summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStan Hu <stanhu@gmail.com>2018-12-04 07:10:11 +0000
committerStan Hu <stanhu@gmail.com>2018-12-04 07:10:11 +0000
commit83f0798e7dc588f0e4cb6816daadeef7dbfc8b81 (patch)
tree13f1d960da531496f369126381385a9ee36c19ba
parenteeb0e98df8f038a6498f6388c515fa7d3c136ee4 (diff)
parent9c059a026d7801b144a1f4888adc414d428c2a78 (diff)
downloadgitlab-ce-83f0798e7dc588f0e4cb6816daadeef7dbfc8b81.tar.gz
Merge branch 'ab-approximate-counts' into 'master'
Approximate table counts based on TABLESAMPLE See merge request gitlab-org/gitlab-ce!22650
-rw-r--r--changelogs/unreleased/ab-approximate-counts.yml5
-rw-r--r--lib/gitlab/database/count.rb79
-rw-r--r--lib/gitlab/database/count/exact_count_strategy.rb31
-rw-r--r--lib/gitlab/database/count/reltuples_count_strategy.rb79
-rw-r--r--lib/gitlab/database/count/tablesample_count_strategy.rb66
-rw-r--r--spec/lib/gitlab/database/count/exact_count_strategy_spec.rb34
-rw-r--r--spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb48
-rw-r--r--spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb65
-rw-r--r--spec/lib/gitlab/database/count_spec.rb72
9 files changed, 381 insertions, 98 deletions
diff --git a/changelogs/unreleased/ab-approximate-counts.yml b/changelogs/unreleased/ab-approximate-counts.yml
new file mode 100644
index 00000000000..8a67239d031
--- /dev/null
+++ b/changelogs/unreleased/ab-approximate-counts.yml
@@ -0,0 +1,5 @@
+---
+title: Approximate counting strategy with TABLESAMPLE.
+merge_request: 22650
+author:
+type: performance
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb
index ea6529e2dc4..c996d786909 100644
--- a/lib/gitlab/database/count.rb
+++ b/lib/gitlab/database/count.rb
@@ -1,7 +1,12 @@
# frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+# We can optimize this by using various strategies for approximate counting.
+#
+# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+#
+# However, since statistics are not always up to date, we also implement a table sampling strategy
+# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
module Gitlab
module Database
module Count
@@ -20,68 +25,30 @@ module Gitlab
end
# Takes in an array of models and returns a Hash for the approximate
- # counts for them. If the model's table has not been vacuumed or
- # analyzed recently, simply run the Model.count to get the data.
+ # counts for them.
+ #
+ # Various count strategies can be specified that are executed in
+ # sequence until all tables have an approximate count attached
+ # or we run out of strategies.
+ #
+ # Note that not all strategies are available on all supported RDBMS.
#
# @param [Array]
# @return [Hash] of Model -> count mapping
- def self.approximate_counts(models)
- table_to_model_map = models.each_with_object({}) do |model, hash|
- hash[model.table_name] = model
- end
-
- table_names = table_to_model_map.keys
- counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {}
+ def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
+ strategies.each_with_object({}) do |strategy, counts_by_model|
+ if strategy.enabled?
+ models_with_missing_counts = models - counts_by_model.keys
- # Convert table -> count to Model -> count
- counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash|
- model = table_to_model_map[pair.first]
- hash[model] = pair.second
- end
+ break if models_with_missing_counts.empty?
- missing_tables = table_names - counts_by_table_name.keys
+ counts = strategy.new(models_with_missing_counts).count
- missing_tables.each do |table|
- model = table_to_model_map[table]
- counts_by_model[model] = model.count
+ counts.each do |model, count|
+ counts_by_model[model] = count
+ end
+ end
end
-
- counts_by_model
- end
-
- # Returns a hash of the table names that have recently updated tuples.
- #
- # @param [Array] table names
- # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
- def self.reltuples_from_recently_updated(table_names)
- query = postgresql_estimate_query(table_names)
- rows = []
-
- # Querying tuple stats only works on the primary. Due to load
- # balancing, we need to ensure this query hits the load balancer. The
- # easiest way to do this is to start a transaction.
- ActiveRecord::Base.transaction do
- rows = ActiveRecord::Base.connection.select_all(query)
- end
-
- rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i }
- rescue *CONNECTION_ERRORS
- {}
- end
-
- # Generates the PostgreSQL query to return the tuples for tables
- # that have been vacuumed or analyzed in the last hour.
- #
- # @param [Array] table names
- # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
- def self.postgresql_estimate_query(table_names)
- time = "to_timestamp(#{1.hour.ago.to_i})"
- <<~SQL
- SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
- LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
- WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
- AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
- SQL
end
end
end
diff --git a/lib/gitlab/database/count/exact_count_strategy.rb b/lib/gitlab/database/count/exact_count_strategy.rb
new file mode 100644
index 00000000000..0276fe2b54f
--- /dev/null
+++ b/lib/gitlab/database/count/exact_count_strategy.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ module Count
+ # This strategy performs an exact count on the model.
+ #
+ # This is guaranteed to be accurate, however it also scans the
+ # whole table. Hence, there are no guarantees with respect
+ # to runtime.
+ #
+ # Note that for very large tables, this may even timeout.
+ class ExactCountStrategy
+ attr_reader :models
+ def initialize(models)
+ @models = models
+ end
+
+ def count
+ models.each_with_object({}) do |model, data|
+ data[model] = model.count
+ end
+ end
+
+ def self.enabled?
+ true
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/database/count/reltuples_count_strategy.rb b/lib/gitlab/database/count/reltuples_count_strategy.rb
new file mode 100644
index 00000000000..c3a674aeb7e
--- /dev/null
+++ b/lib/gitlab/database/count/reltuples_count_strategy.rb
@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ module Count
+ class PgClass < ActiveRecord::Base
+ self.table_name = 'pg_class'
+ end
+
+ # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
+ #
+ # Specifically, it relies on the column reltuples in said table. An additional
+ # check is performed to make sure statistics were updated within the last hour.
+ #
+ # Otherwise, this strategy skips tables with outdated statistics.
+ #
+ # There are no guarantees with respect to the accuracy of this strategy. Runtime
+ # however is guaranteed to be "fast", because it only looks up statistics.
+ class ReltuplesCountStrategy
+ attr_reader :models
+ def initialize(models)
+ @models = models
+ end
+
+ # Returns a hash of the table names that have recently updated tuples.
+ #
+ # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
+ def count
+ size_estimates
+ rescue *CONNECTION_ERRORS
+ {}
+ end
+
+ def self.enabled?
+ Gitlab::Database.postgresql?
+ end
+
+ private
+
+ def table_names
+ models.map(&:table_name)
+ end
+
+ def size_estimates(check_statistics: true)
+ table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model }
+
+ # Querying tuple stats only works on the primary. Due to load balancing, the
+ # easiest way to do this is to start a transaction.
+ ActiveRecord::Base.transaction do
+ get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data|
+ model = table_to_model[row.table_name]
+ data[model] = row.estimate
+ end
+ end
+ end
+
+ # Generates the PostgreSQL query to return the tuples for tables
+ # that have been vacuumed or analyzed in the last hour.
+ #
+ # @param [Array] table names
+ # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
+ def get_statistics(table_names, check_statistics: true)
+ time = 1.hour.ago
+
+ query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)")
+ .where(relname: table_names)
+ .select('pg_class.relname AS table_name, reltuples::bigint AS estimate')
+
+ if check_statistics
+ query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?',
+ time, time, time, time)
+ end
+
+ query
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/database/count/tablesample_count_strategy.rb b/lib/gitlab/database/count/tablesample_count_strategy.rb
new file mode 100644
index 00000000000..cf1cf054dbf
--- /dev/null
+++ b/lib/gitlab/database/count/tablesample_count_strategy.rb
@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ module Count
+ # A tablesample count executes in two phases:
+ # * Estimate table sizes based on reltuples.
+ # * Based on the estimate:
+ # * If the table is considered 'small', execute an exact relation count.
+ # * Otherwise, count on a sample of the table using TABLESAMPLE.
+ #
+ # The size of the sample is chosen in a way that we always roughly scan
+ # the same amount of rows (see TABLESAMPLE_ROW_TARGET).
+ #
+ # There are no guarantees with respect to the accuracy of the result or runtime.
+ class TablesampleCountStrategy < ReltuplesCountStrategy
+ EXACT_COUNT_THRESHOLD = 10_000
+ TABLESAMPLE_ROW_TARGET = 10_000
+
+ def count
+ estimates = size_estimates(check_statistics: false)
+
+ models.each_with_object({}) do |model, count_by_model|
+ count = perform_count(model, estimates[model])
+ count_by_model[model] = count if count
+ end
+ rescue *CONNECTION_ERRORS
+ {}
+ end
+
+ def self.enabled?
+ Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts)
+ end
+
+ private
+
+ def perform_count(model, estimate)
+ # If we estimate 0, we may not have statistics at all. Don't use them.
+ return nil unless estimate && estimate > 0
+
+ if estimate < EXACT_COUNT_THRESHOLD
+ # The table is considered small, the assumption here is that
+ # the exact count will be fast anyways.
+ model.count
+ else
+ # The table is considered large, let's only count on a sample.
+ tablesample_count(model, estimate)
+ end
+ end
+
+ def tablesample_count(model, estimate)
+ portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
+ inverse = 1 / portion
+ query = <<~SQL
+ SELECT (COUNT(*)*#{inverse})::integer AS count
+ FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100})
+ SQL
+
+ rows = ActiveRecord::Base.connection.select_all(query)
+
+ Integer(rows.first['count'])
+ end
+ end
+ end
+ end
+end
diff --git a/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb b/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb
new file mode 100644
index 00000000000..f518bb3dc3e
--- /dev/null
+++ b/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb
@@ -0,0 +1,34 @@
+require 'spec_helper'
+
+describe Gitlab::Database::Count::ExactCountStrategy do
+ before do
+ create_list(:project, 3)
+ create(:identity)
+ end
+
+ let(:models) { [Project, Identity] }
+
+ subject { described_class.new(models).count }
+
+ describe '#count' do
+ it 'counts all models' do
+ expect(models).to all(receive(:count).and_call_original)
+
+ expect(subject).to eq({ Project => 3, Identity => 1 })
+ end
+ end
+
+ describe '.enabled?' do
+ it 'is enabled for PostgreSQL' do
+ allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+
+ expect(described_class.enabled?).to be_truthy
+ end
+
+ it 'is enabled for MySQL' do
+ allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+
+ expect(described_class.enabled?).to be_truthy
+ end
+ end
+end
diff --git a/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb b/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb
new file mode 100644
index 00000000000..b44e8c5a110
--- /dev/null
+++ b/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb
@@ -0,0 +1,48 @@
+require 'spec_helper'
+
+describe Gitlab::Database::Count::ReltuplesCountStrategy do
+ before do
+ create_list(:project, 3)
+ create(:identity)
+ end
+
+ let(:models) { [Project, Identity] }
+ subject { described_class.new(models).count }
+
+ describe '#count', :postgresql do
+ context 'when reltuples is up to date' do
+ before do
+ ActiveRecord::Base.connection.execute('ANALYZE projects')
+ ActiveRecord::Base.connection.execute('ANALYZE identities')
+ end
+
+ it 'uses statistics to do the count' do
+ models.each { |model| expect(model).not_to receive(:count) }
+
+ expect(subject).to eq({ Project => 3, Identity => 1 })
+ end
+ end
+
+ context 'insufficient permissions' do
+ it 'returns an empty hash' do
+ allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege)
+
+ expect(subject).to eq({})
+ end
+ end
+ end
+
+ describe '.enabled?' do
+ it 'is enabled for PostgreSQL' do
+ allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+
+ expect(described_class.enabled?).to be_truthy
+ end
+
+ it 'is disabled for MySQL' do
+ allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+
+ expect(described_class.enabled?).to be_falsey
+ end
+ end
+end
diff --git a/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb b/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb
new file mode 100644
index 00000000000..203f9344a41
--- /dev/null
+++ b/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb
@@ -0,0 +1,65 @@
+require 'spec_helper'
+
+describe Gitlab::Database::Count::TablesampleCountStrategy do
+ before do
+ create_list(:project, 3)
+ create(:identity)
+ end
+
+ let(:models) { [Project, Identity] }
+ let(:strategy) { described_class.new(models) }
+
+ subject { strategy.count }
+
+ describe '#count', :postgresql do
+ let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
+ let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
+
+ before do
+ allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
+ end
+
+ context 'for tables with an estimated small size' do
+ it 'performs an exact count' do
+ expect(Identity).to receive(:count).and_call_original
+
+ expect(subject).to include({ Identity => 1 })
+ end
+ end
+
+ context 'for tables with an estimated large size' do
+ it 'performs a tablesample count' do
+ expect(Project).not_to receive(:count)
+
+ result = subject
+ expect(result[Project]).to eq(3)
+ end
+ end
+
+ context 'insufficient permissions' do
+ it 'returns an empty hash' do
+ allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
+
+ expect(subject).to eq({})
+ end
+ end
+ end
+
+ describe '.enabled?' do
+ before do
+ stub_feature_flags(tablesample_counts: true)
+ end
+
+ it 'is enabled for PostgreSQL' do
+ allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+
+ expect(described_class.enabled?).to be_truthy
+ end
+
+ it 'is disabled for MySQL' do
+ allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+
+ expect(described_class.enabled?).to be_falsey
+ end
+ end
+end
diff --git a/spec/lib/gitlab/database/count_spec.rb b/spec/lib/gitlab/database/count_spec.rb
index 407d9470785..1d096b8fa7c 100644
--- a/spec/lib/gitlab/database/count_spec.rb
+++ b/spec/lib/gitlab/database/count_spec.rb
@@ -8,63 +8,51 @@ describe Gitlab::Database::Count do
let(:models) { [Project, Identity] }
- describe '.approximate_counts' do
- context 'with MySQL' do
- context 'when reltuples have not been updated' do
- it 'counts all models the normal way' do
- expect(Gitlab::Database).to receive(:postgresql?).and_return(false)
+ context '.approximate_counts' do
+ context 'selecting strategies' do
+ let(:strategies) { [double('s1', enabled?: true), double('s2', enabled?: false)] }
- expect(Project).to receive(:count).and_call_original
- expect(Identity).to receive(:count).and_call_original
+ it 'uses only enabled strategies' do
+ expect(strategies[0]).to receive(:new).and_return(double('strategy1', count: {}))
+ expect(strategies[1]).not_to receive(:new)
- expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
- end
+ described_class.approximate_counts(models, strategies: strategies)
end
end
- context 'with PostgreSQL', :postgresql do
- describe 'when reltuples have not been updated' do
- it 'counts all models the normal way' do
- expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({})
+ context 'fallbacks' do
+ subject { described_class.approximate_counts(models, strategies: strategies) }
- expect(Project).to receive(:count).and_call_original
- expect(Identity).to receive(:count).and_call_original
- expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
- end
+ let(:strategies) do
+ [
+ double('s1', enabled?: true, new: first_strategy),
+ double('s2', enabled?: true, new: second_strategy)
+ ]
end
- describe 'no permission' do
- it 'falls back to standard query' do
- allow(described_class).to receive(:postgresql_estimate_query).and_raise(PG::InsufficientPrivilege)
+ let(:first_strategy) { double('first strategy', count: {}) }
+ let(:second_strategy) { double('second strategy', count: {}) }
- expect(Project).to receive(:count).and_call_original
- expect(Identity).to receive(:count).and_call_original
- expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
- end
+ it 'gets results from first strategy' do
+ expect(strategies[0]).to receive(:new).with(models).and_return(first_strategy)
+ expect(first_strategy).to receive(:count)
+
+ subject
end
- describe 'when some reltuples have been updated' do
- it 'counts projects in the fast way' do
- expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({ 'projects' => 3 })
+ it 'gets more results from second strategy if some counts are missing' do
+ expect(first_strategy).to receive(:count).and_return({ Project => 3 })
+ expect(strategies[1]).to receive(:new).with([Identity]).and_return(second_strategy)
+ expect(second_strategy).to receive(:count).and_return({ Identity => 1 })
- expect(Project).not_to receive(:count).and_call_original
- expect(Identity).to receive(:count).and_call_original
- expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
- end
+ expect(subject).to eq({ Project => 3, Identity => 1 })
end
- describe 'when all reltuples have been updated' do
- before do
- ActiveRecord::Base.connection.execute('ANALYZE projects')
- ActiveRecord::Base.connection.execute('ANALYZE identities')
- end
-
- it 'counts models with the standard way' do
- expect(Project).not_to receive(:count)
- expect(Identity).not_to receive(:count)
+ it 'does not get more results as soon as all counts are present' do
+ expect(first_strategy).to receive(:count).and_return({ Project => 3, Identity => 1 })
+ expect(strategies[1]).not_to receive(:new)
- expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 })
- end
+ subject
end
end
end