diff options
author | Stan Hu <stanhu@gmail.com> | 2018-12-04 07:10:11 +0000 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2018-12-04 07:10:11 +0000 |
commit | 83f0798e7dc588f0e4cb6816daadeef7dbfc8b81 (patch) | |
tree | 13f1d960da531496f369126381385a9ee36c19ba | |
parent | eeb0e98df8f038a6498f6388c515fa7d3c136ee4 (diff) | |
parent | 9c059a026d7801b144a1f4888adc414d428c2a78 (diff) | |
download | gitlab-ce-83f0798e7dc588f0e4cb6816daadeef7dbfc8b81.tar.gz |
Merge branch 'ab-approximate-counts' into 'master'
Approximate table counts based on TABLESAMPLE
See merge request gitlab-org/gitlab-ce!22650
-rw-r--r-- | changelogs/unreleased/ab-approximate-counts.yml | 5 | ||||
-rw-r--r-- | lib/gitlab/database/count.rb | 79 | ||||
-rw-r--r-- | lib/gitlab/database/count/exact_count_strategy.rb | 31 | ||||
-rw-r--r-- | lib/gitlab/database/count/reltuples_count_strategy.rb | 79 | ||||
-rw-r--r-- | lib/gitlab/database/count/tablesample_count_strategy.rb | 66 | ||||
-rw-r--r-- | spec/lib/gitlab/database/count/exact_count_strategy_spec.rb | 34 | ||||
-rw-r--r-- | spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb | 48 | ||||
-rw-r--r-- | spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb | 65 | ||||
-rw-r--r-- | spec/lib/gitlab/database/count_spec.rb | 72 |
9 files changed, 381 insertions, 98 deletions
diff --git a/changelogs/unreleased/ab-approximate-counts.yml b/changelogs/unreleased/ab-approximate-counts.yml new file mode 100644 index 00000000000..8a67239d031 --- /dev/null +++ b/changelogs/unreleased/ab-approximate-counts.yml @@ -0,0 +1,5 @@ +--- +title: Approximate counting strategy with TABLESAMPLE. +merge_request: 22650 +author: +type: performance diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb index ea6529e2dc4..c996d786909 100644 --- a/lib/gitlab/database/count.rb +++ b/lib/gitlab/database/count.rb @@ -1,7 +1,12 @@ # frozen_string_literal: true # For large tables, PostgreSQL can take a long time to count rows due to MVCC. -# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. +# We can optimize this by using various strategies for approximate counting. +# +# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. +# +# However, since statistics are not always up to date, we also implement a table sampling strategy +# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy. module Gitlab module Database module Count @@ -20,68 +25,30 @@ module Gitlab end # Takes in an array of models and returns a Hash for the approximate - # counts for them. If the model's table has not been vacuumed or - # analyzed recently, simply run the Model.count to get the data. + # counts for them. + # + # Various count strategies can be specified that are executed in + # sequence until all tables have an approximate count attached + # or we run out of strategies. + # + # Note that not all strategies are available on all supported RDBMS. # # @param [Array] # @return [Hash] of Model -> count mapping - def self.approximate_counts(models) - table_to_model_map = models.each_with_object({}) do |model, hash| - hash[model.table_name] = model - end - - table_names = table_to_model_map.keys - counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {} + def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy]) + strategies.each_with_object({}) do |strategy, counts_by_model| + if strategy.enabled? + models_with_missing_counts = models - counts_by_model.keys - # Convert table -> count to Model -> count - counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash| - model = table_to_model_map[pair.first] - hash[model] = pair.second - end + break if models_with_missing_counts.empty? - missing_tables = table_names - counts_by_table_name.keys + counts = strategy.new(models_with_missing_counts).count - missing_tables.each do |table| - model = table_to_model_map[table] - counts_by_model[model] = model.count + counts.each do |model, count| + counts_by_model[model] = count + end + end end - - counts_by_model - end - - # Returns a hash of the table names that have recently updated tuples. - # - # @param [Array] table names - # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) - def self.reltuples_from_recently_updated(table_names) - query = postgresql_estimate_query(table_names) - rows = [] - - # Querying tuple stats only works on the primary. Due to load - # balancing, we need to ensure this query hits the load balancer. The - # easiest way to do this is to start a transaction. - ActiveRecord::Base.transaction do - rows = ActiveRecord::Base.connection.select_all(query) - end - - rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i } - rescue *CONNECTION_ERRORS - {} - end - - # Generates the PostgreSQL query to return the tuples for tables - # that have been vacuumed or analyzed in the last hour. - # - # @param [Array] table names - # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) - def self.postgresql_estimate_query(table_names) - time = "to_timestamp(#{1.hour.ago.to_i})" - <<~SQL - SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class - LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname - WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')}) - AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time}) - SQL end end end diff --git a/lib/gitlab/database/count/exact_count_strategy.rb b/lib/gitlab/database/count/exact_count_strategy.rb new file mode 100644 index 00000000000..0276fe2b54f --- /dev/null +++ b/lib/gitlab/database/count/exact_count_strategy.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module Count + # This strategy performs an exact count on the model. + # + # This is guaranteed to be accurate, however it also scans the + # whole table. Hence, there are no guarantees with respect + # to runtime. + # + # Note that for very large tables, this may even timeout. + class ExactCountStrategy + attr_reader :models + def initialize(models) + @models = models + end + + def count + models.each_with_object({}) do |model, data| + data[model] = model.count + end + end + + def self.enabled? + true + end + end + end + end +end diff --git a/lib/gitlab/database/count/reltuples_count_strategy.rb b/lib/gitlab/database/count/reltuples_count_strategy.rb new file mode 100644 index 00000000000..c3a674aeb7e --- /dev/null +++ b/lib/gitlab/database/count/reltuples_count_strategy.rb @@ -0,0 +1,79 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module Count + class PgClass < ActiveRecord::Base + self.table_name = 'pg_class' + end + + # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables. + # + # Specifically, it relies on the column reltuples in said table. An additional + # check is performed to make sure statistics were updated within the last hour. + # + # Otherwise, this strategy skips tables with outdated statistics. + # + # There are no guarantees with respect to the accuracy of this strategy. Runtime + # however is guaranteed to be "fast", because it only looks up statistics. + class ReltuplesCountStrategy + attr_reader :models + def initialize(models) + @models = models + end + + # Returns a hash of the table names that have recently updated tuples. + # + # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) + def count + size_estimates + rescue *CONNECTION_ERRORS + {} + end + + def self.enabled? + Gitlab::Database.postgresql? + end + + private + + def table_names + models.map(&:table_name) + end + + def size_estimates(check_statistics: true) + table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model } + + # Querying tuple stats only works on the primary. Due to load balancing, the + # easiest way to do this is to start a transaction. + ActiveRecord::Base.transaction do + get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data| + model = table_to_model[row.table_name] + data[model] = row.estimate + end + end + end + + # Generates the PostgreSQL query to return the tuples for tables + # that have been vacuumed or analyzed in the last hour. + # + # @param [Array] table names + # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) + def get_statistics(table_names, check_statistics: true) + time = 1.hour.ago + + query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)") + .where(relname: table_names) + .select('pg_class.relname AS table_name, reltuples::bigint AS estimate') + + if check_statistics + query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?', + time, time, time, time) + end + + query + end + end + end + end +end diff --git a/lib/gitlab/database/count/tablesample_count_strategy.rb b/lib/gitlab/database/count/tablesample_count_strategy.rb new file mode 100644 index 00000000000..cf1cf054dbf --- /dev/null +++ b/lib/gitlab/database/count/tablesample_count_strategy.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module Count + # A tablesample count executes in two phases: + # * Estimate table sizes based on reltuples. + # * Based on the estimate: + # * If the table is considered 'small', execute an exact relation count. + # * Otherwise, count on a sample of the table using TABLESAMPLE. + # + # The size of the sample is chosen in a way that we always roughly scan + # the same amount of rows (see TABLESAMPLE_ROW_TARGET). + # + # There are no guarantees with respect to the accuracy of the result or runtime. + class TablesampleCountStrategy < ReltuplesCountStrategy + EXACT_COUNT_THRESHOLD = 10_000 + TABLESAMPLE_ROW_TARGET = 10_000 + + def count + estimates = size_estimates(check_statistics: false) + + models.each_with_object({}) do |model, count_by_model| + count = perform_count(model, estimates[model]) + count_by_model[model] = count if count + end + rescue *CONNECTION_ERRORS + {} + end + + def self.enabled? + Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts) + end + + private + + def perform_count(model, estimate) + # If we estimate 0, we may not have statistics at all. Don't use them. + return nil unless estimate && estimate > 0 + + if estimate < EXACT_COUNT_THRESHOLD + # The table is considered small, the assumption here is that + # the exact count will be fast anyways. + model.count + else + # The table is considered large, let's only count on a sample. + tablesample_count(model, estimate) + end + end + + def tablesample_count(model, estimate) + portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4) + inverse = 1 / portion + query = <<~SQL + SELECT (COUNT(*)*#{inverse})::integer AS count + FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100}) + SQL + + rows = ActiveRecord::Base.connection.select_all(query) + + Integer(rows.first['count']) + end + end + end + end +end diff --git a/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb b/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb new file mode 100644 index 00000000000..f518bb3dc3e --- /dev/null +++ b/spec/lib/gitlab/database/count/exact_count_strategy_spec.rb @@ -0,0 +1,34 @@ +require 'spec_helper' + +describe Gitlab::Database::Count::ExactCountStrategy do + before do + create_list(:project, 3) + create(:identity) + end + + let(:models) { [Project, Identity] } + + subject { described_class.new(models).count } + + describe '#count' do + it 'counts all models' do + expect(models).to all(receive(:count).and_call_original) + + expect(subject).to eq({ Project => 3, Identity => 1 }) + end + end + + describe '.enabled?' do + it 'is enabled for PostgreSQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(true) + + expect(described_class.enabled?).to be_truthy + end + + it 'is enabled for MySQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(false) + + expect(described_class.enabled?).to be_truthy + end + end +end diff --git a/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb b/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb new file mode 100644 index 00000000000..b44e8c5a110 --- /dev/null +++ b/spec/lib/gitlab/database/count/reltuples_count_strategy_spec.rb @@ -0,0 +1,48 @@ +require 'spec_helper' + +describe Gitlab::Database::Count::ReltuplesCountStrategy do + before do + create_list(:project, 3) + create(:identity) + end + + let(:models) { [Project, Identity] } + subject { described_class.new(models).count } + + describe '#count', :postgresql do + context 'when reltuples is up to date' do + before do + ActiveRecord::Base.connection.execute('ANALYZE projects') + ActiveRecord::Base.connection.execute('ANALYZE identities') + end + + it 'uses statistics to do the count' do + models.each { |model| expect(model).not_to receive(:count) } + + expect(subject).to eq({ Project => 3, Identity => 1 }) + end + end + + context 'insufficient permissions' do + it 'returns an empty hash' do + allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege) + + expect(subject).to eq({}) + end + end + end + + describe '.enabled?' do + it 'is enabled for PostgreSQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(true) + + expect(described_class.enabled?).to be_truthy + end + + it 'is disabled for MySQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(false) + + expect(described_class.enabled?).to be_falsey + end + end +end diff --git a/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb b/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb new file mode 100644 index 00000000000..203f9344a41 --- /dev/null +++ b/spec/lib/gitlab/database/count/tablesample_count_strategy_spec.rb @@ -0,0 +1,65 @@ +require 'spec_helper' + +describe Gitlab::Database::Count::TablesampleCountStrategy do + before do + create_list(:project, 3) + create(:identity) + end + + let(:models) { [Project, Identity] } + let(:strategy) { described_class.new(models) } + + subject { strategy.count } + + describe '#count', :postgresql do + let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } } + let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD } + + before do + allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates) + end + + context 'for tables with an estimated small size' do + it 'performs an exact count' do + expect(Identity).to receive(:count).and_call_original + + expect(subject).to include({ Identity => 1 }) + end + end + + context 'for tables with an estimated large size' do + it 'performs a tablesample count' do + expect(Project).not_to receive(:count) + + result = subject + expect(result[Project]).to eq(3) + end + end + + context 'insufficient permissions' do + it 'returns an empty hash' do + allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege) + + expect(subject).to eq({}) + end + end + end + + describe '.enabled?' do + before do + stub_feature_flags(tablesample_counts: true) + end + + it 'is enabled for PostgreSQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(true) + + expect(described_class.enabled?).to be_truthy + end + + it 'is disabled for MySQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(false) + + expect(described_class.enabled?).to be_falsey + end + end +end diff --git a/spec/lib/gitlab/database/count_spec.rb b/spec/lib/gitlab/database/count_spec.rb index 407d9470785..1d096b8fa7c 100644 --- a/spec/lib/gitlab/database/count_spec.rb +++ b/spec/lib/gitlab/database/count_spec.rb @@ -8,63 +8,51 @@ describe Gitlab::Database::Count do let(:models) { [Project, Identity] } - describe '.approximate_counts' do - context 'with MySQL' do - context 'when reltuples have not been updated' do - it 'counts all models the normal way' do - expect(Gitlab::Database).to receive(:postgresql?).and_return(false) + context '.approximate_counts' do + context 'selecting strategies' do + let(:strategies) { [double('s1', enabled?: true), double('s2', enabled?: false)] } - expect(Project).to receive(:count).and_call_original - expect(Identity).to receive(:count).and_call_original + it 'uses only enabled strategies' do + expect(strategies[0]).to receive(:new).and_return(double('strategy1', count: {})) + expect(strategies[1]).not_to receive(:new) - expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) - end + described_class.approximate_counts(models, strategies: strategies) end end - context 'with PostgreSQL', :postgresql do - describe 'when reltuples have not been updated' do - it 'counts all models the normal way' do - expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({}) + context 'fallbacks' do + subject { described_class.approximate_counts(models, strategies: strategies) } - expect(Project).to receive(:count).and_call_original - expect(Identity).to receive(:count).and_call_original - expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) - end + let(:strategies) do + [ + double('s1', enabled?: true, new: first_strategy), + double('s2', enabled?: true, new: second_strategy) + ] end - describe 'no permission' do - it 'falls back to standard query' do - allow(described_class).to receive(:postgresql_estimate_query).and_raise(PG::InsufficientPrivilege) + let(:first_strategy) { double('first strategy', count: {}) } + let(:second_strategy) { double('second strategy', count: {}) } - expect(Project).to receive(:count).and_call_original - expect(Identity).to receive(:count).and_call_original - expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) - end + it 'gets results from first strategy' do + expect(strategies[0]).to receive(:new).with(models).and_return(first_strategy) + expect(first_strategy).to receive(:count) + + subject end - describe 'when some reltuples have been updated' do - it 'counts projects in the fast way' do - expect(described_class).to receive(:reltuples_from_recently_updated).with(%w(projects identities)).and_return({ 'projects' => 3 }) + it 'gets more results from second strategy if some counts are missing' do + expect(first_strategy).to receive(:count).and_return({ Project => 3 }) + expect(strategies[1]).to receive(:new).with([Identity]).and_return(second_strategy) + expect(second_strategy).to receive(:count).and_return({ Identity => 1 }) - expect(Project).not_to receive(:count).and_call_original - expect(Identity).to receive(:count).and_call_original - expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) - end + expect(subject).to eq({ Project => 3, Identity => 1 }) end - describe 'when all reltuples have been updated' do - before do - ActiveRecord::Base.connection.execute('ANALYZE projects') - ActiveRecord::Base.connection.execute('ANALYZE identities') - end - - it 'counts models with the standard way' do - expect(Project).not_to receive(:count) - expect(Identity).not_to receive(:count) + it 'does not get more results as soon as all counts are present' do + expect(first_strategy).to receive(:count).and_return({ Project => 3, Identity => 1 }) + expect(strategies[1]).not_to receive(:new) - expect(described_class.approximate_counts(models)).to eq({ Project => 3, Identity => 1 }) - end + subject end end end |