diff options
author | Andreas Brandl <abrandl@gitlab.com> | 2018-10-28 19:16:47 +0100 |
---|---|---|
committer | Andreas Brandl <abrandl@gitlab.com> | 2018-11-27 16:46:55 +0100 |
commit | dc33e6d2af51cadb28fbe5af4a5319f7186e5c88 (patch) | |
tree | 16305a0e192d3fca9773a36f3fcf7d4dec650f84 | |
parent | c60e9837b908771dbc2e382deea9a7e308a7406e (diff) | |
download | gitlab-ce-dc33e6d2af51cadb28fbe5af4a5319f7186e5c88.tar.gz |
Implement TablesampleCountStrategy.
A tablesample count executes in two phases:
* Estimate table sizes based on reltuples.
* Based on the estimate:
* If the table is considered 'small', execute an exact relation count.
* Otherwise, count on a sample of the table using TABLESAMPLE.
The size of the sample is chosen in a way that we always roughly scan
the same amount of rows (see TABLESAMPLE_ROW_TARGET).
-rw-r--r-- | lib/gitlab/database/count.rb | 128 | ||||
-rw-r--r-- | spec/lib/gitlab/database/count_spec.rb | 53 |
2 files changed, 161 insertions, 20 deletions
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb index e244bf40bbf..01fabcced65 100644 --- a/lib/gitlab/database/count.rb +++ b/lib/gitlab/database/count.rb @@ -1,7 +1,12 @@ # frozen_string_literal: true # For large tables, PostgreSQL can take a long time to count rows due to MVCC. -# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. +# We can optimize this by using various strategies for approximate counting. +# +# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. +# +# However, since statistics are not always up to date, we also implement a table sampling strategy +# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy. module Gitlab module Database module Count @@ -20,12 +25,17 @@ module Gitlab end # Takes in an array of models and returns a Hash for the approximate - # counts for them. If the model's table has not been vacuumed or - # analyzed recently, simply run the Model.count to get the data. + # counts for them. + # + # Various count strategies can be specified that are executed in + # sequence until all tables have an approximate count attached + # or we run out of strategies. + # + # Note that not all strategies are available on all supported RDBMS. # # @param [Array] # @return [Hash] of Model -> count mapping - def self.approximate_counts(models, strategies: [ReltuplesCountStrategy, ExactCountStrategy]) + def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy]) strategies.each_with_object({}) do |strategy, counts_by_model| if strategy.enabled? models_with_missing_counts = models - counts_by_model.keys @@ -41,6 +51,13 @@ module Gitlab end end + # This strategy performs an exact count on the model. + # + # This is guaranteed to be accurate, however it also scans the + # whole table. Hence, there are no guarantees with respect + # to runtime. + # + # Note that for very large tables, this may even timeout. class ExactCountStrategy attr_reader :models def initialize(models) @@ -58,6 +75,15 @@ module Gitlab end end + # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables. + # + # Specifically, it relies on the column reltuples in said table. An additional + # check is performed to make sure statistics were updated within the last hour. + # + # Otherwise, this strategy skips tables with outdated statistics. + # + # There are no guarantees with respect to the accuracy of this strategy. Runtime + # however is guaranteed to be "fast", because it only looks up statistics. class ReltuplesCountStrategy attr_reader :models def initialize(models) @@ -68,7 +94,23 @@ module Gitlab # # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) def count - query = postgresql_estimate_query(table_names) + size_estimates + rescue *CONNECTION_ERRORS + {} + end + + def self.enabled? + Gitlab::Database.postgresql? + end + + private + + def table_names + models.map(&:table_name) + end + + def size_estimates(check_statistics: true) + query = postgresql_estimate_query(table_names, check_statistics: check_statistics) rows = [] # Querying tuple stats only works on the primary. Due to load @@ -83,18 +125,6 @@ module Gitlab model = table_to_model[row['table_name']] data[model] = row['estimate'].to_i end - rescue *CONNECTION_ERRORS => e - {} - end - - def self.enabled? - Gitlab::Database.postgresql? - end - - private - - def table_names - models.map(&:table_name) end # Generates the PostgreSQL query to return the tuples for tables @@ -102,14 +132,72 @@ module Gitlab # # @param [Array] table names # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) - def postgresql_estimate_query(table_names) + def postgresql_estimate_query(table_names, check_statistics: true) time = "to_timestamp(#{1.hour.ago.to_i})" - <<~SQL + base_query = <<~SQL SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')}) - AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time}) SQL + if check_statistics + base_query + "AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})" + else + base_query + end + end + end + + # A tablesample count executes in two phases: + # * Estimate table sizes based on reltuples. + # * Based on the estimate: + # * If the table is considered 'small', execute an exact relation count. + # * Otherwise, count on a sample of the table using TABLESAMPLE. + # + # The size of the sample is chosen in a way that we always roughly scan + # the same amount of rows (see TABLESAMPLE_ROW_TARGET). + # + # There are no guarantees with respect to the accuracy of the result or runtime. + class TablesampleCountStrategy < ReltuplesCountStrategy + EXACT_COUNT_THRESHOLD = 100_000 + TABLESAMPLE_ROW_TARGET = 100_000 + + def count + estimates = size_estimates(check_statistics: false) + + models.each_with_object({}) do |model, count_by_model| + count = perform_count(model, estimates[model]) + count_by_model[model] = count if count + end + rescue *CONNECTION_ERRORS + {} + end + + private + def perform_count(model, estimate) + # If we estimate 0, we may not have statistics at all. Don't use them. + return nil unless estimate && estimate > 0 + + if estimate < EXACT_COUNT_THRESHOLD + # The table is considered small, the assumption here is that + # the exact count will be fast anyways. + model.count + else + # The table is considered large, let's only count on a sample. + tablesample_count(model, estimate) + end + end + + def tablesample_count(model, estimate) + portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4) + inverse = 1/portion + query = <<~SQL + SELECT (COUNT(*)*#{inverse})::integer AS count + FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion*100}) + SQL + + rows = ActiveRecord::Base.connection.select_all(query) + + Integer(rows.first['count']) end end end diff --git a/spec/lib/gitlab/database/count_spec.rb b/spec/lib/gitlab/database/count_spec.rb index 699422abb6b..d6668cee23e 100644 --- a/spec/lib/gitlab/database/count_spec.rb +++ b/spec/lib/gitlab/database/count_spec.rb @@ -123,4 +123,57 @@ describe Gitlab::Database::Count do end end end + + describe Gitlab::Database::Count::TablesampleCountStrategy do + subject { strategy.count } + let(:strategy) { described_class.new(models) } + + describe '#count' do + let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } } + let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD } + + before do + allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates) + end + + context 'for tables with an estimated small size' do + it 'performs an exact count' do + expect(Identity).to receive(:count).and_call_original + + expect(subject).to include({ Identity => 1 }) + end + end + + context 'for tables with an estimated large size' do + it 'performs a tablesample count' do + expect(Project).not_to receive(:count) + + result = subject + expect(result[Project]).to eq(3) + end + end + + context 'insufficient permissions' do + it 'returns an empty hash' do + allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege) + + expect(subject).to eq({}) + end + end + end + + describe '.enabled?' do + it 'is enabled for PostgreSQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(true) + + expect(described_class.enabled?).to be_truthy + end + + it 'is disabled for MySQL' do + allow(Gitlab::Database).to receive(:postgresql?).and_return(false) + + expect(described_class.enabled?).to be_falsey + end + end + end end |