Implement TablesampleCountStrategy.

A tablesample count executes in two phases: * Estimate table sizes based on reltuples. * Based on the estimate: * If the table is considered 'small', execute an exact relation count. * Otherwise, count on a sample of the table using TABLESAMPLE. The size of the sample is chosen in a way that we always roughly scan the same amount of rows (see TABLESAMPLE_ROW_TARGET).
author: Andreas Brandl <abrandl@gitlab.com> 2018-10-28 19:16:47 +0100
committer: Andreas Brandl <abrandl@gitlab.com> 2018-11-27 16:46:55 +0100
commit: dc33e6d2af51cadb28fbe5af4a5319f7186e5c88 (patch)
tree: 16305a0e192d3fca9773a36f3fcf7d4dec650f84
parent: c60e9837b908771dbc2e382deea9a7e308a7406e (diff)
download: gitlab-ce-dc33e6d2af51cadb28fbe5af4a5319f7186e5c88.tar.gz
2 files changed, 161 insertions, 20 deletions
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb
index e244bf40bbf..01fabcced65 100644
--- a/lib/gitlab/database/count.rb
+++ b/lib/gitlab/database/count.rb
@@ -1,7 +1,12 @@
 # frozen_string_literal: true
 
 # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+# We can optimize this by using various strategies for approximate counting.
+#
+# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+#
+# However, since statistics are not always up to date, we also implement a table sampling strategy
+# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
 module Gitlab
   module Database
     module Count
@@ -20,12 +25,17 @@ module Gitlab
         end
 
       # Takes in an array of models and returns a Hash for the approximate
-      # counts for them.  If the model's table has not been vacuumed or
-      # analyzed recently, simply run the Model.count to get the data.
+      # counts for them.
+      #
+      # Various count strategies can be specified that are executed in
+      # sequence until all tables have an approximate count attached
+      # or we run out of strategies.
+      #
+      # Note that not all strategies are available on all supported RDBMS.
       #
       # @param [Array]
       # @return [Hash] of Model -> count mapping
-      def self.approximate_counts(models, strategies: [ReltuplesCountStrategy, ExactCountStrategy])
+      def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
         strategies.each_with_object({}) do |strategy, counts_by_model|
           if strategy.enabled?
             models_with_missing_counts = models - counts_by_model.keys
@@ -41,6 +51,13 @@ module Gitlab
         end
       end
 
+      # This strategy performs an exact count on the model.
+      #
+      # This is guaranteed to be accurate, however it also scans the
+      # whole table. Hence, there are no guarantees with respect
+      # to runtime.
+      #
+      # Note that for very large tables, this may even timeout.
       class ExactCountStrategy
         attr_reader :models
         def initialize(models)
@@ -58,6 +75,15 @@ module Gitlab
         end
       end
 
+      # This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
+      #
+      # Specifically, it relies on the column reltuples in said table. An additional
+      # check is performed to make sure statistics were updated within the last hour.
+      #
+      # Otherwise, this strategy skips tables with outdated statistics.
+      #
+      # There are no guarantees with respect to the accuracy of this strategy. Runtime
+      # however is guaranteed to be "fast", because it only looks up statistics.
       class ReltuplesCountStrategy
         attr_reader :models
         def initialize(models)
@@ -68,7 +94,23 @@ module Gitlab
         #
         # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
         def count
-          query = postgresql_estimate_query(table_names)
+          size_estimates
+        rescue *CONNECTION_ERRORS
+          {}
+        end
+
+        def self.enabled?
+          Gitlab::Database.postgresql?
+        end
+
+        private
+
+        def table_names
+          models.map(&:table_name)
+        end
+
+        def size_estimates(check_statistics: true)
+          query = postgresql_estimate_query(table_names, check_statistics: check_statistics)
           rows = []
 
           # Querying tuple stats only works on the primary. Due to load
@@ -83,18 +125,6 @@ module Gitlab
             model = table_to_model[row['table_name']]
             data[model] = row['estimate'].to_i
           end
-        rescue *CONNECTION_ERRORS => e
-          {}
-        end
-
-        def self.enabled?
-          Gitlab::Database.postgresql?
-        end
-
-        private
-
-        def table_names
-          models.map(&:table_name)
         end
 
         # Generates the PostgreSQL query to return the tuples for tables
@@ -102,14 +132,72 @@ module Gitlab
         #
         # @param [Array] table names
         # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
-        def postgresql_estimate_query(table_names)
+        def postgresql_estimate_query(table_names, check_statistics: true)
           time = "to_timestamp(#{1.hour.ago.to_i})"
-          <<~SQL
+          base_query = <<~SQL
           SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
           LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
           WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
-          AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
           SQL
+          if check_statistics
+            base_query + "AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})"
+          else
+            base_query
+          end
+        end
+      end
+
+      # A tablesample count executes in two phases:
+      # * Estimate table sizes based on reltuples.
+      # * Based on the estimate:
+      #   * If the table is considered 'small', execute an exact relation count.
+      #   * Otherwise, count on a sample of the table using TABLESAMPLE.
+      #
+      # The size of the sample is chosen in a way that we always roughly scan
+      # the same amount of rows (see TABLESAMPLE_ROW_TARGET).
+      #
+      # There are no guarantees with respect to the accuracy of the result or runtime.
+      class TablesampleCountStrategy < ReltuplesCountStrategy
+        EXACT_COUNT_THRESHOLD = 100_000
+        TABLESAMPLE_ROW_TARGET = 100_000
+
+        def count
+          estimates = size_estimates(check_statistics: false)
+
+          models.each_with_object({}) do |model, count_by_model|
+            count = perform_count(model, estimates[model])
+            count_by_model[model] = count if count
+          end
+        rescue *CONNECTION_ERRORS
+          {}
+        end
+
+        private
+        def perform_count(model, estimate)
+          # If we estimate 0, we may not have statistics at all. Don't use them.
+          return nil unless estimate && estimate > 0
+
+          if estimate < EXACT_COUNT_THRESHOLD
+            # The table is considered small, the assumption here is that
+            # the exact count will be fast anyways.
+            model.count
+          else
+            # The table is considered large, let's only count on a sample.
+            tablesample_count(model, estimate)
+          end
+        end
+
+        def tablesample_count(model, estimate)
+          portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
+          inverse = 1/portion
+          query = <<~SQL
+            SELECT (COUNT(*)*#{inverse})::integer AS count
+            FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion*100})
+          SQL
+
+          rows = ActiveRecord::Base.connection.select_all(query)
+
+          Integer(rows.first['count'])
         end
       end
     end
diff --git a/spec/lib/gitlab/database/count_spec.rb b/spec/lib/gitlab/database/count_spec.rb
index 699422abb6b..d6668cee23e 100644
--- a/spec/lib/gitlab/database/count_spec.rb
+++ b/spec/lib/gitlab/database/count_spec.rb
@@ -123,4 +123,57 @@ describe Gitlab::Database::Count do
       end
     end
   end
+
+  describe Gitlab::Database::Count::TablesampleCountStrategy do
+    subject { strategy.count }
+    let(:strategy) { described_class.new(models) }
+
+    describe '#count' do
+      let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
+      let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
+
+      before do
+        allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
+      end
+
+      context 'for tables with an estimated small size' do
+        it 'performs an exact count' do
+          expect(Identity).to receive(:count).and_call_original
+
+          expect(subject).to include({ Identity => 1 })
+        end
+      end
+
+      context 'for tables with an estimated large size' do
+        it 'performs a tablesample count' do
+          expect(Project).not_to receive(:count)
+
+          result = subject
+          expect(result[Project]).to eq(3)
+        end
+      end
+
+      context 'insufficient permissions' do
+        it 'returns an empty hash' do
+          allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
+
+          expect(subject).to eq({})
+        end
+      end
+    end
+
+    describe '.enabled?' do
+      it 'is enabled for PostgreSQL' do
+        allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
+
+        expect(described_class.enabled?).to be_truthy
+      end
+
+      it 'is disabled for MySQL' do
+        allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
+
+        expect(described_class.enabled?).to be_falsey
+      end
+    end
+  end
 end
author	Andreas Brandl <abrandl@gitlab.com>	2018-10-28 19:16:47 +0100
committer	Andreas Brandl <abrandl@gitlab.com>	2018-11-27 16:46:55 +0100
commit	dc33e6d2af51cadb28fbe5af4a5319f7186e5c88 (patch)
tree	16305a0e192d3fca9773a36f3fcf7d4dec650f84
parent	c60e9837b908771dbc2e382deea9a7e308a7406e (diff)
download	gitlab-ce-dc33e6d2af51cadb28fbe5af4a5319f7186e5c88.tar.gz