1 files changed, 23 insertions, 56 deletions
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb
index ea6529e2dc4..f3d37ccd72a 100644
--- a/lib/gitlab/database/count.rb
+++ b/lib/gitlab/database/count.rb
@@ -1,7 +1,12 @@
 # frozen_string_literal: true
 
 # For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+# We can optimize this by using various strategies for approximate counting.
+#
+# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+#
+# However, since statistics are not always up to date, we also implement a table sampling strategy
+# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
 module Gitlab
   module Database
     module Count
@@ -20,68 +25,30 @@ module Gitlab
         end
 
       # Takes in an array of models and returns a Hash for the approximate
-      # counts for them.  If the model's table has not been vacuumed or
-      # analyzed recently, simply run the Model.count to get the data.
+      # counts for them.
+      #
+      # Various count strategies can be specified that are executed in
+      # sequence until all tables have an approximate count attached
+      # or we run out of strategies.
+      #
+      # Note that not all strategies are available on all supported RDBMS.
       #
       # @param [Array]
       # @return [Hash] of Model -> count mapping
-      def self.approximate_counts(models)
-        table_to_model_map = models.each_with_object({}) do |model, hash|
-          hash[model.table_name] = model
-        end
-
-        table_names = table_to_model_map.keys
-        counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {}
+      def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
+        strategies.each_with_object({}) do |strategy, counts_by_model|
+          if strategy.enabled?
+            models_with_missing_counts = models - counts_by_model.keys
 
-        # Convert table -> count to Model -> count
-        counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash|
-          model = table_to_model_map[pair.first]
-          hash[model] = pair.second
-        end
+            break counts_by_model if models_with_missing_counts.empty?
 
-        missing_tables = table_names - counts_by_table_name.keys
+            counts = strategy.new(models_with_missing_counts).count
 
-        missing_tables.each do |table|
-          model = table_to_model_map[table]
-          counts_by_model[model] = model.count
+            counts.each do |model, count|
+              counts_by_model[model] = count
+            end
+          end
         end
-
-        counts_by_model
-      end
-
-      # Returns a hash of the table names that have recently updated tuples.
-      #
-      # @param [Array] table names
-      # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
-      def self.reltuples_from_recently_updated(table_names)
-        query = postgresql_estimate_query(table_names)
-        rows = []
-
-        # Querying tuple stats only works on the primary. Due to load
-        # balancing, we need to ensure this query hits the load balancer.  The
-        # easiest way to do this is to start a transaction.
-        ActiveRecord::Base.transaction do
-          rows = ActiveRecord::Base.connection.select_all(query)
-        end
-
-        rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i }
-      rescue *CONNECTION_ERRORS
-        {}
-      end
-
-      # Generates the PostgreSQL query to return the tuples for tables
-      # that have been vacuumed or analyzed in the last hour.
-      #
-      # @param [Array] table names
-      # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
-      def self.postgresql_estimate_query(table_names)
-        time = "to_timestamp(#{1.hour.ago.to_i})"
-        <<~SQL
-          SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
-          LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
-          WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
-          AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
-        SQL
       end
     end
   end