summaryrefslogtreecommitdiff
path: root/lib/gitlab/database/count.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/database/count.rb')
-rw-r--r--lib/gitlab/database/count.rb79
1 files changed, 23 insertions, 56 deletions
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb
index ea6529e2dc4..f3d37ccd72a 100644
--- a/lib/gitlab/database/count.rb
+++ b/lib/gitlab/database/count.rb
@@ -1,7 +1,12 @@
# frozen_string_literal: true
# For large tables, PostgreSQL can take a long time to count rows due to MVCC.
-# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+# We can optimize this by using various strategies for approximate counting.
+#
+# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting.
+#
+# However, since statistics are not always up to date, we also implement a table sampling strategy
+# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy.
module Gitlab
module Database
module Count
@@ -20,68 +25,30 @@ module Gitlab
end
# Takes in an array of models and returns a Hash for the approximate
- # counts for them. If the model's table has not been vacuumed or
- # analyzed recently, simply run the Model.count to get the data.
+ # counts for them.
+ #
+ # Various count strategies can be specified that are executed in
+ # sequence until all tables have an approximate count attached
+ # or we run out of strategies.
+ #
+ # Note that not all strategies are available on all supported RDBMS.
#
# @param [Array]
# @return [Hash] of Model -> count mapping
- def self.approximate_counts(models)
- table_to_model_map = models.each_with_object({}) do |model, hash|
- hash[model.table_name] = model
- end
-
- table_names = table_to_model_map.keys
- counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {}
+ def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy])
+ strategies.each_with_object({}) do |strategy, counts_by_model|
+ if strategy.enabled?
+ models_with_missing_counts = models - counts_by_model.keys
- # Convert table -> count to Model -> count
- counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash|
- model = table_to_model_map[pair.first]
- hash[model] = pair.second
- end
+ break counts_by_model if models_with_missing_counts.empty?
- missing_tables = table_names - counts_by_table_name.keys
+ counts = strategy.new(models_with_missing_counts).count
- missing_tables.each do |table|
- model = table_to_model_map[table]
- counts_by_model[model] = model.count
+ counts.each do |model, count|
+ counts_by_model[model] = count
+ end
+ end
end
-
- counts_by_model
- end
-
- # Returns a hash of the table names that have recently updated tuples.
- #
- # @param [Array] table names
- # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
- def self.reltuples_from_recently_updated(table_names)
- query = postgresql_estimate_query(table_names)
- rows = []
-
- # Querying tuple stats only works on the primary. Due to load
- # balancing, we need to ensure this query hits the load balancer. The
- # easiest way to do this is to start a transaction.
- ActiveRecord::Base.transaction do
- rows = ActiveRecord::Base.connection.select_all(query)
- end
-
- rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i }
- rescue *CONNECTION_ERRORS
- {}
- end
-
- # Generates the PostgreSQL query to return the tuples for tables
- # that have been vacuumed or analyzed in the last hour.
- #
- # @param [Array] table names
- # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
- def self.postgresql_estimate_query(table_names)
- time = "to_timestamp(#{1.hour.ago.to_i})"
- <<~SQL
- SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class
- LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname
- WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')})
- AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time})
- SQL
end
end
end