diff options
Diffstat (limited to 'lib/gitlab/database/count.rb')
-rw-r--r-- | lib/gitlab/database/count.rb | 79 |
1 files changed, 23 insertions, 56 deletions
diff --git a/lib/gitlab/database/count.rb b/lib/gitlab/database/count.rb index ea6529e2dc4..f3d37ccd72a 100644 --- a/lib/gitlab/database/count.rb +++ b/lib/gitlab/database/count.rb @@ -1,7 +1,12 @@ # frozen_string_literal: true # For large tables, PostgreSQL can take a long time to count rows due to MVCC. -# We can optimize this by using the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. +# We can optimize this by using various strategies for approximate counting. +# +# For example, we can use the reltuples count as described in https://wiki.postgresql.org/wiki/Slow_Counting. +# +# However, since statistics are not always up to date, we also implement a table sampling strategy +# that performs an exact count but only on a sample of the table. See TablesampleCountStrategy. module Gitlab module Database module Count @@ -20,68 +25,30 @@ module Gitlab end # Takes in an array of models and returns a Hash for the approximate - # counts for them. If the model's table has not been vacuumed or - # analyzed recently, simply run the Model.count to get the data. + # counts for them. + # + # Various count strategies can be specified that are executed in + # sequence until all tables have an approximate count attached + # or we run out of strategies. + # + # Note that not all strategies are available on all supported RDBMS. # # @param [Array] # @return [Hash] of Model -> count mapping - def self.approximate_counts(models) - table_to_model_map = models.each_with_object({}) do |model, hash| - hash[model.table_name] = model - end - - table_names = table_to_model_map.keys - counts_by_table_name = Gitlab::Database.postgresql? ? reltuples_from_recently_updated(table_names) : {} + def self.approximate_counts(models, strategies: [TablesampleCountStrategy, ReltuplesCountStrategy, ExactCountStrategy]) + strategies.each_with_object({}) do |strategy, counts_by_model| + if strategy.enabled? + models_with_missing_counts = models - counts_by_model.keys - # Convert table -> count to Model -> count - counts_by_model = counts_by_table_name.each_with_object({}) do |pair, hash| - model = table_to_model_map[pair.first] - hash[model] = pair.second - end + break counts_by_model if models_with_missing_counts.empty? - missing_tables = table_names - counts_by_table_name.keys + counts = strategy.new(models_with_missing_counts).count - missing_tables.each do |table| - model = table_to_model_map[table] - counts_by_model[model] = model.count + counts.each do |model, count| + counts_by_model[model] = count + end + end end - - counts_by_model - end - - # Returns a hash of the table names that have recently updated tuples. - # - # @param [Array] table names - # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) - def self.reltuples_from_recently_updated(table_names) - query = postgresql_estimate_query(table_names) - rows = [] - - # Querying tuple stats only works on the primary. Due to load - # balancing, we need to ensure this query hits the load balancer. The - # easiest way to do this is to start a transaction. - ActiveRecord::Base.transaction do - rows = ActiveRecord::Base.connection.select_all(query) - end - - rows.each_with_object({}) { |row, data| data[row['table_name']] = row['estimate'].to_i } - rescue *CONNECTION_ERRORS - {} - end - - # Generates the PostgreSQL query to return the tuples for tables - # that have been vacuumed or analyzed in the last hour. - # - # @param [Array] table names - # @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 }) - def self.postgresql_estimate_query(table_names) - time = "to_timestamp(#{1.hour.ago.to_i})" - <<~SQL - SELECT pg_class.relname AS table_name, reltuples::bigint AS estimate FROM pg_class - LEFT JOIN pg_stat_user_tables ON pg_class.relname = pg_stat_user_tables.relname - WHERE pg_class.relname IN (#{table_names.map { |table| "'#{table}'" }.join(',')}) - AND (last_vacuum > #{time} OR last_autovacuum > #{time} OR last_analyze > #{time} OR last_autoanalyze > #{time}) - SQL end end end |