summaryrefslogtreecommitdiff
path: root/lib/gitlab/database/batch_counter.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/database/batch_counter.rb')
-rw-r--r--lib/gitlab/database/batch_counter.rb159
1 files changed, 159 insertions, 0 deletions
diff --git a/lib/gitlab/database/batch_counter.rb b/lib/gitlab/database/batch_counter.rb
new file mode 100644
index 00000000000..5f2e404c9da
--- /dev/null
+++ b/lib/gitlab/database/batch_counter.rb
@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ class BatchCounter
+ FALLBACK = -1
+ MIN_REQUIRED_BATCH_SIZE = 1_250
+ DEFAULT_SUM_BATCH_SIZE = 1_000
+ MAX_ALLOWED_LOOPS = 10_000
+ SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
+ ALLOWED_MODES = [:itself, :distinct].freeze
+ FALLBACK_FINISH = 0
+ OFFSET_BY_ONE = 1
+
+ # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
+ DEFAULT_DISTINCT_BATCH_SIZE = 10_000
+ DEFAULT_BATCH_SIZE = 100_000
+
+ def initialize(relation, column: nil, operation: :count, operation_args: nil)
+ @relation = relation
+ @column = column || relation.primary_key
+ @operation = operation
+ @operation_args = operation_args
+ end
+
+ def unwanted_configuration?(finish, batch_size, start)
+ (@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) ||
+ (@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) ||
+ (finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
+ start >= finish
+ end
+
+ def count(batch_size: nil, mode: :itself, start: nil, finish: nil)
+ raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open?
+
+ check_mode!(mode)
+
+ # non-distinct have better performance
+ batch_size ||= batch_size_for_mode_and_operation(mode, @operation)
+
+ start = actual_start(start)
+ finish = actual_finish(finish)
+
+ raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0
+ return FALLBACK if unwanted_configuration?(finish, batch_size, start)
+
+ results = nil
+ batch_start = start
+
+ while batch_start < finish
+ begin
+ batch_end = [batch_start + batch_size, finish].min
+ batch_relation = build_relation_batch(batch_start, batch_end, mode)
+
+ op_args = @operation_args
+ if @operation == :count && @operation_args.blank? && use_loose_index_scan_for_distinct_values?(mode)
+ op_args = [Gitlab::Database::LooseIndexScanDistinctCount::COLUMN_ALIAS]
+ end
+
+ results = merge_results(results, batch_relation.send(@operation, *op_args)) # rubocop:disable GitlabSecurity/PublicSend
+ batch_start = batch_end
+ rescue ActiveRecord::QueryCanceled => error
+ # retry with a safe batch size & warmer cache
+ if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE
+ batch_size /= 2
+ else
+ log_canceled_batch_fetch(batch_start, mode, batch_relation.to_sql, error)
+ return FALLBACK
+ end
+ rescue Gitlab::Database::LooseIndexScanDistinctCount::ColumnConfigurationError => error
+ Gitlab::AppJsonLogger
+ .error(
+ event: 'batch_count',
+ relation: @relation.table_name,
+ operation: @operation,
+ operation_args: @operation_args,
+ mode: mode,
+ message: "LooseIndexScanDistinctCount column error: #{error.message}"
+ )
+
+ return FALLBACK
+ end
+
+ sleep(SLEEP_TIME_IN_SECONDS)
+ end
+
+ results
+ end
+
+ def merge_results(results, object)
+ return object unless results
+
+ if object.is_a?(Hash)
+ results.merge!(object) { |_, a, b| a + b }
+ else
+ results + object
+ end
+ end
+
+ private
+
+ def build_relation_batch(start, finish, mode)
+ if use_loose_index_scan_for_distinct_values?(mode)
+ Gitlab::Database::LooseIndexScanDistinctCount.new(@relation, @column).build_query(from: start, to: finish)
+ else
+ @relation.select(@column).public_send(mode).where(between_condition(start, finish)) # rubocop:disable GitlabSecurity/PublicSend
+ end
+ end
+
+ def batch_size_for_mode_and_operation(mode, operation)
+ return DEFAULT_SUM_BATCH_SIZE if operation == :sum
+
+ mode == :distinct ? DEFAULT_DISTINCT_BATCH_SIZE : DEFAULT_BATCH_SIZE
+ end
+
+ def between_condition(start, finish)
+ return @column.between(start...finish) if @column.is_a?(Arel::Attributes::Attribute)
+
+ { @column => start...finish }
+ end
+
+ def actual_start(start)
+ start || @relation.unscope(:group, :having).minimum(@column) || 0
+ end
+
+ def actual_finish(finish)
+ (finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE
+ end
+
+ def check_mode!(mode)
+ raise "The mode #{mode.inspect} is not supported" unless ALLOWED_MODES.include?(mode)
+ raise 'Use distinct count for optimized distinct counting' if @relation.limit(1).distinct_value.present? && mode != :distinct
+ raise 'Use distinct count only with non id fields' if @column == :id && mode == :distinct
+ end
+
+ def log_canceled_batch_fetch(batch_start, mode, query, error)
+ Gitlab::AppJsonLogger
+ .error(
+ event: 'batch_count',
+ relation: @relation.table_name,
+ operation: @operation,
+ operation_args: @operation_args,
+ start: batch_start,
+ mode: mode,
+ query: query,
+ message: "Query has been canceled with message: #{error.message}"
+ )
+ end
+
+ def use_loose_index_scan_for_distinct_values?(mode)
+ Feature.enabled?(:loose_index_scan_for_distinct_values) && not_group_by_query? && mode == :distinct
+ end
+
+ def not_group_by_query?
+ !@relation.is_a?(ActiveRecord::Relation) || @relation.group_values.blank?
+ end
+ end
+ end
+end