diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2020-11-30 11:02:35 +0000 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2020-11-30 11:02:35 +0000 |
commit | 434a0ce52d75e13d48eac9ce83774954c7c5d48d (patch) | |
tree | de3b7a7cf1ce8b07555f28df592297c76894c90f /lib/gitlab/database | |
parent | 0a0d9493ca481c56b739a3df27c31262283150fe (diff) | |
download | gitlab-ce-13.7.0-rc2.tar.gz |
Add latest changes from gitlab-org/gitlab@13-7-stable-eev13.7.0-rc2
Diffstat (limited to 'lib/gitlab/database')
-rw-r--r-- | lib/gitlab/database/batch_count.rb | 15 | ||||
-rw-r--r-- | lib/gitlab/database/migrations/background_migration_helpers.rb | 11 | ||||
-rw-r--r-- | lib/gitlab/database/postgres_hll_batch_distinct_counter.rb | 156 |
3 files changed, 173 insertions, 9 deletions
diff --git a/lib/gitlab/database/batch_count.rb b/lib/gitlab/database/batch_count.rb index 6f79e965cd5..5a506da0d05 100644 --- a/lib/gitlab/database/batch_count.rb +++ b/lib/gitlab/database/batch_count.rb @@ -49,6 +49,8 @@ module Gitlab MAX_ALLOWED_LOOPS = 10_000 SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep ALLOWED_MODES = [:itself, :distinct].freeze + FALLBACK_FINISH = 0 + OFFSET_BY_ONE = 1 # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705 DEFAULT_DISTINCT_BATCH_SIZE = 10_000 @@ -65,7 +67,7 @@ module Gitlab (@operation == :count && batch_size <= MIN_REQUIRED_BATCH_SIZE) || (@operation == :sum && batch_size < DEFAULT_SUM_BATCH_SIZE) || (finish - start) / batch_size >= MAX_ALLOWED_LOOPS || - start > finish + start >= finish end def count(batch_size: nil, mode: :itself, start: nil, finish: nil) @@ -85,11 +87,13 @@ module Gitlab results = nil batch_start = start - while batch_start <= finish - batch_relation = build_relation_batch(batch_start, batch_start + batch_size, mode) + while batch_start < finish + batch_end = [batch_start + batch_size, finish].min + batch_relation = build_relation_batch(batch_start, batch_end, mode) + begin results = merge_results(results, batch_relation.send(@operation, *@operation_args)) # rubocop:disable GitlabSecurity/PublicSend - batch_start += batch_size + batch_start = batch_end rescue ActiveRecord::QueryCanceled => error # retry with a safe batch size & warmer cache if batch_size >= 2 * MIN_REQUIRED_BATCH_SIZE @@ -99,6 +103,7 @@ module Gitlab return FALLBACK end end + sleep(SLEEP_TIME_IN_SECONDS) end @@ -138,7 +143,7 @@ module Gitlab end def actual_finish(finish) - finish || @relation.unscope(:group, :having).maximum(@column) || 0 + (finish || @relation.unscope(:group, :having).maximum(@column) || FALLBACK_FINISH) + OFFSET_BY_ONE end def check_mode!(mode) diff --git a/lib/gitlab/database/migrations/background_migration_helpers.rb b/lib/gitlab/database/migrations/background_migration_helpers.rb index a6cc03aa9eb..36073844765 100644 --- a/lib/gitlab/database/migrations/background_migration_helpers.rb +++ b/lib/gitlab/database/migrations/background_migration_helpers.rb @@ -55,7 +55,8 @@ module Gitlab bulk_migrate_async(jobs) unless jobs.empty? end - # Queues background migration jobs for an entire table, batched by ID range. + # Queues background migration jobs for an entire table in batches. + # The default batching column used is the standard primary key `id`. # Each job is scheduled with a `delay_interval` in between. # If you use a small interval, then some jobs may run at the same time. # @@ -68,6 +69,7 @@ module Gitlab # is scheduled to be run. These records can be used to trace execution of the background job, but there is no # builtin support to manage that automatically at this time. You should only set this flag if you are aware of # how it works, and intend to manually cleanup the database records in your background job. + # primary_column_name - The name of the primary key column if the primary key is not `id` # # *Returns the final migration delay* # @@ -87,8 +89,9 @@ module Gitlab # # do something # end # end - def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false) - raise "#{model_class} does not have an ID to use for batch ranges" unless model_class.column_names.include?('id') + def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false, primary_column_name: :id) + raise "#{model_class} does not have an ID column of #{primary_column_name} to use for batch ranges" unless model_class.column_names.include?(primary_column_name.to_s) + raise "#{primary_column_name} is not an integer column" unless model_class.columns_hash[primary_column_name.to_s].type == :integer # To not overload the worker too much we enforce a minimum interval both # when scheduling and performing jobs. @@ -99,7 +102,7 @@ module Gitlab final_delay = 0 model_class.each_batch(of: batch_size) do |relation, index| - start_id, end_id = relation.pluck(Arel.sql('MIN(id), MAX(id)')).first + start_id, end_id = relation.pluck(Arel.sql("MIN(#{primary_column_name}), MAX(#{primary_column_name})")).first # `BackgroundMigrationWorker.bulk_perform_in` schedules all jobs for # the same time, which is not helpful in most cases where we wish to diff --git a/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb b/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb new file mode 100644 index 00000000000..4b2afcb128f --- /dev/null +++ b/lib/gitlab/database/postgres_hll_batch_distinct_counter.rb @@ -0,0 +1,156 @@ +# frozen_string_literal: true + +module Gitlab + module Database + # For large tables, PostgreSQL can take a long time to count rows due to MVCC. + # Implements a distinct batch counter based on HyperLogLog algorithm + # Needs indexes on the column below to calculate max, min and range queries + # For larger tables just set higher batch_size with index optimization + # + # In order to not use a possible complex time consuming query when calculating min and max values, + # the start and finish can be sent specifically, start and finish should contain max and min values for PRIMARY KEY of + # relation (most cases `id` column) rather than counted attribute eg: + # estimate_distinct_count(start: ::Project.with_active_services.minimum(:id), finish: ::Project.with_active_services.maximum(:id)) + # + # Grouped relations are NOT supported yet. + # + # @example Usage + # ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project, :creator_id).estimate_distinct_count + # ::Gitlab::Database::PostgresHllBatchDistinctCount.new(::Project.with_active_services.service_desk_enabled.where(time_period)) + # .estimate_distinct_count( + # batch_size: 1_000, + # start: ::Project.with_active_services.service_desk_enabled.where(time_period).minimum(:id), + # finish: ::Project.with_active_services.service_desk_enabled.where(time_period).maximum(:id) + # ) + # + # @note HyperLogLog is an PROBABILISTIC algorithm that ESTIMATES distinct count of given attribute value for supplied relation + # Like all probabilistic algorithm is has ERROR RATE margin, that can affect values, + # for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3% + # for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used. + class PostgresHllBatchDistinctCounter + FALLBACK = -1 + MIN_REQUIRED_BATCH_SIZE = 1_250 + MAX_ALLOWED_LOOPS = 10_000 + SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep + + # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705 + DEFAULT_BATCH_SIZE = 100_000 + + BIT_31_MASK = "B'0#{'1' * 31}'" + BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'" + # @example source_query + # SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits + # FROM %{relation} + # WHERE %{pkey} >= %{batch_start} + # AND %{pkey} < %{batch_end} + # AND %{column} IS NOT NULL + BUCKETED_DATA_SQL = <<~SQL + WITH hashed_attributes AS (%{source_query}) + SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num, + (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash + FROM hashed_attributes + GROUP BY 1 ORDER BY 1 + SQL + + TOTAL_BUCKETS_NUMBER = 512 + + def initialize(relation, column = nil) + @relation = relation + @column = column || relation.primary_key + end + + def unwanted_configuration?(finish, batch_size, start) + batch_size <= MIN_REQUIRED_BATCH_SIZE || + (finish - start) / batch_size >= MAX_ALLOWED_LOOPS || + start > finish + end + + def estimate_distinct_count(batch_size: nil, start: nil, finish: nil) + raise 'BatchCount can not be run inside a transaction' if ActiveRecord::Base.connection.transaction_open? + + batch_size ||= DEFAULT_BATCH_SIZE + + start = actual_start(start) + finish = actual_finish(finish) + + raise "Batch counting expects positive values only for #{@column}" if start < 0 || finish < 0 + return FALLBACK if unwanted_configuration?(finish, batch_size, start) + + batch_start = start + hll_blob = {} + + while batch_start <= finish + begin + hll_blob.merge!(hll_blob_for_batch(batch_start, batch_start + batch_size)) {|_key, old, new| new > old ? new : old } + batch_start += batch_size + end + sleep(SLEEP_TIME_IN_SECONDS) + end + + estimate_cardinality(hll_blob) + end + + private + + # arbitrary values that are present in #estimate_cardinality + # are sourced from https://www.sisense.com/blog/hyperloglog-in-pure-sql/ + # article, they are not representing any entity and serves as tune value + # for the whole equation + def estimate_cardinality(hll_blob) + num_zero_buckets = TOTAL_BUCKETS_NUMBER - hll_blob.size + + num_uniques = ( + ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) / + (num_zero_buckets + hll_blob.values.sum { |bucket_hash, _| 2**(-1 * bucket_hash)} ) + ).to_i + + if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER + ((0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER)) * (TOTAL_BUCKETS_NUMBER * + Math.log2(TOTAL_BUCKETS_NUMBER.to_f / num_zero_buckets))) + else + num_uniques + end + end + + def hll_blob_for_batch(start, finish) + @relation + .connection + .execute(BUCKETED_DATA_SQL % { source_query: source_query(start, finish) }) + .map(&:values) + .to_h + end + + # Generate the source query SQL snippet for the provided id range + # + # @example SQL query template + # SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits + # FROM %{relation} + # WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end} + # AND %{column} IS NOT NULL + # + # @param start initial id range + # @param finish final id range + # @return [String] SQL query fragment + def source_query(start, finish) + col_as_arel = @column.is_a?(Arel::Attributes::Attribute) ? @column : Arel.sql(@column.to_s) + col_as_text = Arel::Nodes::NamedFunction.new('CAST', [col_as_arel.as('text')]) + md5_of_col = Arel::Nodes::NamedFunction.new('md5', [col_as_text]) + md5_as_hex = Arel::Nodes::Concat.new(Arel.sql("'X'"), md5_of_col) + bits = Arel::Nodes::NamedFunction.new('CAST', [md5_as_hex.as('bit(32)')]) + + @relation + .where(@relation.primary_key => (start...finish)) + .where(col_as_arel.not_eq(nil)) + .select(bits.as('attr_hash_32_bits')).to_sql + end + + def actual_start(start) + start || @relation.unscope(:group, :having).minimum(@relation.primary_key) || 0 + end + + def actual_finish(finish) + finish || @relation.unscope(:group, :having).maximum(@relation.primary_key) || 0 + end + end + end +end |