diff options
Diffstat (limited to 'lib/gitlab/database/background_migration')
5 files changed, 179 insertions, 24 deletions
diff --git a/lib/gitlab/database/background_migration/batch_optimizer.rb b/lib/gitlab/database/background_migration/batch_optimizer.rb new file mode 100644 index 00000000000..0668490dda8 --- /dev/null +++ b/lib/gitlab/database/background_migration/batch_optimizer.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +module Gitlab + module Database + module BackgroundMigration + # This is an optimizer for throughput of batched migration jobs + # + # The underyling mechanic is based on the concept of time efficiency: + # time efficiency = job duration / interval + # Ideally, this is close but lower than 1 - so we're using time efficiently. + # + # We aim to land in the 90%-98% range, which gives the database a little breathing room + # in between. + # + # The optimizer is based on calculating the exponential moving average of time efficiencies + # for the last N jobs. If we're outside the range, we add 10% to or decrease by 20% of the batch size. + class BatchOptimizer + # Target time efficiency for a job + # Time efficiency is defined as: job duration / interval + TARGET_EFFICIENCY = (0.9..0.95).freeze + + # Lower and upper bound for the batch size + ALLOWED_BATCH_SIZE = (1_000..2_000_000).freeze + + # Limit for the multiplier of the batch size + MAX_MULTIPLIER = 1.2 + + # When smoothing time efficiency, use this many jobs + NUMBER_OF_JOBS = 20 + + # Smoothing factor for exponential moving average + EMA_ALPHA = 0.4 + + attr_reader :migration, :number_of_jobs, :ema_alpha + + def initialize(migration, number_of_jobs: NUMBER_OF_JOBS, ema_alpha: EMA_ALPHA) + @migration = migration + @number_of_jobs = number_of_jobs + @ema_alpha = ema_alpha + end + + def optimize! + return unless Feature.enabled?(:optimize_batched_migrations, type: :ops, default_enabled: :yaml) + + if multiplier = batch_size_multiplier + migration.batch_size = (migration.batch_size * multiplier).to_i.clamp(ALLOWED_BATCH_SIZE) + migration.save! + end + end + + private + + def batch_size_multiplier + efficiency = migration.smoothed_time_efficiency(number_of_jobs: number_of_jobs, alpha: ema_alpha) + + return if efficiency.nil? || efficiency == 0 + + # We hit the range - no change + return if TARGET_EFFICIENCY.include?(efficiency) + + # Assumption: time efficiency is linear in the batch size + [TARGET_EFFICIENCY.max / efficiency, MAX_MULTIPLIER].min + end + end + end + end +end diff --git a/lib/gitlab/database/background_migration/batched_job.rb b/lib/gitlab/database/background_migration/batched_job.rb index 3b624df2bfd..869b97b8ac0 100644 --- a/lib/gitlab/database/background_migration/batched_job.rb +++ b/lib/gitlab/database/background_migration/batched_job.rb @@ -4,10 +4,23 @@ module Gitlab module Database module BackgroundMigration class BatchedJob < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord + include FromUnion + self.table_name = :batched_background_migration_jobs + MAX_ATTEMPTS = 3 + STUCK_JOBS_TIMEOUT = 1.hour.freeze + belongs_to :batched_migration, foreign_key: :batched_background_migration_id + scope :active, -> { where(status: [:pending, :running]) } + scope :stuck, -> { active.where('updated_at <= ?', STUCK_JOBS_TIMEOUT.ago) } + scope :retriable, -> { + failed_jobs = where(status: :failed).where('attempts < ?', MAX_ATTEMPTS) + + from_union([failed_jobs, self.stuck]) + } + enum status: { pending: 0, running: 1, @@ -15,8 +28,22 @@ module Gitlab succeeded: 3 } + scope :successful_in_execution_order, -> { where.not(finished_at: nil).succeeded.order(:finished_at) } + delegate :aborted?, :job_class, :table_name, :column_name, :job_arguments, to: :batched_migration, prefix: :migration + + attribute :pause_ms, :integer, default: 100 + + def time_efficiency + return unless succeeded? + return unless finished_at && started_at + + duration = finished_at - started_at + + # TODO: Switch to individual job interval (prereq: https://gitlab.com/gitlab-org/gitlab/-/issues/328801) + duration.to_f / batched_migration.interval + end end end end diff --git a/lib/gitlab/database/background_migration/batched_migration.rb b/lib/gitlab/database/background_migration/batched_migration.rb index 4aa33ed7946..e85162f355e 100644 --- a/lib/gitlab/database/background_migration/batched_migration.rb +++ b/lib/gitlab/database/background_migration/batched_migration.rb @@ -20,9 +20,12 @@ module Gitlab paused: 0, active: 1, aborted: 2, - finished: 3 + finished: 3, + failed: 4 } + attribute :pause_ms, :integer, default: 100 + def self.active_migration active.queue_order.first end @@ -35,7 +38,13 @@ module Gitlab end def create_batched_job!(min, max) - batched_jobs.create!(min_value: min, max_value: max, batch_size: batch_size, sub_batch_size: sub_batch_size) + batched_jobs.create!( + min_value: min, + max_value: max, + batch_size: batch_size, + sub_batch_size: sub_batch_size, + pause_ms: pause_ms + ) end def next_min_value @@ -58,12 +67,40 @@ module Gitlab write_attribute(:batch_class_name, class_name.demodulize) end + def migrated_tuple_count + batched_jobs.succeeded.sum(:batch_size) + end + def prometheus_labels @prometheus_labels ||= { migration_id: id, migration_identifier: "%s/%s.%s" % [job_class_name, table_name, column_name] } end + + def smoothed_time_efficiency(number_of_jobs: 10, alpha: 0.2) + jobs = batched_jobs.successful_in_execution_order.reverse_order.limit(number_of_jobs) + + return if jobs.size < number_of_jobs + + efficiencies = jobs.map(&:time_efficiency).reject(&:nil?).each_with_index + + dividend = efficiencies.reduce(0) do |total, (job_eff, i)| + total + job_eff * (1 - alpha)**i + end + + divisor = efficiencies.reduce(0) do |total, (job_eff, i)| + total + (1 - alpha)**i + end + + return if divisor == 0 + + (dividend / divisor).round(2) + end + + def optimize! + BatchOptimizer.new(self).optimize! + end end end end diff --git a/lib/gitlab/database/background_migration/batched_migration_runner.rb b/lib/gitlab/database/background_migration/batched_migration_runner.rb index cf8b61f5feb..67fe6c536e6 100644 --- a/lib/gitlab/database/background_migration/batched_migration_runner.rb +++ b/lib/gitlab/database/background_migration/batched_migration_runner.rb @@ -19,8 +19,10 @@ module Gitlab # # Note that this method is primarily intended to called by a scheduled worker. def run_migration_job(active_migration) - if next_batched_job = create_next_batched_job!(active_migration) + if next_batched_job = find_or_create_next_batched_job(active_migration) migration_wrapper.perform(next_batched_job) + + active_migration.optimize! else finish_active_migration(active_migration) end @@ -46,12 +48,12 @@ module Gitlab attr_reader :migration_wrapper - def create_next_batched_job!(active_migration) - next_batch_range = find_next_batch_range(active_migration) - - return if next_batch_range.nil? - - active_migration.create_batched_job!(next_batch_range.min, next_batch_range.max) + def find_or_create_next_batched_job(active_migration) + if next_batch_range = find_next_batch_range(active_migration) + active_migration.create_batched_job!(next_batch_range.min, next_batch_range.max) + else + active_migration.batched_jobs.retriable.first + end end def find_next_batch_range(active_migration) @@ -80,7 +82,13 @@ module Gitlab end def finish_active_migration(active_migration) - active_migration.finished! + return if active_migration.batched_jobs.active.exists? + + if active_migration.batched_jobs.failed.exists? + active_migration.failed! + else + active_migration.finished! + end end end end diff --git a/lib/gitlab/database/background_migration/batched_migration_wrapper.rb b/lib/gitlab/database/background_migration/batched_migration_wrapper.rb index c276f8ce75b..e37df102872 100644 --- a/lib/gitlab/database/background_migration/batched_migration_wrapper.rb +++ b/lib/gitlab/database/background_migration/batched_migration_wrapper.rb @@ -19,10 +19,10 @@ module Gitlab execute_batch(batch_tracking_record) batch_tracking_record.status = :succeeded - rescue => e + rescue Exception # rubocop:disable Lint/RescueException batch_tracking_record.status = :failed - raise e + raise ensure finish_tracking_execution(batch_tracking_record) track_prometheus_metrics(batch_tracking_record) @@ -31,7 +31,7 @@ module Gitlab private def start_tracking_execution(tracking_record) - tracking_record.update!(attempts: tracking_record.attempts + 1, status: :running, started_at: Time.current) + tracking_record.update!(attempts: tracking_record.attempts + 1, status: :running, started_at: Time.current, finished_at: nil, metrics: {}) end def execute_batch(tracking_record) @@ -43,6 +43,7 @@ module Gitlab tracking_record.migration_table_name, tracking_record.migration_column_name, tracking_record.sub_batch_size, + tracking_record.pause_ms, *tracking_record.migration_job_arguments) if job_instance.respond_to?(:batch_metrics) @@ -61,11 +62,12 @@ module Gitlab metric_for(:gauge_batch_size).set(base_labels, tracking_record.batch_size) metric_for(:gauge_sub_batch_size).set(base_labels, tracking_record.sub_batch_size) + metric_for(:gauge_interval).set(base_labels, tracking_record.batched_migration.interval) + metric_for(:gauge_job_duration).set(base_labels, (tracking_record.finished_at - tracking_record.started_at).to_i) metric_for(:counter_updated_tuples).increment(base_labels, tracking_record.batch_size) - - # Time efficiency: Ratio of duration to interval (ideal: less than, but close to 1) - efficiency = (tracking_record.finished_at - tracking_record.started_at).to_i / migration.interval.to_f - metric_for(:histogram_time_efficiency).observe(base_labels, efficiency) + metric_for(:gauge_migrated_tuples).set(base_labels, tracking_record.batched_migration.migrated_tuple_count) + metric_for(:gauge_total_tuple_count).set(base_labels, tracking_record.batched_migration.total_tuple_count) + metric_for(:gauge_last_update_time).set(base_labels, Time.current.to_i) if metrics = tracking_record.metrics metrics['timings']&.each do |key, timings| @@ -94,21 +96,35 @@ module Gitlab :batched_migration_job_sub_batch_size, 'Sub-batch size for a batched migration job' ), + gauge_interval: Gitlab::Metrics.gauge( + :batched_migration_job_interval_seconds, + 'Interval for a batched migration job' + ), + gauge_job_duration: Gitlab::Metrics.gauge( + :batched_migration_job_duration_seconds, + 'Duration for a batched migration job' + ), counter_updated_tuples: Gitlab::Metrics.counter( :batched_migration_job_updated_tuples_total, 'Number of tuples updated by batched migration job' ), + gauge_migrated_tuples: Gitlab::Metrics.gauge( + :batched_migration_migrated_tuples_total, + 'Total number of tuples migrated by a batched migration' + ), histogram_timings: Gitlab::Metrics.histogram( - :batched_migration_job_duration_seconds, - 'Timings for a batched migration job', + :batched_migration_job_query_duration_seconds, + 'Query timings for a batched migration job', {}, [0.1, 0.25, 0.5, 1, 5].freeze ), - histogram_time_efficiency: Gitlab::Metrics.histogram( - :batched_migration_job_time_efficiency, - 'Ratio of job duration to interval', - {}, - [0.5, 0.9, 1, 1.5, 2].freeze + gauge_total_tuple_count: Gitlab::Metrics.gauge( + :batched_migration_total_tuple_count, + 'Total tuple count the migration needs to touch' + ), + gauge_last_update_time: Gitlab::Metrics.gauge( + :batched_migration_last_update_time_seconds, + 'Unix epoch time in seconds' ) } end |