summaryrefslogtreecommitdiff
path: root/lib/gitlab/database/background_migration
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/database/background_migration')
-rw-r--r--lib/gitlab/database/background_migration/batch_optimizer.rb67
-rw-r--r--lib/gitlab/database/background_migration/batched_job.rb27
-rw-r--r--lib/gitlab/database/background_migration/batched_migration.rb41
-rw-r--r--lib/gitlab/database/background_migration/batched_migration_runner.rb24
-rw-r--r--lib/gitlab/database/background_migration/batched_migration_wrapper.rb44
5 files changed, 179 insertions, 24 deletions
diff --git a/lib/gitlab/database/background_migration/batch_optimizer.rb b/lib/gitlab/database/background_migration/batch_optimizer.rb
new file mode 100644
index 00000000000..0668490dda8
--- /dev/null
+++ b/lib/gitlab/database/background_migration/batch_optimizer.rb
@@ -0,0 +1,67 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Database
+ module BackgroundMigration
+ # This is an optimizer for throughput of batched migration jobs
+ #
+ # The underyling mechanic is based on the concept of time efficiency:
+ # time efficiency = job duration / interval
+ # Ideally, this is close but lower than 1 - so we're using time efficiently.
+ #
+ # We aim to land in the 90%-98% range, which gives the database a little breathing room
+ # in between.
+ #
+ # The optimizer is based on calculating the exponential moving average of time efficiencies
+ # for the last N jobs. If we're outside the range, we add 10% to or decrease by 20% of the batch size.
+ class BatchOptimizer
+ # Target time efficiency for a job
+ # Time efficiency is defined as: job duration / interval
+ TARGET_EFFICIENCY = (0.9..0.95).freeze
+
+ # Lower and upper bound for the batch size
+ ALLOWED_BATCH_SIZE = (1_000..2_000_000).freeze
+
+ # Limit for the multiplier of the batch size
+ MAX_MULTIPLIER = 1.2
+
+ # When smoothing time efficiency, use this many jobs
+ NUMBER_OF_JOBS = 20
+
+ # Smoothing factor for exponential moving average
+ EMA_ALPHA = 0.4
+
+ attr_reader :migration, :number_of_jobs, :ema_alpha
+
+ def initialize(migration, number_of_jobs: NUMBER_OF_JOBS, ema_alpha: EMA_ALPHA)
+ @migration = migration
+ @number_of_jobs = number_of_jobs
+ @ema_alpha = ema_alpha
+ end
+
+ def optimize!
+ return unless Feature.enabled?(:optimize_batched_migrations, type: :ops, default_enabled: :yaml)
+
+ if multiplier = batch_size_multiplier
+ migration.batch_size = (migration.batch_size * multiplier).to_i.clamp(ALLOWED_BATCH_SIZE)
+ migration.save!
+ end
+ end
+
+ private
+
+ def batch_size_multiplier
+ efficiency = migration.smoothed_time_efficiency(number_of_jobs: number_of_jobs, alpha: ema_alpha)
+
+ return if efficiency.nil? || efficiency == 0
+
+ # We hit the range - no change
+ return if TARGET_EFFICIENCY.include?(efficiency)
+
+ # Assumption: time efficiency is linear in the batch size
+ [TARGET_EFFICIENCY.max / efficiency, MAX_MULTIPLIER].min
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/database/background_migration/batched_job.rb b/lib/gitlab/database/background_migration/batched_job.rb
index 3b624df2bfd..869b97b8ac0 100644
--- a/lib/gitlab/database/background_migration/batched_job.rb
+++ b/lib/gitlab/database/background_migration/batched_job.rb
@@ -4,10 +4,23 @@ module Gitlab
module Database
module BackgroundMigration
class BatchedJob < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord
+ include FromUnion
+
self.table_name = :batched_background_migration_jobs
+ MAX_ATTEMPTS = 3
+ STUCK_JOBS_TIMEOUT = 1.hour.freeze
+
belongs_to :batched_migration, foreign_key: :batched_background_migration_id
+ scope :active, -> { where(status: [:pending, :running]) }
+ scope :stuck, -> { active.where('updated_at <= ?', STUCK_JOBS_TIMEOUT.ago) }
+ scope :retriable, -> {
+ failed_jobs = where(status: :failed).where('attempts < ?', MAX_ATTEMPTS)
+
+ from_union([failed_jobs, self.stuck])
+ }
+
enum status: {
pending: 0,
running: 1,
@@ -15,8 +28,22 @@ module Gitlab
succeeded: 3
}
+ scope :successful_in_execution_order, -> { where.not(finished_at: nil).succeeded.order(:finished_at) }
+
delegate :aborted?, :job_class, :table_name, :column_name, :job_arguments,
to: :batched_migration, prefix: :migration
+
+ attribute :pause_ms, :integer, default: 100
+
+ def time_efficiency
+ return unless succeeded?
+ return unless finished_at && started_at
+
+ duration = finished_at - started_at
+
+ # TODO: Switch to individual job interval (prereq: https://gitlab.com/gitlab-org/gitlab/-/issues/328801)
+ duration.to_f / batched_migration.interval
+ end
end
end
end
diff --git a/lib/gitlab/database/background_migration/batched_migration.rb b/lib/gitlab/database/background_migration/batched_migration.rb
index 4aa33ed7946..e85162f355e 100644
--- a/lib/gitlab/database/background_migration/batched_migration.rb
+++ b/lib/gitlab/database/background_migration/batched_migration.rb
@@ -20,9 +20,12 @@ module Gitlab
paused: 0,
active: 1,
aborted: 2,
- finished: 3
+ finished: 3,
+ failed: 4
}
+ attribute :pause_ms, :integer, default: 100
+
def self.active_migration
active.queue_order.first
end
@@ -35,7 +38,13 @@ module Gitlab
end
def create_batched_job!(min, max)
- batched_jobs.create!(min_value: min, max_value: max, batch_size: batch_size, sub_batch_size: sub_batch_size)
+ batched_jobs.create!(
+ min_value: min,
+ max_value: max,
+ batch_size: batch_size,
+ sub_batch_size: sub_batch_size,
+ pause_ms: pause_ms
+ )
end
def next_min_value
@@ -58,12 +67,40 @@ module Gitlab
write_attribute(:batch_class_name, class_name.demodulize)
end
+ def migrated_tuple_count
+ batched_jobs.succeeded.sum(:batch_size)
+ end
+
def prometheus_labels
@prometheus_labels ||= {
migration_id: id,
migration_identifier: "%s/%s.%s" % [job_class_name, table_name, column_name]
}
end
+
+ def smoothed_time_efficiency(number_of_jobs: 10, alpha: 0.2)
+ jobs = batched_jobs.successful_in_execution_order.reverse_order.limit(number_of_jobs)
+
+ return if jobs.size < number_of_jobs
+
+ efficiencies = jobs.map(&:time_efficiency).reject(&:nil?).each_with_index
+
+ dividend = efficiencies.reduce(0) do |total, (job_eff, i)|
+ total + job_eff * (1 - alpha)**i
+ end
+
+ divisor = efficiencies.reduce(0) do |total, (job_eff, i)|
+ total + (1 - alpha)**i
+ end
+
+ return if divisor == 0
+
+ (dividend / divisor).round(2)
+ end
+
+ def optimize!
+ BatchOptimizer.new(self).optimize!
+ end
end
end
end
diff --git a/lib/gitlab/database/background_migration/batched_migration_runner.rb b/lib/gitlab/database/background_migration/batched_migration_runner.rb
index cf8b61f5feb..67fe6c536e6 100644
--- a/lib/gitlab/database/background_migration/batched_migration_runner.rb
+++ b/lib/gitlab/database/background_migration/batched_migration_runner.rb
@@ -19,8 +19,10 @@ module Gitlab
#
# Note that this method is primarily intended to called by a scheduled worker.
def run_migration_job(active_migration)
- if next_batched_job = create_next_batched_job!(active_migration)
+ if next_batched_job = find_or_create_next_batched_job(active_migration)
migration_wrapper.perform(next_batched_job)
+
+ active_migration.optimize!
else
finish_active_migration(active_migration)
end
@@ -46,12 +48,12 @@ module Gitlab
attr_reader :migration_wrapper
- def create_next_batched_job!(active_migration)
- next_batch_range = find_next_batch_range(active_migration)
-
- return if next_batch_range.nil?
-
- active_migration.create_batched_job!(next_batch_range.min, next_batch_range.max)
+ def find_or_create_next_batched_job(active_migration)
+ if next_batch_range = find_next_batch_range(active_migration)
+ active_migration.create_batched_job!(next_batch_range.min, next_batch_range.max)
+ else
+ active_migration.batched_jobs.retriable.first
+ end
end
def find_next_batch_range(active_migration)
@@ -80,7 +82,13 @@ module Gitlab
end
def finish_active_migration(active_migration)
- active_migration.finished!
+ return if active_migration.batched_jobs.active.exists?
+
+ if active_migration.batched_jobs.failed.exists?
+ active_migration.failed!
+ else
+ active_migration.finished!
+ end
end
end
end
diff --git a/lib/gitlab/database/background_migration/batched_migration_wrapper.rb b/lib/gitlab/database/background_migration/batched_migration_wrapper.rb
index c276f8ce75b..e37df102872 100644
--- a/lib/gitlab/database/background_migration/batched_migration_wrapper.rb
+++ b/lib/gitlab/database/background_migration/batched_migration_wrapper.rb
@@ -19,10 +19,10 @@ module Gitlab
execute_batch(batch_tracking_record)
batch_tracking_record.status = :succeeded
- rescue => e
+ rescue Exception # rubocop:disable Lint/RescueException
batch_tracking_record.status = :failed
- raise e
+ raise
ensure
finish_tracking_execution(batch_tracking_record)
track_prometheus_metrics(batch_tracking_record)
@@ -31,7 +31,7 @@ module Gitlab
private
def start_tracking_execution(tracking_record)
- tracking_record.update!(attempts: tracking_record.attempts + 1, status: :running, started_at: Time.current)
+ tracking_record.update!(attempts: tracking_record.attempts + 1, status: :running, started_at: Time.current, finished_at: nil, metrics: {})
end
def execute_batch(tracking_record)
@@ -43,6 +43,7 @@ module Gitlab
tracking_record.migration_table_name,
tracking_record.migration_column_name,
tracking_record.sub_batch_size,
+ tracking_record.pause_ms,
*tracking_record.migration_job_arguments)
if job_instance.respond_to?(:batch_metrics)
@@ -61,11 +62,12 @@ module Gitlab
metric_for(:gauge_batch_size).set(base_labels, tracking_record.batch_size)
metric_for(:gauge_sub_batch_size).set(base_labels, tracking_record.sub_batch_size)
+ metric_for(:gauge_interval).set(base_labels, tracking_record.batched_migration.interval)
+ metric_for(:gauge_job_duration).set(base_labels, (tracking_record.finished_at - tracking_record.started_at).to_i)
metric_for(:counter_updated_tuples).increment(base_labels, tracking_record.batch_size)
-
- # Time efficiency: Ratio of duration to interval (ideal: less than, but close to 1)
- efficiency = (tracking_record.finished_at - tracking_record.started_at).to_i / migration.interval.to_f
- metric_for(:histogram_time_efficiency).observe(base_labels, efficiency)
+ metric_for(:gauge_migrated_tuples).set(base_labels, tracking_record.batched_migration.migrated_tuple_count)
+ metric_for(:gauge_total_tuple_count).set(base_labels, tracking_record.batched_migration.total_tuple_count)
+ metric_for(:gauge_last_update_time).set(base_labels, Time.current.to_i)
if metrics = tracking_record.metrics
metrics['timings']&.each do |key, timings|
@@ -94,21 +96,35 @@ module Gitlab
:batched_migration_job_sub_batch_size,
'Sub-batch size for a batched migration job'
),
+ gauge_interval: Gitlab::Metrics.gauge(
+ :batched_migration_job_interval_seconds,
+ 'Interval for a batched migration job'
+ ),
+ gauge_job_duration: Gitlab::Metrics.gauge(
+ :batched_migration_job_duration_seconds,
+ 'Duration for a batched migration job'
+ ),
counter_updated_tuples: Gitlab::Metrics.counter(
:batched_migration_job_updated_tuples_total,
'Number of tuples updated by batched migration job'
),
+ gauge_migrated_tuples: Gitlab::Metrics.gauge(
+ :batched_migration_migrated_tuples_total,
+ 'Total number of tuples migrated by a batched migration'
+ ),
histogram_timings: Gitlab::Metrics.histogram(
- :batched_migration_job_duration_seconds,
- 'Timings for a batched migration job',
+ :batched_migration_job_query_duration_seconds,
+ 'Query timings for a batched migration job',
{},
[0.1, 0.25, 0.5, 1, 5].freeze
),
- histogram_time_efficiency: Gitlab::Metrics.histogram(
- :batched_migration_job_time_efficiency,
- 'Ratio of job duration to interval',
- {},
- [0.5, 0.9, 1, 1.5, 2].freeze
+ gauge_total_tuple_count: Gitlab::Metrics.gauge(
+ :batched_migration_total_tuple_count,
+ 'Total tuple count the migration needs to touch'
+ ),
+ gauge_last_update_time: Gitlab::Metrics.gauge(
+ :batched_migration_last_update_time_seconds,
+ 'Unix epoch time in seconds'
)
}
end