diff options
-rw-r--r-- | app/models/postgresql/replication_slot.rb | 32 | ||||
-rw-r--r-- | app/workers/background_migration_worker.rb | 61 | ||||
-rw-r--r-- | doc/development/background_migrations.md | 3 | ||||
-rw-r--r-- | lib/gitlab/background_migration.rb | 6 | ||||
-rw-r--r-- | lib/gitlab/database/migration_helpers.rb | 4 | ||||
-rw-r--r-- | spec/models/postgresql/replication_slot_spec.rb | 31 | ||||
-rw-r--r-- | spec/workers/background_migration_worker_spec.rb | 52 |
7 files changed, 179 insertions, 10 deletions
diff --git a/app/models/postgresql/replication_slot.rb b/app/models/postgresql/replication_slot.rb new file mode 100644 index 00000000000..70c7432e6b5 --- /dev/null +++ b/app/models/postgresql/replication_slot.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module Postgresql + class ReplicationSlot < ActiveRecord::Base + self.table_name = 'pg_replication_slots' + + # Returns true if the lag observed across all replication slots exceeds a + # given threshold. + # + # max - The maximum replication lag size, in bytes. Based on GitLab.com + # statistics it takes between 1 and 5 seconds to replicate around + # 100 MB of data. + def self.lag_too_great?(max = 100.megabytes) + lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \ + "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint" + + # We force the use of a transaction here so the query always goes to the + # primary, even when using the EE DB load balancer. + sizes = transaction { pluck(lag_function) } + too_great = sizes.count { |size| size >= max } + + # If too many replicas are falling behind too much, the availability of a + # GitLab instance might suffer. To prevent this from happening we require + # at least 1 replica to have data recent enough. + if sizes.any? && too_great.positive? + (sizes.length - too_great) <= 1 + else + false + end + end + end +end diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb index eaec7d48f35..7d006cc348e 100644 --- a/app/workers/background_migration_worker.rb +++ b/app/workers/background_migration_worker.rb @@ -6,10 +6,22 @@ class BackgroundMigrationWorker # The minimum amount of time between processing two jobs of the same migration # class. # - # This interval is set to 5 minutes so autovacuuming and other maintenance - # related tasks have plenty of time to clean up after a migration has been - # performed. - MIN_INTERVAL = 5.minutes.to_i + # This interval is set to 2 or 5 minutes so autovacuuming and other + # maintenance related tasks have plenty of time to clean up after a migration + # has been performed. + def self.minimum_interval + if enable_health_check? + 2.minutes.to_i + else + 5.minutes.to_i + end + end + + def self.enable_health_check? + Rails.env.development? || + Rails.env.test? || + Feature.enabled?('background_migration_health_check') + end # Performs the background migration. # @@ -27,7 +39,8 @@ class BackgroundMigrationWorker # running a migration of this class or we ran one recently. In this case # we'll reschedule the job in such a way that it is picked up again around # the time the lease expires. - self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments) + self.class + .perform_in(ttl || self.class.minimum_interval, class_name, arguments) end end @@ -39,17 +52,51 @@ class BackgroundMigrationWorker [true, nil] else lease = lease_for(class_name) + perform = !!lease.try_obtain + + # If we managed to acquire the lease but the DB is not healthy, then we + # want to simply reschedule our job and try again _after_ the lease + # expires. + if perform && !healthy_database? + database_unhealthy_counter.increment - [lease.try_obtain, lease.ttl] + perform = false + end + + [perform, lease.ttl] end end def lease_for(class_name) Gitlab::ExclusiveLease - .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL) + .new(lease_key_for(class_name), timeout: self.class.minimum_interval) + end + + def lease_key_for(class_name) + "#{self.class.name}:#{class_name}" end def always_perform? Rails.env.test? end + + # Returns true if the database is healthy enough to allow the migration to be + # performed. + # + # class_name - The name of the background migration that we might want to + # run. + def healthy_database? + return true unless self.class.enable_health_check? + + return true unless Gitlab::Database.postgresql? + + !Postgresql::ReplicationSlot.lag_too_great? + end + + def database_unhealthy_counter + Gitlab::Metrics.counter( + :background_migration_database_health_reschedules, + 'The number of times a background migration is rescheduled because the database is unhealthy.' + ) + end end diff --git a/doc/development/background_migrations.md b/doc/development/background_migrations.md index 16195cbbbdf..f0d5af9fcb5 100644 --- a/doc/development/background_migrations.md +++ b/doc/development/background_migrations.md @@ -5,6 +5,9 @@ otherwise take a very long time (hours, days, years, etc) to complete. For example, you can use background migrations to migrate data so that instead of storing data in a single JSON column the data is stored in a separate table. +If the database cluster is considered to be in an unhealthy state, background +migrations automatically reschedule themselves for a later point in time. + ## When To Use Background Migrations >**Note:** diff --git a/lib/gitlab/background_migration.rb b/lib/gitlab/background_migration.rb index d3f66877672..36c85dec544 100644 --- a/lib/gitlab/background_migration.rb +++ b/lib/gitlab/background_migration.rb @@ -46,7 +46,11 @@ module Gitlab # arguments - The arguments to pass to the background migration's "perform" # method. def self.perform(class_name, arguments) - const_get(class_name).new.perform(*arguments) + migration_class_for(class_name).new.perform(*arguments) + end + + def self.migration_class_for(class_name) + const_get(class_name) end end end diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb index 4fe5b4cc835..f39b3b6eb5b 100644 --- a/lib/gitlab/database/migration_helpers.rb +++ b/lib/gitlab/database/migration_helpers.rb @@ -979,8 +979,8 @@ into similar problems in the future (e.g. when new tables are created). # To not overload the worker too much we enforce a minimum interval both # when scheduling and performing jobs. - if delay_interval < BackgroundMigrationWorker::MIN_INTERVAL - delay_interval = BackgroundMigrationWorker::MIN_INTERVAL + if delay_interval < BackgroundMigrationWorker.minimum_interval + delay_interval = BackgroundMigrationWorker.minimum_interval end model_class.each_batch(of: batch_size) do |relation, index| diff --git a/spec/models/postgresql/replication_slot_spec.rb b/spec/models/postgresql/replication_slot_spec.rb new file mode 100644 index 00000000000..919a7526803 --- /dev/null +++ b/spec/models/postgresql/replication_slot_spec.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Postgresql::ReplicationSlot, :postgresql do + describe '.lag_too_great?' do + it 'returns true when replication lag is too great' do + expect(described_class) + .to receive(:pluck) + .and_return([125.megabytes]) + + expect(described_class.lag_too_great?).to eq(true) + end + + it 'returns false when more than one replicas is up to date enough' do + expect(described_class) + .to receive(:pluck) + .and_return([125.megabytes, 0.megabytes, 0.megabytes]) + + expect(described_class.lag_too_great?).to eq(false) + end + + it 'returns false when replication lag is not too great' do + expect(described_class) + .to receive(:pluck) + .and_return([0.megabytes]) + + expect(described_class.lag_too_great?).to eq(false) + end + end +end diff --git a/spec/workers/background_migration_worker_spec.rb b/spec/workers/background_migration_worker_spec.rb index d67e7698635..3bd072e7125 100644 --- a/spec/workers/background_migration_worker_spec.rb +++ b/spec/workers/background_migration_worker_spec.rb @@ -3,6 +3,12 @@ require 'spec_helper' describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state do let(:worker) { described_class.new } + describe '.minimum_interval' do + it 'returns 2 minutes' do + expect(described_class.minimum_interval).to eq(2.minutes.to_i) + end + end + describe '.perform' do it 'performs a background migration' do expect(Gitlab::BackgroundMigration) @@ -28,5 +34,51 @@ describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state d worker.perform('Foo', [10, 20]) end + + it 'reschedules a migration if the database is not healthy' do + allow(worker) + .to receive(:always_perform?) + .and_return(false) + + allow(worker) + .to receive(:healthy_database?) + .and_return(false) + + expect(described_class) + .to receive(:perform_in) + .with(a_kind_of(Numeric), 'Foo', [10, 20]) + + worker.perform('Foo', [10, 20]) + end + end + + describe '#healthy_database?' do + context 'using MySQL', :mysql do + it 'returns true' do + expect(worker.healthy_database?).to eq(true) + end + end + + context 'using PostgreSQL', :postgresql do + context 'when replication lag is too great' do + it 'returns false' do + allow(Postgresql::ReplicationSlot) + .to receive(:lag_too_great?) + .and_return(true) + + expect(worker.healthy_database?).to eq(false) + end + end + + context 'when replication lag is small enough' do + it 'returns true' do + allow(Postgresql::ReplicationSlot) + .to receive(:lag_too_great?) + .and_return(false) + + expect(worker.healthy_database?).to eq(true) + end + end + end end end |