7 files changed, 179 insertions, 10 deletions
diff --git a/app/models/postgresql/replication_slot.rb b/app/models/postgresql/replication_slot.rb
new file mode 100644
index 00000000000..70c7432e6b5
--- /dev/null
+++ b/app/models/postgresql/replication_slot.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+module Postgresql
+  class ReplicationSlot < ActiveRecord::Base
+    self.table_name = 'pg_replication_slots'
+
+    # Returns true if the lag observed across all replication slots exceeds a
+    # given threshold.
+    #
+    # max - The maximum replication lag size, in bytes. Based on GitLab.com
+    #       statistics it takes between 1 and 5 seconds to replicate around
+    #       100 MB of data.
+    def self.lag_too_great?(max = 100.megabytes)
+      lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \
+        "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint"
+
+      # We force the use of a transaction here so the query always goes to the
+      # primary, even when using the EE DB load balancer.
+      sizes = transaction { pluck(lag_function) }
+      too_great = sizes.count { |size| size >= max }
+
+      # If too many replicas are falling behind too much, the availability of a
+      # GitLab instance might suffer. To prevent this from happening we require
+      # at least 1 replica to have data recent enough.
+      if sizes.any? && too_great.positive?
+        (sizes.length - too_great) <= 1
+      else
+        false
+      end
+    end
+  end
+end
diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb
index eaec7d48f35..7d006cc348e 100644
--- a/app/workers/background_migration_worker.rb
+++ b/app/workers/background_migration_worker.rb
@@ -6,10 +6,22 @@ class BackgroundMigrationWorker
   # The minimum amount of time between processing two jobs of the same migration
   # class.
   #
-  # This interval is set to 5 minutes so autovacuuming and other maintenance
-  # related tasks have plenty of time to clean up after a migration has been
-  # performed.
-  MIN_INTERVAL = 5.minutes.to_i
+  # This interval is set to 2 or 5 minutes so autovacuuming and other
+  # maintenance related tasks have plenty of time to clean up after a migration
+  # has been performed.
+  def self.minimum_interval
+    if enable_health_check?
+      2.minutes.to_i
+    else
+      5.minutes.to_i
+    end
+  end
+
+  def self.enable_health_check?
+    Rails.env.development? ||
+      Rails.env.test? ||
+      Feature.enabled?('background_migration_health_check')
+  end
 
   # Performs the background migration.
   #
@@ -27,7 +39,8 @@ class BackgroundMigrationWorker
       # running a migration of this class or we ran one recently. In this case
       # we'll reschedule the job in such a way that it is picked up again around
       # the time the lease expires.
-      self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments)
+      self.class
+        .perform_in(ttl || self.class.minimum_interval, class_name, arguments)
     end
   end
 
@@ -39,17 +52,51 @@ class BackgroundMigrationWorker
       [true, nil]
     else
       lease = lease_for(class_name)
+      perform = !!lease.try_obtain
+
+      # If we managed to acquire the lease but the DB is not healthy, then we
+      # want to simply reschedule our job and try again _after_ the lease
+      # expires.
+      if perform && !healthy_database?
+        database_unhealthy_counter.increment
 
-      [lease.try_obtain, lease.ttl]
+        perform = false
+      end
+
+      [perform, lease.ttl]
     end
   end
 
   def lease_for(class_name)
     Gitlab::ExclusiveLease
-      .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL)
+      .new(lease_key_for(class_name), timeout: self.class.minimum_interval)
+  end
+
+  def lease_key_for(class_name)
+    "#{self.class.name}:#{class_name}"
   end
 
   def always_perform?
     Rails.env.test?
   end
+
+  # Returns true if the database is healthy enough to allow the migration to be
+  # performed.
+  #
+  # class_name - The name of the background migration that we might want to
+  #              run.
+  def healthy_database?
+    return true unless self.class.enable_health_check?
+
+    return true unless Gitlab::Database.postgresql?
+
+    !Postgresql::ReplicationSlot.lag_too_great?
+  end
+
+  def database_unhealthy_counter
+    Gitlab::Metrics.counter(
+      :background_migration_database_health_reschedules,
+      'The number of times a background migration is rescheduled because the database is unhealthy.'
+    )
+  end
 end
diff --git a/doc/development/background_migrations.md b/doc/development/background_migrations.md
index 16195cbbbdf..f0d5af9fcb5 100644
--- a/doc/development/background_migrations.md
+++ b/doc/development/background_migrations.md
@@ -5,6 +5,9 @@ otherwise take a very long time (hours, days, years, etc) to complete. For
 example, you can use background migrations to migrate data so that instead of
 storing data in a single JSON column the data is stored in a separate table.
 
+If the database cluster is considered to be in an unhealthy state, background
+migrations automatically reschedule themselves for a later point in time.
+
 ## When To Use Background Migrations
 
 >**Note:**
diff --git a/lib/gitlab/background_migration.rb b/lib/gitlab/background_migration.rb
index d3f66877672..36c85dec544 100644
--- a/lib/gitlab/background_migration.rb
+++ b/lib/gitlab/background_migration.rb
@@ -46,7 +46,11 @@ module Gitlab
     # arguments - The arguments to pass to the background migration's "perform"
     #             method.
     def self.perform(class_name, arguments)
-      const_get(class_name).new.perform(*arguments)
+      migration_class_for(class_name).new.perform(*arguments)
+    end
+
+    def self.migration_class_for(class_name)
+      const_get(class_name)
     end
   end
 end
diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb
index 4fe5b4cc835..f39b3b6eb5b 100644
--- a/lib/gitlab/database/migration_helpers.rb
+++ b/lib/gitlab/database/migration_helpers.rb
@@ -979,8 +979,8 @@ into similar problems in the future (e.g. when new tables are created).
 
         # To not overload the worker too much we enforce a minimum interval both
         # when scheduling and performing jobs.
-        if delay_interval < BackgroundMigrationWorker::MIN_INTERVAL
-          delay_interval = BackgroundMigrationWorker::MIN_INTERVAL
+        if delay_interval < BackgroundMigrationWorker.minimum_interval
+          delay_interval = BackgroundMigrationWorker.minimum_interval
         end
 
         model_class.each_batch(of: batch_size) do |relation, index|
diff --git a/spec/models/postgresql/replication_slot_spec.rb b/spec/models/postgresql/replication_slot_spec.rb
new file mode 100644
index 00000000000..919a7526803
--- /dev/null
+++ b/spec/models/postgresql/replication_slot_spec.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Postgresql::ReplicationSlot, :postgresql do
+  describe '.lag_too_great?' do
+    it 'returns true when replication lag is too great' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([125.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(true)
+    end
+
+    it 'returns false when more than one replicas is up to date enough' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([125.megabytes, 0.megabytes, 0.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(false)
+    end
+
+    it 'returns false when replication lag is not too great' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([0.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(false)
+    end
+  end
+end
diff --git a/spec/workers/background_migration_worker_spec.rb b/spec/workers/background_migration_worker_spec.rb
index d67e7698635..3bd072e7125 100644
--- a/spec/workers/background_migration_worker_spec.rb
+++ b/spec/workers/background_migration_worker_spec.rb
@@ -3,6 +3,12 @@ require 'spec_helper'
 describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state do
   let(:worker) { described_class.new }
 
+  describe '.minimum_interval' do
+    it 'returns 2 minutes' do
+      expect(described_class.minimum_interval).to eq(2.minutes.to_i)
+    end
+  end
+
   describe '.perform' do
     it 'performs a background migration' do
       expect(Gitlab::BackgroundMigration)
@@ -28,5 +34,51 @@ describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state d
 
       worker.perform('Foo', [10, 20])
     end
+
+    it 'reschedules a migration if the database is not healthy' do
+      allow(worker)
+        .to receive(:always_perform?)
+        .and_return(false)
+
+      allow(worker)
+        .to receive(:healthy_database?)
+        .and_return(false)
+
+      expect(described_class)
+        .to receive(:perform_in)
+        .with(a_kind_of(Numeric), 'Foo', [10, 20])
+
+      worker.perform('Foo', [10, 20])
+    end
+  end
+
+  describe '#healthy_database?' do
+    context 'using MySQL', :mysql do
+      it 'returns true' do
+        expect(worker.healthy_database?).to eq(true)
+      end
+    end
+
+    context 'using PostgreSQL', :postgresql do
+      context 'when replication lag is too great' do
+        it 'returns false' do
+          allow(Postgresql::ReplicationSlot)
+            .to receive(:lag_too_great?)
+            .and_return(true)
+
+          expect(worker.healthy_database?).to eq(false)
+        end
+      end
+
+      context 'when replication lag is small enough' do
+        it 'returns true' do
+          allow(Postgresql::ReplicationSlot)
+            .to receive(:lag_too_great?)
+            .and_return(false)
+
+          expect(worker.healthy_database?).to eq(true)
+        end
+      end
+    end
   end
 end