Respond to DB health in background migrations

This changes the BackgroundMigration worker so it checks for the health of the DB before performing a background migration. This in turn allows us to reduce the minimum interval, without having to worry about blowing things up if we schedule too many migrations. In this setup, the BackgroundMigration worker will reschedule jobs as long as the database is considered to be in an unhealthy state. Once the database has recovered, the migration can be performed. To determine if the database is in a healthy state, we look at the replication lag of any replication slots defined on the primary. If the lag is deemed to great (100 MB by default) for too many slots, the migration is rescheduled for a later point in time. The health checking code is hidden behind a feature flag, allowing us to disable it if necessary.
author: Yorick Peterse <yorickpeterse@gmail.com> 2018-07-19 17:16:47 +0200
committer: Yorick Peterse <yorickpeterse@gmail.com> 2018-08-06 15:20:36 +0200
commit: 91b752dce63147bc99d7784d3d37865efb5e9352 (patch)
tree: 447dcd9dc5efcb14af5439f247d87938daf845dc
parent: 5f742eb95a0080343167469ccabfeccd3630007d (diff)
download: gitlab-ce-91b752dce63147bc99d7784d3d37865efb5e9352.tar.gz
7 files changed, 179 insertions, 10 deletions
diff --git a/app/models/postgresql/replication_slot.rb b/app/models/postgresql/replication_slot.rb
new file mode 100644
index 00000000000..70c7432e6b5
--- /dev/null
+++ b/app/models/postgresql/replication_slot.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+module Postgresql
+  class ReplicationSlot < ActiveRecord::Base
+    self.table_name = 'pg_replication_slots'
+
+    # Returns true if the lag observed across all replication slots exceeds a
+    # given threshold.
+    #
+    # max - The maximum replication lag size, in bytes. Based on GitLab.com
+    #       statistics it takes between 1 and 5 seconds to replicate around
+    #       100 MB of data.
+    def self.lag_too_great?(max = 100.megabytes)
+      lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \
+        "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint"
+
+      # We force the use of a transaction here so the query always goes to the
+      # primary, even when using the EE DB load balancer.
+      sizes = transaction { pluck(lag_function) }
+      too_great = sizes.count { |size| size >= max }
+
+      # If too many replicas are falling behind too much, the availability of a
+      # GitLab instance might suffer. To prevent this from happening we require
+      # at least 1 replica to have data recent enough.
+      if sizes.any? && too_great.positive?
+        (sizes.length - too_great) <= 1
+      else
+        false
+      end
+    end
+  end
+end
diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb
index eaec7d48f35..7d006cc348e 100644
--- a/app/workers/background_migration_worker.rb
+++ b/app/workers/background_migration_worker.rb
@@ -6,10 +6,22 @@ class BackgroundMigrationWorker
   # The minimum amount of time between processing two jobs of the same migration
   # class.
   #
-  # This interval is set to 5 minutes so autovacuuming and other maintenance
-  # related tasks have plenty of time to clean up after a migration has been
-  # performed.
-  MIN_INTERVAL = 5.minutes.to_i
+  # This interval is set to 2 or 5 minutes so autovacuuming and other
+  # maintenance related tasks have plenty of time to clean up after a migration
+  # has been performed.
+  def self.minimum_interval
+    if enable_health_check?
+      2.minutes.to_i
+    else
+      5.minutes.to_i
+    end
+  end
+
+  def self.enable_health_check?
+    Rails.env.development? ||
+      Rails.env.test? ||
+      Feature.enabled?('background_migration_health_check')
+  end
 
   # Performs the background migration.
   #
@@ -27,7 +39,8 @@ class BackgroundMigrationWorker
       # running a migration of this class or we ran one recently. In this case
       # we'll reschedule the job in such a way that it is picked up again around
       # the time the lease expires.
-      self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments)
+      self.class
+        .perform_in(ttl || self.class.minimum_interval, class_name, arguments)
     end
   end
 
@@ -39,17 +52,51 @@ class BackgroundMigrationWorker
       [true, nil]
     else
       lease = lease_for(class_name)
+      perform = !!lease.try_obtain
+
+      # If we managed to acquire the lease but the DB is not healthy, then we
+      # want to simply reschedule our job and try again _after_ the lease
+      # expires.
+      if perform && !healthy_database?
+        database_unhealthy_counter.increment
 
-      [lease.try_obtain, lease.ttl]
+        perform = false
+      end
+
+      [perform, lease.ttl]
     end
   end
 
   def lease_for(class_name)
     Gitlab::ExclusiveLease
-      .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL)
+      .new(lease_key_for(class_name), timeout: self.class.minimum_interval)
+  end
+
+  def lease_key_for(class_name)
+    "#{self.class.name}:#{class_name}"
   end
 
   def always_perform?
     Rails.env.test?
   end
+
+  # Returns true if the database is healthy enough to allow the migration to be
+  # performed.
+  #
+  # class_name - The name of the background migration that we might want to
+  #              run.
+  def healthy_database?
+    return true unless self.class.enable_health_check?
+
+    return true unless Gitlab::Database.postgresql?
+
+    !Postgresql::ReplicationSlot.lag_too_great?
+  end
+
+  def database_unhealthy_counter
+    Gitlab::Metrics.counter(
+      :background_migration_database_health_reschedules,
+      'The number of times a background migration is rescheduled because the database is unhealthy.'
+    )
+  end
 end
diff --git a/doc/development/background_migrations.md b/doc/development/background_migrations.md
index 16195cbbbdf..f0d5af9fcb5 100644
--- a/doc/development/background_migrations.md
+++ b/doc/development/background_migrations.md
@@ -5,6 +5,9 @@ otherwise take a very long time (hours, days, years, etc) to complete. For
 example, you can use background migrations to migrate data so that instead of
 storing data in a single JSON column the data is stored in a separate table.
 
+If the database cluster is considered to be in an unhealthy state, background
+migrations automatically reschedule themselves for a later point in time.
+
 ## When To Use Background Migrations
 
 >**Note:**
diff --git a/lib/gitlab/background_migration.rb b/lib/gitlab/background_migration.rb
index d3f66877672..36c85dec544 100644
--- a/lib/gitlab/background_migration.rb
+++ b/lib/gitlab/background_migration.rb
@@ -46,7 +46,11 @@ module Gitlab
     # arguments - The arguments to pass to the background migration's "perform"
     #             method.
     def self.perform(class_name, arguments)
-      const_get(class_name).new.perform(*arguments)
+      migration_class_for(class_name).new.perform(*arguments)
+    end
+
+    def self.migration_class_for(class_name)
+      const_get(class_name)
     end
   end
 end
diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb
index 4fe5b4cc835..f39b3b6eb5b 100644
--- a/lib/gitlab/database/migration_helpers.rb
+++ b/lib/gitlab/database/migration_helpers.rb
@@ -979,8 +979,8 @@ into similar problems in the future (e.g. when new tables are created).
 
         # To not overload the worker too much we enforce a minimum interval both
         # when scheduling and performing jobs.
-        if delay_interval < BackgroundMigrationWorker::MIN_INTERVAL
-          delay_interval = BackgroundMigrationWorker::MIN_INTERVAL
+        if delay_interval < BackgroundMigrationWorker.minimum_interval
+          delay_interval = BackgroundMigrationWorker.minimum_interval
         end
 
         model_class.each_batch(of: batch_size) do |relation, index|
diff --git a/spec/models/postgresql/replication_slot_spec.rb b/spec/models/postgresql/replication_slot_spec.rb
new file mode 100644
index 00000000000..919a7526803
--- /dev/null
+++ b/spec/models/postgresql/replication_slot_spec.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Postgresql::ReplicationSlot, :postgresql do
+  describe '.lag_too_great?' do
+    it 'returns true when replication lag is too great' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([125.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(true)
+    end
+
+    it 'returns false when more than one replicas is up to date enough' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([125.megabytes, 0.megabytes, 0.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(false)
+    end
+
+    it 'returns false when replication lag is not too great' do
+      expect(described_class)
+        .to receive(:pluck)
+        .and_return([0.megabytes])
+
+      expect(described_class.lag_too_great?).to eq(false)
+    end
+  end
+end
diff --git a/spec/workers/background_migration_worker_spec.rb b/spec/workers/background_migration_worker_spec.rb
index d67e7698635..3bd072e7125 100644
--- a/spec/workers/background_migration_worker_spec.rb
+++ b/spec/workers/background_migration_worker_spec.rb
@@ -3,6 +3,12 @@ require 'spec_helper'
 describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state do
   let(:worker) { described_class.new }
 
+  describe '.minimum_interval' do
+    it 'returns 2 minutes' do
+      expect(described_class.minimum_interval).to eq(2.minutes.to_i)
+    end
+  end
+
   describe '.perform' do
     it 'performs a background migration' do
       expect(Gitlab::BackgroundMigration)
@@ -28,5 +34,51 @@ describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state d
 
       worker.perform('Foo', [10, 20])
     end
+
+    it 'reschedules a migration if the database is not healthy' do
+      allow(worker)
+        .to receive(:always_perform?)
+        .and_return(false)
+
+      allow(worker)
+        .to receive(:healthy_database?)
+        .and_return(false)
+
+      expect(described_class)
+        .to receive(:perform_in)
+        .with(a_kind_of(Numeric), 'Foo', [10, 20])
+
+      worker.perform('Foo', [10, 20])
+    end
+  end
+
+  describe '#healthy_database?' do
+    context 'using MySQL', :mysql do
+      it 'returns true' do
+        expect(worker.healthy_database?).to eq(true)
+      end
+    end
+
+    context 'using PostgreSQL', :postgresql do
+      context 'when replication lag is too great' do
+        it 'returns false' do
+          allow(Postgresql::ReplicationSlot)
+            .to receive(:lag_too_great?)
+            .and_return(true)
+
+          expect(worker.healthy_database?).to eq(false)
+        end
+      end
+
+      context 'when replication lag is small enough' do
+        it 'returns true' do
+          allow(Postgresql::ReplicationSlot)
+            .to receive(:lag_too_great?)
+            .and_return(false)
+
+          expect(worker.healthy_database?).to eq(true)
+        end
+      end
+    end
   end
 end
author	Yorick Peterse <yorickpeterse@gmail.com>	2018-07-19 17:16:47 +0200
committer	Yorick Peterse <yorickpeterse@gmail.com>	2018-08-06 15:20:36 +0200
commit	91b752dce63147bc99d7784d3d37865efb5e9352 (patch)
tree	447dcd9dc5efcb14af5439f247d87938daf845dc
parent	5f742eb95a0080343167469ccabfeccd3630007d (diff)
download	gitlab-ce-91b752dce63147bc99d7784d3d37865efb5e9352.tar.gz