Merge branch 'background-migrations-system-load' into 'master'

Respond to DB health in background migrations See merge request gitlab-org/gitlab-ce!20720
author: Stan Hu <stanhu@gmail.com> 2018-08-06 16:16:09 +0000
committer: Stan Hu <stanhu@gmail.com> 2018-08-06 16:16:09 +0000
commit: 964d9f431f64754f171c5c523309417447c2ee71 (patch)
tree: d9d06c1ba2397ce9717a8ce0b01deb96d2bcce6b /app
parent: b4415c01740430cef58baf9bb0cbda2fb1055edb (diff)
parent: 1e5192cc8c2ebd3e0d740f3a044b7f5e4c086730 (diff)
download: gitlab-ce-964d9f431f64754f171c5c523309417447c2ee71.tar.gz
2 files changed, 86 insertions, 7 deletions
diff --git a/app/models/postgresql/replication_slot.rb b/app/models/postgresql/replication_slot.rb
new file mode 100644
index 00000000000..70c7432e6b5
--- /dev/null
+++ b/app/models/postgresql/replication_slot.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+module Postgresql
+  class ReplicationSlot < ActiveRecord::Base
+    self.table_name = 'pg_replication_slots'
+
+    # Returns true if the lag observed across all replication slots exceeds a
+    # given threshold.
+    #
+    # max - The maximum replication lag size, in bytes. Based on GitLab.com
+    #       statistics it takes between 1 and 5 seconds to replicate around
+    #       100 MB of data.
+    def self.lag_too_great?(max = 100.megabytes)
+      lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \
+        "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint"
+
+      # We force the use of a transaction here so the query always goes to the
+      # primary, even when using the EE DB load balancer.
+      sizes = transaction { pluck(lag_function) }
+      too_great = sizes.count { |size| size >= max }
+
+      # If too many replicas are falling behind too much, the availability of a
+      # GitLab instance might suffer. To prevent this from happening we require
+      # at least 1 replica to have data recent enough.
+      if sizes.any? && too_great.positive?
+        (sizes.length - too_great) <= 1
+      else
+        false
+      end
+    end
+  end
+end
diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb
index eaec7d48f35..7d006cc348e 100644
--- a/app/workers/background_migration_worker.rb
+++ b/app/workers/background_migration_worker.rb
@@ -6,10 +6,22 @@ class BackgroundMigrationWorker
   # The minimum amount of time between processing two jobs of the same migration
   # class.
   #
-  # This interval is set to 5 minutes so autovacuuming and other maintenance
-  # related tasks have plenty of time to clean up after a migration has been
-  # performed.
-  MIN_INTERVAL = 5.minutes.to_i
+  # This interval is set to 2 or 5 minutes so autovacuuming and other
+  # maintenance related tasks have plenty of time to clean up after a migration
+  # has been performed.
+  def self.minimum_interval
+    if enable_health_check?
+      2.minutes.to_i
+    else
+      5.minutes.to_i
+    end
+  end
+
+  def self.enable_health_check?
+    Rails.env.development? ||
+      Rails.env.test? ||
+      Feature.enabled?('background_migration_health_check')
+  end
 
   # Performs the background migration.
   #
@@ -27,7 +39,8 @@ class BackgroundMigrationWorker
       # running a migration of this class or we ran one recently. In this case
       # we'll reschedule the job in such a way that it is picked up again around
       # the time the lease expires.
-      self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments)
+      self.class
+        .perform_in(ttl || self.class.minimum_interval, class_name, arguments)
     end
   end
 
@@ -39,17 +52,51 @@ class BackgroundMigrationWorker
       [true, nil]
     else
       lease = lease_for(class_name)
+      perform = !!lease.try_obtain
+
+      # If we managed to acquire the lease but the DB is not healthy, then we
+      # want to simply reschedule our job and try again _after_ the lease
+      # expires.
+      if perform && !healthy_database?
+        database_unhealthy_counter.increment
 
-      [lease.try_obtain, lease.ttl]
+        perform = false
+      end
+
+      [perform, lease.ttl]
     end
   end
 
   def lease_for(class_name)
     Gitlab::ExclusiveLease
-      .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL)
+      .new(lease_key_for(class_name), timeout: self.class.minimum_interval)
+  end
+
+  def lease_key_for(class_name)
+    "#{self.class.name}:#{class_name}"
   end
 
   def always_perform?
     Rails.env.test?
   end
+
+  # Returns true if the database is healthy enough to allow the migration to be
+  # performed.
+  #
+  # class_name - The name of the background migration that we might want to
+  #              run.
+  def healthy_database?
+    return true unless self.class.enable_health_check?
+
+    return true unless Gitlab::Database.postgresql?
+
+    !Postgresql::ReplicationSlot.lag_too_great?
+  end
+
+  def database_unhealthy_counter
+    Gitlab::Metrics.counter(
+      :background_migration_database_health_reschedules,
+      'The number of times a background migration is rescheduled because the database is unhealthy.'
+    )
+  end
 end
author	Stan Hu <stanhu@gmail.com>	2018-08-06 16:16:09 +0000
committer	Stan Hu <stanhu@gmail.com>	2018-08-06 16:16:09 +0000
commit	964d9f431f64754f171c5c523309417447c2ee71 (patch)
tree	d9d06c1ba2397ce9717a8ce0b01deb96d2bcce6b /app
parent	b4415c01740430cef58baf9bb0cbda2fb1055edb (diff)
parent	1e5192cc8c2ebd3e0d740f3a044b7f5e4c086730 (diff)
download	gitlab-ce-964d9f431f64754f171c5c523309417447c2ee71.tar.gz