diff options
author | Stan Hu <stanhu@gmail.com> | 2018-08-06 16:16:09 +0000 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2018-08-06 16:16:09 +0000 |
commit | 964d9f431f64754f171c5c523309417447c2ee71 (patch) | |
tree | d9d06c1ba2397ce9717a8ce0b01deb96d2bcce6b /app | |
parent | b4415c01740430cef58baf9bb0cbda2fb1055edb (diff) | |
parent | 1e5192cc8c2ebd3e0d740f3a044b7f5e4c086730 (diff) | |
download | gitlab-ce-964d9f431f64754f171c5c523309417447c2ee71.tar.gz |
Merge branch 'background-migrations-system-load' into 'master'
Respond to DB health in background migrations
See merge request gitlab-org/gitlab-ce!20720
Diffstat (limited to 'app')
-rw-r--r-- | app/models/postgresql/replication_slot.rb | 32 | ||||
-rw-r--r-- | app/workers/background_migration_worker.rb | 61 |
2 files changed, 86 insertions, 7 deletions
diff --git a/app/models/postgresql/replication_slot.rb b/app/models/postgresql/replication_slot.rb new file mode 100644 index 00000000000..70c7432e6b5 --- /dev/null +++ b/app/models/postgresql/replication_slot.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module Postgresql + class ReplicationSlot < ActiveRecord::Base + self.table_name = 'pg_replication_slots' + + # Returns true if the lag observed across all replication slots exceeds a + # given threshold. + # + # max - The maximum replication lag size, in bytes. Based on GitLab.com + # statistics it takes between 1 and 5 seconds to replicate around + # 100 MB of data. + def self.lag_too_great?(max = 100.megabytes) + lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \ + "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint" + + # We force the use of a transaction here so the query always goes to the + # primary, even when using the EE DB load balancer. + sizes = transaction { pluck(lag_function) } + too_great = sizes.count { |size| size >= max } + + # If too many replicas are falling behind too much, the availability of a + # GitLab instance might suffer. To prevent this from happening we require + # at least 1 replica to have data recent enough. + if sizes.any? && too_great.positive? + (sizes.length - too_great) <= 1 + else + false + end + end + end +end diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb index eaec7d48f35..7d006cc348e 100644 --- a/app/workers/background_migration_worker.rb +++ b/app/workers/background_migration_worker.rb @@ -6,10 +6,22 @@ class BackgroundMigrationWorker # The minimum amount of time between processing two jobs of the same migration # class. # - # This interval is set to 5 minutes so autovacuuming and other maintenance - # related tasks have plenty of time to clean up after a migration has been - # performed. - MIN_INTERVAL = 5.minutes.to_i + # This interval is set to 2 or 5 minutes so autovacuuming and other + # maintenance related tasks have plenty of time to clean up after a migration + # has been performed. + def self.minimum_interval + if enable_health_check? + 2.minutes.to_i + else + 5.minutes.to_i + end + end + + def self.enable_health_check? + Rails.env.development? || + Rails.env.test? || + Feature.enabled?('background_migration_health_check') + end # Performs the background migration. # @@ -27,7 +39,8 @@ class BackgroundMigrationWorker # running a migration of this class or we ran one recently. In this case # we'll reschedule the job in such a way that it is picked up again around # the time the lease expires. - self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments) + self.class + .perform_in(ttl || self.class.minimum_interval, class_name, arguments) end end @@ -39,17 +52,51 @@ class BackgroundMigrationWorker [true, nil] else lease = lease_for(class_name) + perform = !!lease.try_obtain + + # If we managed to acquire the lease but the DB is not healthy, then we + # want to simply reschedule our job and try again _after_ the lease + # expires. + if perform && !healthy_database? + database_unhealthy_counter.increment - [lease.try_obtain, lease.ttl] + perform = false + end + + [perform, lease.ttl] end end def lease_for(class_name) Gitlab::ExclusiveLease - .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL) + .new(lease_key_for(class_name), timeout: self.class.minimum_interval) + end + + def lease_key_for(class_name) + "#{self.class.name}:#{class_name}" end def always_perform? Rails.env.test? end + + # Returns true if the database is healthy enough to allow the migration to be + # performed. + # + # class_name - The name of the background migration that we might want to + # run. + def healthy_database? + return true unless self.class.enable_health_check? + + return true unless Gitlab::Database.postgresql? + + !Postgresql::ReplicationSlot.lag_too_great? + end + + def database_unhealthy_counter + Gitlab::Metrics.counter( + :background_migration_database_health_reschedules, + 'The number of times a background migration is rescheduled because the database is unhealthy.' + ) + end end |