summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--app/models/postgresql/replication_slot.rb32
-rw-r--r--app/workers/background_migration_worker.rb61
-rw-r--r--doc/development/background_migrations.md3
-rw-r--r--lib/gitlab/background_migration.rb6
-rw-r--r--lib/gitlab/database/migration_helpers.rb4
-rw-r--r--spec/models/postgresql/replication_slot_spec.rb31
-rw-r--r--spec/workers/background_migration_worker_spec.rb52
7 files changed, 179 insertions, 10 deletions
diff --git a/app/models/postgresql/replication_slot.rb b/app/models/postgresql/replication_slot.rb
new file mode 100644
index 00000000000..70c7432e6b5
--- /dev/null
+++ b/app/models/postgresql/replication_slot.rb
@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+
+module Postgresql
+ class ReplicationSlot < ActiveRecord::Base
+ self.table_name = 'pg_replication_slots'
+
+ # Returns true if the lag observed across all replication slots exceeds a
+ # given threshold.
+ #
+ # max - The maximum replication lag size, in bytes. Based on GitLab.com
+ # statistics it takes between 1 and 5 seconds to replicate around
+ # 100 MB of data.
+ def self.lag_too_great?(max = 100.megabytes)
+ lag_function = "#{Gitlab::Database.pg_wal_lsn_diff}" \
+ "(#{Gitlab::Database.pg_current_wal_insert_lsn}(), restart_lsn)::bigint"
+
+ # We force the use of a transaction here so the query always goes to the
+ # primary, even when using the EE DB load balancer.
+ sizes = transaction { pluck(lag_function) }
+ too_great = sizes.count { |size| size >= max }
+
+ # If too many replicas are falling behind too much, the availability of a
+ # GitLab instance might suffer. To prevent this from happening we require
+ # at least 1 replica to have data recent enough.
+ if sizes.any? && too_great.positive?
+ (sizes.length - too_great) <= 1
+ else
+ false
+ end
+ end
+ end
+end
diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb
index eaec7d48f35..7d006cc348e 100644
--- a/app/workers/background_migration_worker.rb
+++ b/app/workers/background_migration_worker.rb
@@ -6,10 +6,22 @@ class BackgroundMigrationWorker
# The minimum amount of time between processing two jobs of the same migration
# class.
#
- # This interval is set to 5 minutes so autovacuuming and other maintenance
- # related tasks have plenty of time to clean up after a migration has been
- # performed.
- MIN_INTERVAL = 5.minutes.to_i
+ # This interval is set to 2 or 5 minutes so autovacuuming and other
+ # maintenance related tasks have plenty of time to clean up after a migration
+ # has been performed.
+ def self.minimum_interval
+ if enable_health_check?
+ 2.minutes.to_i
+ else
+ 5.minutes.to_i
+ end
+ end
+
+ def self.enable_health_check?
+ Rails.env.development? ||
+ Rails.env.test? ||
+ Feature.enabled?('background_migration_health_check')
+ end
# Performs the background migration.
#
@@ -27,7 +39,8 @@ class BackgroundMigrationWorker
# running a migration of this class or we ran one recently. In this case
# we'll reschedule the job in such a way that it is picked up again around
# the time the lease expires.
- self.class.perform_in(ttl || MIN_INTERVAL, class_name, arguments)
+ self.class
+ .perform_in(ttl || self.class.minimum_interval, class_name, arguments)
end
end
@@ -39,17 +52,51 @@ class BackgroundMigrationWorker
[true, nil]
else
lease = lease_for(class_name)
+ perform = !!lease.try_obtain
+
+ # If we managed to acquire the lease but the DB is not healthy, then we
+ # want to simply reschedule our job and try again _after_ the lease
+ # expires.
+ if perform && !healthy_database?
+ database_unhealthy_counter.increment
- [lease.try_obtain, lease.ttl]
+ perform = false
+ end
+
+ [perform, lease.ttl]
end
end
def lease_for(class_name)
Gitlab::ExclusiveLease
- .new("#{self.class.name}:#{class_name}", timeout: MIN_INTERVAL)
+ .new(lease_key_for(class_name), timeout: self.class.minimum_interval)
+ end
+
+ def lease_key_for(class_name)
+ "#{self.class.name}:#{class_name}"
end
def always_perform?
Rails.env.test?
end
+
+ # Returns true if the database is healthy enough to allow the migration to be
+ # performed.
+ #
+ # class_name - The name of the background migration that we might want to
+ # run.
+ def healthy_database?
+ return true unless self.class.enable_health_check?
+
+ return true unless Gitlab::Database.postgresql?
+
+ !Postgresql::ReplicationSlot.lag_too_great?
+ end
+
+ def database_unhealthy_counter
+ Gitlab::Metrics.counter(
+ :background_migration_database_health_reschedules,
+ 'The number of times a background migration is rescheduled because the database is unhealthy.'
+ )
+ end
end
diff --git a/doc/development/background_migrations.md b/doc/development/background_migrations.md
index 16195cbbbdf..f0d5af9fcb5 100644
--- a/doc/development/background_migrations.md
+++ b/doc/development/background_migrations.md
@@ -5,6 +5,9 @@ otherwise take a very long time (hours, days, years, etc) to complete. For
example, you can use background migrations to migrate data so that instead of
storing data in a single JSON column the data is stored in a separate table.
+If the database cluster is considered to be in an unhealthy state, background
+migrations automatically reschedule themselves for a later point in time.
+
## When To Use Background Migrations
>**Note:**
diff --git a/lib/gitlab/background_migration.rb b/lib/gitlab/background_migration.rb
index d3f66877672..36c85dec544 100644
--- a/lib/gitlab/background_migration.rb
+++ b/lib/gitlab/background_migration.rb
@@ -46,7 +46,11 @@ module Gitlab
# arguments - The arguments to pass to the background migration's "perform"
# method.
def self.perform(class_name, arguments)
- const_get(class_name).new.perform(*arguments)
+ migration_class_for(class_name).new.perform(*arguments)
+ end
+
+ def self.migration_class_for(class_name)
+ const_get(class_name)
end
end
end
diff --git a/lib/gitlab/database/migration_helpers.rb b/lib/gitlab/database/migration_helpers.rb
index 4fe5b4cc835..f39b3b6eb5b 100644
--- a/lib/gitlab/database/migration_helpers.rb
+++ b/lib/gitlab/database/migration_helpers.rb
@@ -979,8 +979,8 @@ into similar problems in the future (e.g. when new tables are created).
# To not overload the worker too much we enforce a minimum interval both
# when scheduling and performing jobs.
- if delay_interval < BackgroundMigrationWorker::MIN_INTERVAL
- delay_interval = BackgroundMigrationWorker::MIN_INTERVAL
+ if delay_interval < BackgroundMigrationWorker.minimum_interval
+ delay_interval = BackgroundMigrationWorker.minimum_interval
end
model_class.each_batch(of: batch_size) do |relation, index|
diff --git a/spec/models/postgresql/replication_slot_spec.rb b/spec/models/postgresql/replication_slot_spec.rb
new file mode 100644
index 00000000000..919a7526803
--- /dev/null
+++ b/spec/models/postgresql/replication_slot_spec.rb
@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Postgresql::ReplicationSlot, :postgresql do
+ describe '.lag_too_great?' do
+ it 'returns true when replication lag is too great' do
+ expect(described_class)
+ .to receive(:pluck)
+ .and_return([125.megabytes])
+
+ expect(described_class.lag_too_great?).to eq(true)
+ end
+
+ it 'returns false when more than one replicas is up to date enough' do
+ expect(described_class)
+ .to receive(:pluck)
+ .and_return([125.megabytes, 0.megabytes, 0.megabytes])
+
+ expect(described_class.lag_too_great?).to eq(false)
+ end
+
+ it 'returns false when replication lag is not too great' do
+ expect(described_class)
+ .to receive(:pluck)
+ .and_return([0.megabytes])
+
+ expect(described_class.lag_too_great?).to eq(false)
+ end
+ end
+end
diff --git a/spec/workers/background_migration_worker_spec.rb b/spec/workers/background_migration_worker_spec.rb
index d67e7698635..3bd072e7125 100644
--- a/spec/workers/background_migration_worker_spec.rb
+++ b/spec/workers/background_migration_worker_spec.rb
@@ -3,6 +3,12 @@ require 'spec_helper'
describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state do
let(:worker) { described_class.new }
+ describe '.minimum_interval' do
+ it 'returns 2 minutes' do
+ expect(described_class.minimum_interval).to eq(2.minutes.to_i)
+ end
+ end
+
describe '.perform' do
it 'performs a background migration' do
expect(Gitlab::BackgroundMigration)
@@ -28,5 +34,51 @@ describe BackgroundMigrationWorker, :sidekiq, :clean_gitlab_redis_shared_state d
worker.perform('Foo', [10, 20])
end
+
+ it 'reschedules a migration if the database is not healthy' do
+ allow(worker)
+ .to receive(:always_perform?)
+ .and_return(false)
+
+ allow(worker)
+ .to receive(:healthy_database?)
+ .and_return(false)
+
+ expect(described_class)
+ .to receive(:perform_in)
+ .with(a_kind_of(Numeric), 'Foo', [10, 20])
+
+ worker.perform('Foo', [10, 20])
+ end
+ end
+
+ describe '#healthy_database?' do
+ context 'using MySQL', :mysql do
+ it 'returns true' do
+ expect(worker.healthy_database?).to eq(true)
+ end
+ end
+
+ context 'using PostgreSQL', :postgresql do
+ context 'when replication lag is too great' do
+ it 'returns false' do
+ allow(Postgresql::ReplicationSlot)
+ .to receive(:lag_too_great?)
+ .and_return(true)
+
+ expect(worker.healthy_database?).to eq(false)
+ end
+ end
+
+ context 'when replication lag is small enough' do
+ it 'returns true' do
+ allow(Postgresql::ReplicationSlot)
+ .to receive(:lag_too_great?)
+ .and_return(false)
+
+ expect(worker.healthy_database?).to eq(true)
+ end
+ end
+ end
end
end