WIP parallel DB migrationsparallel-db-migrations

author: Yorick Peterse <yorickpeterse@gmail.com> 2018-07-18 16:34:04 +0200
committer: Yorick Peterse <yorickpeterse@gmail.com> 2018-07-18 16:34:04 +0200
commit: 42e1080d54cf32919a5592bbdeef1857afc90783 (patch)
tree: bb312c77c341d9959b67ddf65325c60fcefc3924
parent: 4736e0519b0d8c66ab35fd636f5f81089822ba7a (diff)
download: gitlab-ce-parallel-db-migrations.tar.gz
2 files changed, 148 insertions, 0 deletions
diff --git a/app/models/concerns/each_batch.rb b/app/models/concerns/each_batch.rb
index 6ddbb8da1a9..6baa9edc263 100644
--- a/app/models/concerns/each_batch.rb
+++ b/app/models/concerns/each_batch.rb
@@ -38,6 +38,10 @@ module EachBatch
     # of - The number of rows to retrieve per batch.
     # column - The column to use for ordering the batches.
     def each_batch(of: 1000, column: primary_key)
+      unless block_given?
+        return to_enum(__method__, of: of, column: column).lazy
+      end
+
       unless column
         raise ArgumentError,
           'the column: argument must be set to a column name to use for ordering rows'
diff --git a/parallel_migrate.rb b/parallel_migrate.rb
new file mode 100644
index 00000000000..8c4c115bb8e
--- /dev/null
+++ b/parallel_migrate.rb
@@ -0,0 +1,144 @@
+# frozen_string_literal: true
+# rubocop: disable all
+
+class PostgresSetting < ActiveRecord::Base
+  self.table_name = 'pg_settings'
+
+  def self.autovacuum_interval
+    select('setting::bigint AS interval')
+      .where(name: 'autovacuum_naptime')
+      .take
+      .interval
+  end
+end
+
+class PostgresTableStatistic < ActiveRecord::Base
+  self.table_name = 'pg_stat_all_tables'
+
+  # This threshold is based on what GitLab.com can usually replicate in a matter
+  # of seconds.
+  MAX_DEAD_TUPLES = 100_000
+
+  alias_attribute :dead_tuples, :n_dead_tup
+  alias_attribute :live_tuples, :n_live_tup
+
+  def self.for_table(table)
+    find_by(relname: table)
+  end
+
+  def too_many_dead_tuples?
+    # If there are no live tuples (e.g. the statistics are out of date), the
+    # below calculation would return true, even if there are only a few dead
+    # tuples.
+    return false if live_tuples.zero?
+
+    # For small tables we want to limit the number of dead tuples to 1% of the
+    # total number of tuples.
+    #
+    # For large tables this can result in a very large number (millions for
+    # example), so we enforce an upper limit on the number of dead tuples.
+    return true if dead_tuples >= MAX_DEAD_TUPLES
+
+    (dead_tuples.to_f / live_tuples.to_f) >= 0.01
+  end
+end
+
+class Table
+  def initialize(name)
+    @name = name
+  end
+
+  def too_many_dead_tuples?
+    # Table statistics are only refreshed when running ANALYZE. To ensure this
+    # method doesn't return the wrong value, we first run ANALYZE before
+    # checking the statistics.
+    analyze
+
+    # We force the use of a transaction here so the query always goes to the
+    # primary, even when using the EE DB load balancer.
+    stats = PostgresTableStatistic.transaction do
+      PostgresTableStatistic.for_table(@name)
+    end
+
+    stats.too_many_dead_tuples?
+  end
+
+  def analyze
+    ActiveRecord::Base.connection.execute("ANALYZE #{quoted_table_name}")
+  end
+
+  def vacuum_analyze
+    ActiveRecord::Base.connection.execute("VACUUM ANALYZE #{quoted_table_name}")
+  end
+
+  def quoted_table_name
+    ActiveRecord::Base.connection.quote_table_name(@name)
+  end
+end
+
+class ReplicationSlot < ActiveRecord::Base
+  self.table_name = 'pg_replication_slots'
+
+  LAG_FUNCTION =
+    if Gitlab::Database.postgresql? && Gitlab::Database.version.to_f < 10.0
+      'pg_xlog_location_diff(pg_current_xlog_insert_location(), restart_lsn)::bigint'
+    else
+      'pg_wal_lsn_diff(pg_current_wal_insert_lsn(), restart_lsn)::bigint'
+    end
+
+  MAX_LAG = 100.megabytes
+
+  def self.lag_too_great?
+    sizes = transaction { pluck(LAG_FUNCTION) }
+    too_great = sizes.count { |size| size >= MAX_LAG }
+
+    # If too many replicas are falling behind too much, the availability of a
+    # GitLab instance might suffer. To prevent this from happening we require at
+    # least 1 replica to have data recent enough.
+    (sizes.length - too_great) >= 1
+  end
+end
+
+Event.send(:include, EachBatch)
+
+def parallel_migrate(query, batch_size: 1_000, concurrency: 8)
+  old_config = ActiveRecord::Base.configurations[Rails.env]
+  new_config = old_config.merge('pool' => concurrency)
+
+  ActiveRecord::Base.establish_connection(new_config)
+
+  begin
+    query.each_batch(of: batch_size).each_slice(concurrency) do |slices|
+      threads = slices.map do |slice|
+        Thread.new do
+          yield slice.first
+        end
+      end
+
+      threads.each(&:join)
+
+      table = Table.new(query.table_name)
+      start = Gitlab::Metrics::System.monotonic_time
+
+      while ReplicationSlot.lag_too_great? || table.too_many_dead_tuples?
+        sleep(5)
+
+        # If vacuuming hasn't kicked in yet we'll manually vacuum the table,
+        # ensuring we don't wait in this loop for too long.
+        #
+        # We use `nap time * 2` to make sure we don't start vacuuming right away
+        # when autovacuuming is waking up.
+        if (Gitlab::Metrics::System.monotonic_time - start) >= (PostgresSetting.autovacuum_interval * 2)
+          puts 'MANUALLY VACUUMING'
+          table.vacuum_analyze
+        end
+      end
+    end
+  ensure
+    ActiveRecord::Base.establish_connection(old_config)
+  end
+end
+
+parallel_migrate(Event, concurrency: 16) do |batch|
+  batch.update_all('updated_at = now()')
+end
author	Yorick Peterse <yorickpeterse@gmail.com>	2018-07-18 16:34:04 +0200
committer	Yorick Peterse <yorickpeterse@gmail.com>	2018-07-18 16:34:04 +0200
commit	42e1080d54cf32919a5592bbdeef1857afc90783 (patch)
tree	bb312c77c341d9959b67ddf65325c60fcefc3924
parent	4736e0519b0d8c66ab35fd636f5f81089822ba7a (diff)
download	gitlab-ce-parallel-db-migrations.tar.gz