From ff78af152cc3054a7bb76af718943dca7a69a3c5 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 7 Jul 2017 14:49:05 +0200 Subject: Added EachBatch for iterating tables in batches This module provides a class method called `each_batch` that can be used to iterate tables in batches in a more efficient way compared to Rails' `in_batches` method. This commit also includes a RuboCop cop to blacklist the use of `in_batches` in favour of this new method. --- app/models/concerns/each_batch.rb | 81 ++++++++++++++++++++++++++ doc/development/README.md | 1 + doc/development/iterating_tables_in_batches.md | 37 ++++++++++++ lib/gitlab/otp_key_rotator.rb | 2 +- rubocop/cop/in_batches.rb | 16 +++++ rubocop/rubocop.rb | 1 + spec/models/concerns/each_batch_spec.rb | 53 +++++++++++++++++ spec/rubocop/cop/in_batches_spec.rb | 19 ++++++ 8 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 app/models/concerns/each_batch.rb create mode 100644 doc/development/iterating_tables_in_batches.md create mode 100644 rubocop/cop/in_batches.rb create mode 100644 spec/models/concerns/each_batch_spec.rb create mode 100644 spec/rubocop/cop/in_batches_spec.rb diff --git a/app/models/concerns/each_batch.rb b/app/models/concerns/each_batch.rb new file mode 100644 index 00000000000..6ddbb8da1a9 --- /dev/null +++ b/app/models/concerns/each_batch.rb @@ -0,0 +1,81 @@ +module EachBatch + extend ActiveSupport::Concern + + module ClassMethods + # Iterates over the rows in a relation in batches, similar to Rails' + # `in_batches` but in a more efficient way. + # + # Unlike `in_batches` provided by Rails this method does not support a + # custom start/end range, nor does it provide support for the `load:` + # keyword argument. + # + # This method will yield an ActiveRecord::Relation to the supplied block, or + # return an Enumerator if no block is given. + # + # Example: + # + # User.each_batch do |relation| + # relation.update_all(updated_at: Time.now) + # end + # + # The supplied block is also passed an optional batch index: + # + # User.each_batch do |relation, index| + # puts index # => 1, 2, 3, ... + # end + # + # You can also specify an alternative column to use for ordering the rows: + # + # User.each_batch(column: :created_at) do |relation| + # ... + # end + # + # This will produce SQL queries along the lines of: + # + # User Load (0.7ms) SELECT "users"."id" FROM "users" WHERE ("users"."id" >= 41654) ORDER BY "users"."id" ASC LIMIT 1 OFFSET 1000 + # (0.7ms) SELECT COUNT(*) FROM "users" WHERE ("users"."id" >= 41654) AND ("users"."id" < 42687) + # + # of - The number of rows to retrieve per batch. + # column - The column to use for ordering the batches. + def each_batch(of: 1000, column: primary_key) + unless column + raise ArgumentError, + 'the column: argument must be set to a column name to use for ordering rows' + end + + start = except(:select) + .select(column) + .reorder(column => :asc) + .take + + return unless start + + start_id = start[column] + arel_table = self.arel_table + + 1.step do |index| + stop = except(:select) + .select(column) + .where(arel_table[column].gteq(start_id)) + .reorder(column => :asc) + .offset(of) + .limit(1) + .take + + relation = where(arel_table[column].gteq(start_id)) + + if stop + stop_id = stop[column] + start_id = stop_id + relation = relation.where(arel_table[column].lt(stop_id)) + end + + # Any ORDER BYs are useless for this relation and can lead to less + # efficient UPDATE queries, hence we get rid of it. + yield relation.except(:order), index + + break unless stop + end + end + end +end diff --git a/doc/development/README.md b/doc/development/README.md index a2a07c37ced..58993c52dcd 100644 --- a/doc/development/README.md +++ b/doc/development/README.md @@ -55,6 +55,7 @@ - [Single Table Inheritance](single_table_inheritance.md) - [Background Migrations](background_migrations.md) - [Storing SHA1 Hashes As Binary](sha1_as_binary.md) +- [Iterating Tables In Batches](iterating_tables_in_batches.md) ## i18n diff --git a/doc/development/iterating_tables_in_batches.md b/doc/development/iterating_tables_in_batches.md new file mode 100644 index 00000000000..590c8cbba2d --- /dev/null +++ b/doc/development/iterating_tables_in_batches.md @@ -0,0 +1,37 @@ +# Iterating Tables In Batches + +Rails provides a method called `in_batches` that can be used to iterate over +rows in batches. For example: + +```ruby +User.in_batches(of: 10) do |relation| + relation.update_all(updated_at: Time.now) +end +``` + +Unfortunately this method is implemented in a way that is not very efficient, +both query and memory usage wise. + +To work around this you can include the `EachBatch` module into your models, +then use the `each_batch` class method. For example: + +```ruby +class User < ActiveRecord::Base + include EachBatch +end + +User.each_batch(of: 10) do |relation| + relation.update_all(updated_at: Time.now) +end +``` + +This will end up producing queries such as: + +``` +User Load (0.7ms) SELECT "users"."id" FROM "users" WHERE ("users"."id" >= 41654) ORDER BY "users"."id" ASC LIMIT 1 OFFSET 1000 + (0.7ms) SELECT COUNT(*) FROM "users" WHERE ("users"."id" >= 41654) AND ("users"."id" < 42687) +``` + +The API of this method is similar to `in_batches`, though it doesn't support +all of the arguments that `in_batches` supports. You should always use +`each_batch` _unless_ you have a specific need for `in_batches`. diff --git a/lib/gitlab/otp_key_rotator.rb b/lib/gitlab/otp_key_rotator.rb index 0d541935bc6..22332474945 100644 --- a/lib/gitlab/otp_key_rotator.rb +++ b/lib/gitlab/otp_key_rotator.rb @@ -34,7 +34,7 @@ module Gitlab write_csv do |csv| ActiveRecord::Base.transaction do - User.with_two_factor.in_batches do |relation| + User.with_two_factor.in_batches do |relation| # rubocop: disable Cop/InBatches rows = relation.pluck(:id, :encrypted_otp_secret, :encrypted_otp_secret_iv, :encrypted_otp_secret_salt) rows.each do |row| user = %i[id ciphertext iv salt].zip(row).to_h diff --git a/rubocop/cop/in_batches.rb b/rubocop/cop/in_batches.rb new file mode 100644 index 00000000000..c0240187e66 --- /dev/null +++ b/rubocop/cop/in_batches.rb @@ -0,0 +1,16 @@ +require_relative '../model_helpers' + +module RuboCop + module Cop + # Cop that prevents the use of `in_batches` + class InBatches < RuboCop::Cop::Cop + MSG = 'Do not use `in_batches`, use `each_batch` from the EachBatch module instead'.freeze + + def on_send(node) + return unless node.children[1] == :in_batches + + add_offense(node, :selector) + end + end + end +end diff --git a/rubocop/rubocop.rb b/rubocop/rubocop.rb index 1e70314598a..f76144275c9 100644 --- a/rubocop/rubocop.rb +++ b/rubocop/rubocop.rb @@ -5,6 +5,7 @@ require_relative 'cop/redirect_with_status' require_relative 'cop/polymorphic_associations' require_relative 'cop/project_path_helper' require_relative 'cop/active_record_dependent' +require_relative 'cop/in_batches' require_relative 'cop/migration/add_column' require_relative 'cop/migration/add_column_with_default_to_large_table' require_relative 'cop/migration/add_concurrent_foreign_key' diff --git a/spec/models/concerns/each_batch_spec.rb b/spec/models/concerns/each_batch_spec.rb new file mode 100644 index 00000000000..951690a217b --- /dev/null +++ b/spec/models/concerns/each_batch_spec.rb @@ -0,0 +1,53 @@ +require 'spec_helper' + +describe EachBatch do + describe '.each_batch' do + let(:model) do + Class.new(ActiveRecord::Base) do + include EachBatch + + self.table_name = 'users' + end + end + + before do + 5.times { create(:user, updated_at: 1.day.ago) } + end + + it 'yields an ActiveRecord::Relation when a block is given' do + model.each_batch do |relation| + expect(relation).to be_a_kind_of(ActiveRecord::Relation) + end + end + + it 'yields a batch index as the second argument' do + model.each_batch do |_, index| + expect(index).to eq(1) + end + end + + it 'accepts a custom batch size' do + amount = 0 + + model.each_batch(of: 1) { amount += 1 } + + expect(amount).to eq(5) + end + + it 'does not include ORDER BYs in the yielded relations' do + model.each_batch do |relation| + expect(relation.to_sql).not_to include('ORDER BY') + end + end + + it 'allows updating of the yielded relations' do + time = Time.now + + model.each_batch do |relation| + relation.update_all(updated_at: time) + end + + expect(model.where(updated_at: time).count).to eq(5) + end + end +end diff --git a/spec/rubocop/cop/in_batches_spec.rb b/spec/rubocop/cop/in_batches_spec.rb new file mode 100644 index 00000000000..072481984c6 --- /dev/null +++ b/spec/rubocop/cop/in_batches_spec.rb @@ -0,0 +1,19 @@ +require 'spec_helper' +require 'rubocop' +require 'rubocop/rspec/support' +require_relative '../../../rubocop/cop/in_batches' + +describe RuboCop::Cop::InBatches do + include CopHelper + + subject(:cop) { described_class.new } + + it 'registers an offense when in_batches is used' do + inspect_source(cop, 'foo.in_batches do; end') + + aggregate_failures do + expect(cop.offenses.size).to eq(1) + expect(cop.offenses.map(&:line)).to eq([1]) + end + end +end -- cgit v1.2.1