summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorick Peterse <yorickpeterse@gmail.com>2017-07-07 14:49:05 +0200
committerYorick Peterse <yorickpeterse@gmail.com>2017-07-07 16:39:36 +0200
commitff78af152cc3054a7bb76af718943dca7a69a3c5 (patch)
tree098a917c696179405026a0f59c9cc43c53a4459b
parentd40445e4c9964ae0ab793bfdd7ba530de4259716 (diff)
downloadgitlab-ce-ff78af152cc3054a7bb76af718943dca7a69a3c5.tar.gz
Added EachBatch for iterating tables in batchesactive-record-each-batch
This module provides a class method called `each_batch` that can be used to iterate tables in batches in a more efficient way compared to Rails' `in_batches` method. This commit also includes a RuboCop cop to blacklist the use of `in_batches` in favour of this new method.
-rw-r--r--app/models/concerns/each_batch.rb81
-rw-r--r--doc/development/README.md1
-rw-r--r--doc/development/iterating_tables_in_batches.md37
-rw-r--r--lib/gitlab/otp_key_rotator.rb2
-rw-r--r--rubocop/cop/in_batches.rb16
-rw-r--r--rubocop/rubocop.rb1
-rw-r--r--spec/models/concerns/each_batch_spec.rb53
-rw-r--r--spec/rubocop/cop/in_batches_spec.rb19
8 files changed, 209 insertions, 1 deletions
diff --git a/app/models/concerns/each_batch.rb b/app/models/concerns/each_batch.rb
new file mode 100644
index 00000000000..6ddbb8da1a9
--- /dev/null
+++ b/app/models/concerns/each_batch.rb
@@ -0,0 +1,81 @@
+module EachBatch
+ extend ActiveSupport::Concern
+
+ module ClassMethods
+ # Iterates over the rows in a relation in batches, similar to Rails'
+ # `in_batches` but in a more efficient way.
+ #
+ # Unlike `in_batches` provided by Rails this method does not support a
+ # custom start/end range, nor does it provide support for the `load:`
+ # keyword argument.
+ #
+ # This method will yield an ActiveRecord::Relation to the supplied block, or
+ # return an Enumerator if no block is given.
+ #
+ # Example:
+ #
+ # User.each_batch do |relation|
+ # relation.update_all(updated_at: Time.now)
+ # end
+ #
+ # The supplied block is also passed an optional batch index:
+ #
+ # User.each_batch do |relation, index|
+ # puts index # => 1, 2, 3, ...
+ # end
+ #
+ # You can also specify an alternative column to use for ordering the rows:
+ #
+ # User.each_batch(column: :created_at) do |relation|
+ # ...
+ # end
+ #
+ # This will produce SQL queries along the lines of:
+ #
+ # User Load (0.7ms) SELECT "users"."id" FROM "users" WHERE ("users"."id" >= 41654) ORDER BY "users"."id" ASC LIMIT 1 OFFSET 1000
+ # (0.7ms) SELECT COUNT(*) FROM "users" WHERE ("users"."id" >= 41654) AND ("users"."id" < 42687)
+ #
+ # of - The number of rows to retrieve per batch.
+ # column - The column to use for ordering the batches.
+ def each_batch(of: 1000, column: primary_key)
+ unless column
+ raise ArgumentError,
+ 'the column: argument must be set to a column name to use for ordering rows'
+ end
+
+ start = except(:select)
+ .select(column)
+ .reorder(column => :asc)
+ .take
+
+ return unless start
+
+ start_id = start[column]
+ arel_table = self.arel_table
+
+ 1.step do |index|
+ stop = except(:select)
+ .select(column)
+ .where(arel_table[column].gteq(start_id))
+ .reorder(column => :asc)
+ .offset(of)
+ .limit(1)
+ .take
+
+ relation = where(arel_table[column].gteq(start_id))
+
+ if stop
+ stop_id = stop[column]
+ start_id = stop_id
+ relation = relation.where(arel_table[column].lt(stop_id))
+ end
+
+ # Any ORDER BYs are useless for this relation and can lead to less
+ # efficient UPDATE queries, hence we get rid of it.
+ yield relation.except(:order), index
+
+ break unless stop
+ end
+ end
+ end
+end
diff --git a/doc/development/README.md b/doc/development/README.md
index a2a07c37ced..58993c52dcd 100644
--- a/doc/development/README.md
+++ b/doc/development/README.md
@@ -55,6 +55,7 @@
- [Single Table Inheritance](single_table_inheritance.md)
- [Background Migrations](background_migrations.md)
- [Storing SHA1 Hashes As Binary](sha1_as_binary.md)
+- [Iterating Tables In Batches](iterating_tables_in_batches.md)
## i18n
diff --git a/doc/development/iterating_tables_in_batches.md b/doc/development/iterating_tables_in_batches.md
new file mode 100644
index 00000000000..590c8cbba2d
--- /dev/null
+++ b/doc/development/iterating_tables_in_batches.md
@@ -0,0 +1,37 @@
+# Iterating Tables In Batches
+
+Rails provides a method called `in_batches` that can be used to iterate over
+rows in batches. For example:
+
+```ruby
+User.in_batches(of: 10) do |relation|
+ relation.update_all(updated_at: Time.now)
+end
+```
+
+Unfortunately this method is implemented in a way that is not very efficient,
+both query and memory usage wise.
+
+To work around this you can include the `EachBatch` module into your models,
+then use the `each_batch` class method. For example:
+
+```ruby
+class User < ActiveRecord::Base
+ include EachBatch
+end
+
+User.each_batch(of: 10) do |relation|
+ relation.update_all(updated_at: Time.now)
+end
+```
+
+This will end up producing queries such as:
+
+```
+User Load (0.7ms) SELECT "users"."id" FROM "users" WHERE ("users"."id" >= 41654) ORDER BY "users"."id" ASC LIMIT 1 OFFSET 1000
+ (0.7ms) SELECT COUNT(*) FROM "users" WHERE ("users"."id" >= 41654) AND ("users"."id" < 42687)
+```
+
+The API of this method is similar to `in_batches`, though it doesn't support
+all of the arguments that `in_batches` supports. You should always use
+`each_batch` _unless_ you have a specific need for `in_batches`.
diff --git a/lib/gitlab/otp_key_rotator.rb b/lib/gitlab/otp_key_rotator.rb
index 0d541935bc6..22332474945 100644
--- a/lib/gitlab/otp_key_rotator.rb
+++ b/lib/gitlab/otp_key_rotator.rb
@@ -34,7 +34,7 @@ module Gitlab
write_csv do |csv|
ActiveRecord::Base.transaction do
- User.with_two_factor.in_batches do |relation|
+ User.with_two_factor.in_batches do |relation| # rubocop: disable Cop/InBatches
rows = relation.pluck(:id, :encrypted_otp_secret, :encrypted_otp_secret_iv, :encrypted_otp_secret_salt)
rows.each do |row|
user = %i[id ciphertext iv salt].zip(row).to_h
diff --git a/rubocop/cop/in_batches.rb b/rubocop/cop/in_batches.rb
new file mode 100644
index 00000000000..c0240187e66
--- /dev/null
+++ b/rubocop/cop/in_batches.rb
@@ -0,0 +1,16 @@
+require_relative '../model_helpers'
+
+module RuboCop
+ module Cop
+ # Cop that prevents the use of `in_batches`
+ class InBatches < RuboCop::Cop::Cop
+ MSG = 'Do not use `in_batches`, use `each_batch` from the EachBatch module instead'.freeze
+
+ def on_send(node)
+ return unless node.children[1] == :in_batches
+
+ add_offense(node, :selector)
+ end
+ end
+ end
+end
diff --git a/rubocop/rubocop.rb b/rubocop/rubocop.rb
index 1e70314598a..f76144275c9 100644
--- a/rubocop/rubocop.rb
+++ b/rubocop/rubocop.rb
@@ -5,6 +5,7 @@ require_relative 'cop/redirect_with_status'
require_relative 'cop/polymorphic_associations'
require_relative 'cop/project_path_helper'
require_relative 'cop/active_record_dependent'
+require_relative 'cop/in_batches'
require_relative 'cop/migration/add_column'
require_relative 'cop/migration/add_column_with_default_to_large_table'
require_relative 'cop/migration/add_concurrent_foreign_key'
diff --git a/spec/models/concerns/each_batch_spec.rb b/spec/models/concerns/each_batch_spec.rb
new file mode 100644
index 00000000000..951690a217b
--- /dev/null
+++ b/spec/models/concerns/each_batch_spec.rb
@@ -0,0 +1,53 @@
+require 'spec_helper'
+
+describe EachBatch do
+ describe '.each_batch' do
+ let(:model) do
+ Class.new(ActiveRecord::Base) do
+ include EachBatch
+
+ self.table_name = 'users'
+ end
+ end
+
+ before do
+ 5.times { create(:user, updated_at: 1.day.ago) }
+ end
+
+ it 'yields an ActiveRecord::Relation when a block is given' do
+ model.each_batch do |relation|
+ expect(relation).to be_a_kind_of(ActiveRecord::Relation)
+ end
+ end
+
+ it 'yields a batch index as the second argument' do
+ model.each_batch do |_, index|
+ expect(index).to eq(1)
+ end
+ end
+
+ it 'accepts a custom batch size' do
+ amount = 0
+
+ model.each_batch(of: 1) { amount += 1 }
+
+ expect(amount).to eq(5)
+ end
+
+ it 'does not include ORDER BYs in the yielded relations' do
+ model.each_batch do |relation|
+ expect(relation.to_sql).not_to include('ORDER BY')
+ end
+ end
+
+ it 'allows updating of the yielded relations' do
+ time = Time.now
+
+ model.each_batch do |relation|
+ relation.update_all(updated_at: time)
+ end
+
+ expect(model.where(updated_at: time).count).to eq(5)
+ end
+ end
+end
diff --git a/spec/rubocop/cop/in_batches_spec.rb b/spec/rubocop/cop/in_batches_spec.rb
new file mode 100644
index 00000000000..072481984c6
--- /dev/null
+++ b/spec/rubocop/cop/in_batches_spec.rb
@@ -0,0 +1,19 @@
+require 'spec_helper'
+require 'rubocop'
+require 'rubocop/rspec/support'
+require_relative '../../../rubocop/cop/in_batches'
+
+describe RuboCop::Cop::InBatches do
+ include CopHelper
+
+ subject(:cop) { described_class.new }
+
+ it 'registers an offense when in_batches is used' do
+ inspect_source(cop, 'foo.in_batches do; end')
+
+ aggregate_failures do
+ expect(cop.offenses.size).to eq(1)
+ expect(cop.offenses.map(&:line)).to eq([1])
+ end
+ end
+end