diff options
Diffstat (limited to 'lib/gitlab/import_export/fast_hash_serializer.rb')
-rw-r--r-- | lib/gitlab/import_export/fast_hash_serializer.rb | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/lib/gitlab/import_export/fast_hash_serializer.rb b/lib/gitlab/import_export/fast_hash_serializer.rb new file mode 100644 index 00000000000..a6ab4f3a3d9 --- /dev/null +++ b/lib/gitlab/import_export/fast_hash_serializer.rb @@ -0,0 +1,108 @@ +# frozen_string_literal: true + +# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184) +# is simple in that it recursively calls `as_json` on each object to +# serialize everything. However, for a model like a Project, this can +# generate a query for every single association, which can add up to tens +# of thousands of queries and lead to memory bloat. +# +# To improve this, we can do several things: + +# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html +# to generate the necessary preload clauses. +# +# 2. We observe that a single project has many issues, merge requests, +# etc. Instead of serializing everything at once, which could lead to +# database timeouts and high memory usage, we take each top-level +# association and serialize the data in batches. +# +# For example, we serialize the first 100 issues and preload all of +# their associated events, notes, etc. before moving onto the next +# batch. When we're done, we serialize merge requests in the same way. +# We repeat this pattern for the remaining associations specified in +# import_export.yml. +module Gitlab + module ImportExport + class FastHashSerializer + attr_reader :subject, :tree + + BATCH_SIZE = 100 + + def initialize(subject, tree, batch_size: BATCH_SIZE) + @subject = subject + @batch_size = batch_size + @tree = tree + end + + # Serializes the subject into a Hash for the given option tree + # (e.g. Project#as_json) + def execute + simple_serialize.merge(serialize_includes) + end + + private + + def simple_serialize + subject.as_json( + tree.merge(include: nil, preloads: nil)) + end + + def serialize_includes + return {} unless includes + + includes + .map(&method(:serialize_include_definition)) + .compact + .to_h + end + + # definition: + # { labels: { includes: ... } } + def serialize_include_definition(definition) + raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash) + raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one? + + key = definition.first.first + options = definition.first.second + + record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend + return unless record + + serialized_record = serialize_record(key, record, options) + return unless serialized_record + + # `#as_json` always returns keys as `strings` + [key.to_s, serialized_record] + end + + def serialize_record(key, record, options) + unless record.respond_to?(:as_json) + raise "Invalid type of #{key} is #{record.class}" + end + + # no has-many relation + unless record.is_a?(ActiveRecord::Relation) + return record.as_json(options) + end + + # has-many relation + data = [] + + record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches + batch = batch.preload(preloads[key]) if preloads&.key?(key) + data += batch.as_json(options) + end + + data + end + + def includes + tree[:include] + end + + def preloads + tree[:preload] + end + end + end +end |