summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorKamil TrzciƄski <ayufan@ayufan.eu>2019-09-09 15:40:49 +0000
committerStan Hu <stanhu@gmail.com>2019-09-09 15:40:49 +0000
commit0e56c1e7cb3e1bbf3e81ab9907a26d385e28022c (patch)
tree4022cd2fe891d64eb34ceb5537467737a4054538 /lib
parent383f363589ac405cce07d3b54e796f9c949d2ffb (diff)
downloadgitlab-ce-0e56c1e7cb3e1bbf3e81ab9907a26d385e28022c.tar.gz
Improve performance and memory usage of project export
ActiveModel::Serialization is simple in that it recursively calls `as_json` on each object to serialize everything. However, for a model like a Project, this can generate a query for every single association, which can add up to tens of thousands of queries and lead to memory bloat. To improve this, we can do several things: 1. We use `tree:` and `preload:` to automatically generate a list of all preloads that could be used to serialize objects in bulk. 2. We observe that a single project has many issues, merge requests, etc. Instead of serializing everything at once, which could lead to database timeouts and high memory usage, we take each top-level association and serialize the data in batches. For example, we serialize the first 100 issues and preload all of their associated events, notes, etc. before moving onto the next batch. When we're done, we serialize merge requests in the same way. We repeat this pattern for the remaining associations specified in import_export.yml.
Diffstat (limited to 'lib')
-rw-r--r--lib/gitlab/import_export/attributes_finder.rb19
-rw-r--r--lib/gitlab/import_export/fast_hash_serializer.rb108
-rw-r--r--lib/gitlab/import_export/import_export.yml10
-rw-r--r--lib/gitlab/import_export/project_tree_saver.rb8
4 files changed, 143 insertions, 2 deletions
diff --git a/lib/gitlab/import_export/attributes_finder.rb b/lib/gitlab/import_export/attributes_finder.rb
index 13883ca7f3d..28d48ce6dfe 100644
--- a/lib/gitlab/import_export/attributes_finder.rb
+++ b/lib/gitlab/import_export/attributes_finder.rb
@@ -8,6 +8,7 @@ module Gitlab
@included_attributes = config[:included_attributes] || {}
@excluded_attributes = config[:excluded_attributes] || {}
@methods = config[:methods] || {}
+ @preloads = config[:preloads] || {}
end
def find_root(model_key)
@@ -29,10 +30,26 @@ module Gitlab
only: @included_attributes[model_key],
except: @excluded_attributes[model_key],
methods: @methods[model_key],
- include: resolve_model_tree(model_tree)
+ include: resolve_model_tree(model_tree),
+ preload: resolve_preloads(model_key, model_tree)
}.compact
end
+ def resolve_preloads(model_key, model_tree)
+ model_tree
+ .map { |submodel_key, submodel_tree| resolve_preload(model_key, submodel_key, submodel_tree) }
+ .compact
+ .to_h
+ .deep_merge(@preloads[model_key].to_h)
+ .presence
+ end
+
+ def resolve_preload(parent_model_key, model_key, model_tree)
+ return if @methods[parent_model_key]&.include?(model_key)
+
+ [model_key, resolve_preloads(model_key, model_tree)]
+ end
+
def resolve_model_tree(model_tree)
return unless model_tree
diff --git a/lib/gitlab/import_export/fast_hash_serializer.rb b/lib/gitlab/import_export/fast_hash_serializer.rb
new file mode 100644
index 00000000000..a6ab4f3a3d9
--- /dev/null
+++ b/lib/gitlab/import_export/fast_hash_serializer.rb
@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+
+# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
+# is simple in that it recursively calls `as_json` on each object to
+# serialize everything. However, for a model like a Project, this can
+# generate a query for every single association, which can add up to tens
+# of thousands of queries and lead to memory bloat.
+#
+# To improve this, we can do several things:
+
+# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
+# to generate the necessary preload clauses.
+#
+# 2. We observe that a single project has many issues, merge requests,
+# etc. Instead of serializing everything at once, which could lead to
+# database timeouts and high memory usage, we take each top-level
+# association and serialize the data in batches.
+#
+# For example, we serialize the first 100 issues and preload all of
+# their associated events, notes, etc. before moving onto the next
+# batch. When we're done, we serialize merge requests in the same way.
+# We repeat this pattern for the remaining associations specified in
+# import_export.yml.
+module Gitlab
+ module ImportExport
+ class FastHashSerializer
+ attr_reader :subject, :tree
+
+ BATCH_SIZE = 100
+
+ def initialize(subject, tree, batch_size: BATCH_SIZE)
+ @subject = subject
+ @batch_size = batch_size
+ @tree = tree
+ end
+
+ # Serializes the subject into a Hash for the given option tree
+ # (e.g. Project#as_json)
+ def execute
+ simple_serialize.merge(serialize_includes)
+ end
+
+ private
+
+ def simple_serialize
+ subject.as_json(
+ tree.merge(include: nil, preloads: nil))
+ end
+
+ def serialize_includes
+ return {} unless includes
+
+ includes
+ .map(&method(:serialize_include_definition))
+ .compact
+ .to_h
+ end
+
+ # definition:
+ # { labels: { includes: ... } }
+ def serialize_include_definition(definition)
+ raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
+ raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?
+
+ key = definition.first.first
+ options = definition.first.second
+
+ record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
+ return unless record
+
+ serialized_record = serialize_record(key, record, options)
+ return unless serialized_record
+
+ # `#as_json` always returns keys as `strings`
+ [key.to_s, serialized_record]
+ end
+
+ def serialize_record(key, record, options)
+ unless record.respond_to?(:as_json)
+ raise "Invalid type of #{key} is #{record.class}"
+ end
+
+ # no has-many relation
+ unless record.is_a?(ActiveRecord::Relation)
+ return record.as_json(options)
+ end
+
+ # has-many relation
+ data = []
+
+ record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
+ batch = batch.preload(preloads[key]) if preloads&.key?(key)
+ data += batch.as_json(options)
+ end
+
+ data
+ end
+
+ def includes
+ tree[:include]
+ end
+
+ def preloads
+ tree[:preload]
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/import_export/import_export.yml b/lib/gitlab/import_export/import_export.yml
index 06c94beead8..511b702553e 100644
--- a/lib/gitlab/import_export/import_export.yml
+++ b/lib/gitlab/import_export/import_export.yml
@@ -231,6 +231,16 @@ methods:
ci_pipelines:
- :notes
+preloads:
+ statuses:
+ # TODO: We cannot preload tags, as they are not part of `GenericCommitStatus`
+ # tags: # needed by tag_list
+ project: # deprecated: needed by coverage_regex of Ci::Build
+ merge_requests:
+ source_project: # needed by source_branch_sha and diff_head_sha
+ target_project: # needed by target_branch_sha
+ assignees: # needed by assigne_id that is implemented by DeprecatedAssignee
+
# EE specific relationships and settings to include. All of this will be merged
# into the previous structures if EE is used.
ee:
diff --git a/lib/gitlab/import_export/project_tree_saver.rb b/lib/gitlab/import_export/project_tree_saver.rb
index f1b3db6b208..f75f69b2c75 100644
--- a/lib/gitlab/import_export/project_tree_saver.rb
+++ b/lib/gitlab/import_export/project_tree_saver.rb
@@ -41,7 +41,13 @@ module Gitlab
end
def serialize_project_tree
- @project.as_json(reader.project_tree)
+ if Feature.enabled?(:export_fast_serialize, default_enabled: true)
+ Gitlab::ImportExport::FastHashSerializer
+ .new(@project, reader.project_tree)
+ .execute
+ else
+ @project.as_json(reader.project_tree)
+ end
end
def reader