summaryrefslogtreecommitdiff
path: root/lib/gitlab/import_export/fast_hash_serializer.rb
blob: e5d52f945b573e6c6e0813e32f6e6255e4fba4c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# frozen_string_literal: true

# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
# is simple in that it recursively calls `as_json` on each object to
# serialize everything. However, for a model like a Project, this can
# generate a query for every single association, which can add up to tens
# of thousands of queries and lead to memory bloat.
#
# To improve this, we can do several things:

# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
#    to generate the necessary preload clauses.
#
# 2. We observe that a single project has many issues, merge requests,
#    etc. Instead of serializing everything at once, which could lead to
#    database timeouts and high memory usage, we take each top-level
#    association and serialize the data in batches.
#
#  For example, we serialize the first 100 issues and preload all of
#  their associated events, notes, etc. before moving onto the next
#  batch. When we're done, we serialize merge requests in the same way.
#  We repeat this pattern for the remaining associations specified in
#  import_export.yml.
module Gitlab
  module ImportExport
    class FastHashSerializer
      attr_reader :subject, :tree

      # Usage of this class results in delayed
      # serialization of relation. The serialization
      # will be triggered when the `JSON.generate`
      # is exected.
      #
      # This class uses memory-optimised, lazily
      # initialised, fast to recycle relation
      # serialization.
      #
      # The `JSON.generate` does use `#to_json`,
      # that returns raw JSON content that is written
      # directly to file.
      class JSONBatchRelation
        include Gitlab::Utils::StrongMemoize

        def initialize(relation, options, preloads)
          @relation = relation
          @options = options
          @preloads = preloads
        end

        def raw_json
          strong_memoize(:raw_json) do
            result = +''

            batch = @relation
            batch = batch.preload(@preloads) if @preloads
            batch.each do |item|
              result.concat(",") unless result.empty?
              result.concat(item.to_json(@options))
            end

            result
          end
        end

        def to_json(options = {})
          raw_json
        end

        def as_json(*)
          raise NotImplementedError
        end
      end

      BATCH_SIZE = 100

      def initialize(subject, tree, batch_size: BATCH_SIZE)
        @subject = subject
        @batch_size = batch_size
        @tree = tree
      end

      # With the usage of `JSONBatchRelation`, it returns partially
      # serialized hash which is not easily accessible.
      # It means you can only manipulate and replace top-level objects.
      # All future mutations of the hash (such as `fix_project_tree`)
      # should be aware of that.
      def execute
        simple_serialize.merge(serialize_includes)
      end

      private

      def simple_serialize
        subject.as_json(
          tree.merge(include: nil, preloads: nil))
      end

      def serialize_includes
        return {} unless includes

        includes
          .map(&method(:serialize_include_definition))
          .tap { |entries| entries.compact! }
          .to_h
      end

      # definition:
      # { labels: { includes: ... } }
      def serialize_include_definition(definition)
        raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
        raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?

        key = definition.first.first
        options = definition.first.second

        record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
        return unless record

        serialized_record = serialize_record(key, record, options)
        return unless serialized_record

        # `#as_json` always returns keys as `strings`
        [key.to_s, serialized_record]
      end

      def serialize_record(key, record, options)
        unless record.respond_to?(:as_json)
          raise "Invalid type of #{key} is #{record.class}"
        end

        # no has-many relation
        unless record.is_a?(ActiveRecord::Relation)
          return record.as_json(options)
        end

        data = []

        record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
          # order each batch by it's primary key to ensure
          # consistent and predictable ordering of each exported relation
          # as additional `WHERE` clauses can impact the order in which data is being
          # returned by database when no `ORDER` is specified
          batch = batch.reorder(batch.klass.primary_key)

          data.append(JSONBatchRelation.new(batch, options, preloads[key]).tap(&:raw_json))
        end

        data
      end

      def includes
        tree[:include]
      end

      def preloads
        tree[:preload]
      end
    end
  end
end