summaryrefslogtreecommitdiff
path: root/lib/gitlab/github_import/single_endpoint_notes_importing.rb
blob: 3584288da573106b1a69a5acb6b74c4d1687d772 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# frozen_string_literal: true

# This module is used in:
#  - SingleEndpointDiffNotesImporter
#  - SingleEndpointIssueNotesImporter
#  - SingleEndpointMergeRequestNotesImporter
# if enabled by Gitlab::GithubImport::Settings
#
# - SingleEndpointIssueEventsImporter
# if enabled by Gitlab::GithubImport::Settings
#
# Fetches associated objects page by page to each item of parent collection.
# Currently `associated` is note or event.
# Currently `parent` is MergeRequest or Issue record.
#
# It fetches 1 PR's associated objects at a time using `issue_comments` or
# `pull_request_comments` endpoint, which is slower than `NotesImporter`
# but it makes sure all notes are imported, as it can sometimes not be
# the case for `NotesImporter`, because `issues_comments` endpoint
# it uses can be limited by GitHub API to not return all available pages.
module Gitlab
  module GithubImport
    module SingleEndpointNotesImporting
      BATCH_SIZE = 100

      def each_object_to_import(&block)
        each_associated_page do |parent_record, associated_page|
          associated_page.objects.each do |associated|
            each_associated(parent_record, associated, &block)
          end
        end
      end

      def id_for_already_imported_cache(associated)
        associated[:id]
      end

      def parent_collection
        raise NotImplementedError
      end

      def parent_imported_cache_key
        raise NotImplementedError
      end

      def page_counter_id(parent)
        raise NotImplementedError
      end

      private

      # Sometimes we need to add some extra info from parent
      # to associated record that is not available by default
      # in Github API response object. For example:
      # lib/gitlab/github_import/importer/single_endpoint_issue_events_importer.rb:26
      def each_associated(_parent_record, associated)
        associated = associated.to_h

        return if already_imported?(associated)

        Gitlab::GithubImport::ObjectCounter.increment(project, object_type, :fetched)

        yield(associated)

        mark_as_imported(associated)
      end

      def each_associated_page(&block)
        parent_collection.each_batch(of: BATCH_SIZE, column: :iid) do |batch|
          process_batch(batch, &block)
        end
      end

      def process_batch(batch)
        batch.each do |parent_record|
          # The page counter needs to be scoped by parent_record to avoid skipping
          # pages of notes from already imported parent_record.
          page_counter = PageCounter.new(project, page_counter_id(parent_record))
          repo = project.import_source
          options = collection_options.merge(page: page_counter.current)

          client.each_page(collection_method, repo, parent_record.iid, options) do |page|
            next unless page_counter.set(page.number)

            yield parent_record, page
          end

          mark_parent_imported(parent_record)
        end
      end

      def mark_parent_imported(parent)
        Gitlab::Cache::Import::Caching.set_add(
          parent_imported_cache_key,
          parent.iid
        )
      end

      def already_imported_parents
        Gitlab::Cache::Import::Caching.values_from_set(parent_imported_cache_key)
      end
    end
  end
end