lib/gitlab/github_import/client.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295

# frozen_string_literal: true

module Gitlab
  module GithubImport
    # HTTP client for interacting with the GitHub API.
    #
    # This class is basically a fancy wrapped around Octokit while adding some
    # functionality to deal with rate limiting and parallel imports. Usage is
    # mostly the same as Octokit, for example:
    #
    #     client = GithubImport::Client.new('hunter2')
    #
    #     client.labels.each do |label|
    #       puts label.name
    #     end
    class Client
      include ::Gitlab::Utils::StrongMemoize

      attr_reader :octokit

      SEARCH_MAX_REQUESTS_PER_MINUTE = 30
      DEFAULT_PER_PAGE = 100
      LOWER_PER_PAGE = 50
      CLIENT_CONNECTION_ERROR = ::Faraday::ConnectionFailed # used/set in sawyer agent which octokit uses

      # A single page of data and the corresponding page number.
      Page = Struct.new(:objects, :number)

      # The minimum number of requests we want to keep available.
      #
      # We don't use a value of 0 as multiple threads may be using the same
      # token in parallel. This could result in all of them hitting the GitHub
      # rate limit at once. The threshold is put in place to not hit the limit
      # in most cases.
      RATE_LIMIT_THRESHOLD = 50
      SEARCH_RATE_LIMIT_THRESHOLD = 3

      # token - The GitHub API token to use.
      #
      # host - The GitHub hostname. If nil, github.com will be used.
      #
      # per_page - The number of objects that should be displayed per page.
      #
      # parallel - When set to true hitting the rate limit will result in a
      #            dedicated error being raised. When set to `false` we will
      #            instead just `sleep()` until the rate limit is reset. Setting
      #            this value to `true` for parallel importing is crucial as
      #            otherwise hitting the rate limit will result in a thread
      #            being blocked in a `sleep()` call for up to an hour.
      def initialize(token, host: nil, per_page: DEFAULT_PER_PAGE, parallel: true)
        @host = host
        @octokit = ::Octokit::Client.new(
          access_token: token,
          per_page: per_page,
          api_endpoint: api_endpoint,
          web_endpoint: web_endpoint
        )

        @octokit.connection_options[:ssl] = { verify: verify_ssl }

        @parallel = parallel
      end

      def parallel?
        @parallel
      end

      # Returns the details of a GitHub user.
      #
      # username - The username of the user.
      def user(username)
        with_rate_limit { octokit.user(username) }
      end

      def pull_request_reviews(repo_name, iid)
        each_object(:pull_request_reviews, repo_name, iid)
      end

      # Returns the details of a GitHub repository.
      #
      # name - The path (in the form `owner/repository`) of the repository.
      def repository(name)
        with_rate_limit { octokit.repo(name) }
      end

      def pull_request(repo_name, iid)
        with_rate_limit { octokit.pull_request(repo_name, iid) }
      end

      def labels(*args)
        each_object(:labels, *args)
      end

      def milestones(*args)
        each_object(:milestones, *args)
      end

      def releases(*args)
        each_object(:releases, *args)
      end

      # Fetches data from the GitHub API and yields a Page object for every page
      # of data, without loading all of them into memory.
      #
      # method - The Octokit method to use for getting the data.
      # args - Arguments to pass to the Octokit method.
      #
      # rubocop: disable GitlabSecurity/PublicSend
      def each_page(method, *args, &block)
        return to_enum(__method__, method, *args) unless block

        page =
          if args.last.is_a?(Hash) && args.last[:page]
            args.last[:page]
          else
            1
          end

        collection = with_rate_limit { octokit.public_send(method, *args) }
        next_url = octokit.last_response.rels[:next]

        yield Page.new(collection, page)

        while next_url
          response = with_rate_limit { next_url.get }
          next_url = response.rels[:next]

          yield Page.new(response.data, page += 1)
        end
      end

      # Iterates over all of the objects for the given method (e.g. `:labels`).
      #
      # method - The method to send to Octokit for querying data.
      # args - Any arguments to pass to the Octokit method.
      def each_object(method, *args, &block)
        return to_enum(__method__, method, *args) unless block

        each_page(method, *args) do |page|
          page.objects.each do |object|
            yield object
          end
        end
      end

      # Yields the supplied block, responding to any rate limit errors.
      #
      # The exact strategy used for handling rate limiting errors depends on
      # whether we are running in parallel mode or not. For more information see
      # `#rate_or_wait_for_rate_limit`.
      def with_rate_limit
        return with_retry { yield } unless rate_limiting_enabled?

        request_count_counter.increment

        raise_or_wait_for_rate_limit unless requests_remaining?

        begin
          with_retry { yield }
        rescue ::Octokit::TooManyRequests
          raise_or_wait_for_rate_limit

          # This retry will only happen when running in sequential mode as we'll
          # raise an error in parallel mode.
          retry
        end
      end

      def search_repos_by_name(name, options = {})
        with_retry { octokit.search_repositories(search_query(str: name, type: :name), options) }
      end

      def search_query(str:, type:, include_collaborations: true, include_orgs: true)
        query = "#{str} in:#{type} is:public,private user:#{octokit.user.login}"

        query = [query, collaborations_subquery].join(' ') if include_collaborations
        query = [query, organizations_subquery].join(' ') if include_orgs

        query
      end

      # Returns `true` if we're still allowed to perform API calls.
      # Search API has rate limit of 30, use lowered threshold when search is used.
      def requests_remaining?
        if requests_limit == SEARCH_MAX_REQUESTS_PER_MINUTE
          return remaining_requests > SEARCH_RATE_LIMIT_THRESHOLD
        end

        remaining_requests > RATE_LIMIT_THRESHOLD
      end

      def remaining_requests
        octokit.rate_limit.remaining
      end

      def requests_limit
        octokit.rate_limit.limit
      end

      def raise_or_wait_for_rate_limit
        rate_limit_counter.increment

        if parallel?
          raise RateLimitError
        else
          sleep(rate_limit_resets_in)
        end
      end

      def rate_limit_resets_in
        # We add a few seconds to the rate limit so we don't _immediately_
        # resume when the rate limit resets as this may result in us performing
        # a request before GitHub has a chance to reset the limit.
        octokit.rate_limit.resets_in + 5
      end

      def rate_limiting_enabled?
        strong_memoize(:rate_limiting_enabled) do
          api_endpoint.include?('.github.com')
        end
      end

      def api_endpoint
        @host || custom_api_endpoint || default_api_endpoint
      end

      def web_endpoint
        @host || custom_api_endpoint || ::Octokit::Default.web_endpoint
      end

      def custom_api_endpoint
        github_omniauth_provider.dig('args', 'client_options', 'site')
      end

      def default_api_endpoint
        OmniAuth::Strategies::GitHub.default_options[:client_options][:site] || ::Octokit::Default.api_endpoint
      end

      def verify_ssl
        github_omniauth_provider.fetch('verify_ssl', true)
      end

      def github_omniauth_provider
        @github_omniauth_provider ||= Gitlab::Auth::OAuth::Provider.config_for('github').to_h
      end

      def rate_limit_counter
        @rate_limit_counter ||= Gitlab::Metrics.counter(
          :github_importer_rate_limit_hits,
          'The number of times we hit the GitHub rate limit when importing projects'
        )
      end

      def request_count_counter
        @request_counter ||= Gitlab::Metrics.counter(
          :github_importer_request_count,
          'The number of GitHub API calls performed when importing projects'
        )
      end

      private

      def collaborations_subquery
        each_object(:repos, nil, { affiliation: 'collaborator' })
          .map { |repo| "repo:#{repo.full_name}" }
          .join(' ')
      end

      def organizations_subquery
        each_object(:organizations)
          .map { |org| "org:#{org.login}" }
          .join(' ')
      end

      def with_retry
        Retriable.retriable(on: CLIENT_CONNECTION_ERROR, on_retry: on_retry) do
          yield
        end
      end

      def on_retry
        proc do |exception, try, elapsed_time, next_interval|
          Gitlab::Import::Logger.info(
            message: "GitHub connection retry triggered",
            'error.class': exception.class,
            'error.message': exception.message,
            try_count: try,
            elapsed_time_s: elapsed_time,
            wait_to_retry_s: next_interval
          )
        end
      end
    end
  end
end