diff options
Diffstat (limited to 'lib/gitlab/github_import/client.rb')
-rw-r--r-- | lib/gitlab/github_import/client.rb | 239 |
1 files changed, 138 insertions, 101 deletions
diff --git a/lib/gitlab/github_import/client.rb b/lib/gitlab/github_import/client.rb index 0550f9695bd..844530b1ea7 100644 --- a/lib/gitlab/github_import/client.rb +++ b/lib/gitlab/github_import/client.rb @@ -1,147 +1,184 @@ +# frozen_string_literal: true + module Gitlab module GithubImport + # HTTP client for interacting with the GitHub API. + # + # This class is basically a fancy wrapped around Octokit while adding some + # functionality to deal with rate limiting and parallel imports. Usage is + # mostly the same as Octokit, for example: + # + # client = GithubImport::Client.new('hunter2') + # + # client.labels.each do |label| + # puts label.name + # end class Client - GITHUB_SAFE_REMAINING_REQUESTS = 100 - GITHUB_SAFE_SLEEP_TIME = 500 + attr_reader :octokit - attr_reader :access_token, :host, :api_version + # A single page of data and the corresponding page number. + Page = Struct.new(:objects, :number) - def initialize(access_token, host: nil, api_version: 'v3') - @access_token = access_token - @host = host.to_s.sub(%r{/+\z}, '') - @api_version = api_version - @users = {} + # The minimum number of requests we want to keep available. + # + # We don't use a value of 0 as multiple threads may be using the same + # token in parallel. This could result in all of them hitting the GitHub + # rate limit at once. The threshold is put in place to not hit the limit + # in most cases. + RATE_LIMIT_THRESHOLD = 50 - if access_token - ::Octokit.auto_paginate = false - end + # token - The GitHub API token to use. + # + # per_page - The number of objects that should be displayed per page. + # + # parallel - When set to true hitting the rate limit will result in a + # dedicated error being raised. When set to `false` we will + # instead just `sleep()` until the rate limit is reset. Setting + # this value to `true` for parallel importing is crucial as + # otherwise hitting the rate limit will result in a thread + # being blocked in a `sleep()` call for up to an hour. + def initialize(token, per_page: 100, parallel: true) + @octokit = Octokit::Client.new(access_token: token, per_page: per_page) + @parallel = parallel end - def api - @api ||= ::Octokit::Client.new( - access_token: access_token, - api_endpoint: api_endpoint, - # If there is no config, we're connecting to github.com and we - # should verify ssl. - connection_options: { - ssl: { verify: config ? config['verify_ssl'] : true } - } - ) + def parallel? + @parallel end - def client - unless config - raise Projects::ImportService::Error, - 'OAuth configuration for GitHub missing.' - end - - @client ||= ::OAuth2::Client.new( - config.app_id, - config.app_secret, - github_options.merge(ssl: { verify: config['verify_ssl'] }) - ) + # Returns the details of a GitHub user. + # + # username - The username of the user. + def user(username) + with_rate_limit { octokit.user(username) } end - def authorize_url(redirect_uri) - client.auth_code.authorize_url({ - redirect_uri: redirect_uri, - scope: "repo, user, user:email" - }) + # Returns the details of a GitHub repository. + # + # name - The path (in the form `owner/repository`) of the repository. + def repository(name) + with_rate_limit { octokit.repo(name) } end - def get_token(code) - client.auth_code.get_token(code).token + def labels(*args) + each_object(:labels, *args) end - def method_missing(method, *args, &block) - if api.respond_to?(method) - request(method, *args, &block) - else - super(method, *args, &block) - end + def milestones(*args) + each_object(:milestones, *args) end - def respond_to?(method) - api.respond_to?(method) || super + def releases(*args) + each_object(:releases, *args) end - def user(login) - return nil unless login.present? - return @users[login] if @users.key?(login) + # Fetches data from the GitHub API and yields a Page object for every page + # of data, without loading all of them into memory. + # + # method - The Octokit method to use for getting the data. + # args - Arguments to pass to the Octokit method. + # + # rubocop: disable GitlabSecurity/PublicSend + def each_page(method, *args, &block) + return to_enum(__method__, method, *args) unless block_given? - @users[login] = api.user(login) - end + page = + if args.last.is_a?(Hash) && args.last[:page] + args.last[:page] + else + 1 + end - private + collection = with_rate_limit { octokit.public_send(method, *args) } + next_url = octokit.last_response.rels[:next] - def api_endpoint - if host.present? && api_version.present? - "#{host}/api/#{api_version}" - else - github_options[:site] + yield Page.new(collection, page) + + while next_url + response = with_rate_limit { next_url.get } + next_url = response.rels[:next] + + yield Page.new(response.data, page += 1) end end - def config - Gitlab.config.omniauth.providers.find { |provider| provider.name == "github" } - end + # Iterates over all of the objects for the given method (e.g. `:labels`). + # + # method - The method to send to Octokit for querying data. + # args - Any arguments to pass to the Octokit method. + def each_object(method, *args, &block) + return to_enum(__method__, method, *args) unless block_given? - def github_options - if config - config["args"]["client_options"].deep_symbolize_keys - else - OmniAuth::Strategies::GitHub.default_options[:client_options].symbolize_keys + each_page(method, *args) do |page| + page.objects.each do |object| + yield object + end end end - def rate_limit - api.rate_limit! - # GitHub Rate Limit API returns 404 when the rate limit is - # disabled. In this case we just want to return gracefully - # instead of spitting out an error. - rescue Octokit::NotFound - nil - end + # Yields the supplied block, responding to any rate limit errors. + # + # The exact strategy used for handling rate limiting errors depends on + # whether we are running in parallel mode or not. For more information see + # `#rate_or_wait_for_rate_limit`. + def with_rate_limit + request_count_counter.increment - def has_rate_limit? - return @has_rate_limit if defined?(@has_rate_limit) + raise_or_wait_for_rate_limit unless requests_remaining? - @has_rate_limit = rate_limit.present? - end + begin + yield + rescue Octokit::TooManyRequests + raise_or_wait_for_rate_limit - def rate_limit_exceed? - has_rate_limit? && rate_limit.remaining <= GITHUB_SAFE_REMAINING_REQUESTS + # This retry will only happen when running in sequential mode as we'll + # raise an error in parallel mode. + retry + end end - def rate_limit_sleep_time - rate_limit.resets_in + GITHUB_SAFE_SLEEP_TIME + # Returns `true` if we're still allowed to perform API calls. + def requests_remaining? + remaining_requests > RATE_LIMIT_THRESHOLD end - def request(method, *args, &block) - sleep rate_limit_sleep_time if rate_limit_exceed? - - data = api.__send__(method, *args) # rubocop:disable GitlabSecurity/PublicSend - return data unless data.is_a?(Array) + def remaining_requests + octokit.rate_limit.remaining + end - last_response = api.last_response + def raise_or_wait_for_rate_limit + rate_limit_counter.increment - if block_given? - yield data - # api.last_response could change while we're yielding (e.g. fetching labels for each PR) - # so we cache our own last response - each_response_page(last_response, &block) + if parallel? + raise RateLimitError else - each_response_page(last_response) { |page| data.concat(page) } - data + sleep(rate_limit_resets_in) end end - def each_response_page(last_response) - while last_response.rels[:next] - sleep rate_limit_sleep_time if rate_limit_exceed? - last_response = last_response.rels[:next].get - yield last_response.data if last_response.data.is_a?(Array) - end + def rate_limit_resets_in + # We add a few seconds to the rate limit so we don't _immediately_ + # resume when the rate limit resets as this may result in us performing + # a request before GitHub has a chance to reset the limit. + octokit.rate_limit.resets_in + 5 + end + + def respond_to_missing?(method, include_private = false) + octokit.respond_to?(method, include_private) + end + + def rate_limit_counter + @rate_limit_counter ||= Gitlab::Metrics.counter( + :github_importer_rate_limit_hits, + 'The number of times we hit the GitHub rate limit when importing projects' + ) + end + + def request_count_counter + @request_counter ||= Gitlab::Metrics.counter( + :github_importer_request_count, + 'The number of GitHub API calls performed when importing projects' + ) end end end |