summaryrefslogtreecommitdiff
path: root/app/workers/repository_check/batch_worker.rb
blob: 72f0a9b0619764d551a41b33f1996b59d3d868ec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
module RepositoryCheck
  class BatchWorker
    include ApplicationWorker
    include CronjobQueue

    RUN_TIME = 3600
    BATCH_SIZE = 10_000

    def perform
      return unless Gitlab::CurrentSettings.repository_checks_enabled

      start = Time.now

      # This loop will break after a little more than one hour ('a little
      # more' because `git fsck` may take a few minutes), or if it runs out of
      # projects to check. By default sidekiq-cron will start a new
      # RepositoryCheckWorker each hour so that as long as there are repositories to
      # check, only one (or two) will be checked at a time.
      project_ids.each do |project_id|
        break if Time.now - start >= RUN_TIME

        next unless try_obtain_lease(project_id)

        SingleRepositoryWorker.new.perform(project_id)
      end
    end

    private

    # Project.find_each does not support WHERE clauses and
    # Project.find_in_batches does not support ordering. So we just build an
    # array of ID's. This is OK because we do it only once an hour, because
    # getting ID's from Postgres is not terribly slow, and because no user
    # has to sit and wait for this query to finish.
    def project_ids
      never_checked_project_ids(BATCH_SIZE) + old_checked_project_ids(BATCH_SIZE)
    end

    def never_checked_project_ids(batch_size)
      Project.where(last_repository_check_at: nil)
        .where('created_at < ?', 24.hours.ago)
        .limit(batch_size).pluck(:id)
    end

    def old_checked_project_ids(batch_size)
      Project.where.not(last_repository_check_at: nil)
        .where('last_repository_check_at < ?', 1.month.ago)
        .reorder(last_repository_check_at: :asc)
        .limit(batch_size).pluck(:id)
    end

    def try_obtain_lease(id)
      # Use a 24-hour timeout because on servers/projects where 'git fsck' is
      # super slow we definitely do not want to run it twice in parallel.
      Gitlab::ExclusiveLease.new(
        "project_repository_check:#{id}",
        timeout: 24.hours
      ).try_obtain
    end
  end
end