diff options
author | Stan Hu <stanhu@gmail.com> | 2018-07-06 09:05:58 -0700 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2018-07-06 10:11:59 -0700 |
commit | b33661d6ec8498ae1dadfb3b2be0e4a80e61f108 (patch) | |
tree | d6b80c77965177619a6f4d149a10b613815fbfe6 /app/workers | |
parent | a291bcdf0d3ed892dcb805e11a43afcadbc20e8b (diff) | |
download | gitlab-ce-b33661d6ec8498ae1dadfb3b2be0e4a80e61f108.tar.gz |
Add ExclusiveLease guards for RepositoryCheck::{DispatchWorker,BatchWorker}
We saw in production that DispatchWorker was running about twice an hour,
which would schedule twice as many jobs as it should.
For some reason, BatchWorker was running 1000 times per hour, possibly
due to Sidekiq RSS kills that caused these jobs to restart.
Adding an ExclusiveLease prevents these jobs from running more
than they should.
Relates to https://gitlab.com/gitlab-com/infrastructure/issues/4526
Diffstat (limited to 'app/workers')
-rw-r--r-- | app/workers/repository_check/batch_worker.rb | 20 | ||||
-rw-r--r-- | app/workers/repository_check/dispatch_worker.rb | 13 |
2 files changed, 29 insertions, 4 deletions
diff --git a/app/workers/repository_check/batch_worker.rb b/app/workers/repository_check/batch_worker.rb index 051382a08a9..07559ea479b 100644 --- a/app/workers/repository_check/batch_worker.rb +++ b/app/workers/repository_check/batch_worker.rb @@ -4,9 +4,11 @@ module RepositoryCheck class BatchWorker include ApplicationWorker include RepositoryCheckQueue + include ExclusiveLeaseGuard RUN_TIME = 3600 BATCH_SIZE = 10_000 + LEASE_TIMEOUT = 1.hour attr_reader :shard_name @@ -16,6 +18,20 @@ module RepositoryCheck return unless Gitlab::CurrentSettings.repository_checks_enabled return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name) + try_obtain_lease do + perform_repository_checks + end + end + + def lease_timeout + LEASE_TIMEOUT + end + + def lease_key + "repository_check_batch_worker:#{shard_name}" + end + + def perform_repository_checks start = Time.now # This loop will break after a little more than one hour ('a little @@ -26,7 +42,7 @@ module RepositoryCheck project_ids.each do |project_id| break if Time.now - start >= RUN_TIME - next unless try_obtain_lease(project_id) + next unless try_obtain_lease_for_project(project_id) SingleRepositoryWorker.new.perform(project_id) end @@ -60,7 +76,7 @@ module RepositoryCheck Project.where(repository_storage: shard_name) end - def try_obtain_lease(id) + def try_obtain_lease_for_project(id) # Use a 24-hour timeout because on servers/projects where 'git fsck' is # super slow we definitely do not want to run it twice in parallel. Gitlab::ExclusiveLease.new( diff --git a/app/workers/repository_check/dispatch_worker.rb b/app/workers/repository_check/dispatch_worker.rb index 891a273afd7..96634f09a15 100644 --- a/app/workers/repository_check/dispatch_worker.rb +++ b/app/workers/repository_check/dispatch_worker.rb @@ -3,13 +3,22 @@ module RepositoryCheck include ApplicationWorker include CronjobQueue include ::EachShardWorker + include ExclusiveLeaseGuard + + LEASE_TIMEOUT = 1.hour def perform return unless Gitlab::CurrentSettings.repository_checks_enabled - each_eligible_shard do |shard_name| - RepositoryCheck::BatchWorker.perform_async(shard_name) + try_obtain_lease do + each_eligible_shard do |shard_name| + RepositoryCheck::BatchWorker.perform_async(shard_name) + end end end + + def lease_timeout + LEASE_TIMEOUT + end end end |