summaryrefslogtreecommitdiff
path: root/app/workers
diff options
context:
space:
mode:
authorJacob Vosmaer <contact@jacobvosmaer.nl>2016-04-13 15:56:05 +0200
committerJacob Vosmaer <contact@jacobvosmaer.nl>2016-04-13 15:56:05 +0200
commit0f602be99f99f1ae493478a8a28df2907cfa0082 (patch)
treea49a9a7ac70ca6825547db88a32f8b4c00343aa3 /app/workers
parent9a30d3b5aef732e782e9496b2e8ae62069ba521a (diff)
downloadgitlab-ce-0f602be99f99f1ae493478a8a28df2907cfa0082.tar.gz
Clear repository check columns asynchronously
Diffstat (limited to 'app/workers')
-rw-r--r--app/workers/repository_check/batch_worker.rb63
-rw-r--r--app/workers/repository_check/clear_worker.rb17
-rw-r--r--app/workers/repository_check/single_repository_worker.rb36
-rw-r--r--app/workers/repository_check_worker.rb61
-rw-r--r--app/workers/single_repository_check_worker.rb34
5 files changed, 116 insertions, 95 deletions
diff --git a/app/workers/repository_check/batch_worker.rb b/app/workers/repository_check/batch_worker.rb
new file mode 100644
index 00000000000..16cd77a9bf0
--- /dev/null
+++ b/app/workers/repository_check/batch_worker.rb
@@ -0,0 +1,63 @@
+module RepositoryCheck
+ class BatchWorker
+ include Sidekiq::Worker
+
+ RUN_TIME = 3600
+
+ sidekiq_options retry: false
+
+ def perform
+ start = Time.now
+
+ # This loop will break after a little more than one hour ('a little
+ # more' because `git fsck` may take a few minutes), or if it runs out of
+ # projects to check. By default sidekiq-cron will start a new
+ # RepositoryCheckWorker each hour so that as long as there are repositories to
+ # check, only one (or two) will be checked at a time.
+ project_ids.each do |project_id|
+ break if Time.now - start >= RUN_TIME
+ break unless current_settings.repository_checks_enabled
+
+ next unless try_obtain_lease(project_id)
+
+ SingleRepositoryWorker.new.perform(project_id)
+ end
+ end
+
+ private
+
+ # Project.find_each does not support WHERE clauses and
+ # Project.find_in_batches does not support ordering. So we just build an
+ # array of ID's. This is OK because we do it only once an hour, because
+ # getting ID's from Postgres is not terribly slow, and because no user
+ # has to sit and wait for this query to finish.
+ def project_ids
+ limit = 10_000
+ never_checked_projects = Project.where('last_repository_check_at IS NULL').limit(limit).
+ pluck(:id)
+ old_check_projects = Project.where('last_repository_check_at < ?', 1.week.ago).
+ reorder('last_repository_check_at ASC').limit(limit).pluck(:id)
+ never_checked_projects + old_check_projects
+ end
+
+ def try_obtain_lease(id)
+ # Use a 24-hour timeout because on servers/projects where 'git fsck' is
+ # super slow we definitely do not want to run it twice in parallel.
+ Gitlab::ExclusiveLease.new(
+ "project_repository_check:#{id}",
+ timeout: 24.hours
+ ).try_obtain
+ end
+
+ def current_settings
+ # No caching of the settings! If we cache them and an admin disables
+ # this feature, an active RepositoryCheckWorker would keep going for up
+ # to 1 hour after the feature was disabled.
+ if Rails.env.test?
+ Gitlab::CurrentSettings.fake_application_settings
+ else
+ ApplicationSetting.current
+ end
+ end
+ end
+end
diff --git a/app/workers/repository_check/clear_worker.rb b/app/workers/repository_check/clear_worker.rb
new file mode 100644
index 00000000000..fe0cce9aab7
--- /dev/null
+++ b/app/workers/repository_check/clear_worker.rb
@@ -0,0 +1,17 @@
+module RepositoryCheck
+ class ClearWorker
+ include Sidekiq::Worker
+
+ sidekiq_options retry: false
+
+ def perform
+ # Do batched updates because these updates will be slow and locking
+ Project.select(:id).find_in_batches(batch_size: 1000) do |batch|
+ Project.where(id: batch.map(&:id)).update_all(
+ last_repository_check_failed: nil,
+ last_repository_check_at: nil,
+ )
+ end
+ end
+ end
+end \ No newline at end of file
diff --git a/app/workers/repository_check/single_repository_worker.rb b/app/workers/repository_check/single_repository_worker.rb
new file mode 100644
index 00000000000..e54ae86d06c
--- /dev/null
+++ b/app/workers/repository_check/single_repository_worker.rb
@@ -0,0 +1,36 @@
+module RepositoryCheck
+ class SingleRepositoryWorker
+ include Sidekiq::Worker
+
+ sidekiq_options retry: false
+
+ def perform(project_id)
+ project = Project.find(project_id)
+ project.update_columns(
+ last_repository_check_failed: !check(project),
+ last_repository_check_at: Time.now,
+ )
+ end
+
+ private
+
+ def check(project)
+ # Use 'map do', not 'all? do', to prevent short-circuiting
+ [project.repository, project.wiki.repository].map do |repository|
+ git_fsck(repository.path_to_repo)
+ end.all?
+ end
+
+ def git_fsck(path)
+ cmd = %W(nice git --git-dir=#{path} fsck)
+ output, status = Gitlab::Popen.popen(cmd)
+
+ if status.zero?
+ true
+ else
+ Gitlab::RepositoryCheckLogger.error("command failed: #{cmd.join(' ')}\n#{output}")
+ false
+ end
+ end
+ end
+end
diff --git a/app/workers/repository_check_worker.rb b/app/workers/repository_check_worker.rb
deleted file mode 100644
index d7ead91f94e..00000000000
--- a/app/workers/repository_check_worker.rb
+++ /dev/null
@@ -1,61 +0,0 @@
-class RepositoryCheckWorker
- include Sidekiq::Worker
-
- RUN_TIME = 3600
-
- sidekiq_options retry: false
-
- def perform
- start = Time.now
-
- # This loop will break after a little more than one hour ('a little
- # more' because `git fsck` may take a few minutes), or if it runs out of
- # projects to check. By default sidekiq-cron will start a new
- # RepositoryCheckWorker each hour so that as long as there are repositories to
- # check, only one (or two) will be checked at a time.
- project_ids.each do |project_id|
- break if Time.now - start >= RUN_TIME
- break unless current_settings.repository_checks_enabled
-
- next unless try_obtain_lease(project_id)
-
- SingleRepositoryCheckWorker.new.perform(project_id)
- end
- end
-
- private
-
- # Project.find_each does not support WHERE clauses and
- # Project.find_in_batches does not support ordering. So we just build an
- # array of ID's. This is OK because we do it only once an hour, because
- # getting ID's from Postgres is not terribly slow, and because no user
- # has to sit and wait for this query to finish.
- def project_ids
- limit = 10_000
- never_checked_projects = Project.where('last_repository_check_at IS NULL').limit(limit).
- pluck(:id)
- old_check_projects = Project.where('last_repository_check_at < ?', 1.week.ago).
- reorder('last_repository_check_at ASC').limit(limit).pluck(:id)
- never_checked_projects + old_check_projects
- end
-
- def try_obtain_lease(id)
- # Use a 24-hour timeout because on servers/projects where 'git fsck' is
- # super slow we definitely do not want to run it twice in parallel.
- Gitlab::ExclusiveLease.new(
- "project_repository_check:#{id}",
- timeout: 24.hours
- ).try_obtain
- end
-
- def current_settings
- # No caching of the settings! If we cache them and an admin disables
- # this feature, an active RepositoryCheckWorker would keep going for up
- # to 1 hour after the feature was disabled.
- if Rails.env.test?
- Gitlab::CurrentSettings.fake_application_settings
- else
- ApplicationSetting.current
- end
- end
-end
diff --git a/app/workers/single_repository_check_worker.rb b/app/workers/single_repository_check_worker.rb
deleted file mode 100644
index f6c345df8b5..00000000000
--- a/app/workers/single_repository_check_worker.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-class SingleRepositoryCheckWorker
- include Sidekiq::Worker
-
- sidekiq_options retry: false
-
- def perform(project_id)
- project = Project.find(project_id)
- project.update_columns(
- last_repository_check_failed: !check(project),
- last_repository_check_at: Time.now,
- )
- end
-
- private
-
- def check(project)
- # Use 'map do', not 'all? do', to prevent short-circuiting
- [project.repository, project.wiki.repository].map do |repository|
- git_fsck(repository.path_to_repo)
- end.all?
- end
-
- def git_fsck(path)
- cmd = %W(nice git --git-dir=#{path} fsck)
- output, status = Gitlab::Popen.popen(cmd)
-
- if status.zero?
- true
- else
- Gitlab::RepositoryCheckLogger.error("command failed: #{cmd.join(' ')}\n#{output}")
- false
- end
- end
-end