summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKamil Trzciński <ayufan@ayufan.eu>2017-03-06 16:59:29 +0000
committerKamil Trzciński <ayufan@ayufan.eu>2017-03-06 16:59:29 +0000
commit34f3d8999a10b14edd0a440cac7b40d11589b54c (patch)
tree67bc001e038e4f5f5ae75f104f0b65a7e93e0d2e
parentb63c41e12e9e6f7e9fd1d79bedf56bd42cc17035 (diff)
parent4d4e99a2f163408de44d39ea98131a4231667c24 (diff)
downloadgitlab-ce-34f3d8999a10b14edd0a440cac7b40d11589b54c.tar.gz
Merge branch '27523-make-stuck-build-detection-more-performant' into 'master'
Make stuck builds detection more performant Closes #27523 See merge request !9025
-rw-r--r--app/workers/stuck_ci_builds_worker.rb19
-rw-r--r--app/workers/stuck_ci_jobs_worker.rb59
-rw-r--r--changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml4
-rw-r--r--config/gitlab.yml.example6
-rw-r--r--config/initializers/1_settings.rb6
-rw-r--r--spec/workers/stuck_ci_builds_worker_spec.rb57
-rw-r--r--spec/workers/stuck_ci_jobs_worker_spec.rb129
7 files changed, 198 insertions, 82 deletions
diff --git a/app/workers/stuck_ci_builds_worker.rb b/app/workers/stuck_ci_builds_worker.rb
deleted file mode 100644
index b70df5a1afa..00000000000
--- a/app/workers/stuck_ci_builds_worker.rb
+++ /dev/null
@@ -1,19 +0,0 @@
-class StuckCiBuildsWorker
- include Sidekiq::Worker
- include CronjobQueue
-
- BUILD_STUCK_TIMEOUT = 1.day
-
- def perform
- Rails.logger.info 'Cleaning stuck builds'
-
- builds = Ci::Build.joins(:project).running_or_pending.where('ci_builds.updated_at < ?', BUILD_STUCK_TIMEOUT.ago)
- builds.find_each(batch_size: 50).each do |build|
- Rails.logger.debug "Dropping stuck #{build.status} build #{build.id} for runner #{build.runner_id}"
- build.drop
- end
-
- # Update builds that failed to drop
- builds.update_all(status: 'failed')
- end
-end
diff --git a/app/workers/stuck_ci_jobs_worker.rb b/app/workers/stuck_ci_jobs_worker.rb
new file mode 100644
index 00000000000..ae8c980c9e4
--- /dev/null
+++ b/app/workers/stuck_ci_jobs_worker.rb
@@ -0,0 +1,59 @@
+class StuckCiJobsWorker
+ include Sidekiq::Worker
+ include CronjobQueue
+
+ EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'.freeze
+
+ BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
+ BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
+ BUILD_PENDING_STUCK_TIMEOUT = 1.hour
+
+ def perform
+ return unless try_obtain_lease
+
+ Rails.logger.info "#{self.class}: Cleaning stuck builds"
+
+ drop :running, BUILD_RUNNING_OUTDATED_TIMEOUT
+ drop :pending, BUILD_PENDING_OUTDATED_TIMEOUT
+ drop_stuck :pending, BUILD_PENDING_STUCK_TIMEOUT
+
+ remove_lease
+ end
+
+ private
+
+ def try_obtain_lease
+ @uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
+ end
+
+ def remove_lease
+ Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
+ end
+
+ def drop(status, timeout)
+ search(status, timeout) do |build|
+ drop_build :outdated, build, status, timeout
+ end
+ end
+
+ def drop_stuck(status, timeout)
+ search(status, timeout) do |build|
+ return unless build.stuck?
+ drop_build :stuck, build, status, timeout
+ end
+ end
+
+ def search(status, timeout)
+ builds = Ci::Build.where(status: status).where('ci_builds.updated_at < ?', timeout.ago)
+ builds.joins(:project).includes(:tags, :runner, project: :namespace).find_each(batch_size: 50).each do |build|
+ yield(build)
+ end
+ end
+
+ def drop_build(type, build, status, timeout)
+ Rails.logger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout})"
+ Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
+ b.drop
+ end
+ end
+end
diff --git a/changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml b/changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml
new file mode 100644
index 00000000000..a4ef2b23aaa
--- /dev/null
+++ b/changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml
@@ -0,0 +1,4 @@
+---
+title: Make stuck builds detection more performant
+merge_request: 9025
+author:
diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example
index a82ff605a70..8f99a4d541f 100644
--- a/config/gitlab.yml.example
+++ b/config/gitlab.yml.example
@@ -177,9 +177,9 @@ production: &base
# Periodically executed jobs, to self-heal Gitlab, do external synchronizations, etc.
# Please read here for more information: https://github.com/ondrejbartas/sidekiq-cron#adding-cron-job
cron_jobs:
- # Flag stuck CI builds as failed
- stuck_ci_builds_worker:
- cron: "0 0 * * *"
+ # Flag stuck CI jobs as failed
+ stuck_ci_jobs_worker:
+ cron: "0 * * * *"
# Remove expired build artifacts
expire_build_artifacts_worker:
cron: "50 * * * *"
diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb
index c64ae15fa92..5aeaa7eab81 100644
--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -308,9 +308,9 @@ Settings.gravatar['host'] = Settings.host_without_www(Settings.gravatar[
# Cron Jobs
#
Settings['cron_jobs'] ||= Settingslogic.new({})
-Settings.cron_jobs['stuck_ci_builds_worker'] ||= Settingslogic.new({})
-Settings.cron_jobs['stuck_ci_builds_worker']['cron'] ||= '0 0 * * *'
-Settings.cron_jobs['stuck_ci_builds_worker']['job_class'] = 'StuckCiBuildsWorker'
+Settings.cron_jobs['stuck_ci_jobs_worker'] ||= Settingslogic.new({})
+Settings.cron_jobs['stuck_ci_jobs_worker']['cron'] ||= '0 * * * *'
+Settings.cron_jobs['stuck_ci_jobs_worker']['job_class'] = 'StuckCiJobsWorker'
Settings.cron_jobs['expire_build_artifacts_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['expire_build_artifacts_worker']['cron'] ||= '50 * * * *'
Settings.cron_jobs['expire_build_artifacts_worker']['job_class'] = 'ExpireBuildArtifactsWorker'
diff --git a/spec/workers/stuck_ci_builds_worker_spec.rb b/spec/workers/stuck_ci_builds_worker_spec.rb
deleted file mode 100644
index 801fa31b45d..00000000000
--- a/spec/workers/stuck_ci_builds_worker_spec.rb
+++ /dev/null
@@ -1,57 +0,0 @@
-require "spec_helper"
-
-describe StuckCiBuildsWorker do
- let!(:build) { create :ci_build }
- let(:worker) { described_class.new }
-
- subject do
- build.reload
- build.status
- end
-
- %w(pending running).each do |status|
- context "#{status} build" do
- before do
- build.update!(status: status)
- end
-
- it 'gets dropped if it was updated over 2 days ago' do
- build.update!(updated_at: 2.days.ago)
- worker.perform
- is_expected.to eq('failed')
- end
-
- it "is still #{status}" do
- build.update!(updated_at: 1.minute.ago)
- worker.perform
- is_expected.to eq(status)
- end
- end
- end
-
- %w(success failed canceled).each do |status|
- context "#{status} build" do
- before do
- build.update!(status: status)
- end
-
- it "is still #{status}" do
- build.update!(updated_at: 2.days.ago)
- worker.perform
- is_expected.to eq(status)
- end
- end
- end
-
- context "for deleted project" do
- before do
- build.update!(status: :running, updated_at: 2.days.ago)
- build.project.update(pending_delete: true)
- end
-
- it "does not drop build" do
- expect_any_instance_of(Ci::Build).not_to receive(:drop)
- worker.perform
- end
- end
-end
diff --git a/spec/workers/stuck_ci_jobs_worker_spec.rb b/spec/workers/stuck_ci_jobs_worker_spec.rb
new file mode 100644
index 00000000000..8434b0c8e5b
--- /dev/null
+++ b/spec/workers/stuck_ci_jobs_worker_spec.rb
@@ -0,0 +1,129 @@
+require 'spec_helper'
+
+describe StuckCiJobsWorker do
+ let!(:runner) { create :ci_runner }
+ let!(:job) { create :ci_build, runner: runner }
+ let(:worker) { described_class.new }
+ let(:exclusive_lease_uuid) { SecureRandom.uuid }
+
+ subject do
+ job.reload
+ job.status
+ end
+
+ before do
+ job.update!(status: status, updated_at: updated_at)
+ allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(exclusive_lease_uuid)
+ end
+
+ shared_examples 'job is dropped' do
+ it 'changes status' do
+ worker.perform
+ is_expected.to eq('failed')
+ end
+ end
+
+ shared_examples 'job is unchanged' do
+ it "doesn't change status" do
+ worker.perform
+ is_expected.to eq(status)
+ end
+ end
+
+ context 'when job is pending' do
+ let(:status) { 'pending' }
+
+ context 'when job is not stuck' do
+ before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false) }
+
+ context 'when job was not updated for more than 1 day ago' do
+ let(:updated_at) { 2.days.ago }
+ it_behaves_like 'job is dropped'
+ end
+
+ context 'when job was updated in less than 1 day ago' do
+ let(:updated_at) { 6.hours.ago }
+ it_behaves_like 'job is unchanged'
+ end
+
+ context 'when job was not updated for more than 1 hour ago' do
+ let(:updated_at) { 2.hours.ago }
+ it_behaves_like 'job is unchanged'
+ end
+ end
+
+ context 'when job is stuck' do
+ before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true) }
+
+ context 'when job was not updated for more than 1 hour ago' do
+ let(:updated_at) { 2.hours.ago }
+ it_behaves_like 'job is dropped'
+ end
+
+ context 'when job was updated in less than 1 hour ago' do
+ let(:updated_at) { 30.minutes.ago }
+ it_behaves_like 'job is unchanged'
+ end
+ end
+ end
+
+ context 'when job is running' do
+ let(:status) { 'running' }
+
+ context 'when job was not updated for more than 1 hour ago' do
+ let(:updated_at) { 2.hours.ago }
+ it_behaves_like 'job is dropped'
+ end
+
+ context 'when job was updated in less than 1 hour ago' do
+ let(:updated_at) { 30.minutes.ago }
+ it_behaves_like 'job is unchanged'
+ end
+ end
+
+ %w(success skipped failed canceled).each do |status|
+ context "when job is #{status}" do
+ let(:status) { status }
+ let(:updated_at) { 2.days.ago }
+ it_behaves_like 'job is unchanged'
+ end
+ end
+
+ context 'for deleted project' do
+ let(:status) { 'running' }
+ let(:updated_at) { 2.days.ago }
+
+ before { job.project.update(pending_delete: true) }
+
+ it 'does not drop job' do
+ expect_any_instance_of(Ci::Build).not_to receive(:drop)
+ worker.perform
+ end
+ end
+
+ describe 'exclusive lease' do
+ let(:status) { 'running' }
+ let(:updated_at) { 2.days.ago }
+ let(:worker2) { described_class.new }
+
+ it 'is guard by exclusive lease when executed concurrently' do
+ expect(worker).to receive(:drop).at_least(:once)
+ expect(worker2).not_to receive(:drop)
+ worker.perform
+ allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(false)
+ worker2.perform
+ end
+
+ it 'can be executed in sequence' do
+ expect(worker).to receive(:drop).at_least(:once)
+ expect(worker2).to receive(:drop).at_least(:once)
+ worker.perform
+ worker2.perform
+ end
+
+ it 'cancels exclusive lease after worker perform' do
+ expect(Gitlab::ExclusiveLease).to receive(:cancel).with(described_class::EXCLUSIVE_LEASE_KEY, exclusive_lease_uuid)
+ worker.perform
+ end
+ end
+end