summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTomasz Maczukin <tomasz@maczukin.pl>2019-08-06 14:00:38 +0200
committerTomasz Maczukin <tomasz@maczukin.pl>2019-08-27 21:39:55 +0200
commitb49ebae145828dfbc9d238ce94fbee8a87d9eeb9 (patch)
tree38b77cd467ee81075fd2ae647f7def2d573c3ad0
parent4eecb0ad965e29a84ce6209d559d40c13b0de505 (diff)
downloadgitlab-ce-ce-split-conflict-and-validation-errors-from-job-scheduling-error-analyse.tar.gz
When job is being scheduled, GitLab tries to detect concurrent requests that would get the same job. In that case only the first Runner that asked for a job gets it, and all other requests are internally handled again (until there are other jobs in the queue that matches suchi specific Runner). When such "conflict" situation was detected and no job was found in the loop repeat, Runner receives a `409 Conflict` response, which leaves a track about such situation. It additionally triggers a request back-off on Runner, so in highly concurrent installations it should generate some intervals between concurrent requests from different Runners. However, one of the symptoms used for finding the "conflict" situation is the `StateMachines::InvalidTransition` exception, which may be also thrown when some data doesn't pass validation. This is a case that should definitely not produce a `409 Conflict` response, but instead a `400 Bad Request` one with details about what was wrong should be sent. This commit adds a needed update to the `Ci::RegisterJobService` service and the `/api/v4/jobs/request` endpoint. It efectively splits the 'conflict' and 'validation error' cases for job scheduling. More details about the problem and why we'fe decided to handle it like that can be found in https://gitlab.com/gitlab-org/gitlab-runner/issues/4360.
-rw-r--r--app/services/ci/register_job_service.rb39
-rw-r--r--changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml5
-rw-r--r--lib/api/runner.rb7
-rw-r--r--spec/requests/api/runner_spec.rb11
-rw-r--r--spec/services/ci/register_job_service_spec.rb80
5 files changed, 111 insertions, 31 deletions
diff --git a/app/services/ci/register_job_service.rb b/app/services/ci/register_job_service.rb
index 9d4cf5df713..7c58f2a6d45 100644
--- a/app/services/ci/register_job_service.rb
+++ b/app/services/ci/register_job_service.rb
@@ -9,7 +9,19 @@ module Ci
JOB_QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30, 60, 300, 900].freeze
JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5.freeze
- Result = Struct.new(:build, :valid?)
+ Result = Struct.new(:build, :state) do
+ def stale?
+ state == :stale
+ end
+
+ def invalid?
+ state == :invalid
+ end
+
+ def valid?
+ !stale? && !invalid?
+ end
+ end
def initialize(runner)
@runner = runner
@@ -26,7 +38,7 @@ module Ci
builds_for_project_runner
end
- valid = true
+ job_state = nil
# pick builds that does not have other tags than runner's one
builds = builds.matches_tag_ids(runner.tags.ids)
@@ -50,9 +62,9 @@ module Ci
if assign_runner!(build, params)
register_success(build)
- return Result.new(build, true)
+ return Result.new(build, :valid)
end
- rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError
+ rescue ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting
# It also indicates that this build can be picked and passed to runner.
# If we don't do it, basically a bunch of runners would be competing for a build
@@ -62,12 +74,27 @@ module Ci
# In case we hit the concurrency-access lock,
# we still have to return 409 in the end,
# to make sure that this is properly handled by runner.
- valid = false
+
+ job_state = :stale
+ rescue StateMachines::InvalidTransition
+ # StateMachines::InvalidTransition may also be a symptom of a conflicting
+ # situation. But it may be also returned, when a validation error happens.
+ # In that case, instead of looping, we should return a 400 Bad Request response
+ # to the Runner.
+ # https://gitlab.com/gitlab-org/gitlab-runner/issues/4360 for a context
+
+ if build.valid?
+ job_state = :stale
+ else
+ register_failure
+
+ return Result.new(build, :invalid)
+ end
end
end
register_failure
- Result.new(nil, valid)
+ Result.new(nil, job_state)
end
# rubocop: enable CodeReuse/ActiveRecord
diff --git a/changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml b/changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml
new file mode 100644
index 00000000000..e1b6d223d15
--- /dev/null
+++ b/changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml
@@ -0,0 +1,5 @@
+---
+title: Distinguish between validation errors and stale resources errors when Runner requests a new job
+merge_request: 31678
+author:
+type: fixed
diff --git a/lib/api/runner.rb b/lib/api/runner.rb
index fdf4904e9f5..1580b2059ec 100644
--- a/lib/api/runner.rb
+++ b/lib/api/runner.rb
@@ -128,10 +128,13 @@ module API
header 'X-GitLab-Last-Update', new_update
no_content!
end
- else
+ elsif result.stale?
# We received build that is invalid due to concurrency conflict
- Gitlab::Metrics.add_event(:build_invalid)
+ Gitlab::Metrics.add_event(:build_conflicted)
conflict!
+ else
+ Gitlab::Metrics.add_event(:build_invalid)
+ render_validation_error!(result.build)
end
end
diff --git a/spec/requests/api/runner_spec.rb b/spec/requests/api/runner_spec.rb
index d9ef5edb848..c48322ff2dd 100644
--- a/spec/requests/api/runner_spec.rb
+++ b/spec/requests/api/runner_spec.rb
@@ -681,6 +681,17 @@ describe API::Runner, :clean_gitlab_redis_shared_state do
end
end
+ context 'when invalid data is provided' do
+ it 'returns a conflict' do
+ request_job(runner.token, { session: { url: 'http://###:1234/' } })
+
+ expect(response).to have_gitlab_http_status(400)
+
+ expect(json_response).to have_key('message')
+ expect(json_response['message']).to have_key('runner_session.url')
+ end
+ end
+
context 'when project and pipeline have multiple jobs' do
let!(:job) { create(:ci_build, :tag, pipeline: pipeline, name: 'spinach', stage: 'test', stage_idx: 0) }
let!(:job2) { create(:ci_build, :tag, pipeline: pipeline, name: 'rubocop', stage: 'test', stage_idx: 0) }
diff --git a/spec/services/ci/register_job_service_spec.rb b/spec/services/ci/register_job_service_spec.rb
index 874945c8585..d6f4857aef5 100644
--- a/spec/services/ci/register_job_service_spec.rb
+++ b/spec/services/ci/register_job_service_spec.rb
@@ -244,49 +244,83 @@ module Ci
end
end
- context 'when first build is stalled' do
+ context 'when a conflicted schedule is made' do
before do
allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!).and_call_original
- allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!)
- .with(pending_job, anything).and_raise(ActiveRecord::StaleObjectError)
end
subject { described_class.new(specific_runner).execute }
- context 'with multiple builds are in queue' do
- let!(:other_build) { create :ci_build, pipeline: pipeline }
+ shared_examples 'reports conflicts' do
+ context 'with multiple builds are in queue' do
+ let!(:other_build) { create :ci_build, pipeline: pipeline }
- before do
- allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner)
- .and_return(Ci::Build.where(id: [pending_job, other_build]))
+ before do
+ allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner)
+ .and_return(Ci::Build.where(id: [pending_job, other_build]))
+ end
+
+ it "receives second build from the queue" do
+ expect(subject).not_to be_stale
+ expect(subject).to be_valid
+ expect(subject.build).to eq(other_build)
+ end
end
- it "receives second build from the queue" do
- expect(subject).to be_valid
- expect(subject.build).to eq(other_build)
+ context 'when single build is in queue' do
+ before do
+ allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner)
+ .and_return(Ci::Build.where(id: pending_job))
+ end
+
+ it "does receive a conflict result" do
+ expect(subject).to be_stale
+ expect(subject).not_to be_valid
+ end
+ end
+
+ context 'when there is no build in queue' do
+ before do
+ allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner)
+ .and_return(Ci::Build.none)
+ end
+
+ it "does not receive builds but result is not a conflict" do
+ expect(subject).not_to be_stale
+ expect(subject).to be_valid
+ expect(subject.build).to be_nil
+ end
end
end
- context 'when single build is in queue' do
+ context 'when first build is stalled' do
before do
- allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner)
- .and_return(Ci::Build.where(id: pending_job))
+ allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!)
+ .with(pending_job, anything).and_raise(ActiveRecord::StaleObjectError)
end
- it "does not receive any valid result" do
- expect(subject).not_to be_valid
- end
+ it_behaves_like 'reports conflicts'
end
- context 'when there is no build in queue' do
+ context 'when state machine transition is invalid' do
before do
- allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner)
- .and_return(Ci::Build.none)
+ allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!)
+ .with(pending_job, anything)
+ .and_raise(StateMachines::InvalidTransition.new(pending_job, Ci::Build.state_machines[:status], :run))
end
- it "does not receive builds but result is valid" do
- expect(subject).to be_valid
- expect(subject.build).to be_nil
+ it_behaves_like 'reports conflicts'
+
+ context "when it's caused by validation error" do
+ before do
+ allow_any_instance_of(Ci::Build).to receive(:valid?).and_return(false)
+ end
+
+ it 'returns an invalid result' do
+ expect(subject).not_to be_stale
+ expect(subject).not_to be_valid
+ expect(subject.build).not_to be_nil
+ end
end
end
end