diff options
author | Tomasz Maczukin <tomasz@maczukin.pl> | 2019-08-06 14:00:38 +0200 |
---|---|---|
committer | Tomasz Maczukin <tomasz@maczukin.pl> | 2019-08-27 21:39:55 +0200 |
commit | b49ebae145828dfbc9d238ce94fbee8a87d9eeb9 (patch) | |
tree | 38b77cd467ee81075fd2ae647f7def2d573c3ad0 | |
parent | 4eecb0ad965e29a84ce6209d559d40c13b0de505 (diff) | |
download | gitlab-ce-ce-split-conflict-and-validation-errors-from-job-scheduling-error-analyse.tar.gz |
Improve responses for Runner's new job requestsce-split-conflict-and-validation-errors-from-job-scheduling-error-analyse
When job is being scheduled, GitLab tries to detect concurrent requests
that would get the same job. In that case only the first Runner that
asked for a job gets it, and all other requests are internally handled
again (until there are other jobs in the queue that matches suchi
specific Runner).
When such "conflict" situation was detected and no job was found in the
loop repeat, Runner receives a `409 Conflict` response, which leaves a
track about such situation. It additionally triggers a request back-off
on Runner, so in highly concurrent installations it should generate some
intervals between concurrent requests from different Runners.
However, one of the symptoms used for finding the "conflict" situation
is the `StateMachines::InvalidTransition` exception, which may be also
thrown when some data doesn't pass validation. This is a case that
should definitely not produce a `409 Conflict` response, but instead a
`400 Bad Request` one with details about what was wrong should be sent.
This commit adds a needed update to the `Ci::RegisterJobService` service
and the `/api/v4/jobs/request` endpoint. It efectively splits the
'conflict' and 'validation error' cases for job scheduling.
More details about the problem and why we'fe decided to handle it like
that can be found in
https://gitlab.com/gitlab-org/gitlab-runner/issues/4360.
-rw-r--r-- | app/services/ci/register_job_service.rb | 39 | ||||
-rw-r--r-- | changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml | 5 | ||||
-rw-r--r-- | lib/api/runner.rb | 7 | ||||
-rw-r--r-- | spec/requests/api/runner_spec.rb | 11 | ||||
-rw-r--r-- | spec/services/ci/register_job_service_spec.rb | 80 |
5 files changed, 111 insertions, 31 deletions
diff --git a/app/services/ci/register_job_service.rb b/app/services/ci/register_job_service.rb index 9d4cf5df713..7c58f2a6d45 100644 --- a/app/services/ci/register_job_service.rb +++ b/app/services/ci/register_job_service.rb @@ -9,7 +9,19 @@ module Ci JOB_QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30, 60, 300, 900].freeze JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5.freeze - Result = Struct.new(:build, :valid?) + Result = Struct.new(:build, :state) do + def stale? + state == :stale + end + + def invalid? + state == :invalid + end + + def valid? + !stale? && !invalid? + end + end def initialize(runner) @runner = runner @@ -26,7 +38,7 @@ module Ci builds_for_project_runner end - valid = true + job_state = nil # pick builds that does not have other tags than runner's one builds = builds.matches_tag_ids(runner.tags.ids) @@ -50,9 +62,9 @@ module Ci if assign_runner!(build, params) register_success(build) - return Result.new(build, true) + return Result.new(build, :valid) end - rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError + rescue ActiveRecord::StaleObjectError # We are looping to find another build that is not conflicting # It also indicates that this build can be picked and passed to runner. # If we don't do it, basically a bunch of runners would be competing for a build @@ -62,12 +74,27 @@ module Ci # In case we hit the concurrency-access lock, # we still have to return 409 in the end, # to make sure that this is properly handled by runner. - valid = false + + job_state = :stale + rescue StateMachines::InvalidTransition + # StateMachines::InvalidTransition may also be a symptom of a conflicting + # situation. But it may be also returned, when a validation error happens. + # In that case, instead of looping, we should return a 400 Bad Request response + # to the Runner. + # https://gitlab.com/gitlab-org/gitlab-runner/issues/4360 for a context + + if build.valid? + job_state = :stale + else + register_failure + + return Result.new(build, :invalid) + end end end register_failure - Result.new(nil, valid) + Result.new(nil, job_state) end # rubocop: enable CodeReuse/ActiveRecord diff --git a/changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml b/changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml new file mode 100644 index 00000000000..e1b6d223d15 --- /dev/null +++ b/changelogs/unreleased/split-conflict-and-validation-errors-from-job-scheduling-error-analyse.yml @@ -0,0 +1,5 @@ +--- +title: Distinguish between validation errors and stale resources errors when Runner requests a new job +merge_request: 31678 +author: +type: fixed diff --git a/lib/api/runner.rb b/lib/api/runner.rb index fdf4904e9f5..1580b2059ec 100644 --- a/lib/api/runner.rb +++ b/lib/api/runner.rb @@ -128,10 +128,13 @@ module API header 'X-GitLab-Last-Update', new_update no_content! end - else + elsif result.stale? # We received build that is invalid due to concurrency conflict - Gitlab::Metrics.add_event(:build_invalid) + Gitlab::Metrics.add_event(:build_conflicted) conflict! + else + Gitlab::Metrics.add_event(:build_invalid) + render_validation_error!(result.build) end end diff --git a/spec/requests/api/runner_spec.rb b/spec/requests/api/runner_spec.rb index d9ef5edb848..c48322ff2dd 100644 --- a/spec/requests/api/runner_spec.rb +++ b/spec/requests/api/runner_spec.rb @@ -681,6 +681,17 @@ describe API::Runner, :clean_gitlab_redis_shared_state do end end + context 'when invalid data is provided' do + it 'returns a conflict' do + request_job(runner.token, { session: { url: 'http://###:1234/' } }) + + expect(response).to have_gitlab_http_status(400) + + expect(json_response).to have_key('message') + expect(json_response['message']).to have_key('runner_session.url') + end + end + context 'when project and pipeline have multiple jobs' do let!(:job) { create(:ci_build, :tag, pipeline: pipeline, name: 'spinach', stage: 'test', stage_idx: 0) } let!(:job2) { create(:ci_build, :tag, pipeline: pipeline, name: 'rubocop', stage: 'test', stage_idx: 0) } diff --git a/spec/services/ci/register_job_service_spec.rb b/spec/services/ci/register_job_service_spec.rb index 874945c8585..d6f4857aef5 100644 --- a/spec/services/ci/register_job_service_spec.rb +++ b/spec/services/ci/register_job_service_spec.rb @@ -244,49 +244,83 @@ module Ci end end - context 'when first build is stalled' do + context 'when a conflicted schedule is made' do before do allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!).and_call_original - allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!) - .with(pending_job, anything).and_raise(ActiveRecord::StaleObjectError) end subject { described_class.new(specific_runner).execute } - context 'with multiple builds are in queue' do - let!(:other_build) { create :ci_build, pipeline: pipeline } + shared_examples 'reports conflicts' do + context 'with multiple builds are in queue' do + let!(:other_build) { create :ci_build, pipeline: pipeline } - before do - allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner) - .and_return(Ci::Build.where(id: [pending_job, other_build])) + before do + allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner) + .and_return(Ci::Build.where(id: [pending_job, other_build])) + end + + it "receives second build from the queue" do + expect(subject).not_to be_stale + expect(subject).to be_valid + expect(subject.build).to eq(other_build) + end end - it "receives second build from the queue" do - expect(subject).to be_valid - expect(subject.build).to eq(other_build) + context 'when single build is in queue' do + before do + allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner) + .and_return(Ci::Build.where(id: pending_job)) + end + + it "does receive a conflict result" do + expect(subject).to be_stale + expect(subject).not_to be_valid + end + end + + context 'when there is no build in queue' do + before do + allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner) + .and_return(Ci::Build.none) + end + + it "does not receive builds but result is not a conflict" do + expect(subject).not_to be_stale + expect(subject).to be_valid + expect(subject.build).to be_nil + end end end - context 'when single build is in queue' do + context 'when first build is stalled' do before do - allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner) - .and_return(Ci::Build.where(id: pending_job)) + allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!) + .with(pending_job, anything).and_raise(ActiveRecord::StaleObjectError) end - it "does not receive any valid result" do - expect(subject).not_to be_valid - end + it_behaves_like 'reports conflicts' end - context 'when there is no build in queue' do + context 'when state machine transition is invalid' do before do - allow_any_instance_of(Ci::RegisterJobService).to receive(:builds_for_project_runner) - .and_return(Ci::Build.none) + allow_any_instance_of(Ci::RegisterJobService).to receive(:assign_runner!) + .with(pending_job, anything) + .and_raise(StateMachines::InvalidTransition.new(pending_job, Ci::Build.state_machines[:status], :run)) end - it "does not receive builds but result is valid" do - expect(subject).to be_valid - expect(subject.build).to be_nil + it_behaves_like 'reports conflicts' + + context "when it's caused by validation error" do + before do + allow_any_instance_of(Ci::Build).to receive(:valid?).and_return(false) + end + + it 'returns an invalid result' do + expect(subject).not_to be_stale + expect(subject).not_to be_valid + expect(subject.build).not_to be_nil + end end end end |