diff options
author | Shinya Maeda <shinya@gitlab.com> | 2019-07-18 16:22:46 +0700 |
---|---|---|
committer | Shinya Maeda <shinya@gitlab.com> | 2019-07-23 17:26:08 +0700 |
commit | c2e0e689f355555db231ac6db40ab1b654c90233 (patch) | |
tree | 1ab8b2e6561598a61a10a0197a975019ed13f464 | |
parent | 1a3fda63a5f9756cde19bc7e221651b0c33cb5dc (diff) | |
download | gitlab-ce-c2e0e689f355555db231ac6db40ab1b654c90233.tar.gz |
Validate the existence of archived traces before removing live tracesafe-archiving-for-traces
Often live traces are removed even though the archived trace
doesn't exist. This commit checkes the existence strictly.
-rw-r--r-- | app/models/ci/build.rb | 8 | ||||
-rw-r--r-- | app/models/ci/job_artifact.rb | 4 | ||||
-rw-r--r-- | app/services/ci/archive_trace_service.rb | 28 | ||||
-rw-r--r-- | app/workers/archive_trace_worker.rb | 2 | ||||
-rw-r--r-- | app/workers/ci/archive_traces_cron_worker.rb | 2 | ||||
-rw-r--r-- | changelogs/unreleased/safe-archiving-for-traces.yml | 5 | ||||
-rw-r--r-- | lib/gitlab/ci/trace.rb | 12 | ||||
-rw-r--r-- | lib/gitlab/ci/trace/chunked_io.rb | 7 | ||||
-rw-r--r-- | spec/lib/gitlab/ci/trace/chunked_io_spec.rb | 10 | ||||
-rw-r--r-- | spec/models/ci/build_spec.rb | 28 | ||||
-rw-r--r-- | spec/models/ci/job_artifact_spec.rb | 25 | ||||
-rw-r--r-- | spec/services/ci/archive_trace_service_spec.rb | 38 | ||||
-rw-r--r-- | spec/support/shared_examples/ci_trace_shared_examples.rb | 52 | ||||
-rw-r--r-- | spec/workers/archive_trace_worker_spec.rb | 2 | ||||
-rw-r--r-- | spec/workers/ci/archive_traces_cron_worker_spec.rb | 7 | ||||
-rw-r--r-- | spec/workers/stuck_ci_jobs_worker_spec.rb | 4 |
16 files changed, 214 insertions, 20 deletions
diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 635fcc86166..da70cb9a9a7 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -531,6 +531,14 @@ module Ci trace.exist? end + def has_live_trace? + trace.live_trace_exist? + end + + def has_archived_trace? + trace.archived_trace_exist? + end + def artifacts_file job_artifacts_archive&.file end diff --git a/app/models/ci/job_artifact.rb b/app/models/ci/job_artifact.rb index f80e98e5bca..e132cb045e2 100644 --- a/app/models/ci/job_artifact.rb +++ b/app/models/ci/job_artifact.rb @@ -176,6 +176,10 @@ module Ci end end + def self.archived_trace_exists_for?(job_id) + where(job_id: job_id).trace.take&.file&.file&.exists? + end + private def file_format_adapter_class diff --git a/app/services/ci/archive_trace_service.rb b/app/services/ci/archive_trace_service.rb index b5cfa1d019c..700d78361a4 100644 --- a/app/services/ci/archive_trace_service.rb +++ b/app/services/ci/archive_trace_service.rb @@ -2,8 +2,25 @@ module Ci class ArchiveTraceService - def execute(job) + def execute(job, worker_name:) + # TODO: Remove this logging once we confirmed new live trace architecture is functional. + # See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667. + unless job.has_live_trace? + Sidekiq.logger.warn(class: worker_name, + message: 'The job does not have live trace but going to be archived.', + job_id: job.id) + return + end + job.trace.archive! + + # TODO: Remove this logging once we confirmed new live trace architecture is functional. + # See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667. + unless job.has_archived_trace? + Sidekiq.logger.warn(class: worker_name, + message: 'The job does not have archived trace after archiving.', + job_id: job.id) + end rescue ::Gitlab::Ci::Trace::AlreadyArchivedError # It's already archived, thus we can safely ignore this exception. rescue => e @@ -11,7 +28,7 @@ module Ci # If `archive!` keeps failing for over a week, that could incur data loss. # (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture) # In order to avoid interrupting the system, we do not raise an exception here. - archive_error(e, job) + archive_error(e, job, worker_name) end private @@ -22,9 +39,12 @@ module Ci "Counter of failed attempts of trace archiving") end - def archive_error(error, job) + def archive_error(error, job, worker_name) failed_archive_counter.increment - Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger + + Sidekiq.logger.warn(class: worker_name, + message: "Failed to archive trace. message: #{error.message}.", + job_id: job.id) Gitlab::Sentry .track_exception(error, diff --git a/app/workers/archive_trace_worker.rb b/app/workers/archive_trace_worker.rb index 4a9becf0ca7..66f9b8d9e80 100644 --- a/app/workers/archive_trace_worker.rb +++ b/app/workers/archive_trace_worker.rb @@ -7,7 +7,7 @@ class ArchiveTraceWorker # rubocop: disable CodeReuse/ActiveRecord def perform(job_id) Ci::Build.without_archived_trace.find_by(id: job_id).try do |job| - Ci::ArchiveTraceService.new.execute(job) + Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name) end end # rubocop: enable CodeReuse/ActiveRecord diff --git a/app/workers/ci/archive_traces_cron_worker.rb b/app/workers/ci/archive_traces_cron_worker.rb index f65ff239866..75e68d0233a 100644 --- a/app/workers/ci/archive_traces_cron_worker.rb +++ b/app/workers/ci/archive_traces_cron_worker.rb @@ -11,7 +11,7 @@ module Ci # This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL # More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791 Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build| - Ci::ArchiveTraceService.new.execute(build) + Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name) end end # rubocop: enable CodeReuse/ActiveRecord diff --git a/changelogs/unreleased/safe-archiving-for-traces.yml b/changelogs/unreleased/safe-archiving-for-traces.yml new file mode 100644 index 00000000000..2b9070bacfe --- /dev/null +++ b/changelogs/unreleased/safe-archiving-for-traces.yml @@ -0,0 +1,5 @@ +--- +title: Extra logging for new live trace architecture +merge_request: 30892 +author: +type: fixed diff --git a/lib/gitlab/ci/trace.rb b/lib/gitlab/ci/trace.rb index ce5857965bf..cb617080c76 100644 --- a/lib/gitlab/ci/trace.rb +++ b/lib/gitlab/ci/trace.rb @@ -63,7 +63,15 @@ module Gitlab end def exist? - trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present? + archived_trace_exist? || live_trace_exist? + end + + def archived_trace_exist? + trace_artifact&.exists? + end + + def live_trace_exist? + job.trace_chunks.any? || current_path.present? || old_trace.present? end def read @@ -167,7 +175,7 @@ module Gitlab def clone_file!(src_stream, temp_dir) FileUtils.mkdir_p(temp_dir) - Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path| + Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path| temp_path = File.join(dir_path, "job.log") FileUtils.touch(temp_path) size = IO.copy_stream(src_stream, temp_path) diff --git a/lib/gitlab/ci/trace/chunked_io.rb b/lib/gitlab/ci/trace/chunked_io.rb index 8c6fd56493f..e99889f4a25 100644 --- a/lib/gitlab/ci/trace/chunked_io.rb +++ b/lib/gitlab/ci/trace/chunked_io.rb @@ -166,6 +166,13 @@ module Gitlab end def destroy! + # TODO: Remove this logging once we confirmed new live trace architecture is functional. + # See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667. + unless build.has_archived_trace? + Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.', + job_id: build.id) + end + trace_chunks.fast_destroy_all @tell = @size = 0 ensure diff --git a/spec/lib/gitlab/ci/trace/chunked_io_spec.rb b/spec/lib/gitlab/ci/trace/chunked_io_spec.rb index 546a9e7d0cc..3e5cd81f929 100644 --- a/spec/lib/gitlab/ci/trace/chunked_io_spec.rb +++ b/spec/lib/gitlab/ci/trace/chunked_io_spec.rb @@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0) end + + context 'when the job does not have archived trace' do + it 'leaves a message in sidekiq log' do + expect(Sidekiq.logger).to receive(:warn).with( + message: 'The job does not have archived trace but going to be destroyed.', + job_id: build.id).and_call_original + + subject + end + end end end diff --git a/spec/models/ci/build_spec.rb b/spec/models/ci/build_spec.rb index 78862de0657..c30cb70e1c1 100644 --- a/spec/models/ci/build_spec.rb +++ b/spec/models/ci/build_spec.rb @@ -692,6 +692,34 @@ describe Ci::Build do end end + describe '#has_live_trace?' do + subject { build.has_live_trace? } + + let(:build) { create(:ci_build, :trace_live) } + + it { is_expected.to be_truthy } + + context 'when build does not have live trace' do + let(:build) { create(:ci_build) } + + it { is_expected.to be_falsy } + end + end + + describe '#has_archived_trace?' do + subject { build.has_archived_trace? } + + let(:build) { create(:ci_build, :trace_artifact) } + + it { is_expected.to be_truthy } + + context 'when build does not have archived trace' do + let(:build) { create(:ci_build) } + + it { is_expected.to be_falsy } + end + end + describe '#has_job_artifacts?' do subject { build.has_job_artifacts? } diff --git a/spec/models/ci/job_artifact_spec.rb b/spec/models/ci/job_artifact_spec.rb index 1ba66565e03..1413da231e0 100644 --- a/spec/models/ci/job_artifact_spec.rb +++ b/spec/models/ci/job_artifact_spec.rb @@ -70,6 +70,31 @@ describe Ci::JobArtifact do end end + describe '.archived_trace_exists_for?' do + subject { described_class.archived_trace_exists_for?(job_id) } + + let!(:artifact) { create(:ci_job_artifact, :trace, job: job) } + let(:job) { create(:ci_build) } + + context 'when the specified job_id exists' do + let(:job_id) { job.id } + + it { is_expected.to be_truthy } + + context 'when the job does have archived trace' do + let!(:artifact) { } + + it { is_expected.to be_falsy } + end + end + + context 'when the specified job_id does not exist' do + let(:job_id) { 10000 } + + it { is_expected.to be_falsy } + end + end + describe 'callbacks' do subject { create(:ci_job_artifact, :archive) } diff --git a/spec/services/ci/archive_trace_service_spec.rb b/spec/services/ci/archive_trace_service_spec.rb index 44a77c29086..454db3d5a48 100644 --- a/spec/services/ci/archive_trace_service_spec.rb +++ b/spec/services/ci/archive_trace_service_spec.rb @@ -3,7 +3,7 @@ require 'spec_helper' describe Ci::ArchiveTraceService, '#execute' do - subject { described_class.new.execute(job) } + subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) } context 'when job is finished' do let(:job) { create(:ci_build, :success, :trace_live) } @@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do expect { subject }.not_to change { Ci::JobArtifact.trace.count } end end + + context 'when job does not have trace' do + let(:job) { create(:ci_build, :success) } + + it 'leaves a warning message in sidekiq log' do + expect(Sidekiq.logger).to receive(:warn).with( + class: ArchiveTraceWorker.name, + message: 'The job does not have live trace but going to be archived.', + job_id: job.id) + + subject + end + end + + context 'when job failed to archive trace but did not raise an exception' do + before do + allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {} + end + + it 'leaves a warning message in sidekiq log' do + expect(Sidekiq.logger).to receive(:warn).with( + class: ArchiveTraceWorker.name, + message: 'The job does not have archived trace after archiving.', + job_id: job.id) + + subject + end + end end context 'when job is running' do @@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502', extra: { job_id: job.id } ).once - expect(Rails.logger) - .to receive(:error) - .with("Failed to archive trace. id: #{job.id} message: Job is not finished yet") - .and_call_original + expect(Sidekiq.logger).to receive(:warn).with( + class: ArchiveTraceWorker.name, + message: "Failed to archive trace. message: Job is not finished yet.", + job_id: job.id).and_call_original expect(Gitlab::Metrics) .to receive(:counter) diff --git a/spec/support/shared_examples/ci_trace_shared_examples.rb b/spec/support/shared_examples/ci_trace_shared_examples.rb index ab0550e2613..68c2b6e10e2 100644 --- a/spec/support/shared_examples/ci_trace_shared_examples.rb +++ b/spec/support/shared_examples/ci_trace_shared_examples.rb @@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do end end + describe '#archived_trace_exist?' do + subject { trace.archived_trace_exist? } + + context 'when trace does not exist' do + it { is_expected.to be_falsy } + end + + context 'when archived trace exists' do + before do + create(:ci_job_artifact, :trace, job: build) + end + + it { is_expected.to be_truthy } + end + + context 'when live trace exists' do + before do + Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream| + stream.write('abc') + end + end + + it { is_expected.to be_falsy } + end + end + + describe '#live_trace_exist?' do + subject { trace.live_trace_exist? } + + context 'when trace does not exist' do + it { is_expected.to be_falsy } + end + + context 'when archived trace exists' do + before do + create(:ci_job_artifact, :trace, job: build) + end + + it { is_expected.to be_falsy } + end + + context 'when live trace exists' do + before do + Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream| + stream.write('abc') + end + end + + it { is_expected.to be_truthy } + end + end + describe '#archive!' do subject { trace.archive! } diff --git a/spec/workers/archive_trace_worker_spec.rb b/spec/workers/archive_trace_worker_spec.rb index 368ed3f3db1..44f7be15201 100644 --- a/spec/workers/archive_trace_worker_spec.rb +++ b/spec/workers/archive_trace_worker_spec.rb @@ -11,7 +11,7 @@ describe ArchiveTraceWorker do it 'executes service' do expect_any_instance_of(Ci::ArchiveTraceService) - .to receive(:execute).with(job) + .to receive(:execute).with(job, anything) subject end diff --git a/spec/workers/ci/archive_traces_cron_worker_spec.rb b/spec/workers/ci/archive_traces_cron_worker_spec.rb index eca6cf5235f..28381fdc3be 100644 --- a/spec/workers/ci/archive_traces_cron_worker_spec.rb +++ b/spec/workers/ci/archive_traces_cron_worker_spec.rb @@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do it 'executes service' do expect_any_instance_of(Ci::ArchiveTraceService) - .to receive(:execute).with(build) + .to receive(:execute).with(build, anything) subject end @@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do end it 'puts a log' do - expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error") + expect(Sidekiq.logger).to receive(:warn).with( + class: described_class.name, + message: "Failed to archive trace. message: Unexpected error.", + job_id: build.id) subject end diff --git a/spec/workers/stuck_ci_jobs_worker_spec.rb b/spec/workers/stuck_ci_jobs_worker_spec.rb index 72de62f1188..c3d577e2dae 100644 --- a/spec/workers/stuck_ci_jobs_worker_spec.rb +++ b/spec/workers/stuck_ci_jobs_worker_spec.rb @@ -7,8 +7,6 @@ describe StuckCiJobsWorker do let!(:runner) { create :ci_runner } let!(:job) { create :ci_build, runner: runner } - let(:trace_lease_key) { "trace:write:lock:#{job.id}" } - let(:trace_lease_uuid) { SecureRandom.uuid } let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY } let(:worker_lease_uuid) { SecureRandom.uuid } @@ -16,7 +14,6 @@ describe StuckCiJobsWorker do before do stub_exclusive_lease(worker_lease_key, worker_lease_uuid) - stub_exclusive_lease(trace_lease_key, trace_lease_uuid) job.update!(status: status, updated_at: updated_at) end @@ -195,7 +192,6 @@ describe StuckCiJobsWorker do end it 'cancels exclusive leases after worker perform' do - expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid) expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid) worker.perform |