diff options
author | Sean McGivern <sean@mcgivern.me.uk> | 2018-03-05 11:16:26 +0000 |
---|---|---|
committer | Sean McGivern <sean@mcgivern.me.uk> | 2018-03-05 11:16:26 +0000 |
commit | b7cacaaf4fedd3d9b3b19ea3f1fe3eb585112b88 (patch) | |
tree | 4f2e1e6405eb72b46cfbcbb1d6e9d06f57269220 | |
parent | d50caa64f8701bdd0520eee5f9e8ad6755e2601b (diff) | |
parent | 6f945f20b4c3683bc862ebc476bad9331d72784e (diff) | |
download | gitlab-ce-b7cacaaf4fedd3d9b3b19ea3f1fe3eb585112b88.tar.gz |
Merge branch 'ee-4862-verify-file-checksums' into 'master'
Foreground verification of uploads and LFS objects
See merge request gitlab-org/gitlab-ce!17402
-rw-r--r-- | app/models/lfs_object.rb | 4 | ||||
-rw-r--r-- | changelogs/unreleased/ee-4862-verify-file-checksums.yml | 5 | ||||
-rw-r--r-- | doc/administration/raketasks/check.md | 27 | ||||
-rw-r--r-- | lib/gitlab/verify/batch_verifier.rb | 64 | ||||
-rw-r--r-- | lib/gitlab/verify/lfs_objects.rb | 27 | ||||
-rw-r--r-- | lib/gitlab/verify/rake_task.rb | 53 | ||||
-rw-r--r-- | lib/gitlab/verify/uploads.rb | 27 | ||||
-rw-r--r-- | lib/tasks/gitlab/lfs/check.rake | 8 | ||||
-rw-r--r-- | lib/tasks/gitlab/uploads.rake | 44 | ||||
-rw-r--r-- | lib/tasks/gitlab/uploads/check.rake | 8 | ||||
-rw-r--r-- | spec/factories/lfs_objects.rb | 6 | ||||
-rw-r--r-- | spec/lib/gitlab/verify/lfs_objects_spec.rb | 35 | ||||
-rw-r--r-- | spec/lib/gitlab/verify/uploads_spec.rb | 44 | ||||
-rw-r--r-- | spec/support/gitlab_verify.rb | 45 | ||||
-rw-r--r-- | spec/tasks/gitlab/lfs/check_rake_spec.rb | 28 | ||||
-rw-r--r-- | spec/tasks/gitlab/uploads/check_rake_spec.rb (renamed from spec/tasks/gitlab/uploads_rake_spec.rb) | 13 |
16 files changed, 378 insertions, 60 deletions
diff --git a/app/models/lfs_object.rb b/app/models/lfs_object.rb index fc586fa216e..b444812a4cf 100644 --- a/app/models/lfs_object.rb +++ b/app/models/lfs_object.rb @@ -15,4 +15,8 @@ class LfsObject < ActiveRecord::Base .where(lfs_objects_projects: { id: nil }) .destroy_all end + + def self.calculate_oid(path) + Digest::SHA256.file(path).hexdigest + end end diff --git a/changelogs/unreleased/ee-4862-verify-file-checksums.yml b/changelogs/unreleased/ee-4862-verify-file-checksums.yml new file mode 100644 index 00000000000..392c766ab37 --- /dev/null +++ b/changelogs/unreleased/ee-4862-verify-file-checksums.yml @@ -0,0 +1,5 @@ +--- +title: Foreground verification of uploads and LFS objects +merge_request: 17402 +author: +type: added diff --git a/doc/administration/raketasks/check.md b/doc/administration/raketasks/check.md index d1ed152b58c..d73d9422d2c 100644 --- a/doc/administration/raketasks/check.md +++ b/doc/administration/raketasks/check.md @@ -78,34 +78,41 @@ Example output: ## Uploaded Files Integrity -The uploads check Rake task will loop through all uploads in the database -and run two checks to determine the integrity of each file: +Various types of file can be uploaded to a GitLab installation by users. +Checksums are generated and stored in the database upon upload, and integrity +checks using those checksums can be run. These checks also detect missing files. -1. Check if the file exist on the file system. -1. Check if the checksum of the file on the file system matches the checksum in the database. +Currently, integrity checks are supported for the following types of file: + +* LFS objects +* User uploads **Omnibus Installation** ``` +sudo gitlab-rake gitlab:lfs:check sudo gitlab-rake gitlab:uploads:check ``` **Source Installation** ```bash +sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production ``` -This task also accepts some environment variables which you can use to override +These tasks also accept some environment variables which you can use to override certain values: -Variable | Type | Description --------- | ---- | ----------- -`BATCH` | integer | Specifies the size of the batch. Defaults to 200. -`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value. -`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value. +Variable | Type | Description +--------- | ------- | ----------- +`BATCH` | integer | Specifies the size of the batch. Defaults to 200. +`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value. +`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value. +`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized. ```bash +sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250 sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250 ``` diff --git a/lib/gitlab/verify/batch_verifier.rb b/lib/gitlab/verify/batch_verifier.rb new file mode 100644 index 00000000000..1ef369a4b67 --- /dev/null +++ b/lib/gitlab/verify/batch_verifier.rb @@ -0,0 +1,64 @@ +module Gitlab + module Verify + class BatchVerifier + attr_reader :batch_size, :start, :finish + + def initialize(batch_size:, start: nil, finish: nil) + @batch_size = batch_size + @start = start + @finish = finish + end + + # Yields a Range of IDs and a Hash of failed verifications (object => error) + def run_batches(&blk) + relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches + range = relation.first.id..relation.last.id + failures = run_batch(relation) + + yield(range, failures) + end + end + + def name + raise NotImplementedError.new + end + + def describe(_object) + raise NotImplementedError.new + end + + private + + def run_batch(relation) + relation.map { |upload| verify(upload) }.compact.to_h + end + + def verify(object) + expected = expected_checksum(object) + actual = actual_checksum(object) + + raise 'Checksum missing' unless expected.present? + raise 'Checksum mismatch' unless expected == actual + + nil + rescue => err + [object, err] + end + + # This should return an ActiveRecord::Relation suitable for calling #in_batches on + def relation + raise NotImplementedError.new + end + + # The checksum we expect the object to have + def expected_checksum(_object) + raise NotImplementedError.new + end + + # The freshly-recalculated checksum of the object + def actual_checksum(_object) + raise NotImplementedError.new + end + end + end +end diff --git a/lib/gitlab/verify/lfs_objects.rb b/lib/gitlab/verify/lfs_objects.rb new file mode 100644 index 00000000000..fe51edbdeeb --- /dev/null +++ b/lib/gitlab/verify/lfs_objects.rb @@ -0,0 +1,27 @@ +module Gitlab + module Verify + class LfsObjects < BatchVerifier + def name + 'LFS objects' + end + + def describe(object) + "LFS object: #{object.oid}" + end + + private + + def relation + LfsObject.all + end + + def expected_checksum(lfs_object) + lfs_object.oid + end + + def actual_checksum(lfs_object) + LfsObject.calculate_oid(lfs_object.file.path) + end + end + end +end diff --git a/lib/gitlab/verify/rake_task.rb b/lib/gitlab/verify/rake_task.rb new file mode 100644 index 00000000000..dd138e6b92b --- /dev/null +++ b/lib/gitlab/verify/rake_task.rb @@ -0,0 +1,53 @@ +module Gitlab + module Verify + class RakeTask + def self.run!(verify_kls) + verifier = verify_kls.new( + batch_size: ENV.fetch('BATCH', 200).to_i, + start: ENV['ID_FROM'], + finish: ENV['ID_TO'] + ) + + verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE']) + + new(verifier, verbose).run! + end + + attr_reader :verifier, :output + + def initialize(verifier, verbose) + @verifier = verifier + @verbose = verbose + end + + def run! + say "Checking integrity of #{verifier.name}" + + verifier.run_batches { |*args| run_batch(*args) } + + say 'Done!' + end + + def verbose? + !!@verbose + end + + private + + def say(text) + puts(text) # rubocop:disable Rails/Output + end + + def run_batch(range, failures) + status_color = failures.empty? ? :green : :red + say "- #{range}: Failures: #{failures.count}".color(status_color) + + return unless verbose? + + failures.each do |object, error| + say " - #{verifier.describe(object)}: #{error.inspect}".color(:red) + end + end + end + end +end diff --git a/lib/gitlab/verify/uploads.rb b/lib/gitlab/verify/uploads.rb new file mode 100644 index 00000000000..6972e517ea5 --- /dev/null +++ b/lib/gitlab/verify/uploads.rb @@ -0,0 +1,27 @@ +module Gitlab + module Verify + class Uploads < BatchVerifier + def name + 'Uploads' + end + + def describe(object) + "Upload: #{object.id}" + end + + private + + def relation + Upload.all + end + + def expected_checksum(upload) + upload.checksum + end + + def actual_checksum(upload) + Upload.hexdigest(upload.absolute_path) + end + end + end +end diff --git a/lib/tasks/gitlab/lfs/check.rake b/lib/tasks/gitlab/lfs/check.rake new file mode 100644 index 00000000000..869463d4e5d --- /dev/null +++ b/lib/tasks/gitlab/lfs/check.rake @@ -0,0 +1,8 @@ +namespace :gitlab do + namespace :lfs do + desc 'GitLab | LFS | Check integrity of uploaded LFS objects' + task check: :environment do + Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects) + end + end +end diff --git a/lib/tasks/gitlab/uploads.rake b/lib/tasks/gitlab/uploads.rake deleted file mode 100644 index df31567ce64..00000000000 --- a/lib/tasks/gitlab/uploads.rake +++ /dev/null @@ -1,44 +0,0 @@ -namespace :gitlab do - namespace :uploads do - desc 'GitLab | Uploads | Check integrity of uploaded files' - task check: :environment do - puts 'Checking integrity of uploaded files' - - uploads_batches do |batch| - batch.each do |upload| - puts "- Checking file (#{upload.id}): #{upload.absolute_path}".color(:green) - - if upload.exist? - check_checksum(upload) - else - puts " * File does not exist on the file system".color(:red) - end - end - end - - puts 'Done!' - end - - def batch_size - ENV.fetch('BATCH', 200).to_i - end - - def calculate_checksum(absolute_path) - Digest::SHA256.file(absolute_path).hexdigest - end - - def check_checksum(upload) - checksum = calculate_checksum(upload.absolute_path) - - if checksum != upload.checksum - puts " * File checksum (#{checksum}) does not match the one in the database (#{upload.checksum})".color(:red) - end - end - - def uploads_batches(&block) - Upload.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches - yield relation - end - end - end -end diff --git a/lib/tasks/gitlab/uploads/check.rake b/lib/tasks/gitlab/uploads/check.rake new file mode 100644 index 00000000000..2be2ec7f9c9 --- /dev/null +++ b/lib/tasks/gitlab/uploads/check.rake @@ -0,0 +1,8 @@ +namespace :gitlab do + namespace :uploads do + desc 'GitLab | Uploads | Check integrity of uploaded files' + task check: :environment do + Gitlab::Verify::RakeTask.run!(Gitlab::Verify::Uploads) + end + end +end diff --git a/spec/factories/lfs_objects.rb b/spec/factories/lfs_objects.rb index 8eb709022ce..caaed4d5246 100644 --- a/spec/factories/lfs_objects.rb +++ b/spec/factories/lfs_objects.rb @@ -9,4 +9,10 @@ FactoryBot.define do trait :with_file do file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") } end + + # The uniqueness constraint means we can't use the correct OID for all LFS + # objects, so the test needs to decide which (if any) object gets it + trait :correct_oid do + oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75' + end end diff --git a/spec/lib/gitlab/verify/lfs_objects_spec.rb b/spec/lib/gitlab/verify/lfs_objects_spec.rb new file mode 100644 index 00000000000..64f3a9660e0 --- /dev/null +++ b/spec/lib/gitlab/verify/lfs_objects_spec.rb @@ -0,0 +1,35 @@ +require 'spec_helper' + +describe Gitlab::Verify::LfsObjects do + include GitlabVerifyHelpers + + it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do + let!(:objects) { create_list(:lfs_object, 3, :with_file) } + end + + describe '#run_batches' do + let(:failures) { collect_failures } + let(:failure) { failures[lfs_object] } + + let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) } + + it 'passes LFS objects with the correct file' do + expect(failures).to eq({}) + end + + it 'fails LFS objects with a missing file' do + FileUtils.rm_f(lfs_object.file.path) + + expect(failures.keys).to contain_exactly(lfs_object) + expect(failure).to be_a(Errno::ENOENT) + expect(failure.to_s).to include(lfs_object.file.path) + end + + it 'fails LFS objects with a mismatched oid' do + File.truncate(lfs_object.file.path, 0) + + expect(failures.keys).to contain_exactly(lfs_object) + expect(failure.to_s).to include('Checksum mismatch') + end + end +end diff --git a/spec/lib/gitlab/verify/uploads_spec.rb b/spec/lib/gitlab/verify/uploads_spec.rb new file mode 100644 index 00000000000..6146ce61226 --- /dev/null +++ b/spec/lib/gitlab/verify/uploads_spec.rb @@ -0,0 +1,44 @@ +require 'spec_helper' + +describe Gitlab::Verify::Uploads do + include GitlabVerifyHelpers + + it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do + let(:projects) { create_list(:project, 3, :with_avatar) } + let!(:objects) { projects.flat_map(&:uploads) } + end + + describe '#run_batches' do + let(:project) { create(:project, :with_avatar) } + let(:failures) { collect_failures } + let(:failure) { failures[upload] } + + let!(:upload) { project.uploads.first } + + it 'passes uploads with the correct file' do + expect(failures).to eq({}) + end + + it 'fails uploads with a missing file' do + FileUtils.rm_f(upload.absolute_path) + + expect(failures.keys).to contain_exactly(upload) + expect(failure).to be_a(Errno::ENOENT) + expect(failure.to_s).to include(upload.absolute_path) + end + + it 'fails uploads with a mismatched checksum' do + upload.update!(checksum: 'something incorrect') + + expect(failures.keys).to contain_exactly(upload) + expect(failure.to_s).to include('Checksum mismatch') + end + + it 'fails uploads with a missing precalculated checksum' do + upload.update!(checksum: '') + + expect(failures.keys).to contain_exactly(upload) + expect(failure.to_s).to include('Checksum missing') + end + end +end diff --git a/spec/support/gitlab_verify.rb b/spec/support/gitlab_verify.rb new file mode 100644 index 00000000000..13e2e37624d --- /dev/null +++ b/spec/support/gitlab_verify.rb @@ -0,0 +1,45 @@ +RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do + describe 'batching' do + let(:first_batch) { objects[0].id..objects[0].id } + let(:second_batch) { objects[1].id..objects[1].id } + let(:third_batch) { objects[2].id..objects[2].id } + + it 'iterates through objects in batches' do + expect(collect_ranges).to eq([first_batch, second_batch, third_batch]) + end + + it 'allows the starting ID to be specified' do + expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch]) + end + + it 'allows the finishing ID to be specified' do + expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch]) + end + end +end + +module GitlabVerifyHelpers + def collect_ranges(args = {}) + verifier = described_class.new(args.merge(batch_size: 1)) + + collect_results(verifier).map { |range, _| range } + end + + def collect_failures + verifier = described_class.new(batch_size: 1) + + out = {} + + collect_results(verifier).map { |_, failures| out.merge!(failures) } + + out + end + + def collect_results(verifier) + out = [] + + verifier.run_batches { |*args| out << args } + + out + end +end diff --git a/spec/tasks/gitlab/lfs/check_rake_spec.rb b/spec/tasks/gitlab/lfs/check_rake_spec.rb new file mode 100644 index 00000000000..2610edf8bac --- /dev/null +++ b/spec/tasks/gitlab/lfs/check_rake_spec.rb @@ -0,0 +1,28 @@ +require 'rake_helper' + +describe 'gitlab:lfs rake tasks' do + describe 'check' do + let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) } + + before do + Rake.application.rake_require('tasks/gitlab/lfs/check') + stub_env('VERBOSE' => 'true') + end + + it 'outputs the integrity check for each batch' do + expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout + end + + it 'errors out about missing files on the file system' do + FileUtils.rm_f(lfs_object.file.path) + + expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout + end + + it 'errors out about invalid checksum' do + File.truncate(lfs_object.file.path, 0) + + expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout + end + end +end diff --git a/spec/tasks/gitlab/uploads_rake_spec.rb b/spec/tasks/gitlab/uploads/check_rake_spec.rb index ac0005e51e0..5d597c66133 100644 --- a/spec/tasks/gitlab/uploads_rake_spec.rb +++ b/spec/tasks/gitlab/uploads/check_rake_spec.rb @@ -5,23 +5,24 @@ describe 'gitlab:uploads rake tasks' do let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) } before do - Rake.application.rake_require 'tasks/gitlab/uploads' + Rake.application.rake_require('tasks/gitlab/uploads/check') + stub_env('VERBOSE' => 'true') end - it 'outputs the integrity check for each uploaded file' do - expect { run_rake_task('gitlab:uploads:check') }.to output(/Checking file \(#{upload.id}\): #{Regexp.quote(upload.absolute_path)}/).to_stdout + it 'outputs the integrity check for each batch' do + expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout end it 'errors out about missing files on the file system' do - create(:upload) + missing_upload = create(:upload) - expect { run_rake_task('gitlab:uploads:check') }.to output(/File does not exist on the file system/).to_stdout + expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout end it 'errors out about invalid checksum' do upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e') - expect { run_rake_task('gitlab:uploads:check') }.to output(/File checksum \(9e697aa09fe196909813ee36103e34f721fe47a5fdc8aac0e4e4ac47b9b38282\) does not match the one in the database \(#{upload.checksum}\)/).to_stdout + expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout end end end |