summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Thomas <nick@gitlab.com>2018-02-27 19:15:25 +0000
committerNick Thomas <nick@gitlab.com>2018-03-02 14:16:17 +0000
commit6f945f20b4c3683bc862ebc476bad9331d72784e (patch)
tree9504637169d3a410e03e9e75f1d72ee6a323187f
parent40c61acb6a9ba84928cebcbce8b57630bd439615 (diff)
downloadgitlab-ce-ee-4862-verify-file-checksums.tar.gz
Foreground verification of uploads and LFS objectsee-4862-verify-file-checksums
-rw-r--r--app/models/lfs_object.rb4
-rw-r--r--changelogs/unreleased/ee-4862-verify-file-checksums.yml5
-rw-r--r--doc/administration/raketasks/check.md27
-rw-r--r--lib/gitlab/verify/batch_verifier.rb64
-rw-r--r--lib/gitlab/verify/lfs_objects.rb27
-rw-r--r--lib/gitlab/verify/rake_task.rb53
-rw-r--r--lib/gitlab/verify/uploads.rb27
-rw-r--r--lib/tasks/gitlab/lfs/check.rake8
-rw-r--r--lib/tasks/gitlab/uploads.rake44
-rw-r--r--lib/tasks/gitlab/uploads/check.rake8
-rw-r--r--spec/factories/lfs_objects.rb6
-rw-r--r--spec/lib/gitlab/verify/lfs_objects_spec.rb35
-rw-r--r--spec/lib/gitlab/verify/uploads_spec.rb44
-rw-r--r--spec/support/gitlab_verify.rb45
-rw-r--r--spec/tasks/gitlab/lfs/check_rake_spec.rb28
-rw-r--r--spec/tasks/gitlab/uploads/check_rake_spec.rb (renamed from spec/tasks/gitlab/uploads_rake_spec.rb)13
16 files changed, 378 insertions, 60 deletions
diff --git a/app/models/lfs_object.rb b/app/models/lfs_object.rb
index fc586fa216e..b444812a4cf 100644
--- a/app/models/lfs_object.rb
+++ b/app/models/lfs_object.rb
@@ -15,4 +15,8 @@ class LfsObject < ActiveRecord::Base
.where(lfs_objects_projects: { id: nil })
.destroy_all
end
+
+ def self.calculate_oid(path)
+ Digest::SHA256.file(path).hexdigest
+ end
end
diff --git a/changelogs/unreleased/ee-4862-verify-file-checksums.yml b/changelogs/unreleased/ee-4862-verify-file-checksums.yml
new file mode 100644
index 00000000000..392c766ab37
--- /dev/null
+++ b/changelogs/unreleased/ee-4862-verify-file-checksums.yml
@@ -0,0 +1,5 @@
+---
+title: Foreground verification of uploads and LFS objects
+merge_request: 17402
+author:
+type: added
diff --git a/doc/administration/raketasks/check.md b/doc/administration/raketasks/check.md
index d1ed152b58c..d73d9422d2c 100644
--- a/doc/administration/raketasks/check.md
+++ b/doc/administration/raketasks/check.md
@@ -78,34 +78,41 @@ Example output:
## Uploaded Files Integrity
-The uploads check Rake task will loop through all uploads in the database
-and run two checks to determine the integrity of each file:
+Various types of file can be uploaded to a GitLab installation by users.
+Checksums are generated and stored in the database upon upload, and integrity
+checks using those checksums can be run. These checks also detect missing files.
-1. Check if the file exist on the file system.
-1. Check if the checksum of the file on the file system matches the checksum in the database.
+Currently, integrity checks are supported for the following types of file:
+
+* LFS objects
+* User uploads
**Omnibus Installation**
```
+sudo gitlab-rake gitlab:lfs:check
sudo gitlab-rake gitlab:uploads:check
```
**Source Installation**
```bash
+sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production
sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production
```
-This task also accepts some environment variables which you can use to override
+These tasks also accept some environment variables which you can use to override
certain values:
-Variable | Type | Description
--------- | ---- | -----------
-`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
-`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
-`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
+Variable | Type | Description
+--------- | ------- | -----------
+`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
+`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
+`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
+`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.
```bash
+sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250
sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250
```
diff --git a/lib/gitlab/verify/batch_verifier.rb b/lib/gitlab/verify/batch_verifier.rb
new file mode 100644
index 00000000000..1ef369a4b67
--- /dev/null
+++ b/lib/gitlab/verify/batch_verifier.rb
@@ -0,0 +1,64 @@
+module Gitlab
+ module Verify
+ class BatchVerifier
+ attr_reader :batch_size, :start, :finish
+
+ def initialize(batch_size:, start: nil, finish: nil)
+ @batch_size = batch_size
+ @start = start
+ @finish = finish
+ end
+
+ # Yields a Range of IDs and a Hash of failed verifications (object => error)
+ def run_batches(&blk)
+ relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches
+ range = relation.first.id..relation.last.id
+ failures = run_batch(relation)
+
+ yield(range, failures)
+ end
+ end
+
+ def name
+ raise NotImplementedError.new
+ end
+
+ def describe(_object)
+ raise NotImplementedError.new
+ end
+
+ private
+
+ def run_batch(relation)
+ relation.map { |upload| verify(upload) }.compact.to_h
+ end
+
+ def verify(object)
+ expected = expected_checksum(object)
+ actual = actual_checksum(object)
+
+ raise 'Checksum missing' unless expected.present?
+ raise 'Checksum mismatch' unless expected == actual
+
+ nil
+ rescue => err
+ [object, err]
+ end
+
+ # This should return an ActiveRecord::Relation suitable for calling #in_batches on
+ def relation
+ raise NotImplementedError.new
+ end
+
+ # The checksum we expect the object to have
+ def expected_checksum(_object)
+ raise NotImplementedError.new
+ end
+
+ # The freshly-recalculated checksum of the object
+ def actual_checksum(_object)
+ raise NotImplementedError.new
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/verify/lfs_objects.rb b/lib/gitlab/verify/lfs_objects.rb
new file mode 100644
index 00000000000..fe51edbdeeb
--- /dev/null
+++ b/lib/gitlab/verify/lfs_objects.rb
@@ -0,0 +1,27 @@
+module Gitlab
+ module Verify
+ class LfsObjects < BatchVerifier
+ def name
+ 'LFS objects'
+ end
+
+ def describe(object)
+ "LFS object: #{object.oid}"
+ end
+
+ private
+
+ def relation
+ LfsObject.all
+ end
+
+ def expected_checksum(lfs_object)
+ lfs_object.oid
+ end
+
+ def actual_checksum(lfs_object)
+ LfsObject.calculate_oid(lfs_object.file.path)
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/verify/rake_task.rb b/lib/gitlab/verify/rake_task.rb
new file mode 100644
index 00000000000..dd138e6b92b
--- /dev/null
+++ b/lib/gitlab/verify/rake_task.rb
@@ -0,0 +1,53 @@
+module Gitlab
+ module Verify
+ class RakeTask
+ def self.run!(verify_kls)
+ verifier = verify_kls.new(
+ batch_size: ENV.fetch('BATCH', 200).to_i,
+ start: ENV['ID_FROM'],
+ finish: ENV['ID_TO']
+ )
+
+ verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE'])
+
+ new(verifier, verbose).run!
+ end
+
+ attr_reader :verifier, :output
+
+ def initialize(verifier, verbose)
+ @verifier = verifier
+ @verbose = verbose
+ end
+
+ def run!
+ say "Checking integrity of #{verifier.name}"
+
+ verifier.run_batches { |*args| run_batch(*args) }
+
+ say 'Done!'
+ end
+
+ def verbose?
+ !!@verbose
+ end
+
+ private
+
+ def say(text)
+ puts(text) # rubocop:disable Rails/Output
+ end
+
+ def run_batch(range, failures)
+ status_color = failures.empty? ? :green : :red
+ say "- #{range}: Failures: #{failures.count}".color(status_color)
+
+ return unless verbose?
+
+ failures.each do |object, error|
+ say " - #{verifier.describe(object)}: #{error.inspect}".color(:red)
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/verify/uploads.rb b/lib/gitlab/verify/uploads.rb
new file mode 100644
index 00000000000..6972e517ea5
--- /dev/null
+++ b/lib/gitlab/verify/uploads.rb
@@ -0,0 +1,27 @@
+module Gitlab
+ module Verify
+ class Uploads < BatchVerifier
+ def name
+ 'Uploads'
+ end
+
+ def describe(object)
+ "Upload: #{object.id}"
+ end
+
+ private
+
+ def relation
+ Upload.all
+ end
+
+ def expected_checksum(upload)
+ upload.checksum
+ end
+
+ def actual_checksum(upload)
+ Upload.hexdigest(upload.absolute_path)
+ end
+ end
+ end
+end
diff --git a/lib/tasks/gitlab/lfs/check.rake b/lib/tasks/gitlab/lfs/check.rake
new file mode 100644
index 00000000000..869463d4e5d
--- /dev/null
+++ b/lib/tasks/gitlab/lfs/check.rake
@@ -0,0 +1,8 @@
+namespace :gitlab do
+ namespace :lfs do
+ desc 'GitLab | LFS | Check integrity of uploaded LFS objects'
+ task check: :environment do
+ Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects)
+ end
+ end
+end
diff --git a/lib/tasks/gitlab/uploads.rake b/lib/tasks/gitlab/uploads.rake
deleted file mode 100644
index df31567ce64..00000000000
--- a/lib/tasks/gitlab/uploads.rake
+++ /dev/null
@@ -1,44 +0,0 @@
-namespace :gitlab do
- namespace :uploads do
- desc 'GitLab | Uploads | Check integrity of uploaded files'
- task check: :environment do
- puts 'Checking integrity of uploaded files'
-
- uploads_batches do |batch|
- batch.each do |upload|
- puts "- Checking file (#{upload.id}): #{upload.absolute_path}".color(:green)
-
- if upload.exist?
- check_checksum(upload)
- else
- puts " * File does not exist on the file system".color(:red)
- end
- end
- end
-
- puts 'Done!'
- end
-
- def batch_size
- ENV.fetch('BATCH', 200).to_i
- end
-
- def calculate_checksum(absolute_path)
- Digest::SHA256.file(absolute_path).hexdigest
- end
-
- def check_checksum(upload)
- checksum = calculate_checksum(upload.absolute_path)
-
- if checksum != upload.checksum
- puts " * File checksum (#{checksum}) does not match the one in the database (#{upload.checksum})".color(:red)
- end
- end
-
- def uploads_batches(&block)
- Upload.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches
- yield relation
- end
- end
- end
-end
diff --git a/lib/tasks/gitlab/uploads/check.rake b/lib/tasks/gitlab/uploads/check.rake
new file mode 100644
index 00000000000..2be2ec7f9c9
--- /dev/null
+++ b/lib/tasks/gitlab/uploads/check.rake
@@ -0,0 +1,8 @@
+namespace :gitlab do
+ namespace :uploads do
+ desc 'GitLab | Uploads | Check integrity of uploaded files'
+ task check: :environment do
+ Gitlab::Verify::RakeTask.run!(Gitlab::Verify::Uploads)
+ end
+ end
+end
diff --git a/spec/factories/lfs_objects.rb b/spec/factories/lfs_objects.rb
index 8eb709022ce..caaed4d5246 100644
--- a/spec/factories/lfs_objects.rb
+++ b/spec/factories/lfs_objects.rb
@@ -9,4 +9,10 @@ FactoryBot.define do
trait :with_file do
file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }
end
+
+ # The uniqueness constraint means we can't use the correct OID for all LFS
+ # objects, so the test needs to decide which (if any) object gets it
+ trait :correct_oid do
+ oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'
+ end
end
diff --git a/spec/lib/gitlab/verify/lfs_objects_spec.rb b/spec/lib/gitlab/verify/lfs_objects_spec.rb
new file mode 100644
index 00000000000..64f3a9660e0
--- /dev/null
+++ b/spec/lib/gitlab/verify/lfs_objects_spec.rb
@@ -0,0 +1,35 @@
+require 'spec_helper'
+
+describe Gitlab::Verify::LfsObjects do
+ include GitlabVerifyHelpers
+
+ it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
+ let!(:objects) { create_list(:lfs_object, 3, :with_file) }
+ end
+
+ describe '#run_batches' do
+ let(:failures) { collect_failures }
+ let(:failure) { failures[lfs_object] }
+
+ let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
+
+ it 'passes LFS objects with the correct file' do
+ expect(failures).to eq({})
+ end
+
+ it 'fails LFS objects with a missing file' do
+ FileUtils.rm_f(lfs_object.file.path)
+
+ expect(failures.keys).to contain_exactly(lfs_object)
+ expect(failure).to be_a(Errno::ENOENT)
+ expect(failure.to_s).to include(lfs_object.file.path)
+ end
+
+ it 'fails LFS objects with a mismatched oid' do
+ File.truncate(lfs_object.file.path, 0)
+
+ expect(failures.keys).to contain_exactly(lfs_object)
+ expect(failure.to_s).to include('Checksum mismatch')
+ end
+ end
+end
diff --git a/spec/lib/gitlab/verify/uploads_spec.rb b/spec/lib/gitlab/verify/uploads_spec.rb
new file mode 100644
index 00000000000..6146ce61226
--- /dev/null
+++ b/spec/lib/gitlab/verify/uploads_spec.rb
@@ -0,0 +1,44 @@
+require 'spec_helper'
+
+describe Gitlab::Verify::Uploads do
+ include GitlabVerifyHelpers
+
+ it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
+ let(:projects) { create_list(:project, 3, :with_avatar) }
+ let!(:objects) { projects.flat_map(&:uploads) }
+ end
+
+ describe '#run_batches' do
+ let(:project) { create(:project, :with_avatar) }
+ let(:failures) { collect_failures }
+ let(:failure) { failures[upload] }
+
+ let!(:upload) { project.uploads.first }
+
+ it 'passes uploads with the correct file' do
+ expect(failures).to eq({})
+ end
+
+ it 'fails uploads with a missing file' do
+ FileUtils.rm_f(upload.absolute_path)
+
+ expect(failures.keys).to contain_exactly(upload)
+ expect(failure).to be_a(Errno::ENOENT)
+ expect(failure.to_s).to include(upload.absolute_path)
+ end
+
+ it 'fails uploads with a mismatched checksum' do
+ upload.update!(checksum: 'something incorrect')
+
+ expect(failures.keys).to contain_exactly(upload)
+ expect(failure.to_s).to include('Checksum mismatch')
+ end
+
+ it 'fails uploads with a missing precalculated checksum' do
+ upload.update!(checksum: '')
+
+ expect(failures.keys).to contain_exactly(upload)
+ expect(failure.to_s).to include('Checksum missing')
+ end
+ end
+end
diff --git a/spec/support/gitlab_verify.rb b/spec/support/gitlab_verify.rb
new file mode 100644
index 00000000000..13e2e37624d
--- /dev/null
+++ b/spec/support/gitlab_verify.rb
@@ -0,0 +1,45 @@
+RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do
+ describe 'batching' do
+ let(:first_batch) { objects[0].id..objects[0].id }
+ let(:second_batch) { objects[1].id..objects[1].id }
+ let(:third_batch) { objects[2].id..objects[2].id }
+
+ it 'iterates through objects in batches' do
+ expect(collect_ranges).to eq([first_batch, second_batch, third_batch])
+ end
+
+ it 'allows the starting ID to be specified' do
+ expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch])
+ end
+
+ it 'allows the finishing ID to be specified' do
+ expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch])
+ end
+ end
+end
+
+module GitlabVerifyHelpers
+ def collect_ranges(args = {})
+ verifier = described_class.new(args.merge(batch_size: 1))
+
+ collect_results(verifier).map { |range, _| range }
+ end
+
+ def collect_failures
+ verifier = described_class.new(batch_size: 1)
+
+ out = {}
+
+ collect_results(verifier).map { |_, failures| out.merge!(failures) }
+
+ out
+ end
+
+ def collect_results(verifier)
+ out = []
+
+ verifier.run_batches { |*args| out << args }
+
+ out
+ end
+end
diff --git a/spec/tasks/gitlab/lfs/check_rake_spec.rb b/spec/tasks/gitlab/lfs/check_rake_spec.rb
new file mode 100644
index 00000000000..2610edf8bac
--- /dev/null
+++ b/spec/tasks/gitlab/lfs/check_rake_spec.rb
@@ -0,0 +1,28 @@
+require 'rake_helper'
+
+describe 'gitlab:lfs rake tasks' do
+ describe 'check' do
+ let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
+
+ before do
+ Rake.application.rake_require('tasks/gitlab/lfs/check')
+ stub_env('VERBOSE' => 'true')
+ end
+
+ it 'outputs the integrity check for each batch' do
+ expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout
+ end
+
+ it 'errors out about missing files on the file system' do
+ FileUtils.rm_f(lfs_object.file.path)
+
+ expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout
+ end
+
+ it 'errors out about invalid checksum' do
+ File.truncate(lfs_object.file.path, 0)
+
+ expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout
+ end
+ end
+end
diff --git a/spec/tasks/gitlab/uploads_rake_spec.rb b/spec/tasks/gitlab/uploads/check_rake_spec.rb
index ac0005e51e0..5d597c66133 100644
--- a/spec/tasks/gitlab/uploads_rake_spec.rb
+++ b/spec/tasks/gitlab/uploads/check_rake_spec.rb
@@ -5,23 +5,24 @@ describe 'gitlab:uploads rake tasks' do
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
before do
- Rake.application.rake_require 'tasks/gitlab/uploads'
+ Rake.application.rake_require('tasks/gitlab/uploads/check')
+ stub_env('VERBOSE' => 'true')
end
- it 'outputs the integrity check for each uploaded file' do
- expect { run_rake_task('gitlab:uploads:check') }.to output(/Checking file \(#{upload.id}\): #{Regexp.quote(upload.absolute_path)}/).to_stdout
+ it 'outputs the integrity check for each batch' do
+ expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
- create(:upload)
+ missing_upload = create(:upload)
- expect { run_rake_task('gitlab:uploads:check') }.to output(/File does not exist on the file system/).to_stdout
+ expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout
end
it 'errors out about invalid checksum' do
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
- expect { run_rake_task('gitlab:uploads:check') }.to output(/File checksum \(9e697aa09fe196909813ee36103e34f721fe47a5fdc8aac0e4e4ac47b9b38282\) does not match the one in the database \(#{upload.checksum}\)/).to_stdout
+ expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end