diff options
author | Rubén Dávila <ruben@gitlab.com> | 2018-06-06 13:49:38 -0500 |
---|---|---|
committer | Rubén Dávila <ruben@gitlab.com> | 2018-06-06 13:49:38 -0500 |
commit | 29818672e12fe779034920933098f47269fdef16 (patch) | |
tree | b71983e6a7909a81294e14be777fe7b8bd9e102b | |
parent | 90a5bcc5c9bcf90b4eb17ace072212335b85ea29 (diff) | |
download | gitlab-ce-rd-45062-further-improve-lfs-integrity-check-performance.tar.gz |
Improve performance used to check LFS integrityrd-45062-further-improve-lfs-integrity-check-performance
Instead of using `git rev-list` and loading each Blob to inspect its
content, we're now relying on the usage of `git diff` and `git
cat-file`. This has the main advantage
that we're reusing the same technique used for other existing checks,
hence all the Blobs are cached.
-rw-r--r-- | changelogs/unreleased/rd-45062-further-improve-lfs-integrity-check-performance.yml | 5 | ||||
-rw-r--r-- | lib/extracts_path.rb | 3 | ||||
-rw-r--r-- | lib/gitlab/checks/change_access.rb | 2 | ||||
-rw-r--r-- | lib/gitlab/checks/lfs_integrity.rb | 7 | ||||
-rw-r--r-- | lib/gitlab/git/blob.rb | 25 | ||||
-rw-r--r-- | lib/gitlab/git/raw_diff_change.rb | 4 | ||||
-rw-r--r-- | spec/lib/gitlab/checks/lfs_integrity_spec.rb | 17 | ||||
-rw-r--r-- | spec/lib/gitlab/git/blob_spec.rb | 61 |
8 files changed, 105 insertions, 19 deletions
diff --git a/changelogs/unreleased/rd-45062-further-improve-lfs-integrity-check-performance.yml b/changelogs/unreleased/rd-45062-further-improve-lfs-integrity-check-performance.yml new file mode 100644 index 00000000000..a502cc1d698 --- /dev/null +++ b/changelogs/unreleased/rd-45062-further-improve-lfs-integrity-check-performance.yml @@ -0,0 +1,5 @@ +--- +title: Improve performance of LFS integrity check +merge_request: 19071 +author: +type: performance diff --git a/lib/extracts_path.rb b/lib/extracts_path.rb index a9b04c183ad..48e3d81e860 100644 --- a/lib/extracts_path.rb +++ b/lib/extracts_path.rb @@ -138,8 +138,7 @@ module ExtractsPath end def lfs_blob_ids - blob_ids = tree.blobs.map(&:id) - @lfs_blob_ids = Gitlab::Git::Blob.batch_lfs_pointers(@project.repository, blob_ids).map(&:id) # rubocop:disable Gitlab/ModuleWithInstanceVariables + @lfs_blob_ids = Gitlab::Git::Blob.lfs_pointers_from_tree(@project.repository, tree).map(&:id) # rubocop:disable Gitlab/ModuleWithInstanceVariables end private diff --git a/lib/gitlab/checks/change_access.rb b/lib/gitlab/checks/change_access.rb index 51ba09aa129..e2326fa882c 100644 --- a/lib/gitlab/checks/change_access.rb +++ b/lib/gitlab/checks/change_access.rb @@ -169,7 +169,7 @@ module Gitlab end def lfs_objects_exist_check - lfs_check = Checks::LfsIntegrity.new(project, newrev) + lfs_check = Checks::LfsIntegrity.new(project, oldrev, newrev) if lfs_check.objects_missing? raise GitAccess::UnauthorizedError, ERROR_MESSAGES[:lfs_objects_missing] diff --git a/lib/gitlab/checks/lfs_integrity.rb b/lib/gitlab/checks/lfs_integrity.rb index f0e5773ec3c..603cb36d85d 100644 --- a/lib/gitlab/checks/lfs_integrity.rb +++ b/lib/gitlab/checks/lfs_integrity.rb @@ -1,17 +1,16 @@ module Gitlab module Checks class LfsIntegrity - REV_LIST_OBJECT_LIMIT = 2_000 - - def initialize(project, newrev) + def initialize(project, oldrev, newrev) @project = project + @oldrev = oldrev @newrev = newrev end def objects_missing? return false unless @newrev && @project.lfs_enabled? - new_lfs_pointers = Gitlab::Git::LfsChanges.new(@project.repository, @newrev).new_pointers(object_limit: REV_LIST_OBJECT_LIMIT) + new_lfs_pointers = Gitlab::Git::Blob.lfs_pointers_between(@project.repository, @oldrev, @newrev) return false unless new_lfs_pointers.present? diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb index 156d077a69c..f90f90fbe26 100644 --- a/lib/gitlab/git/blob.rb +++ b/lib/gitlab/git/blob.rb @@ -85,6 +85,31 @@ module Gitlab end end + # Find LFS blobs old_rev and new_rev + # old_rev and new_rev are commit ID's + # Returns array of Gitlab::Git::Blob + # Does not guarantee blob data will be set + def lfs_pointers_between(repository, old_rev, new_rev) + revs_with_paths = repository.raw_changes_between(old_rev, new_rev) + .reject { |c| c.deleted? } + .select { |c| size_could_be_lfs?(c.blob_size) } + .map! { |c| [new_rev, c.new_path] } + + return [] if revs_with_paths.blank? + + batch(repository, revs_with_paths, blob_size_limit: LFS_POINTER_MAX_SIZE) + .select(&:lfs_pointer?) + end + + def lfs_pointers_from_tree(repository, tree) + revs_with_paths = tree.blobs.map! { |blob| [tree.sha, blob.path] } + + return [] if revs_with_paths.blank? + + batch(repository, revs_with_paths, blob_size_limit: LFS_POINTER_MAX_SIZE) + .select { |blob| size_could_be_lfs?(blob.size) && blob.lfs_pointer? } + end + def binary?(data) EncodingHelper.detect_libgit2_binary?(data) end diff --git a/lib/gitlab/git/raw_diff_change.rb b/lib/gitlab/git/raw_diff_change.rb index 98de9328071..a2815c5603d 100644 --- a/lib/gitlab/git/raw_diff_change.rb +++ b/lib/gitlab/git/raw_diff_change.rb @@ -17,6 +17,10 @@ module Gitlab end end + def deleted? + operation == :deleted + end + private # Input data has the following format: diff --git a/spec/lib/gitlab/checks/lfs_integrity_spec.rb b/spec/lib/gitlab/checks/lfs_integrity_spec.rb index ec22e3a198e..9b6524b518c 100644 --- a/spec/lib/gitlab/checks/lfs_integrity_spec.rb +++ b/spec/lib/gitlab/checks/lfs_integrity_spec.rb @@ -5,24 +5,17 @@ describe Gitlab::Checks::LfsIntegrity do let(:project) { create(:project, :repository) } let(:repository) { project.repository } - let(:newrev) do - operations = Gitlab::GitalyClient::StorageSettings.allow_disk_access do - BareRepoOperations.new(repository.path) - end - - # Create a commit not pointed at by any ref to emulate being in the - # pre-receive hook so that `--not --all` returns some objects - operations.commit_tree('8856a329dd38ca86dfb9ce5aa58a16d88cc119bd', "New LFS objects") - end + let(:oldrev) { Gitlab::Git::EMPTY_TREE_ID } + let(:newrev) { TestEnv::BRANCH_SHA['lfs'] } - subject { described_class.new(project, newrev) } + subject { described_class.new(project, oldrev, newrev) } describe '#objects_missing?' do let(:blob_object) { repository.blob_at_branch('lfs', 'files/lfs/lfs_object.iso') } context 'with LFS not enabled' do it 'skips integrity check' do - expect_any_instance_of(Gitlab::Git::LfsChanges).not_to receive(:new_pointers) + expect(Gitlab::Git::Blob).not_to receive(:lfs_pointers_between) subject.objects_missing? end @@ -37,7 +30,7 @@ describe Gitlab::Checks::LfsIntegrity do let(:newrev) { nil } it 'skips integrity check' do - expect_any_instance_of(Gitlab::Git::LfsChanges).not_to receive(:new_pointers) + expect(Gitlab::Git::Blob).not_to receive(:lfs_pointers_between) expect(subject.objects_missing?).to be_falsey end diff --git a/spec/lib/gitlab/git/blob_spec.rb b/spec/lib/gitlab/git/blob_spec.rb index 94eaf86ef80..796dabf7d0e 100644 --- a/spec/lib/gitlab/git/blob_spec.rb +++ b/spec/lib/gitlab/git/blob_spec.rb @@ -277,6 +277,67 @@ describe Gitlab::Git::Blob, seed_helper: true do end end + describe '.lfs_pointers_between' do + let(:repository) { create(:project, :repository).repository } + let(:old_rev) { Gitlab::Git::EMPTY_TREE_ID } + let(:lfs_rev) { TestEnv::BRANCH_SHA['lfs'] } + let(:non_lfs_rev) { TestEnv::BRANCH_SHA['feature'] } + + context 'with LFS objects' do + it 'returns a list of Gitlab::Git::Blob' do + blobs = described_class.lfs_pointers_between(repository, old_rev, lfs_rev) + + expect(blobs.count).to eq(1) + expect(blobs).to all( be_a(Gitlab::Git::Blob) ) + expect(blobs).to be_an(Array) + end + + context 'when changes consist of a single LFS pointer' do + let(:parent_commit) { '5f923865dde3436854e9ceb9cdb7815618d4e849' } + let(:commit_with_lfs_pointer) { '048721d90c449b244b7b4c53a9186b04330174ec' } + + it 'returns the single LFS pointer found' do + blobs = described_class.lfs_pointers_between(repository, parent_commit, commit_with_lfs_pointer) + + expect(blobs.count).to eq(1) + expect(blobs).to all( be_a(Gitlab::Git::Blob) ) + end + end + end + + context 'without LFS objects' do + it 'returns an empty Array' do + blobs = described_class.lfs_pointers_between(repository, old_rev, non_lfs_rev) + + expect(blobs).to eq([]) + end + end + end + + describe '.lfs_pointers_from_tree' do + let(:repository) { create(:project, :repository).repository } + let(:tree_with_lfs_objects) { repository.tree('lfs', 'files/lfs') } + let(:tree_without_lfs_objects) { repository.tree('feature', '/') } + + context 'with a tree with LFS objects' do + it 'returns a list of Gitlab::Git::Blob' do + blobs = described_class.lfs_pointers_from_tree(repository, tree_with_lfs_objects) + + expect(blobs.count).to eq(1) + expect(blobs).to all( be_a(Gitlab::Git::Blob) ) + expect(blobs).to be_an(Array) + end + end + + context 'with a tree without LFS objects' do + it 'returns an empty Array' do + blobs = described_class.lfs_pointers_from_tree(repository, tree_without_lfs_objects) + + expect(blobs).to eq([]) + end + end + end + describe '.batch_lfs_pointers' do let(:tree_object) { repository.rugged.rev_parse('master^{tree}') } |