summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Edwards-Jones <jedwardsjones@gitlab.com>2017-08-25 02:30:12 +0100
committerJames Edwards-Jones <jedwardsjones@gitlab.com>2017-11-01 12:37:25 +0000
commitad3692b8c8dfb7898b6016a86e8f508b75ce3ba2 (patch)
tree258f8666fd6f8af3605ab8e639bea213599f6140
parentb5e7035c1a5e89845faada20addff98fdcefbd1e (diff)
downloadgitlab-ce-jej/lfs-change-detection.tar.gz
Detect changes to LFS pointers for pruning and integrity checkjej/lfs-change-detection
Gitlab::Git::Blob.batch_lfs_metadata can be used to check for LFS pointers. It uses a lazy enumorator and filters by blob size
-rw-r--r--lib/gitlab/git/blob.rb57
-rw-r--r--lib/gitlab/git/lfs_changes.rb36
-rw-r--r--lib/gitlab/git/repository.rb8
-rw-r--r--lib/gitlab/git/rev_list.rb45
-rw-r--r--spec/lib/gitlab/git/blob_spec.rb55
-rw-r--r--spec/lib/gitlab/git/lfs_changes_spec.rb48
-rw-r--r--spec/lib/gitlab/git/repository_spec.rb18
-rw-r--r--spec/lib/gitlab/git/rev_list_spec.rb87
8 files changed, 313 insertions, 41 deletions
diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb
index a4336facee5..6473397f8c3 100644
--- a/lib/gitlab/git/blob.rb
+++ b/lib/gitlab/git/blob.rb
@@ -12,6 +12,12 @@ module Gitlab
# blob data should use load_all_data!.
MAX_DATA_DISPLAY_SIZE = 10.megabytes
+ # These limits are used as a heuristic to ignore files which can't be LFS
+ # pointers. The format of these is described in
+ # https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer
+ LFS_POINTER_MIN_SIZE = 120.bytes
+ LFS_POINTER_MAX_SIZE = 200.bytes
+
attr_accessor :name, :path, :size, :data, :mode, :id, :commit_id, :loaded_size, :binary
class << self
@@ -30,16 +36,7 @@ module Gitlab
if is_enabled
Gitlab::GitalyClient::BlobService.new(repository).get_blob(oid: sha, limit: MAX_DATA_DISPLAY_SIZE)
else
- blob = repository.lookup(sha)
-
- next unless blob.is_a?(Rugged::Blob)
-
- new(
- id: blob.oid,
- size: blob.size,
- data: blob.content(MAX_DATA_DISPLAY_SIZE),
- binary: blob.binary?
- )
+ rugged_raw(repository, sha, limit: MAX_DATA_DISPLAY_SIZE)
end
end
end
@@ -59,10 +56,25 @@ module Gitlab
end
end
+ # Find LFS blobs given an array of sha ids
+ # Returns array of Gitlab::Git::Blob
+ # Does not guarantee blob data will be set
+ def batch_lfs_metadata(repository, blob_ids)
+ blob_ids.lazy
+ .select { |sha| possible_lfs_blob?(repository, sha) }
+ .map { |sha| rugged_raw(repository, sha, limit: LFS_POINTER_MAX_SIZE) }
+ .select(&:lfs_pointer?)
+ .force
+ end
+
def binary?(data)
EncodingHelper.detect_libgit2_binary?(data)
end
+ def size_could_be_lfs?(size)
+ size.between?(LFS_POINTER_MIN_SIZE, LFS_POINTER_MAX_SIZE)
+ end
+
private
# Recursive search of blob id by path
@@ -167,6 +179,29 @@ module Gitlab
end
end
end
+
+ def rugged_raw(repository, sha, limit:)
+ blob = repository.lookup(sha)
+
+ return unless blob.is_a?(Rugged::Blob)
+
+ new(
+ id: blob.oid,
+ size: blob.size,
+ data: blob.content(limit),
+ binary: blob.binary?
+ )
+ end
+
+ # Efficient lookup to determine if object size
+ # and type make it a possible LFS blob without loading
+ # blob content into memory with repository.lookup(sha)
+ def possible_lfs_blob?(repository, sha)
+ object_header = repository.rugged.read_header(sha)
+
+ object_header[:type] == :blob &&
+ size_could_be_lfs?(object_header[:len])
+ end
end
def initialize(options)
@@ -226,7 +261,7 @@ module Gitlab
# size
# see https://github.com/github/git-lfs/blob/v1.1.0/docs/spec.md#the-pointer
def lfs_pointer?
- has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
+ self.class.size_could_be_lfs?(size) && has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
end
def lfs_oid
diff --git a/lib/gitlab/git/lfs_changes.rb b/lib/gitlab/git/lfs_changes.rb
new file mode 100644
index 00000000000..f8ab2621b71
--- /dev/null
+++ b/lib/gitlab/git/lfs_changes.rb
@@ -0,0 +1,36 @@
+module Gitlab
+ module Git
+ class LfsChanges
+ def initialize(repository, newrev)
+ @repository = repository
+ @newrev = newrev
+ end
+
+ def new_pointers(object_limit: nil, not_in: nil)
+ @new_pointers ||= begin
+ object_ids = new_objects(not_in: not_in)
+ object_ids = object_ids.take(object_limit) if object_limit
+
+ Gitlab::Git::Blob.batch_lfs_metadata(@repository, object_ids)
+ end
+ end
+
+ def all_pointers
+ object_ids = rev_list.all_objects(require_path: true)
+
+ Gitlab::Git::Blob.batch_lfs_metadata(@repository, object_ids)
+ end
+
+ private
+
+ def new_objects(not_in:)
+ rev_list.new_objects(require_path: true, lazy: true, not_in: not_in)
+ end
+
+ def rev_list
+ ::Gitlab::Git::RevList.new(path_to_repo: @repository.path_to_repo,
+ newrev: @newrev)
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb
index 59a54b48ed9..9c407b1b1c2 100644
--- a/lib/gitlab/git/repository.rb
+++ b/lib/gitlab/git/repository.rb
@@ -290,6 +290,14 @@ module Gitlab
end
end
+ def batch_existence(object_ids, existing: true)
+ filter_method = existing ? :select : :reject
+
+ object_ids.public_send(filter_method) do |oid| # rubocop:disable GitlabSecurity/PublicSend
+ rugged.exists?(oid)
+ end
+ end
+
# Returns an Array of branch and tag names
def ref_names
branch_names + tag_names
diff --git a/lib/gitlab/git/rev_list.rb b/lib/gitlab/git/rev_list.rb
index 60b2a4ec411..e0c884aceaa 100644
--- a/lib/gitlab/git/rev_list.rb
+++ b/lib/gitlab/git/rev_list.rb
@@ -13,11 +13,31 @@ module Gitlab
@path_to_repo = path_to_repo
end
- # This method returns an array of new references
+ # This method returns an array of new commit references
def new_refs
execute([*base_args, newrev, '--not', '--all'])
end
+ # Finds newly added objects
+ # Returns an array of shas
+ #
+ # Can skip objects which do not have a path using required_path: true
+ # This skips commit objects and root trees, which might not be needed when
+ # looking for blobs
+ #
+ # Can return a lazy enumerator to limit work done on megabytes of data
+ def new_objects(require_path: nil, lazy: false, not_in: nil)
+ object_output = execute([*base_args, newrev, *not_in_refs(not_in), '--objects'])
+
+ objects_from_output(object_output, require_path: require_path, lazy: lazy)
+ end
+
+ def all_objects(require_path: nil)
+ object_output = execute([*base_args, '--all', '--objects'])
+
+ objects_from_output(object_output, require_path: require_path, lazy: true)
+ end
+
# This methods returns an array of missed references
#
# Should become obsolete after https://gitlab.com/gitlab-org/gitaly/issues/348.
@@ -27,6 +47,13 @@ module Gitlab
private
+ def not_in_refs(references)
+ return ['--not', '--all'] unless references
+ return [] if references.empty?
+
+ references.prepend('--not')
+ end
+
def execute(args)
output, status = popen(args, nil, Gitlab::Git::Env.to_env_hash)
@@ -44,6 +71,22 @@ module Gitlab
'rev-list'
]
end
+
+ def objects_from_output(object_output, require_path: nil, lazy: nil)
+ objects = object_output.lazy.map do |output_line|
+ sha, path = output_line.split(' ', 2)
+
+ next if require_path && path.blank?
+
+ sha
+ end.reject(&:nil?)
+
+ if lazy
+ objects
+ else
+ objects.force
+ end
+ end
end
end
end
diff --git a/spec/lib/gitlab/git/blob_spec.rb b/spec/lib/gitlab/git/blob_spec.rb
index 412a0093d97..ecd6f037b70 100644
--- a/spec/lib/gitlab/git/blob_spec.rb
+++ b/spec/lib/gitlab/git/blob_spec.rb
@@ -143,6 +143,16 @@ describe Gitlab::Git::Blob, seed_helper: true do
expect(blob.loaded_size).to eq(blob_size)
end
end
+
+ context 'when sha references a tree' do
+ it 'returns nil' do
+ tree = Gitlab::Git::Commit.find(repository, 'master').tree
+
+ blob = Gitlab::Git::Blob.raw(repository, tree.oid)
+
+ expect(blob).to be_nil
+ end
+ end
end
describe '.raw' do
@@ -226,6 +236,51 @@ describe Gitlab::Git::Blob, seed_helper: true do
end
end
+ describe '.batch_lfs_metadata' do
+ let(:tree_object) { Gitlab::Git::Commit.find(repository, 'master').tree }
+
+ let(:non_lfs_blob) do
+ Gitlab::Git::Blob.find(
+ repository,
+ 'master',
+ 'README.md'
+ )
+ end
+
+ let(:lfs_blob) do
+ Gitlab::Git::Blob.find(
+ repository,
+ '33bcff41c232a11727ac6d660bd4b0c2ba86d63d',
+ 'files/lfs/image.jpg'
+ )
+ end
+
+ it 'returns a list of Gitlab::Git::Blob' do
+ blobs = described_class.batch_lfs_metadata(repository, [lfs_blob.id])
+
+ expect(blobs.count).to eq(1)
+ expect(blobs).to all( be_a(Gitlab::Git::Blob) )
+ end
+
+ it 'silently ignores tree objects' do
+ blobs = described_class.batch_lfs_metadata(repository, [tree_object.oid])
+
+ expect(blobs).to eq([])
+ end
+
+ it 'silently ignores non lfs objects' do
+ blobs = described_class.batch_lfs_metadata(repository, [non_lfs_blob.id])
+
+ expect(blobs).to eq([])
+ end
+
+ it 'avoids loading large blobs into memory' do
+ expect(repository).not_to receive(:lookup)
+
+ described_class.batch_lfs_metadata(repository, [non_lfs_blob.id])
+ end
+ end
+
describe 'encoding' do
context 'file with russian text' do
let(:blob) { Gitlab::Git::Blob.find(repository, SeedRepo::Commit::ID, "encoding/russian.rb") }
diff --git a/spec/lib/gitlab/git/lfs_changes_spec.rb b/spec/lib/gitlab/git/lfs_changes_spec.rb
new file mode 100644
index 00000000000..dc165dc36ff
--- /dev/null
+++ b/spec/lib/gitlab/git/lfs_changes_spec.rb
@@ -0,0 +1,48 @@
+require 'spec_helper'
+
+describe Gitlab::Git::LfsChanges do
+ let(:project) { create(:project, :repository) }
+ let(:newrev) { '54fcc214b94e78d7a41a9a8fe6d87a5e59500e51' }
+ let(:blob_object_id) { '0c304a93cb8430108629bbbcaa27db3343299bc0' }
+
+ subject { described_class.new(project.repository, newrev) }
+
+ describe 'new_pointers' do
+ before do
+ allow_any_instance_of(Gitlab::Git::RevList).to receive(:new_objects).and_return([blob_object_id])
+ end
+
+ it 'uses rev-list to find new objects' do
+ rev_list = double
+ allow(Gitlab::Git::RevList).to receive(:new).and_return(rev_list)
+
+ expect(rev_list).to receive(:new_objects).and_return([])
+
+ subject.new_pointers
+ end
+
+ it 'filters new objects to find lfs pointers' do
+ expect(Gitlab::Git::Blob).to receive(:batch_lfs_metadata).with(project.repository, [blob_object_id])
+
+ subject.new_pointers(object_limit: 1)
+ end
+
+ it 'limits new_objects using object_limit' do
+ expect(Gitlab::Git::Blob).to receive(:batch_lfs_metadata).with(project.repository, [])
+
+ subject.new_pointers(object_limit: 0)
+ end
+ end
+
+ describe 'all_pointers' do
+ it 'uses rev-list to find all objects' do
+ rev_list = double
+ allow(Gitlab::Git::RevList).to receive(:new).and_return(rev_list)
+ allow(rev_list).to receive(:all_objects).and_return([blob_object_id])
+
+ expect(Gitlab::Git::Blob).to receive(:batch_lfs_metadata).with(project.repository, [blob_object_id])
+
+ subject.all_pointers
+ end
+ end
+end
diff --git a/spec/lib/gitlab/git/repository_spec.rb b/spec/lib/gitlab/git/repository_spec.rb
index b2d2f770e3d..ad3d81b41e2 100644
--- a/spec/lib/gitlab/git/repository_spec.rb
+++ b/spec/lib/gitlab/git/repository_spec.rb
@@ -1310,6 +1310,24 @@ describe Gitlab::Git::Repository, seed_helper: true do
end
end
+ describe '#batch_existence' do
+ let(:refs) { ['deadbeef', SeedRepo::RubyBlob::ID, '909e6157199'] }
+
+ it 'returns existing refs back' do
+ result = repository.batch_existence(refs)
+
+ expect(result).to eq([SeedRepo::RubyBlob::ID])
+ end
+
+ context 'existing: true' do
+ it 'inverts meaning and returns non-existing refs' do
+ result = repository.batch_existence(refs, existing: false)
+
+ expect(result).to eq(%w(deadbeef 909e6157199))
+ end
+ end
+ end
+
describe '#local_branches' do
before(:all) do
@repo = Gitlab::Git::Repository.new('default', TEST_MUTABLE_REPO_PATH, '')
diff --git a/spec/lib/gitlab/git/rev_list_spec.rb b/spec/lib/gitlab/git/rev_list_spec.rb
index c0eac98d718..643a4b2d03e 100644
--- a/spec/lib/gitlab/git/rev_list_spec.rb
+++ b/spec/lib/gitlab/git/rev_list_spec.rb
@@ -2,53 +2,82 @@ require 'spec_helper'
describe Gitlab::Git::RevList do
let(:project) { create(:project, :repository) }
+ let(:rev_list) { described_class.new(newrev: 'newrev', path_to_repo: project.repository.path_to_repo) }
before do
- expect(Gitlab::Git::Env).to receive(:all).and_return({
+ allow(Gitlab::Git::Env).to receive(:all).and_return({
GIT_OBJECT_DIRECTORY: 'foo',
GIT_ALTERNATE_OBJECT_DIRECTORIES: 'bar'
})
end
- context "#new_refs" do
- let(:rev_list) { described_class.new(newrev: 'newrev', path_to_repo: project.repository.path_to_repo) }
+ def stub_popen_rev_list(*additional_args, output:)
+ expect(rev_list).to receive(:popen).with([
+ Gitlab.config.git.bin_path,
+ "--git-dir=#{project.repository.path_to_repo}",
+ 'rev-list',
+ *additional_args
+ ],
+ nil,
+ {
+ 'GIT_OBJECT_DIRECTORY' => 'foo',
+ 'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar'
+ }).and_return([output, 0])
+ end
+ context "#new_refs" do
it 'calls out to `popen`' do
- expect(rev_list).to receive(:popen).with([
- Gitlab.config.git.bin_path,
- "--git-dir=#{project.repository.path_to_repo}",
- 'rev-list',
- 'newrev',
- '--not',
- '--all'
- ],
- nil,
- {
- 'GIT_OBJECT_DIRECTORY' => 'foo',
- 'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar'
- }).and_return(["sha1\nsha2", 0])
+ stub_popen_rev_list('newrev', '--not', '--all', output: "sha1\nsha2")
expect(rev_list.new_refs).to eq(%w[sha1 sha2])
end
end
+ context '#new_objects' do
+ it 'fetches list of newly pushed objects using rev-list' do
+ stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2")
+
+ expect(rev_list.new_objects).to eq(%w[sha1 sha2])
+ end
+
+ it 'can skip pathless objects' do
+ stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2 path/to/file")
+
+ expect(rev_list.new_objects(require_path: true)).to eq(%w[sha2])
+ end
+
+ it 'can return a lazy enumerator' do
+ stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2")
+
+ expect(rev_list.new_objects(lazy: true)).to be_a Enumerator::Lazy
+ end
+
+ it 'can accept list of references to exclude' do
+ stub_popen_rev_list('newrev', '--not', 'master', '--objects', output: "sha1\nsha2")
+
+ expect(rev_list.new_objects(not_in: ['master'])).to eq(%w[sha1 sha2])
+ end
+
+ it 'handles empty list of references to exclude as listing all known objects' do
+ stub_popen_rev_list('newrev', '--objects', output: "sha1\nsha2")
+
+ expect(rev_list.new_objects(not_in: [])).to eq(%w[sha1 sha2])
+ end
+ end
+
+ context '#all_objects' do
+ it 'fetches list of all pushed objects using rev-list' do
+ stub_popen_rev_list('--all', '--objects', output: "sha1\nsha2")
+
+ expect(rev_list.all_objects.force).to eq(%w[sha1 sha2])
+ end
+ end
+
context "#missed_ref" do
let(:rev_list) { described_class.new(oldrev: 'oldrev', newrev: 'newrev', path_to_repo: project.repository.path_to_repo) }
it 'calls out to `popen`' do
- expect(rev_list).to receive(:popen).with([
- Gitlab.config.git.bin_path,
- "--git-dir=#{project.repository.path_to_repo}",
- 'rev-list',
- '--max-count=1',
- 'oldrev',
- '^newrev'
- ],
- nil,
- {
- 'GIT_OBJECT_DIRECTORY' => 'foo',
- 'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar'
- }).and_return(["sha1\nsha2", 0])
+ stub_popen_rev_list('--max-count=1', 'oldrev', '^newrev', output: "sha1\nsha2")
expect(rev_list.missed_ref).to eq(%w[sha1 sha2])
end