From 0683d31ab162284f9f37004da7365ce2f3e6da55 Mon Sep 17 00:00:00 2001 From: James Edwards-Jones Date: Thu, 11 Jan 2018 23:12:34 +0000 Subject: Can parse root .gitattributes file for a ref --- .../rugged_use_gitlab_git_attributes.rb | 4 +- lib/gitlab/git/attributes.rb | 136 ------------------ lib/gitlab/git/attributes_at_ref_parser.rb | 14 ++ lib/gitlab/git/attributes_parser.rb | 115 ++++++++++++++++ lib/gitlab/git/info_attributes.rb | 49 +++++++ lib/gitlab/git/repository.rb | 14 +- .../gitlab/git/attributes_at_ref_parser_spec.rb | 28 ++++ spec/lib/gitlab/git/attributes_parser_spec.rb | 153 +++++++++++++++++++++ spec/lib/gitlab/git/attributes_spec.rb | 150 -------------------- spec/lib/gitlab/git/info_attributes_spec.rb | 43 ++++++ spec/support/test_env.rb | 2 +- 11 files changed, 418 insertions(+), 290 deletions(-) delete mode 100644 lib/gitlab/git/attributes.rb create mode 100644 lib/gitlab/git/attributes_at_ref_parser.rb create mode 100644 lib/gitlab/git/attributes_parser.rb create mode 100644 lib/gitlab/git/info_attributes.rb create mode 100644 spec/lib/gitlab/git/attributes_at_ref_parser_spec.rb create mode 100644 spec/lib/gitlab/git/attributes_parser_spec.rb delete mode 100644 spec/lib/gitlab/git/attributes_spec.rb create mode 100644 spec/lib/gitlab/git/info_attributes_spec.rb diff --git a/config/initializers/rugged_use_gitlab_git_attributes.rb b/config/initializers/rugged_use_gitlab_git_attributes.rb index 1cfb3bcb4bd..c0d45caec42 100644 --- a/config/initializers/rugged_use_gitlab_git_attributes.rb +++ b/config/initializers/rugged_use_gitlab_git_attributes.rb @@ -7,7 +7,7 @@ # repository-wide language statistics: # # -# The options passed by Linguist are those assumed by Gitlab::Git::Attributes +# The options passed by Linguist are those assumed by Gitlab::Git::InfoAttributes # anyway, and there is no great efficiency gain from just fetching the listed # attributes with our implementation, so we ignore the additional arguments. # @@ -19,7 +19,7 @@ module Rugged end def attributes - @attributes ||= Gitlab::Git::Attributes.new(path) + @attributes ||= Gitlab::Git::InfoAttributes.new(path) end end diff --git a/lib/gitlab/git/attributes.rb b/lib/gitlab/git/attributes.rb deleted file mode 100644 index 2d20cd473a7..00000000000 --- a/lib/gitlab/git/attributes.rb +++ /dev/null @@ -1,136 +0,0 @@ -# Gitaly note: JV: not sure what to make of this class. Why does it use -# the full disk path of the repository to look up attributes This is -# problematic in Gitaly, because Gitaly hides the full disk path to the -# repository from gitlab-ce. - -module Gitlab - module Git - # Class for parsing Git attribute files and extracting the attributes for - # file patterns. - # - # Unlike Rugged this parser only needs a single IO call (a call to `open`), - # vastly reducing the time spent in extracting attributes. - # - # This class _only_ supports parsing the attributes file located at - # `$GIT_DIR/info/attributes` as GitLab doesn't use any other files - # (`.gitattributes` is copied to this particular path). - # - # Basic usage: - # - # attributes = Gitlab::Git::Attributes.new(some_repo.path) - # - # attributes.attributes('README.md') # => { "eol" => "lf } - class Attributes - # path - The path to the Git repository. - def initialize(path) - @path = File.expand_path(path) - @patterns = nil - end - - # Returns all the Git attributes for the given path. - # - # path - A path to a file for which to get the attributes. - # - # Returns a Hash. - def attributes(path) - full_path = File.join(@path, path) - - patterns.each do |pattern, attrs| - return attrs if File.fnmatch?(pattern, full_path) - end - - {} - end - - # Returns a Hash containing the file patterns and their attributes. - def patterns - @patterns ||= parse_file - end - - # Parses an attribute string. - # - # These strings can be in the following formats: - # - # text # => { "text" => true } - # -text # => { "text" => false } - # key=value # => { "key" => "value" } - # - # string - The string to parse. - # - # Returns a Hash containing the attributes and their values. - def parse_attributes(string) - values = {} - dash = '-' - equal = '=' - binary = 'binary' - - string.split(/\s+/).each do |chunk| - # Data such as "foo = bar" should be treated as "foo" and "bar" being - # separate boolean attributes. - next if chunk == equal - - key = chunk - - # Input: "-foo" - if chunk.start_with?(dash) - key = chunk.byteslice(1, chunk.length - 1) - value = false - - # Input: "foo=bar" - elsif chunk.include?(equal) - key, value = chunk.split(equal, 2) - - # Input: "foo" - else - value = true - end - - values[key] = value - - # When the "binary" option is set the "diff" option should be set to - # the inverse. If "diff" is later set it should overwrite the - # automatically set value. - values['diff'] = false if key == binary && value - end - - values - end - - # Iterates over every line in the attributes file. - def each_line - full_path = File.join(@path, 'info/attributes') - - return unless File.exist?(full_path) - - File.open(full_path, 'r') do |handle| - handle.each_line do |line| - break unless line.valid_encoding? - - yield line.strip - end - end - end - - private - - # Parses the Git attributes file. - def parse_file - pairs = [] - comment = '#' - - each_line do |line| - next if line.start_with?(comment) || line.empty? - - pattern, attrs = line.split(/\s+/, 2) - - parsed = attrs ? parse_attributes(attrs) : {} - - pairs << [File.join(@path, pattern), parsed] - end - - # Newer entries take precedence over older entries. - pairs.reverse.to_h - end - end - end -end diff --git a/lib/gitlab/git/attributes_at_ref_parser.rb b/lib/gitlab/git/attributes_at_ref_parser.rb new file mode 100644 index 00000000000..26b5bd520d5 --- /dev/null +++ b/lib/gitlab/git/attributes_at_ref_parser.rb @@ -0,0 +1,14 @@ +module Gitlab + module Git + # Parses root .gitattributes file at a given ref + class AttributesAtRefParser + delegate :attributes, to: :@parser + + def initialize(repository, ref) + blob = repository.blob_at(ref, '.gitattributes') + + @parser = AttributesParser.new(blob&.data) + end + end + end +end diff --git a/lib/gitlab/git/attributes_parser.rb b/lib/gitlab/git/attributes_parser.rb new file mode 100644 index 00000000000..d8aeabb6cba --- /dev/null +++ b/lib/gitlab/git/attributes_parser.rb @@ -0,0 +1,115 @@ +module Gitlab + module Git + # Class for parsing Git attribute files and extracting the attributes for + # file patterns. + class AttributesParser + def initialize(attributes_data) + @data = attributes_data || "" + + if @data.is_a?(File) + @patterns = parse_file + end + end + + # Returns all the Git attributes for the given path. + # + # file_path - A path to a file for which to get the attributes. + # + # Returns a Hash. + def attributes(file_path) + absolute_path = File.join('/', file_path) + + patterns.each do |pattern, attrs| + return attrs if File.fnmatch?(pattern, absolute_path) + end + + {} + end + + # Returns a Hash containing the file patterns and their attributes. + def patterns + @patterns ||= parse_file + end + + # Parses an attribute string. + # + # These strings can be in the following formats: + # + # text # => { "text" => true } + # -text # => { "text" => false } + # key=value # => { "key" => "value" } + # + # string - The string to parse. + # + # Returns a Hash containing the attributes and their values. + def parse_attributes(string) + values = {} + dash = '-' + equal = '=' + binary = 'binary' + + string.split(/\s+/).each do |chunk| + # Data such as "foo = bar" should be treated as "foo" and "bar" being + # separate boolean attributes. + next if chunk == equal + + key = chunk + + # Input: "-foo" + if chunk.start_with?(dash) + key = chunk.byteslice(1, chunk.length - 1) + value = false + + # Input: "foo=bar" + elsif chunk.include?(equal) + key, value = chunk.split(equal, 2) + + # Input: "foo" + else + value = true + end + + values[key] = value + + # When the "binary" option is set the "diff" option should be set to + # the inverse. If "diff" is later set it should overwrite the + # automatically set value. + values['diff'] = false if key == binary && value + end + + values + end + + # Iterates over every line in the attributes file. + def each_line + @data.each_line do |line| + break unless line.valid_encoding? + + yield line.strip + end + end + + private + + # Parses the Git attributes file. + def parse_file + pairs = [] + comment = '#' + + each_line do |line| + next if line.start_with?(comment) || line.empty? + + pattern, attrs = line.split(/\s+/, 2) + + parsed = attrs ? parse_attributes(attrs) : {} + + absolute_pattern = File.join('/', pattern) + pairs << [absolute_pattern, parsed] + end + + # Newer entries take precedence over older entries. + pairs.reverse.to_h + end + end + end +end diff --git a/lib/gitlab/git/info_attributes.rb b/lib/gitlab/git/info_attributes.rb new file mode 100644 index 00000000000..e79a440950b --- /dev/null +++ b/lib/gitlab/git/info_attributes.rb @@ -0,0 +1,49 @@ +# Gitaly note: JV: not sure what to make of this class. Why does it use +# the full disk path of the repository to look up attributes This is +# problematic in Gitaly, because Gitaly hides the full disk path to the +# repository from gitlab-ce. + +module Gitlab + module Git + # Parses gitattributes at `$GIT_DIR/info/attributes` + # + # Unlike Rugged this parser only needs a single IO call (a call to `open`), + # vastly reducing the time spent in extracting attributes. + # + # This class _only_ supports parsing the attributes file located at + # `$GIT_DIR/info/attributes` as GitLab doesn't use any other files + # (`.gitattributes` is copied to this particular path). + # + # Basic usage: + # + # attributes = Gitlab::Git::InfoAttributes.new(some_repo.path) + # + # attributes.attributes('README.md') # => { "eol" => "lf } + class InfoAttributes + delegate :attributes, :patterns, to: :parser + + # path - The path to the Git repository. + def initialize(path) + @repo_path = File.expand_path(path) + end + + def parser + @parser ||= begin + if File.exist?(attributes_path) + File.open(attributes_path, 'r') do |file_handle| + AttributesParser.new(file_handle) + end + else + AttributesParser.new("") + end + end + end + + private + + def attributes_path + @attributes_path ||= File.join(@repo_path, 'info/attributes') + end + end + end +end diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb index 283134e043e..736aaf0a642 100644 --- a/lib/gitlab/git/repository.rb +++ b/lib/gitlab/git/repository.rb @@ -102,7 +102,7 @@ module Gitlab ) @path = File.join(storage_path, @relative_path) @name = @relative_path.split("/").last - @attributes = Gitlab::Git::Attributes.new(path) + @attributes = Gitlab::Git::InfoAttributes.new(path) end def ==(other) @@ -1011,6 +1011,18 @@ module Gitlab attributes(path)[name] end + # Check .gitattributes for a given ref + # + # This only checks the root .gitattributes file, + # it does not traverse subfolders to find additional .gitattributes files + # + # This method is around 30 times slower than `attributes`, + # which uses `$GIT_DIR/info/attributes` + def attributes_at(ref, file_path) + parser = AttributesAtRefParser.new(self, ref) + parser.attributes(file_path) + end + def languages(ref = nil) Gitlab::GitalyClient.migrate(:commit_languages) do |is_enabled| if is_enabled diff --git a/spec/lib/gitlab/git/attributes_at_ref_parser_spec.rb b/spec/lib/gitlab/git/attributes_at_ref_parser_spec.rb new file mode 100644 index 00000000000..5d22dcfb508 --- /dev/null +++ b/spec/lib/gitlab/git/attributes_at_ref_parser_spec.rb @@ -0,0 +1,28 @@ +require 'spec_helper' + +describe Gitlab::Git::AttributesAtRefParser, seed_helper: true do + let(:project) { create(:project, :repository) } + let(:repository) { project.repository } + + subject { described_class.new(repository, 'lfs') } + + it 'loads .gitattributes blob' do + repository.raw # Initialize repository in advance since this also checks attributes + + expected_filter = 'filter=lfs diff=lfs merge=lfs' + receive_blob = receive(:new).with(a_string_including(expected_filter)) + expect(Gitlab::Git::AttributesParser).to receive_blob.and_call_original + + subject + end + + it 'handles missing blobs' do + expect { described_class.new(repository, 'non-existant-branch') }.not_to raise_error + end + + describe '#attributes' do + it 'returns the attributes as a Hash' do + expect(subject.attributes('test.lfs')['filter']).to eq('lfs') + end + end +end diff --git a/spec/lib/gitlab/git/attributes_parser_spec.rb b/spec/lib/gitlab/git/attributes_parser_spec.rb new file mode 100644 index 00000000000..323334e99a5 --- /dev/null +++ b/spec/lib/gitlab/git/attributes_parser_spec.rb @@ -0,0 +1,153 @@ +require 'spec_helper' + +describe Gitlab::Git::AttributesParser, seed_helper: true do + let(:attributes_path) { File.join(SEED_STORAGE_PATH, 'with-git-attributes.git', 'info', 'attributes') } + let(:data) { File.read(attributes_path) } + + subject { described_class.new(data) } + + describe '#attributes' do + context 'using a path with attributes' do + it 'returns the attributes as a Hash' do + expect(subject.attributes('test.txt')).to eq({ 'text' => true }) + end + + it 'returns a Hash containing multiple attributes' do + expect(subject.attributes('test.sh')) + .to eq({ 'eol' => 'lf', 'gitlab-language' => 'shell' }) + end + + it 'returns a Hash containing attributes for a file with multiple extensions' do + expect(subject.attributes('test.haml.html')) + .to eq({ 'gitlab-language' => 'haml' }) + end + + it 'returns a Hash containing attributes for a file in a directory' do + expect(subject.attributes('foo/bar.txt')).to eq({ 'foo' => true }) + end + + it 'returns a Hash containing attributes with query string parameters' do + expect(subject.attributes('foo.cgi')) + .to eq({ 'key' => 'value?p1=v1&p2=v2' }) + end + + it 'returns a Hash containing the attributes for an absolute path' do + expect(subject.attributes('/test.txt')).to eq({ 'text' => true }) + end + + it 'returns a Hash containing the attributes when a pattern is defined using an absolute path' do + # When a path is given without a leading slash it should still match + # patterns defined with a leading slash. + expect(subject.attributes('foo.png')) + .to eq({ 'gitlab-language' => 'png' }) + + expect(subject.attributes('/foo.png')) + .to eq({ 'gitlab-language' => 'png' }) + end + + it 'returns an empty Hash for a defined path without attributes' do + expect(subject.attributes('bla/bla.txt')).to eq({}) + end + + context 'when the "binary" option is set for a path' do + it 'returns true for the "binary" option' do + expect(subject.attributes('test.binary')['binary']).to eq(true) + end + + it 'returns false for the "diff" option' do + expect(subject.attributes('test.binary')['diff']).to eq(false) + end + end + end + + context 'using a path without any attributes' do + it 'returns an empty Hash' do + expect(subject.attributes('test.foo')).to eq({}) + end + end + + context 'when attributes data is a file handle' do + subject do + File.open(attributes_path, 'r') do |file_handle| + described_class.new(file_handle) + end + end + + it 'returns the attributes as a Hash' do + expect(subject.attributes('test.txt')).to eq({ 'text' => true }) + end + end + + context 'when attributes data is nil' do + let(:data) { nil } + + it 'returns an empty Hash' do + expect(subject.attributes('test.foo')).to eq({}) + end + end + end + + describe '#patterns' do + it 'parses a file with entries' do + expect(subject.patterns).to be_an_instance_of(Hash) + end + + it 'parses an entry that uses a tab to separate the pattern and attributes' do + expect(subject.patterns[File.join('/', '*.md')]) + .to eq({ 'gitlab-language' => 'markdown' }) + end + + it 'stores patterns in reverse order' do + first = subject.patterns.to_a[0] + + expect(first[0]).to eq(File.join('/', 'bla/bla.txt')) + end + + # It's a bit hard to test for something _not_ being processed. As such we'll + # just test the number of entries. + it 'ignores any comments and empty lines' do + expect(subject.patterns.length).to eq(10) + end + end + + describe '#parse_attributes' do + it 'parses a boolean attribute' do + expect(subject.parse_attributes('text')).to eq({ 'text' => true }) + end + + it 'parses a negated boolean attribute' do + expect(subject.parse_attributes('-text')).to eq({ 'text' => false }) + end + + it 'parses a key-value pair' do + expect(subject.parse_attributes('foo=bar')).to eq({ 'foo' => 'bar' }) + end + + it 'parses multiple attributes' do + input = 'boolean key=value -negated' + + expect(subject.parse_attributes(input)) + .to eq({ 'boolean' => true, 'key' => 'value', 'negated' => false }) + end + + it 'parses attributes with query string parameters' do + expect(subject.parse_attributes('foo=bar?baz=1')) + .to eq({ 'foo' => 'bar?baz=1' }) + end + end + + describe '#each_line' do + it 'iterates over every line in the attributes file' do + args = [String] * 14 # the number of lines in the file + + expect { |b| subject.each_line(&b) }.to yield_successive_args(*args) + end + + it 'does not yield when the attributes file has an unsupported encoding' do + path = File.join(SEED_STORAGE_PATH, 'with-invalid-git-attributes.git', 'info', 'attributes') + attrs = described_class.new(File.read(path)) + + expect { |b| attrs.each_line(&b) }.not_to yield_control + end + end +end diff --git a/spec/lib/gitlab/git/attributes_spec.rb b/spec/lib/gitlab/git/attributes_spec.rb deleted file mode 100644 index b715fc3410a..00000000000 --- a/spec/lib/gitlab/git/attributes_spec.rb +++ /dev/null @@ -1,150 +0,0 @@ -require 'spec_helper' - -describe Gitlab::Git::Attributes, seed_helper: true do - let(:path) do - File.join(SEED_STORAGE_PATH, 'with-git-attributes.git') - end - - subject { described_class.new(path) } - - describe '#attributes' do - context 'using a path with attributes' do - it 'returns the attributes as a Hash' do - expect(subject.attributes('test.txt')).to eq({ 'text' => true }) - end - - it 'returns a Hash containing multiple attributes' do - expect(subject.attributes('test.sh')) - .to eq({ 'eol' => 'lf', 'gitlab-language' => 'shell' }) - end - - it 'returns a Hash containing attributes for a file with multiple extensions' do - expect(subject.attributes('test.haml.html')) - .to eq({ 'gitlab-language' => 'haml' }) - end - - it 'returns a Hash containing attributes for a file in a directory' do - expect(subject.attributes('foo/bar.txt')).to eq({ 'foo' => true }) - end - - it 'returns a Hash containing attributes with query string parameters' do - expect(subject.attributes('foo.cgi')) - .to eq({ 'key' => 'value?p1=v1&p2=v2' }) - end - - it 'returns a Hash containing the attributes for an absolute path' do - expect(subject.attributes('/test.txt')).to eq({ 'text' => true }) - end - - it 'returns a Hash containing the attributes when a pattern is defined using an absolute path' do - # When a path is given without a leading slash it should still match - # patterns defined with a leading slash. - expect(subject.attributes('foo.png')) - .to eq({ 'gitlab-language' => 'png' }) - - expect(subject.attributes('/foo.png')) - .to eq({ 'gitlab-language' => 'png' }) - end - - it 'returns an empty Hash for a defined path without attributes' do - expect(subject.attributes('bla/bla.txt')).to eq({}) - end - - context 'when the "binary" option is set for a path' do - it 'returns true for the "binary" option' do - expect(subject.attributes('test.binary')['binary']).to eq(true) - end - - it 'returns false for the "diff" option' do - expect(subject.attributes('test.binary')['diff']).to eq(false) - end - end - end - - context 'using a path without any attributes' do - it 'returns an empty Hash' do - expect(subject.attributes('test.foo')).to eq({}) - end - end - end - - describe '#patterns' do - it 'parses a file with entries' do - expect(subject.patterns).to be_an_instance_of(Hash) - end - - it 'parses an entry that uses a tab to separate the pattern and attributes' do - expect(subject.patterns[File.join(path, '*.md')]) - .to eq({ 'gitlab-language' => 'markdown' }) - end - - it 'stores patterns in reverse order' do - first = subject.patterns.to_a[0] - - expect(first[0]).to eq(File.join(path, 'bla/bla.txt')) - end - - # It's a bit hard to test for something _not_ being processed. As such we'll - # just test the number of entries. - it 'ignores any comments and empty lines' do - expect(subject.patterns.length).to eq(10) - end - - it 'does not parse anything when the attributes file does not exist' do - expect(File).to receive(:exist?) - .with(File.join(path, 'info/attributes')) - .and_return(false) - - expect(subject.patterns).to eq({}) - end - end - - describe '#parse_attributes' do - it 'parses a boolean attribute' do - expect(subject.parse_attributes('text')).to eq({ 'text' => true }) - end - - it 'parses a negated boolean attribute' do - expect(subject.parse_attributes('-text')).to eq({ 'text' => false }) - end - - it 'parses a key-value pair' do - expect(subject.parse_attributes('foo=bar')).to eq({ 'foo' => 'bar' }) - end - - it 'parses multiple attributes' do - input = 'boolean key=value -negated' - - expect(subject.parse_attributes(input)) - .to eq({ 'boolean' => true, 'key' => 'value', 'negated' => false }) - end - - it 'parses attributes with query string parameters' do - expect(subject.parse_attributes('foo=bar?baz=1')) - .to eq({ 'foo' => 'bar?baz=1' }) - end - end - - describe '#each_line' do - it 'iterates over every line in the attributes file' do - args = [String] * 14 # the number of lines in the file - - expect { |b| subject.each_line(&b) }.to yield_successive_args(*args) - end - - it 'does not yield when the attributes file does not exist' do - expect(File).to receive(:exist?) - .with(File.join(path, 'info/attributes')) - .and_return(false) - - expect { |b| subject.each_line(&b) }.not_to yield_control - end - - it 'does not yield when the attributes file has an unsupported encoding' do - path = File.join(SEED_STORAGE_PATH, 'with-invalid-git-attributes.git') - attrs = described_class.new(path) - - expect { |b| attrs.each_line(&b) }.not_to yield_control - end - end -end diff --git a/spec/lib/gitlab/git/info_attributes_spec.rb b/spec/lib/gitlab/git/info_attributes_spec.rb new file mode 100644 index 00000000000..ea84909c3e0 --- /dev/null +++ b/spec/lib/gitlab/git/info_attributes_spec.rb @@ -0,0 +1,43 @@ +require 'spec_helper' + +describe Gitlab::Git::InfoAttributes, seed_helper: true do + let(:path) do + File.join(SEED_STORAGE_PATH, 'with-git-attributes.git') + end + + subject { described_class.new(path) } + + describe '#attributes' do + context 'using a path with attributes' do + it 'returns the attributes as a Hash' do + expect(subject.attributes('test.txt')).to eq({ 'text' => true }) + end + + it 'returns an empty Hash for a defined path without attributes' do + expect(subject.attributes('bla/bla.txt')).to eq({}) + end + end + end + + describe '#parser' do + it 'parses a file with entries' do + expect(subject.patterns).to be_an_instance_of(Hash) + expect(subject.patterns["/*.txt"]).to eq({ 'text' => true }) + end + + it 'does not parse anything when the attributes file does not exist' do + expect(File).to receive(:exist?) + .with(File.join(path, 'info/attributes')) + .and_return(false) + + expect(subject.patterns).to eq({}) + end + + it 'does not parse attributes files with unsupported encoding' do + path = File.join(SEED_STORAGE_PATH, 'with-invalid-git-attributes.git') + subject = described_class.new(path) + + expect(subject.patterns).to eq({}) + end + end +end diff --git a/spec/support/test_env.rb b/spec/support/test_env.rb index 664698fcbaf..a00ef543128 100644 --- a/spec/support/test_env.rb +++ b/spec/support/test_env.rb @@ -20,7 +20,7 @@ module TestEnv 'improve/awesome' => '5937ac0', 'merged-target' => '21751bf', 'markdown' => '0ed8c6c', - 'lfs' => 'be93687', + 'lfs' => '55bc176', 'master' => 'b83d6e3', 'merge-test' => '5937ac0', "'test'" => 'e56497b', -- cgit v1.2.1