diff options
author | Kamil Trzciński <ayufan@ayufan.eu> | 2018-07-09 13:34:18 +0200 |
---|---|---|
committer | Kamil Trzciński <ayufan@ayufan.eu> | 2018-07-09 14:19:52 +0200 |
commit | bc00803af03147452c12e9e2c7e8f0c0cba86f73 (patch) | |
tree | 5d360cbb1422b7d063bffa4fe40bcf1e64b61db7 | |
parent | 67157de1e4cc482b5321ba2f246bfd80a7893f93 (diff) | |
download | gitlab-ce-bc00803af03147452c12e9e2c7e8f0c0cba86f73.tar.gz |
Access metadata directly from Object Storage
Previously we would pull the file, now, we just stream-it as needed from Object Storage
-rw-r--r-- | app/models/ci/build.rb | 4 | ||||
-rw-r--r-- | app/uploaders/gitlab_uploader.rb | 17 | ||||
-rw-r--r-- | app/uploaders/job_artifact_uploader.rb | 8 | ||||
-rw-r--r-- | changelogs/unreleased/improve-metadata-access-performance.yml | 5 | ||||
-rw-r--r-- | lib/gitlab/ci/build/artifacts/metadata.rb | 19 | ||||
-rw-r--r-- | lib/gitlab/ci/trace/http_io.rb | 197 | ||||
-rw-r--r-- | lib/gitlab/http_io.rb | 193 | ||||
-rw-r--r-- | spec/controllers/projects/jobs_controller_spec.rb | 2 | ||||
-rw-r--r-- | spec/lib/gitlab/ci/build/artifacts/metadata_spec.rb | 24 | ||||
-rw-r--r-- | spec/lib/gitlab/http_io_spec.rb (renamed from spec/lib/gitlab/ci/trace/http_io_spec.rb) | 2 | ||||
-rw-r--r-- | spec/support/http_io/http_io_helpers.rb | 4 | ||||
-rw-r--r-- | spec/uploaders/job_artifact_uploader_spec.rb | 2 |
12 files changed, 258 insertions, 219 deletions
diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 19949f83351..acf555a5369 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -437,9 +437,9 @@ module Ci end def artifacts_metadata_entry(path, **options) - artifacts_metadata.use_file do |metadata_path| + artifacts_metadata.open do |metadata_stream| metadata = Gitlab::Ci::Build::Artifacts::Metadata.new( - metadata_path, + metadata_stream, path, **options) diff --git a/app/uploaders/gitlab_uploader.rb b/app/uploaders/gitlab_uploader.rb index 7919f126075..7636acf255c 100644 --- a/app/uploaders/gitlab_uploader.rb +++ b/app/uploaders/gitlab_uploader.rb @@ -71,6 +71,23 @@ class GitlabUploader < CarrierWave::Uploader::Base File.join('/', self.class.base_dir, dynamic_segment, filename) end + def open + stream = if file_storage? + File.open(path, "rb") if path + else + ::Gitlab::HttpIO.new(url, cached_size) if url + end + + return unless stream + return stream unless block_given? + + begin + yield(stream) + ensure + stream.close + end + end + private # Designed to be overridden by child uploaders that have a dynamic path diff --git a/app/uploaders/job_artifact_uploader.rb b/app/uploaders/job_artifact_uploader.rb index 855cf2fc21c..f6af023e0f9 100644 --- a/app/uploaders/job_artifact_uploader.rb +++ b/app/uploaders/job_artifact_uploader.rb @@ -18,14 +18,6 @@ class JobArtifactUploader < GitlabUploader dynamic_segment end - def open - if file_storage? - File.open(path, "rb") if path - else - ::Gitlab::Ci::Trace::HttpIO.new(url, cached_size) if url - end - end - private def dynamic_segment diff --git a/changelogs/unreleased/improve-metadata-access-performance.yml b/changelogs/unreleased/improve-metadata-access-performance.yml new file mode 100644 index 00000000000..b16fa99dd3b --- /dev/null +++ b/changelogs/unreleased/improve-metadata-access-performance.yml @@ -0,0 +1,5 @@ +--- +title: Access metadata directly from Object Storage +merge_request: +author: +type: performance diff --git a/lib/gitlab/ci/build/artifacts/metadata.rb b/lib/gitlab/ci/build/artifacts/metadata.rb index 0bbd60d8ffe..375d8bc1ff5 100644 --- a/lib/gitlab/ci/build/artifacts/metadata.rb +++ b/lib/gitlab/ci/build/artifacts/metadata.rb @@ -7,14 +7,15 @@ module Gitlab module Artifacts class Metadata ParserError = Class.new(StandardError) + InvalidStreamError = Class.new(StandardError) VERSION_PATTERN = /^[\w\s]+(\d+\.\d+\.\d+)/ INVALID_PATH_PATTERN = %r{(^\.?\.?/)|(/\.?\.?/)} - attr_reader :file, :path, :full_version + attr_reader :stream, :path, :full_version - def initialize(file, path, **opts) - @file, @path, @opts = file, path, opts + def initialize(stream, path, **opts) + @stream, @path, @opts = stream, path, opts @full_version = read_version end @@ -103,7 +104,17 @@ module Gitlab end def gzip(&block) - Zlib::GzipReader.open(@file, &block) + raise InvalidStreamError, "Invalid stream" unless @stream + + # restart gzip reading + @stream.seek(0) + + gz = Zlib::GzipReader.new(@stream) + yield(gz) + rescue Zlib::Error => e + raise InvalidStreamError, e.message + ensure + gz&.finish end end end diff --git a/lib/gitlab/ci/trace/http_io.rb b/lib/gitlab/ci/trace/http_io.rb deleted file mode 100644 index 8788af57a67..00000000000 --- a/lib/gitlab/ci/trace/http_io.rb +++ /dev/null @@ -1,197 +0,0 @@ -## -# This class is compatible with IO class (https://ruby-doc.org/core-2.3.1/IO.html) -# source: https://gitlab.com/snippets/1685610 -module Gitlab - module Ci - class Trace - class HttpIO - BUFFER_SIZE = 128.kilobytes - - InvalidURLError = Class.new(StandardError) - FailedToGetChunkError = Class.new(StandardError) - - attr_reader :uri, :size - attr_reader :tell - attr_reader :chunk, :chunk_range - - alias_method :pos, :tell - - def initialize(url, size) - raise InvalidURLError unless ::Gitlab::UrlSanitizer.valid?(url) - - @uri = URI(url) - @size = size - @tell = 0 - end - - def close - # no-op - end - - def binmode - # no-op - end - - def binmode? - true - end - - def path - nil - end - - def url - @uri.to_s - end - - def seek(pos, where = IO::SEEK_SET) - new_pos = - case where - when IO::SEEK_END - size + pos - when IO::SEEK_SET - pos - when IO::SEEK_CUR - tell + pos - else - -1 - end - - raise 'new position is outside of file' if new_pos < 0 || new_pos > size - - @tell = new_pos - end - - def eof? - tell == size - end - - def each_line - until eof? - line = readline - break if line.nil? - - yield(line) - end - end - - def read(length = nil, outbuf = "") - out = "" - - length ||= size - tell - - until length <= 0 || eof? - data = get_chunk - break if data.empty? - - chunk_bytes = [BUFFER_SIZE - chunk_offset, length].min - chunk_data = data.byteslice(0, chunk_bytes) - - out << chunk_data - @tell += chunk_data.bytesize - length -= chunk_data.bytesize - end - - # If outbuf is passed, we put the output into the buffer. This supports IO.copy_stream functionality - if outbuf - outbuf.slice!(0, outbuf.bytesize) - outbuf << out - end - - out - end - - def readline - out = "" - - until eof? - data = get_chunk - new_line = data.index("\n") - - if !new_line.nil? - out << data[0..new_line] - @tell += new_line + 1 - break - else - out << data - @tell += data.bytesize - end - end - - out - end - - def write(data) - raise NotImplementedError - end - - def truncate(offset) - raise NotImplementedError - end - - def flush - raise NotImplementedError - end - - def present? - true - end - - private - - ## - # The below methods are not implemented in IO class - # - def in_range? - @chunk_range&.include?(tell) - end - - def get_chunk - unless in_range? - response = Net::HTTP.start(uri.hostname, uri.port, proxy_from_env: true, use_ssl: uri.scheme == 'https') do |http| - http.request(request) - end - - raise FailedToGetChunkError unless response.code == '200' || response.code == '206' - - @chunk = response.body.force_encoding(Encoding::BINARY) - @chunk_range = response.content_range - - ## - # Note: If provider does not return content_range, then we set it as we requested - # Provider: minio - # - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 - # - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 - # Provider: AWS - # - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 - # - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 - # Provider: GCS - # - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 - # - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPOK 200 - @chunk_range ||= (chunk_start...(chunk_start + @chunk.bytesize)) - end - - @chunk[chunk_offset..BUFFER_SIZE] - end - - def request - Net::HTTP::Get.new(uri).tap do |request| - request.set_range(chunk_start, BUFFER_SIZE) - end - end - - def chunk_offset - tell % BUFFER_SIZE - end - - def chunk_start - (tell / BUFFER_SIZE) * BUFFER_SIZE - end - - def chunk_end - [chunk_start + BUFFER_SIZE, size].min - end - end - end - end -end diff --git a/lib/gitlab/http_io.rb b/lib/gitlab/http_io.rb new file mode 100644 index 00000000000..ce24817db54 --- /dev/null +++ b/lib/gitlab/http_io.rb @@ -0,0 +1,193 @@ +## +# This class is compatible with IO class (https://ruby-doc.org/core-2.3.1/IO.html) +# source: https://gitlab.com/snippets/1685610 +module Gitlab + class HttpIO + BUFFER_SIZE = 128.kilobytes + + InvalidURLError = Class.new(StandardError) + FailedToGetChunkError = Class.new(StandardError) + + attr_reader :uri, :size + attr_reader :tell + attr_reader :chunk, :chunk_range + + alias_method :pos, :tell + + def initialize(url, size) + raise InvalidURLError unless ::Gitlab::UrlSanitizer.valid?(url) + + @uri = URI(url) + @size = size + @tell = 0 + end + + def close + # no-op + end + + def binmode + # no-op + end + + def binmode? + true + end + + def path + nil + end + + def url + @uri.to_s + end + + def seek(pos, where = IO::SEEK_SET) + new_pos = + case where + when IO::SEEK_END + size + pos + when IO::SEEK_SET + pos + when IO::SEEK_CUR + tell + pos + else + -1 + end + + raise 'new position is outside of file' if new_pos < 0 || new_pos > size + + @tell = new_pos + end + + def eof? + tell == size + end + + def each_line + until eof? + line = readline + break if line.nil? + + yield(line) + end + end + + def read(length = nil, outbuf = "") + out = "" + + length ||= size - tell + + until length <= 0 || eof? + data = get_chunk + break if data.empty? + + chunk_bytes = [BUFFER_SIZE - chunk_offset, length].min + chunk_data = data.byteslice(0, chunk_bytes) + + out << chunk_data + @tell += chunk_data.bytesize + length -= chunk_data.bytesize + end + + # If outbuf is passed, we put the output into the buffer. This supports IO.copy_stream functionality + if outbuf + outbuf.slice!(0, outbuf.bytesize) + outbuf << out + end + + out + end + + def readline + out = "" + + until eof? + data = get_chunk + new_line = data.index("\n") + + if !new_line.nil? + out << data[0..new_line] + @tell += new_line + 1 + break + else + out << data + @tell += data.bytesize + end + end + + out + end + + def write(data) + raise NotImplementedError + end + + def truncate(offset) + raise NotImplementedError + end + + def flush + raise NotImplementedError + end + + def present? + true + end + + private + + ## + # The below methods are not implemented in IO class + # + def in_range? + @chunk_range&.include?(tell) + end + + def get_chunk + unless in_range? + response = Net::HTTP.start(uri.hostname, uri.port, proxy_from_env: true, use_ssl: uri.scheme == 'https') do |http| + http.request(request) + end + + raise FailedToGetChunkError unless response.code == '200' || response.code == '206' + + @chunk = response.body.force_encoding(Encoding::BINARY) + @chunk_range = response.content_range + + ## + # Note: If provider does not return content_range, then we set it as we requested + # Provider: minio + # - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 + # - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 + # Provider: AWS + # - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 + # - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 + # Provider: GCS + # - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206 + # - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPOK 200 + @chunk_range ||= (chunk_start...(chunk_start + @chunk.bytesize)) + end + + @chunk[chunk_offset..BUFFER_SIZE] + end + + def request + Net::HTTP::Get.new(uri).tap do |request| + request.set_range(chunk_start, BUFFER_SIZE) + end + end + + def chunk_offset + tell % BUFFER_SIZE + end + + def chunk_start + (tell / BUFFER_SIZE) * BUFFER_SIZE + end + + def chunk_end + [chunk_start + BUFFER_SIZE, size].min + end + end +end diff --git a/spec/controllers/projects/jobs_controller_spec.rb b/spec/controllers/projects/jobs_controller_spec.rb index e6332a10265..5f53a2d087a 100644 --- a/spec/controllers/projects/jobs_controller_spec.rb +++ b/spec/controllers/projects/jobs_controller_spec.rb @@ -245,7 +245,7 @@ describe Projects::JobsController, :clean_gitlab_redis_shared_state do end it 'returns a trace' do - expect { get_trace }.to raise_error(Gitlab::Ci::Trace::HttpIO::FailedToGetChunkError) + expect { get_trace }.to raise_error(Gitlab::HttpIO::FailedToGetChunkError) end end end diff --git a/spec/lib/gitlab/ci/build/artifacts/metadata_spec.rb b/spec/lib/gitlab/ci/build/artifacts/metadata_spec.rb index 6a52ae01b2f..e327399d82d 100644 --- a/spec/lib/gitlab/ci/build/artifacts/metadata_spec.rb +++ b/spec/lib/gitlab/ci/build/artifacts/metadata_spec.rb @@ -2,13 +2,21 @@ require 'spec_helper' describe Gitlab::Ci::Build::Artifacts::Metadata do def metadata(path = '', **opts) - described_class.new(metadata_file_path, path, **opts) + described_class.new(metadata_file_stream, path, **opts) end let(:metadata_file_path) do Rails.root + 'spec/fixtures/ci_build_artifacts_metadata.gz' end + let(:metadata_file_stream) do + File.open(metadata_file_path) if metadata_file_path + end + + after do + metadata_file_stream&.close + end + context 'metadata file exists' do describe '#find_entries! empty string' do subject { metadata('').find_entries! } @@ -86,11 +94,21 @@ describe Gitlab::Ci::Build::Artifacts::Metadata do end context 'metadata file does not exist' do - let(:metadata_file_path) { '' } + let(:metadata_file_path) { nil } + + describe '#find_entries!' do + it 'raises error' do + expect { metadata.find_entries! }.to raise_error(described_class::InvalidStreamError, /Invalid stream/) + end + end + end + + context 'metadata file is invalid' do + let(:metadata_file_path) { Rails.root + 'spec/fixtures/ci_build_artifacts.zip' } describe '#find_entries!' do it 'raises error' do - expect { metadata.find_entries! }.to raise_error(Errno::ENOENT) + expect { metadata.find_entries! }.to raise_error(described_class::InvalidStreamError, /not in gzip format/) end end end diff --git a/spec/lib/gitlab/ci/trace/http_io_spec.rb b/spec/lib/gitlab/http_io_spec.rb index 5474e2f518c..2c9a73609a9 100644 --- a/spec/lib/gitlab/ci/trace/http_io_spec.rb +++ b/spec/lib/gitlab/http_io_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Gitlab::Ci::Trace::HttpIO do +describe Gitlab::HttpIO do include HttpIOHelpers let(:http_io) { described_class.new(url, size) } diff --git a/spec/support/http_io/http_io_helpers.rb b/spec/support/http_io/http_io_helpers.rb index 2c68c2cd9a6..86b254aa67d 100644 --- a/spec/support/http_io/http_io_helpers.rb +++ b/spec/support/http_io/http_io_helpers.rb @@ -54,12 +54,12 @@ module HttpIOHelpers def set_smaller_buffer_size_than(file_size) blocks = (file_size / 128) new_size = (blocks / 2) * 128 - stub_const("Gitlab::Ci::Trace::HttpIO::BUFFER_SIZE", new_size) + stub_const("Gitlab::HttpIO::BUFFER_SIZE", new_size) end def set_larger_buffer_size_than(file_size) blocks = (file_size / 128) new_size = (blocks * 2) * 128 - stub_const("Gitlab::Ci::Trace::HttpIO::BUFFER_SIZE", new_size) + stub_const("Gitlab::HttpIO::BUFFER_SIZE", new_size) end end diff --git a/spec/uploaders/job_artifact_uploader_spec.rb b/spec/uploaders/job_artifact_uploader_spec.rb index 026e4356ed6..d0b14768867 100644 --- a/spec/uploaders/job_artifact_uploader_spec.rb +++ b/spec/uploaders/job_artifact_uploader_spec.rb @@ -55,7 +55,7 @@ describe JobArtifactUploader do end it 'returns http io stream' do - is_expected.to be_a(Gitlab::Ci::Trace::HttpIO) + is_expected.to be_a(Gitlab::HttpIO) end end end |