diff options
-rw-r--r-- | GITALY_SERVER_VERSION | 2 | ||||
-rw-r--r-- | app/models/merge_request_diff.rb | 4 | ||||
-rw-r--r-- | app/models/note_diff_file.rb | 4 | ||||
-rw-r--r-- | app/services/projects/cleanup_service.rb | 47 | ||||
-rw-r--r-- | changelogs/unreleased/30093-apply-bfg-object-map-to-database.yml | 5 | ||||
-rw-r--r-- | doc/user/project/repository/reducing_the_repo_size_using_git.md | 6 | ||||
-rw-r--r-- | lib/gitlab/discussions_diff/highlight_cache.rb | 13 | ||||
-rw-r--r-- | lib/gitlab/git/repository_cleaner.rb | 4 | ||||
-rw-r--r-- | lib/gitlab/gitaly_client/cleanup_service.rb | 33 | ||||
-rw-r--r-- | spec/lib/gitlab/discussions_diff/highlight_cache_spec.rb | 110 | ||||
-rw-r--r-- | spec/lib/gitlab/git/repository_cleaner_spec.rb | 71 | ||||
-rw-r--r-- | spec/lib/gitlab/gitaly_client/cleanup_service_spec.rb | 10 | ||||
-rw-r--r-- | spec/models/note_diff_file_spec.rb | 27 | ||||
-rw-r--r-- | spec/services/projects/cleanup_service_spec.rb | 89 |
14 files changed, 302 insertions, 123 deletions
diff --git a/GITALY_SERVER_VERSION b/GITALY_SERVER_VERSION index 32b7211cb61..7d47e599800 100644 --- a/GITALY_SERVER_VERSION +++ b/GITALY_SERVER_VERSION @@ -1 +1 @@ -1.40.0 +1.41.0 diff --git a/app/models/merge_request_diff.rb b/app/models/merge_request_diff.rb index 5f5d92bc2f0..f45bd0e03de 100644 --- a/app/models/merge_request_diff.rb +++ b/app/models/merge_request_diff.rb @@ -51,6 +51,10 @@ class MergeRequestDiff < ApplicationRecord joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil) end + scope :by_project_id, -> (project_id) do + joins(:merge_request).where(merge_requests: { target_project_id: project_id }) + end + scope :recent, -> { order(id: :desc).limit(100) } scope :files_in_database, -> { where(stored_externally: [false, nil]) } diff --git a/app/models/note_diff_file.rb b/app/models/note_diff_file.rb index 9afb94c869a..fcc9e2b3fd8 100644 --- a/app/models/note_diff_file.rb +++ b/app/models/note_diff_file.rb @@ -7,6 +7,10 @@ class NoteDiffFile < ApplicationRecord joins(:diff_note).where("resolved_at IS NULL OR noteable_type = 'Commit'") end + scope :referencing_sha, -> (oids, project_id:) do + joins(:diff_note).where(notes: { project_id: project_id, commit_id: oids }) + end + delegate :original_position, :project, to: :diff_note belongs_to :diff_note, inverse_of: :note_diff_file diff --git a/app/services/projects/cleanup_service.rb b/app/services/projects/cleanup_service.rb index 12103ea34b5..5972bfd4071 100644 --- a/app/services/projects/cleanup_service.rb +++ b/app/services/projects/cleanup_service.rb @@ -18,9 +18,6 @@ module Projects # per rewritten object, with the old and new SHAs space-separated. It can be # used to update or remove content that references the objects that BFG has # altered - # - # Currently, only the project repository is modified by this service, but we - # may wish to modify other data sources in the future. def execute apply_bfg_object_map! @@ -41,10 +38,52 @@ module Projects raise NoUploadError unless project.bfg_object_map.exists? project.bfg_object_map.open do |io| - repository_cleaner.apply_bfg_object_map(io) + repository_cleaner.apply_bfg_object_map_stream(io) do |response| + cleanup_diffs(response) + end + end + end + + def cleanup_diffs(response) + old_commit_shas = extract_old_commit_shas(response.entries) + + ActiveRecord::Base.transaction do + cleanup_merge_request_diffs(old_commit_shas) + cleanup_note_diff_files(old_commit_shas) end end + def extract_old_commit_shas(batch) + batch.lazy.select { |entry| entry.type == :COMMIT }.map(&:old_oid).force + end + + def cleanup_merge_request_diffs(old_commit_shas) + merge_request_diffs = MergeRequestDiff + .by_project_id(project.id) + .by_commit_sha(old_commit_shas) + + # It's important to run the ActiveRecord callbacks here + merge_request_diffs.destroy_all # rubocop:disable Cop/DestroyAll + + # TODO: ensure the highlight cache is removed immediately. It's too hard + # to calculate the Redis keys at present. + # + # https://gitlab.com/gitlab-org/gitlab-ce/issues/61115 + end + + def cleanup_note_diff_files(old_commit_shas) + # Pluck the IDs instead of running the query twice to ensure we clear the + # cache for exactly the note diffs we remove + ids = NoteDiffFile + .referencing_sha(old_commit_shas, project_id: project.id) + .pluck_primary_key + + NoteDiffFile.id_in(ids).delete_all + + # A highlighted version of the diff is stored in redis. Remove it now. + Gitlab::DiscussionsDiff::HighlightCache.clear_multiple(ids) + end + def repository_cleaner @repository_cleaner ||= Gitlab::Git::RepositoryCleaner.new(repository.raw) end diff --git a/changelogs/unreleased/30093-apply-bfg-object-map-to-database.yml b/changelogs/unreleased/30093-apply-bfg-object-map-to-database.yml new file mode 100644 index 00000000000..ec851dfcacc --- /dev/null +++ b/changelogs/unreleased/30093-apply-bfg-object-map-to-database.yml @@ -0,0 +1,5 @@ +--- +title: Remove cleaned up OIDs from database and cache +merge_request: 26555 +author: +type: added diff --git a/doc/user/project/repository/reducing_the_repo_size_using_git.md b/doc/user/project/repository/reducing_the_repo_size_using_git.md index 672567a8d7d..2339759ecc8 100644 --- a/doc/user/project/repository/reducing_the_repo_size_using_git.md +++ b/doc/user/project/repository/reducing_the_repo_size_using_git.md @@ -98,6 +98,12 @@ up its own internal state, maximizing the space saved. `git gc` against the repository. You will receive an email once it has completed. +This process will remove some copies of the rewritten commits from GitLab's +cache and database, but there are still numerous gaps in coverage - at present, +some of the copies may persist indefinitely. [Clearing the instance cache] +(../../../administration/raketasks/maintenance.md#clear-redis-cache) may help to +remove some of them, but it should not be depended on for security purposes! + ## Using `git filter-branch` 1. Navigate to your repository: diff --git a/lib/gitlab/discussions_diff/highlight_cache.rb b/lib/gitlab/discussions_diff/highlight_cache.rb index 270cfb89488..369c6b87fb4 100644 --- a/lib/gitlab/discussions_diff/highlight_cache.rb +++ b/lib/gitlab/discussions_diff/highlight_cache.rb @@ -52,6 +52,19 @@ module Gitlab end end + # Clears multiple cache keys at once. + # + # raw_keys - An Array of unique cache keys, without namespaces. + # + # It returns the number of cache keys cleared. Ex.: 42 + def clear_multiple(raw_keys) + return [] if raw_keys.empty? + + keys = raw_keys.map { |id| cache_key_for(id) } + + Redis::Cache.with { |redis| redis.del(keys) } + end + def cache_key_for(raw_key) "#{cache_key_prefix}:#{raw_key}" end diff --git a/lib/gitlab/git/repository_cleaner.rb b/lib/gitlab/git/repository_cleaner.rb index 2d1d8435cf3..9dd0ddfb44b 100644 --- a/lib/gitlab/git/repository_cleaner.rb +++ b/lib/gitlab/git/repository_cleaner.rb @@ -12,9 +12,9 @@ module Gitlab @repository = repository end - def apply_bfg_object_map(io) + def apply_bfg_object_map_stream(io, &blk) wrapped_gitaly_errors do - gitaly_cleanup_client.apply_bfg_object_map(io) + gitaly_cleanup_client.apply_bfg_object_map_stream(io, &blk) end end diff --git a/lib/gitlab/gitaly_client/cleanup_service.rb b/lib/gitlab/gitaly_client/cleanup_service.rb index 3e8d6a773ca..a56bc35f6d7 100644 --- a/lib/gitlab/gitaly_client/cleanup_service.rb +++ b/lib/gitlab/gitaly_client/cleanup_service.rb @@ -12,25 +12,32 @@ module Gitlab @storage = repository.storage end - def apply_bfg_object_map(io) - first_request = Gitaly::ApplyBfgObjectMapRequest.new(repository: gitaly_repo) + def apply_bfg_object_map_stream(io, &blk) + responses = GitalyClient.call( + storage, + :cleanup_service, + :apply_bfg_object_map_stream, + build_object_map_enum(io), + timeout: GitalyClient.no_timeout + ) + + responses.each(&blk) + end + + private - enum = Enumerator.new do |y| - y.yield first_request + def build_object_map_enum(io) + Enumerator.new do |y| + # First request. For simplicity, doesn't include any object map data + y << Gitaly::ApplyBfgObjectMapStreamRequest.new(repository: gitaly_repo) + # Now stream the BFG object map file to gitaly in chunks while data = io.read(RepositoryService::MAX_MSG_SIZE) - y.yield Gitaly::ApplyBfgObjectMapRequest.new(object_map: data) + y << Gitaly::ApplyBfgObjectMapStreamRequest.new(object_map: data) + break if io&.eof? end end - - GitalyClient.call( - storage, - :cleanup_service, - :apply_bfg_object_map, - enum, - timeout: GitalyClient.no_timeout - ) end end end diff --git a/spec/lib/gitlab/discussions_diff/highlight_cache_spec.rb b/spec/lib/gitlab/discussions_diff/highlight_cache_spec.rb index fe26ebb8796..15ee8c40b55 100644 --- a/spec/lib/gitlab/discussions_diff/highlight_cache_spec.rb +++ b/spec/lib/gitlab/discussions_diff/highlight_cache_spec.rb @@ -3,31 +3,32 @@ require 'spec_helper' describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do + def fake_file(offset) + { + text: 'foo', + type: 'new', + index: 2 + offset, + old_pos: 10 + offset, + new_pos: 11 + offset, + line_code: 'xpto', + rich_text: '<blips>blops</blips>' + } + end + + let(:mapping) do + { + 3 => [ + fake_file(0), + fake_file(1) + ], + 4 => [ + fake_file(2) + ] + } + end + describe '#write_multiple' do it 'sets multiple keys serializing content as JSON' do - mapping = { - 3 => [ - { - text: 'foo', - type: 'new', - index: 2, - old_pos: 10, - new_pos: 11, - line_code: 'xpto', - rich_text: '<blips>blops</blips>' - }, - { - text: 'foo', - type: 'new', - index: 3, - old_pos: 11, - new_pos: 12, - line_code: 'xpto', - rich_text: '<blops>blips</blops>' - } - ] - } - described_class.write_multiple(mapping) mapping.each do |key, value| @@ -41,53 +42,16 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do describe '#read_multiple' do it 'reads multiple keys and serializes content into Gitlab::Diff::Line objects' do - mapping = { - 3 => [ - { - text: 'foo', - type: 'new', - index: 2, - old_pos: 11, - new_pos: 12, - line_code: 'xpto', - rich_text: '<blips>blops</blips>' - }, - { - text: 'foo', - type: 'new', - index: 3, - old_pos: 10, - new_pos: 11, - line_code: 'xpto', - rich_text: '<blips>blops</blips>' - } - ] - } - described_class.write_multiple(mapping) found = described_class.read_multiple(mapping.keys) - expect(found.size).to eq(1) + expect(found.size).to eq(2) expect(found.first.size).to eq(2) expect(found.first).to all(be_a(Gitlab::Diff::Line)) end it 'returns nil when cached key is not found' do - mapping = { - 3 => [ - { - text: 'foo', - type: 'new', - index: 2, - old_pos: 11, - new_pos: 12, - line_code: 'xpto', - rich_text: '<blips>blops</blips>' - } - ] - } - described_class.write_multiple(mapping) found = described_class.read_multiple([2, 3]) @@ -95,8 +59,30 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do expect(found.size).to eq(2) expect(found.first).to eq(nil) - expect(found.second.size).to eq(1) + expect(found.second.size).to eq(2) expect(found.second).to all(be_a(Gitlab::Diff::Line)) end end + + describe '#clear_multiple' do + it 'removes all named keys' do + described_class.write_multiple(mapping) + + described_class.clear_multiple(mapping.keys) + + expect(described_class.read_multiple(mapping.keys)).to all(be_nil) + end + + it 'only removed named keys' do + to_clear, to_leave = mapping.keys + + described_class.write_multiple(mapping) + described_class.clear_multiple([to_clear]) + + cleared, left = described_class.read_multiple([to_clear, to_leave]) + + expect(cleared).to be_nil + expect(left).to all(be_a(Gitlab::Diff::Line)) + end + end end diff --git a/spec/lib/gitlab/git/repository_cleaner_spec.rb b/spec/lib/gitlab/git/repository_cleaner_spec.rb index 6602f22843f..7bba0107e58 100644 --- a/spec/lib/gitlab/git/repository_cleaner_spec.rb +++ b/spec/lib/gitlab/git/repository_cleaner_spec.rb @@ -6,55 +6,62 @@ describe Gitlab::Git::RepositoryCleaner do let(:project) { create(:project, :repository) } let(:repository) { project.repository } let(:head_sha) { repository.head_commit.id } - let(:object_map_data) { "#{head_sha} #{'0' * 40}" } + let(:object_map_data) { "#{head_sha} #{Gitlab::Git::BLANK_SHA}" } - subject(:cleaner) { described_class.new(repository.raw) } + let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] } + let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] } - describe '#apply_bfg_object_map' do - let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] } - let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] } + subject(:cleaner) { described_class.new(repository.raw) } + shared_examples_for '#apply_bfg_object_map_stream' do before do (clean_refs + keep_refs).each { |ref| repository.create_ref(head_sha, ref) } end - context 'from StringIO' do - let(:object_map) { StringIO.new(object_map_data) } + it 'removes internal references' do + entries = [] - it 'removes internal references' do - cleaner.apply_bfg_object_map(object_map) + cleaner.apply_bfg_object_map_stream(object_map) do |rsp| + entries.concat(rsp.entries) + end - aggregate_failures do - clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy } - keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy } - end + aggregate_failures do + clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(false) } + keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(true) } + + expect(entries).to contain_exactly( + Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new( + type: :COMMIT, + old_oid: head_sha, + new_oid: Gitlab::Git::BLANK_SHA + ) + ) end end + end - context 'from Gitlab::HttpIO' do - let(:url) { 'http://example.com/bfg_object_map.txt' } - let(:tempfile) { Tempfile.new } - let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) } + describe '#apply_bfg_object_map_stream (from StringIO)' do + let(:object_map) { StringIO.new(object_map_data) } - around do |example| - tempfile.write(object_map_data) - tempfile.close + include_examples '#apply_bfg_object_map_stream' + end - example.run - ensure - tempfile.unlink - end + describe '#apply_bfg_object_map_stream (from Gitlab::HttpIO)' do + let(:url) { 'http://example.com/bfg_object_map.txt' } + let(:tempfile) { Tempfile.new } + let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) } - it 'removes internal references' do - stub_remote_url_200(url, tempfile.path) + around do |example| + tempfile.write(object_map_data) + tempfile.close - cleaner.apply_bfg_object_map(object_map) + stub_remote_url_200(url, tempfile.path) - aggregate_failures do - clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy } - keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy } - end - end + example.run + ensure + tempfile.unlink end + + include_examples '#apply_bfg_object_map_stream' end end diff --git a/spec/lib/gitlab/gitaly_client/cleanup_service_spec.rb b/spec/lib/gitlab/gitaly_client/cleanup_service_spec.rb index 369deff732a..c42332dc27b 100644 --- a/spec/lib/gitlab/gitaly_client/cleanup_service_spec.rb +++ b/spec/lib/gitlab/gitaly_client/cleanup_service_spec.rb @@ -6,14 +6,14 @@ describe Gitlab::GitalyClient::CleanupService do let(:relative_path) { project.disk_path + '.git' } let(:client) { described_class.new(project.repository) } - describe '#apply_bfg_object_map' do - it 'sends an apply_bfg_object_map message' do + describe '#apply_bfg_object_map_stream' do + it 'sends an apply_bfg_object_map_stream message' do expect_any_instance_of(Gitaly::CleanupService::Stub) - .to receive(:apply_bfg_object_map) + .to receive(:apply_bfg_object_map_stream) .with(kind_of(Enumerator), kind_of(Hash)) - .and_return(double) + .and_return([]) - client.apply_bfg_object_map(StringIO.new) + client.apply_bfg_object_map_stream(StringIO.new) end end end diff --git a/spec/models/note_diff_file_spec.rb b/spec/models/note_diff_file_spec.rb index 99eeac8d778..b15bedd257e 100644 --- a/spec/models/note_diff_file_spec.rb +++ b/spec/models/note_diff_file_spec.rb @@ -10,4 +10,31 @@ describe NoteDiffFile do describe 'validations' do it { is_expected.to validate_presence_of(:diff_note) } end + + describe '.referencing_sha' do + let!(:diff_note) { create(:diff_note_on_commit) } + + let(:note_diff_file) { diff_note.note_diff_file } + let(:project) { diff_note.project } + + it 'finds note diff files by project and sha' do + found = described_class.referencing_sha(diff_note.commit_id, project_id: project.id) + + expect(found).to contain_exactly(note_diff_file) + end + + it 'excludes note diff files with the wrong project' do + other_project = create(:project) + + found = described_class.referencing_sha(diff_note.commit_id, project_id: other_project.id) + + expect(found).to be_empty + end + + it 'excludes note diff files with the wrong sha' do + found = described_class.referencing_sha(Gitlab::Git::BLANK_SHA, project_id: project.id) + + expect(found).to be_empty + end + end end diff --git a/spec/services/projects/cleanup_service_spec.rb b/spec/services/projects/cleanup_service_spec.rb index 29eabc86327..5c246854eb7 100644 --- a/spec/services/projects/cleanup_service_spec.rb +++ b/spec/services/projects/cleanup_service_spec.rb @@ -6,13 +6,13 @@ describe Projects::CleanupService do let(:project) { create(:project, :repository, bfg_object_map: fixture_file_upload('spec/fixtures/bfg_object_map.txt')) } let(:object_map) { project.bfg_object_map } + let(:cleaner) { service.__send__(:repository_cleaner) } + subject(:service) { described_class.new(project) } describe '#execute' do - it 'runs the apply_bfg_object_map gitaly RPC' do - expect_next_instance_of(Gitlab::Git::RepositoryCleaner) do |cleaner| - expect(cleaner).to receive(:apply_bfg_object_map).with(kind_of(IO)) - end + it 'runs the apply_bfg_object_map_stream gitaly RPC' do + expect(cleaner).to receive(:apply_bfg_object_map_stream).with(kind_of(IO)) service.execute end @@ -37,10 +37,91 @@ describe Projects::CleanupService do expect(object_map.exists?).to be_falsy end + context 'with a tainted merge request diff' do + let(:merge_request) { create(:merge_request, source_project: project, target_project: project) } + let(:diff) { merge_request.merge_request_diff } + let(:entry) { build_entry(diff.commits.first.id) } + + before do + allow(cleaner) + .to receive(:apply_bfg_object_map_stream) + .and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry])) + end + + it 'removes the tainted commit from the database' do + service.execute + + expect(MergeRequestDiff.exists?(diff.id)).to be_falsy + end + + it 'ignores non-commit responses from Gitaly' do + entry.type = :UNKNOWN + + service.execute + + expect(MergeRequestDiff.exists?(diff.id)).to be_truthy + end + end + + context 'with a tainted diff note' do + let(:diff_note) { create(:diff_note_on_commit, project: project) } + let(:note_diff_file) { diff_note.note_diff_file } + let(:entry) { build_entry(diff_note.commit_id) } + + let(:highlight_cache) { Gitlab::DiscussionsDiff::HighlightCache } + let(:cache_id) { note_diff_file.id } + + before do + allow(cleaner) + .to receive(:apply_bfg_object_map_stream) + .and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry])) + end + + it 'removes the tainted commit from the database' do + service.execute + + expect(NoteDiffFile.exists?(note_diff_file.id)).to be_falsy + end + + it 'removes the highlight cache from redis' do + write_cache(highlight_cache, cache_id, [{}]) + + expect(read_cache(highlight_cache, cache_id)).not_to be_nil + + service.execute + + expect(read_cache(highlight_cache, cache_id)).to be_nil + end + + it 'ignores non-commit responses from Gitaly' do + entry.type = :UNKNOWN + + service.execute + + expect(NoteDiffFile.exists?(note_diff_file.id)).to be_truthy + end + end + it 'raises an error if no object map can be found' do object_map.remove! expect { service.execute }.to raise_error(described_class::NoUploadError) end end + + def build_entry(old_oid) + Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new( + type: :COMMIT, + old_oid: old_oid, + new_oid: Gitlab::Git::BLANK_SHA + ) + end + + def read_cache(cache, key) + cache.read_multiple([key]).first + end + + def write_cache(cache, key, value) + cache.write_multiple(key => value) + end end |