diff options
author | Stan Hu <stanhu@gmail.com> | 2018-08-13 15:36:15 -0700 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2018-08-17 16:43:58 -0700 |
commit | 5ffa7220130b57251187d9b953e6946754589503 (patch) | |
tree | 84e4a3fb4e1323691cb990b831ba10a7fff82a83 | |
parent | 74eec89ebe0b118f8aba6051f53d5dbe5e68c2d1 (diff) | |
download | gitlab-ce-sh-insert-git-data-in-separate-transaction.tar.gz |
Bitbucket Server importer: Eliminate most idle-in-transaction issuessh-insert-git-data-in-separate-transaction
Just like with the GitHub importer, the Bitbucket Server importer can hit the
default 60 s idle-in-transaction timeouts if it takes too long to create the
merge request. We solve this by using the same approach as the GitHub importer:
1. Bypass all validation and hooks in creating a merge request
2. Insert the Git data in a separate transaction
Part of #50021
-rw-r--r-- | changelogs/unreleased/sh-insert-git-data-in-separate-transaction.yml | 5 | ||||
-rw-r--r-- | lib/gitlab/bitbucket_server_import/importer.rb | 16 | ||||
-rw-r--r-- | lib/gitlab/github_import.rb | 18 | ||||
-rw-r--r-- | lib/gitlab/github_import/importer/issue_importer.rb | 4 | ||||
-rw-r--r-- | lib/gitlab/github_import/importer/pull_request_importer.rb | 90 | ||||
-rw-r--r-- | lib/gitlab/import/database_helpers.rb | 25 | ||||
-rw-r--r-- | lib/gitlab/import/merge_request_creator.rb | 38 | ||||
-rw-r--r-- | lib/gitlab/import/merge_request_helpers.rb | 64 | ||||
-rw-r--r-- | spec/lib/gitlab/github_import/importer/issue_importer_spec.rb | 6 | ||||
-rw-r--r-- | spec/lib/gitlab/github_import/importer/pull_request_importer_spec.rb | 8 | ||||
-rw-r--r-- | spec/lib/gitlab/github_import_spec.rb | 33 | ||||
-rw-r--r-- | spec/lib/gitlab/import/database_helpers_spec.rb | 44 | ||||
-rw-r--r-- | spec/lib/gitlab/import/merge_request_creator_spec.rb | 43 |
13 files changed, 259 insertions, 135 deletions
diff --git a/changelogs/unreleased/sh-insert-git-data-in-separate-transaction.yml b/changelogs/unreleased/sh-insert-git-data-in-separate-transaction.yml new file mode 100644 index 00000000000..116929b2f53 --- /dev/null +++ b/changelogs/unreleased/sh-insert-git-data-in-separate-transaction.yml @@ -0,0 +1,5 @@ +--- +title: 'Bitbucket Server importer: Eliminate most idle-in-transaction issues' +merge_request: +author: +type: performance diff --git a/lib/gitlab/bitbucket_server_import/importer.rb b/lib/gitlab/bitbucket_server_import/importer.rb index 268d21a77d1..b97ee882349 100644 --- a/lib/gitlab/bitbucket_server_import/importer.rb +++ b/lib/gitlab/bitbucket_server_import/importer.rb @@ -2,6 +2,7 @@ module Gitlab module BitbucketServerImport class Importer include Gitlab::ShellAdapter + attr_reader :recover_missing_commits attr_reader :project, :project_key, :repository_slug, :client, :errors, :users @@ -175,21 +176,18 @@ module Gitlab description = '' description += @formatter.author_line(pull_request.author) unless find_user_id(pull_request.author_email) description += pull_request.description if pull_request.description - - source_branch_sha = pull_request.source_branch_sha - target_branch_sha = pull_request.target_branch_sha author_id = gitlab_user_id(pull_request.author_email) attributes = { iid: pull_request.iid, title: pull_request.title, description: description, - source_project: project, + source_project_id: project.id, source_branch: Gitlab::Git.ref_name(pull_request.source_branch_name), - source_branch_sha: source_branch_sha, - target_project: project, + source_branch_sha: pull_request.source_branch_sha, + target_project_id: project.id, target_branch: Gitlab::Git.ref_name(pull_request.target_branch_name), - target_branch_sha: target_branch_sha, + target_branch_sha: pull_request.target_branch_sha, state: pull_request.state, author_id: author_id, assignee_id: nil, @@ -197,7 +195,9 @@ module Gitlab updated_at: pull_request.updated_at } - merge_request = project.merge_requests.create!(attributes) + creator = Gitlab::Import::MergeRequestCreator.new(project) + merge_request = creator.execute(attributes) + import_pull_request_comments(pull_request, merge_request) if merge_request.persisted? end diff --git a/lib/gitlab/github_import.rb b/lib/gitlab/github_import.rb index 65b5e30c70f..d40b06f969f 100644 --- a/lib/gitlab/github_import.rb +++ b/lib/gitlab/github_import.rb @@ -10,24 +10,6 @@ module Gitlab Client.new(token_to_use, parallel: parallel) end - # Inserts a raw row and returns the ID of the inserted row. - # - # attributes - The attributes/columns to set. - # relation - An ActiveRecord::Relation to use for finding the ID of the row - # when using MySQL. - def self.insert_and_return_id(attributes, relation) - # We use bulk_insert here so we can bypass any queries executed by - # callbacks or validation rules, as doing this wouldn't scale when - # importing very large projects. - result = Gitlab::Database - .bulk_insert(relation.table_name, [attributes], return_ids: true) - - # MySQL doesn't support returning the IDs of a bulk insert in a way that - # is not a pain, so in this case we'll issue an extra query instead. - result.first || - relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first - end - # Returns the ID of the ghost user. def self.ghost_user_id key = 'github-import/ghost-user-id' diff --git a/lib/gitlab/github_import/importer/issue_importer.rb b/lib/gitlab/github_import/importer/issue_importer.rb index 31fefebf787..fe10bec395d 100644 --- a/lib/gitlab/github_import/importer/issue_importer.rb +++ b/lib/gitlab/github_import/importer/issue_importer.rb @@ -4,6 +4,8 @@ module Gitlab module GithubImport module Importer class IssueImporter + include Gitlab::Import::DatabaseHelpers + attr_reader :project, :issue, :client, :user_finder, :milestone_finder, :issuable_finder @@ -55,7 +57,7 @@ module Gitlab updated_at: issue.updated_at } - GithubImport.insert_and_return_id(attributes, project.issues) + insert_and_return_id(attributes, project.issues) rescue ActiveRecord::InvalidForeignKey # It's possible the project has been deleted since scheduling this # job. In this case we'll just skip creating the issue. diff --git a/lib/gitlab/github_import/importer/pull_request_importer.rb b/lib/gitlab/github_import/importer/pull_request_importer.rb index 6b3688c4381..74fc64f3c6b 100644 --- a/lib/gitlab/github_import/importer/pull_request_importer.rb +++ b/lib/gitlab/github_import/importer/pull_request_importer.rb @@ -4,6 +4,8 @@ module Gitlab module GithubImport module Importer class PullRequestImporter + include Gitlab::Import::MergeRequestHelpers + attr_reader :pull_request, :project, :client, :user_finder, :milestone_finder, :issuable_finder @@ -30,6 +32,10 @@ module Gitlab end end + def insert_git_data(merge_request, already_exists) + insert_or_replace_git_data(merge_request, pull_request.source_branch_sha, pull_request.target_branch_sha, already_exists) + end + # Creates the merge request and returns its ID. # # This method will return `nil` if the merge request could not be @@ -44,75 +50,23 @@ module Gitlab description = MarkdownText .format(pull_request.description, pull_request.author, author_found) - # This work must be wrapped in a transaction as otherwise we can leave - # behind incomplete data in the event of an error. This can then lead - # to duplicate key errors when jobs are retried. - MergeRequest.transaction do - attributes = { - iid: pull_request.iid, - title: pull_request.truncated_title, - description: description, - source_project_id: project.id, - target_project_id: project.id, - source_branch: pull_request.formatted_source_branch, - target_branch: pull_request.target_branch, - state: pull_request.state, - milestone_id: milestone_finder.id_for(pull_request), - author_id: author_id, - assignee_id: user_finder.assignee_id_for(pull_request), - created_at: pull_request.created_at, - updated_at: pull_request.updated_at - } - - # When creating merge requests there are a lot of hooks that may - # run, for many different reasons. Many of these hooks (e.g. the - # ones used for rendering Markdown) are completely unnecessary and - # may even lead to transaction timeouts. - # - # To ensure importing pull requests has a minimal impact and can - # complete in a reasonable time we bypass all the hooks by inserting - # the row and then retrieving it. We then only perform the - # additional work that is strictly necessary. - merge_request_id = GithubImport - .insert_and_return_id(attributes, project.merge_requests) - - [project.merge_requests.find(merge_request_id), false] - end - rescue ActiveRecord::InvalidForeignKey - # It's possible the project has been deleted since scheduling this - # job. In this case we'll just skip creating the merge request. - [] - rescue ActiveRecord::RecordNotUnique - # It's possible we previously created the MR, but failed when updating - # the Git data. In this case we'll just continue working on the - # existing row. - [project.merge_requests.find_by(iid: pull_request.iid), true] - end - - def insert_git_data(merge_request, already_exists = false) - # These fields are set so we can create the correct merge request - # diffs. - merge_request.source_branch_sha = pull_request.source_branch_sha - merge_request.target_branch_sha = pull_request.target_branch_sha - - merge_request.keep_around_commit - - # MR diffs normally use an "after_save" hook to pull data from Git. - # All of this happens in the transaction started by calling - # create/save/etc. This in turn can lead to these transactions being - # held open for much longer than necessary. To work around this we - # first save the diff, then populate it. - diff = - if already_exists - merge_request.merge_request_diffs.take || - merge_request.merge_request_diffs.build - else - merge_request.merge_request_diffs.build - end + attributes = { + iid: pull_request.iid, + title: pull_request.truncated_title, + description: description, + source_project_id: project.id, + target_project_id: project.id, + source_branch: pull_request.formatted_source_branch, + target_branch: pull_request.target_branch, + state: pull_request.state, + milestone_id: milestone_finder.id_for(pull_request), + author_id: author_id, + assignee_id: user_finder.assignee_id_for(pull_request), + created_at: pull_request.created_at, + updated_at: pull_request.updated_at + } - diff.importing = true - diff.save - diff.save_git_content + create_merge_request_without_hooks(project, attributes, pull_request.iid) end end end diff --git a/lib/gitlab/import/database_helpers.rb b/lib/gitlab/import/database_helpers.rb new file mode 100644 index 00000000000..80857061933 --- /dev/null +++ b/lib/gitlab/import/database_helpers.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +module Gitlab + module Import + module DatabaseHelpers + # Inserts a raw row and returns the ID of the inserted row. + # + # attributes - The attributes/columns to set. + # relation - An ActiveRecord::Relation to use for finding the ID of the row + # when using MySQL. + def insert_and_return_id(attributes, relation) + # We use bulk_insert here so we can bypass any queries executed by + # callbacks or validation rules, as doing this wouldn't scale when + # importing very large projects. + result = Gitlab::Database + .bulk_insert(relation.table_name, [attributes], return_ids: true) + + # MySQL doesn't support returning the IDs of a bulk insert in a way that + # is not a pain, so in this case we'll issue an extra query instead. + result.first || + relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first + end + end + end +end diff --git a/lib/gitlab/import/merge_request_creator.rb b/lib/gitlab/import/merge_request_creator.rb new file mode 100644 index 00000000000..327abdd4183 --- /dev/null +++ b/lib/gitlab/import/merge_request_creator.rb @@ -0,0 +1,38 @@ +# This module is designed for importers that need to create many merge +# requests quickly. When creating merge requests there are a lot of hooks +# that may run, for many different reasons. Many of these hooks (e.g. the ones +# used for rendering Markdown) are completely unnecessary and may even lead to +# transaction timeouts. +# +# To ensure importing merge requests requests has a minimal impact and can +# complete in a reasonable time we bypass all the hooks by inserting the row +# and then retrieving it. We then only perform the additional work that is +# strictly necessary. +module Gitlab + module Import + class MergeRequestCreator + include ::Gitlab::Import::DatabaseHelpers + include ::Gitlab::Import::MergeRequestHelpers + + attr_accessor :project + + def initialize(project) + @project = project + end + + def execute(attributes) + source_branch_sha = attributes.delete(:source_branch_sha) + target_branch_sha = attributes.delete(:target_branch_sha) + iid = attributes[:iid] + + merge_request, already_exists = create_merge_request_without_hooks(project, attributes, iid) + + if merge_request + insert_or_replace_git_data(merge_request, source_branch_sha, target_branch_sha, already_exists) + end + + merge_request + end + end + end +end diff --git a/lib/gitlab/import/merge_request_helpers.rb b/lib/gitlab/import/merge_request_helpers.rb new file mode 100644 index 00000000000..89a312574aa --- /dev/null +++ b/lib/gitlab/import/merge_request_helpers.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +module Gitlab + module Import + module MergeRequestHelpers + include DatabaseHelpers + + def create_merge_request_without_hooks(project, attributes, iid) + # This work must be wrapped in a transaction as otherwise we can leave + # behind incomplete data in the event of an error. This can then lead + # to duplicate key errors when jobs are retried. + MergeRequest.transaction do + # When creating merge requests there are a lot of hooks that may + # run, for many different reasons. Many of these hooks (e.g. the + # ones used for rendering Markdown) are completely unnecessary and + # may even lead to transaction timeouts. + # + # To ensure importing pull requests has a minimal impact and can + # complete in a reasonable time we bypass all the hooks by inserting + # the row and then retrieving it. We then only perform the + # additional work that is strictly necessary. + merge_request_id = insert_and_return_id(attributes, project.merge_requests) + + [project.merge_requests.find(merge_request_id), false] + end + rescue ActiveRecord::InvalidForeignKey + # It's possible the project has been deleted since scheduling this + # job. In this case we'll just skip creating the merge request. + [] + rescue ActiveRecord::RecordNotUnique + # It's possible we previously created the MR, but failed when updating + # the Git data. In this case we'll just continue working on the + # existing row. + [project.merge_requests.find_by(iid: iid), true] + end + + def insert_or_replace_git_data(merge_request, source_branch_sha, target_branch_sha, already_exists = false) + # These fields are set so we can create the correct merge request + # diffs. + merge_request.source_branch_sha = source_branch_sha + merge_request.target_branch_sha = target_branch_sha + + merge_request.keep_around_commit + + # MR diffs normally use an "after_save" hook to pull data from Git. + # All of this happens in the transaction started by calling + # create/save/etc. This in turn can lead to these transactions being + # held open for much longer than necessary. To work around this we + # first save the diff, then populate it. + diff = + if already_exists + merge_request.merge_request_diffs.take || + merge_request.merge_request_diffs.build + else + merge_request.merge_request_diffs.build + end + + diff.importing = true + diff.save + diff.save_git_content + end + end + end +end diff --git a/spec/lib/gitlab/github_import/importer/issue_importer_spec.rb b/spec/lib/gitlab/github_import/importer/issue_importer_spec.rb index 81fe97c1e49..7901ae005d9 100644 --- a/spec/lib/gitlab/github_import/importer/issue_importer_spec.rb +++ b/spec/lib/gitlab/github_import/importer/issue_importer_spec.rb @@ -87,7 +87,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach .with(issue) .and_return([user.id, true]) - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .with( { @@ -116,7 +116,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach .with(issue) .and_return([project.creator_id, false]) - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .with( { @@ -145,7 +145,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach .with(issue) .and_return([user.id, true]) - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .and_raise(ActiveRecord::InvalidForeignKey, 'invalid foreign key') diff --git a/spec/lib/gitlab/github_import/importer/pull_request_importer_spec.rb b/spec/lib/gitlab/github_import/importer/pull_request_importer_spec.rb index 3422a1e82fc..0f21b8843b6 100644 --- a/spec/lib/gitlab/github_import/importer/pull_request_importer_spec.rb +++ b/spec/lib/gitlab/github_import/importer/pull_request_importer_spec.rb @@ -80,7 +80,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi end it 'imports the pull request with the pull request author as the merge request author' do - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .with( { @@ -125,7 +125,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi .with(pull_request) .and_return(user.id) - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .with( { @@ -171,7 +171,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi .to receive(:source_branch) .and_return('master') - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .with( { @@ -209,7 +209,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi .with(pull_request) .and_return(user.id) - expect(Gitlab::GithubImport) + expect(importer) .to receive(:insert_and_return_id) .and_raise(ActiveRecord::InvalidForeignKey, 'invalid foreign key') diff --git a/spec/lib/gitlab/github_import_spec.rb b/spec/lib/gitlab/github_import_spec.rb index 51414800e8c..496244c91bf 100644 --- a/spec/lib/gitlab/github_import_spec.rb +++ b/spec/lib/gitlab/github_import_spec.rb @@ -27,39 +27,6 @@ describe Gitlab::GithubImport do end end - describe '.insert_and_return_id' do - let(:attributes) { { iid: 1, title: 'foo' } } - let(:project) { create(:project) } - - context 'on PostgreSQL' do - it 'returns the ID returned by the query' do - expect(Gitlab::Database) - .to receive(:bulk_insert) - .with(Issue.table_name, [attributes], return_ids: true) - .and_return([10]) - - id = described_class.insert_and_return_id(attributes, project.issues) - - expect(id).to eq(10) - end - end - - context 'on MySQL' do - it 'uses a separate query to retrieve the ID' do - issue = create(:issue, project: project, iid: attributes[:iid]) - - expect(Gitlab::Database) - .to receive(:bulk_insert) - .with(Issue.table_name, [attributes], return_ids: true) - .and_return([]) - - id = described_class.insert_and_return_id(attributes, project.issues) - - expect(id).to eq(issue.id) - end - end - end - describe '.ghost_user_id', :clean_gitlab_redis_cache do it 'returns the ID of the ghost user' do expect(described_class.ghost_user_id).to eq(User.ghost.id) diff --git a/spec/lib/gitlab/import/database_helpers_spec.rb b/spec/lib/gitlab/import/database_helpers_spec.rb new file mode 100644 index 00000000000..ec7d5710b97 --- /dev/null +++ b/spec/lib/gitlab/import/database_helpers_spec.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require 'spec_helper' + +class DatabaseHelpersTest + include Gitlab::Import::DatabaseHelpers +end + +describe Gitlab::Import::DatabaseHelpers do + subject { DatabaseHelpersTest.new } + + describe '.insert_and_return_id' do + let(:attributes) { { iid: 1, title: 'foo' } } + let(:project) { create(:project) } + + context 'on PostgreSQL' do + it 'returns the ID returned by the query' do + expect(Gitlab::Database) + .to receive(:bulk_insert) + .with(Issue.table_name, [attributes], return_ids: true) + .and_return([10]) + + id = subject.insert_and_return_id(attributes, project.issues) + + expect(id).to eq(10) + end + end + + context 'on MySQL' do + it 'uses a separate query to retrieve the ID' do + issue = create(:issue, project: project, iid: attributes[:iid]) + + expect(Gitlab::Database) + .to receive(:bulk_insert) + .with(Issue.table_name, [attributes], return_ids: true) + .and_return([]) + + id = subject.insert_and_return_id(attributes, project.issues) + + expect(id).to eq(issue.id) + end + end + end +end diff --git a/spec/lib/gitlab/import/merge_request_creator_spec.rb b/spec/lib/gitlab/import/merge_request_creator_spec.rb new file mode 100644 index 00000000000..3683558f59f --- /dev/null +++ b/spec/lib/gitlab/import/merge_request_creator_spec.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Gitlab::Import::MergeRequestCreator do + let(:project) { create(:project, :repository) } + + subject { described_class.new(project) } + + describe '#execute' do + context 'merge request already exists' do + let(:merge_request) { create(:merge_request, target_project: project, source_project: project) } + let(:commits) { merge_request.merge_request_diffs.first.commits } + let(:attributes) { HashWithIndifferentAccess.new(merge_request.attributes) } + + it 'updates the data' do + commits_count = commits.count + merge_request.merge_request_diffs.destroy_all + + expect(merge_request.merge_request_diffs.count).to eq(0) + + subject.execute(attributes) + + expect(merge_request.reload.merge_request_diffs.count).to eq(1) + expect(merge_request.reload.merge_request_diffs.first.commits.count).to eq(commits_count) + end + end + + context 'new merge request' do + let(:merge_request) { build(:merge_request, target_project: project, source_project: project) } + let(:attributes) { HashWithIndifferentAccess.new(merge_request.attributes) } + + it 'creates a new merge request' do + attributes.delete(:id) + + expect { subject.execute(attributes) }.to change { MergeRequest.count }.by(1) + + new_mr = MergeRequest.last + expect(new_mr.merge_request_diffs.count).to eq(1) + end + end + end +end |