summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Kozono <mkozono@gmail.com>2017-11-20 16:27:24 -0800
committerMichael Kozono <mkozono@gmail.com>2017-12-01 15:26:41 -0800
commitedb5cac46c1cba1029fb3e67d4853027590584f6 (patch)
tree3f17ff04ba05aea3143a69abd78124417437b024
parent17ce21d74eab4d2973d372cb3f97258eb3b81de9 (diff)
downloadgitlab-ce-edb5cac46c1cba1029fb3e67d4853027590584f6.tar.gz
Use bulk inserts
-rw-r--r--lib/gitlab/background_migration/prepare_untracked_uploads.rb55
-rw-r--r--spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb192
-rw-r--r--spec/migrations/track_untracked_uploads_spec.rb8
3 files changed, 184 insertions, 71 deletions
diff --git a/lib/gitlab/background_migration/prepare_untracked_uploads.rb b/lib/gitlab/background_migration/prepare_untracked_uploads.rb
index c3f5dddb07d..022b2f41393 100644
--- a/lib/gitlab/background_migration/prepare_untracked_uploads.rb
+++ b/lib/gitlab/background_migration/prepare_untracked_uploads.rb
@@ -20,7 +20,19 @@ module Gitlab
def perform
ensure_temporary_tracking_table_exists
+
+ # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
+ # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
+ # slow, start with an empty table for Postgres < 9.5.
+ # That way we can do bulk inserts at ~30x the speed of individual
+ # inserts (~20 minutes worth of inserts at GitLab.com scale instead of
+ # ~10 hours).
+ # In all other cases, installations will get both bulk inserts and the
+ # ability for these jobs to retry without having to clear and reinsert.
+ clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
+
store_untracked_file_paths
+
schedule_populate_untracked_uploads_jobs
end
@@ -44,6 +56,10 @@ module Gitlab
end
end
+ def clear_untracked_file_paths
+ UntrackedFile.delete_all
+ end
+
def store_untracked_file_paths
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
@@ -96,36 +112,35 @@ module Gitlab
end
def insert_file_paths(file_paths)
- ActiveRecord::Base.transaction do
- file_paths.each do |file_path|
- insert_file_path(file_path)
- end
- end
- end
+ sql = if postgresql_pre_9_5?
+ "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
+ elsif postgresql?
+ "INSERT INTO #{table_columns_and_values_for_insert(file_paths)} ON CONFLICT DO NOTHING;"
+ else # MySQL
+ "INSERT IGNORE INTO #{table_columns_and_values_for_insert(file_paths)};"
+ end
- def insert_file_path(file_path)
- if postgresql_pre_9_5?
- # No easy way to do ON CONFLICT DO NOTHING before Postgres 9.5 so just use Rails
- return UntrackedFile.where(path: file_path).first_or_create
- end
+ ActiveRecord::Base.connection.execute(sql)
+ end
- table_columns_and_values = 'untracked_files_for_uploads (path, created_at, updated_at) VALUES (?, ?, ?)'
+ def table_columns_and_values_for_insert(file_paths)
+ timestamp = Time.now.utc.iso8601
- sql = if postgresql?
- "INSERT INTO #{table_columns_and_values} ON CONFLICT DO NOTHING;"
- else
- "INSERT IGNORE INTO #{table_columns_and_values};"
- end
+ values = file_paths.map do |file_path|
+ ActiveRecord::Base.send(:sanitize_sql_array, ['(?, ?, ?)', file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
+ end.join(', ')
- timestamp = Time.now.utc.iso8601
- sql = ActiveRecord::Base.send(:sanitize_sql_array, [sql, file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
- ActiveRecord::Base.connection.execute(sql)
+ "#{UntrackedFile.table_name} (path, created_at, updated_at) VALUES #{values}"
end
def postgresql?
@postgresql ||= Gitlab::Database.postgresql?
end
+ def can_bulk_insert_and_ignore_duplicates?
+ !postgresql_pre_9_5?
+ end
+
def postgresql_pre_9_5?
@postgresql_pre_9_5 ||= postgresql? &&
ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500
diff --git a/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb b/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb
index b6b046ec3aa..f1eb7173717 100644
--- a/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb
+++ b/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb
@@ -53,80 +53,178 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side
expect(record.reload.path.size).to eq(519)
end
- context 'when files were uploaded before and after hashed storage was enabled' do
- let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
- let!(:user) { create(:user, :with_avatar) }
- let!(:project1) { create(:project, :with_avatar) }
- let(:project2) { create(:project) } # instantiate after enabling hashed_storage
+ context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE" do
+ around do |example|
+ # If this is CI, we use Postgres 9.2 so this whole context should be
+ # skipped since we're unable to use ON CONFLICT DO NOTHING or IGNORE.
+ if described_class.new.send(:can_bulk_insert_and_ignore_duplicates?)
+ example.run
+ end
+ end
- before do
- # Markdown upload before enabling hashed_storage
- UploadService.new(project1, uploaded_file, FileUploader).execute
+ context 'when files were uploaded before and after hashed storage was enabled' do
+ let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
+ let!(:user) { create(:user, :with_avatar) }
+ let!(:project1) { create(:project, :with_avatar) }
+ let(:project2) { create(:project) } # instantiate after enabling hashed_storage
- stub_application_setting(hashed_storage_enabled: true)
+ before do
+ # Markdown upload before enabling hashed_storage
+ UploadService.new(project1, uploaded_file, FileUploader).execute
- # Markdown upload after enabling hashed_storage
- UploadService.new(project2, uploaded_file, FileUploader).execute
- end
+ stub_application_setting(hashed_storage_enabled: true)
- it 'adds unhashed files to the untracked_files_for_uploads table' do
- described_class.new.perform
+ # Markdown upload after enabling hashed_storage
+ UploadService.new(project2, uploaded_file, FileUploader).execute
+ end
- expect(untracked_files_for_uploads.count).to eq(5)
- end
+ it 'adds unhashed files to the untracked_files_for_uploads table' do
+ described_class.new.perform
- it 'adds files with paths relative to CarrierWave.root' do
- described_class.new.perform
- untracked_files_for_uploads.all.each do |file|
- expect(file.path.start_with?('uploads/')).to be_truthy
+ expect(untracked_files_for_uploads.count).to eq(5)
end
- end
- it 'does not add hashed files to the untracked_files_for_uploads table' do
- described_class.new.perform
-
- hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
- expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
- end
+ it 'adds files with paths relative to CarrierWave.root' do
+ described_class.new.perform
+ untracked_files_for_uploads.all.each do |file|
+ expect(file.path.start_with?('uploads/')).to be_truthy
+ end
+ end
- it 'correctly schedules the follow-up background migration jobs' do
- described_class.new.perform
+ it 'does not add hashed files to the untracked_files_for_uploads table' do
+ described_class.new.perform
- expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
- expect(BackgroundMigrationWorker.jobs.size).to eq(1)
- end
+ hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
+ expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
+ end
- # E.g. from a previous failed run of this background migration
- context 'when there is existing data in untracked_files_for_uploads' do
- before do
+ it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
+
+ expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
+ expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
- it 'does not error or produce duplicates of existing data' do
- expect do
+ # E.g. from a previous failed run of this background migration
+ context 'when there is existing data in untracked_files_for_uploads' do
+ before do
described_class.new.perform
- end.not_to change { untracked_files_for_uploads.count }.from(5)
+ end
+
+ it 'does not error or produce duplicates of existing data' do
+ expect do
+ described_class.new.perform
+ end.not_to change { untracked_files_for_uploads.count }.from(5)
+ end
end
+
+ # E.g. The installation is in use at the time of migration, and someone has
+ # just uploaded a file
+ context 'when there are files in /uploads/tmp' do
+ let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
+
+ before do
+ FileUtils.touch(tmp_file)
+ end
+
+ after do
+ FileUtils.rm(tmp_file)
+ end
+
+ it 'does not add files from /uploads/tmp' do
+ described_class.new.perform
+
+ expect(untracked_files_for_uploads.count).to eq(5)
+ end
+ end
+ end
+ end
+
+ context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do
+ before do
+ # If this is CI, we use Postgres 9.2 so this stub has no effect.
+ #
+ # If this is being run on Postgres 9.5+ or MySQL, then this stub allows us
+ # to test the bulk insert functionality without ON CONFLICT DO NOTHING or
+ # IGNORE.
+ allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true)
end
- # E.g. The installation is in use at the time of migration, and someone has
- # just uploaded a file
- context 'when there are files in /uploads/tmp' do
- let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
+ context 'when files were uploaded before and after hashed storage was enabled' do
+ let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
+ let!(:user) { create(:user, :with_avatar) }
+ let!(:project1) { create(:project, :with_avatar) }
+ let(:project2) { create(:project) } # instantiate after enabling hashed_storage
before do
- FileUtils.touch(tmp_file)
- end
+ # Markdown upload before enabling hashed_storage
+ UploadService.new(project1, uploaded_file, FileUploader).execute
- after do
- FileUtils.rm(tmp_file)
+ stub_application_setting(hashed_storage_enabled: true)
+
+ # Markdown upload after enabling hashed_storage
+ UploadService.new(project2, uploaded_file, FileUploader).execute
end
- it 'does not add files from /uploads/tmp' do
+ it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
+
+ it 'adds files with paths relative to CarrierWave.root' do
+ described_class.new.perform
+ untracked_files_for_uploads.all.each do |file|
+ expect(file.path.start_with?('uploads/')).to be_truthy
+ end
+ end
+
+ it 'does not add hashed files to the untracked_files_for_uploads table' do
+ described_class.new.perform
+
+ hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
+ expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
+ end
+
+ it 'correctly schedules the follow-up background migration jobs' do
+ described_class.new.perform
+
+ expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
+ expect(BackgroundMigrationWorker.jobs.size).to eq(1)
+ end
+
+ # E.g. from a previous failed run of this background migration
+ context 'when there is existing data in untracked_files_for_uploads' do
+ before do
+ described_class.new.perform
+ end
+
+ it 'does not error or produce duplicates of existing data' do
+ expect do
+ described_class.new.perform
+ end.not_to change { untracked_files_for_uploads.count }.from(5)
+ end
+ end
+
+ # E.g. The installation is in use at the time of migration, and someone has
+ # just uploaded a file
+ context 'when there are files in /uploads/tmp' do
+ let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
+
+ before do
+ FileUtils.touch(tmp_file)
+ end
+
+ after do
+ FileUtils.rm(tmp_file)
+ end
+
+ it 'does not add files from /uploads/tmp' do
+ described_class.new.perform
+
+ expect(untracked_files_for_uploads.count).to eq(5)
+ end
+ end
end
end
diff --git a/spec/migrations/track_untracked_uploads_spec.rb b/spec/migrations/track_untracked_uploads_spec.rb
index 11824bebb91..01bfe26744f 100644
--- a/spec/migrations/track_untracked_uploads_spec.rb
+++ b/spec/migrations/track_untracked_uploads_spec.rb
@@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes)
expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes)
- expect(project2.reload.uploads.first.attributes).to include(@project2_avatar_attributes)
- expect(project2.uploads.last.attributes).to include(@project2_markdown_attributes)
+ expect(project2.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project2_avatar_attributes)
+ expect(project2.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project2_markdown_attributes)
end
it 'ignores already-tracked uploads' do
@@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes)
expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes)
- expect(project1.reload.uploads.first.attributes).to include(@project1_avatar_attributes)
- expect(project1.uploads.last.attributes).to include(@project1_markdown_attributes)
+ expect(project1.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project1_avatar_attributes)
+ expect(project1.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project1_markdown_attributes)
end
it 'the temporary table untracked_files_for_uploads no longer exists' do