diff options
author | Michael Kozono <mkozono@gmail.com> | 2017-11-20 16:27:24 -0800 |
---|---|---|
committer | Michael Kozono <mkozono@gmail.com> | 2017-12-01 15:26:41 -0800 |
commit | edb5cac46c1cba1029fb3e67d4853027590584f6 (patch) | |
tree | 3f17ff04ba05aea3143a69abd78124417437b024 | |
parent | 17ce21d74eab4d2973d372cb3f97258eb3b81de9 (diff) | |
download | gitlab-ce-edb5cac46c1cba1029fb3e67d4853027590584f6.tar.gz |
Use bulk inserts
3 files changed, 184 insertions, 71 deletions
diff --git a/lib/gitlab/background_migration/prepare_untracked_uploads.rb b/lib/gitlab/background_migration/prepare_untracked_uploads.rb index c3f5dddb07d..022b2f41393 100644 --- a/lib/gitlab/background_migration/prepare_untracked_uploads.rb +++ b/lib/gitlab/background_migration/prepare_untracked_uploads.rb @@ -20,7 +20,19 @@ module Gitlab def perform ensure_temporary_tracking_table_exists + + # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since + # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be + # slow, start with an empty table for Postgres < 9.5. + # That way we can do bulk inserts at ~30x the speed of individual + # inserts (~20 minutes worth of inserts at GitLab.com scale instead of + # ~10 hours). + # In all other cases, installations will get both bulk inserts and the + # ability for these jobs to retry without having to clear and reinsert. + clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates? + store_untracked_file_paths + schedule_populate_untracked_uploads_jobs end @@ -44,6 +56,10 @@ module Gitlab end end + def clear_untracked_file_paths + UntrackedFile.delete_all + end + def store_untracked_file_paths return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR) @@ -96,36 +112,35 @@ module Gitlab end def insert_file_paths(file_paths) - ActiveRecord::Base.transaction do - file_paths.each do |file_path| - insert_file_path(file_path) - end - end - end + sql = if postgresql_pre_9_5? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};" + elsif postgresql? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)} ON CONFLICT DO NOTHING;" + else # MySQL + "INSERT IGNORE INTO #{table_columns_and_values_for_insert(file_paths)};" + end - def insert_file_path(file_path) - if postgresql_pre_9_5? - # No easy way to do ON CONFLICT DO NOTHING before Postgres 9.5 so just use Rails - return UntrackedFile.where(path: file_path).first_or_create - end + ActiveRecord::Base.connection.execute(sql) + end - table_columns_and_values = 'untracked_files_for_uploads (path, created_at, updated_at) VALUES (?, ?, ?)' + def table_columns_and_values_for_insert(file_paths) + timestamp = Time.now.utc.iso8601 - sql = if postgresql? - "INSERT INTO #{table_columns_and_values} ON CONFLICT DO NOTHING;" - else - "INSERT IGNORE INTO #{table_columns_and_values};" - end + values = file_paths.map do |file_path| + ActiveRecord::Base.send(:sanitize_sql_array, ['(?, ?, ?)', file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend + end.join(', ') - timestamp = Time.now.utc.iso8601 - sql = ActiveRecord::Base.send(:sanitize_sql_array, [sql, file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend - ActiveRecord::Base.connection.execute(sql) + "#{UntrackedFile.table_name} (path, created_at, updated_at) VALUES #{values}" end def postgresql? @postgresql ||= Gitlab::Database.postgresql? end + def can_bulk_insert_and_ignore_duplicates? + !postgresql_pre_9_5? + end + def postgresql_pre_9_5? @postgresql_pre_9_5 ||= postgresql? && ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500 diff --git a/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb b/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb index b6b046ec3aa..f1eb7173717 100644 --- a/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb +++ b/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb @@ -53,80 +53,178 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side expect(record.reload.path.size).to eq(519) end - context 'when files were uploaded before and after hashed storage was enabled' do - let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) } - let!(:user) { create(:user, :with_avatar) } - let!(:project1) { create(:project, :with_avatar) } - let(:project2) { create(:project) } # instantiate after enabling hashed_storage + context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE" do + around do |example| + # If this is CI, we use Postgres 9.2 so this whole context should be + # skipped since we're unable to use ON CONFLICT DO NOTHING or IGNORE. + if described_class.new.send(:can_bulk_insert_and_ignore_duplicates?) + example.run + end + end - before do - # Markdown upload before enabling hashed_storage - UploadService.new(project1, uploaded_file, FileUploader).execute + context 'when files were uploaded before and after hashed storage was enabled' do + let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) } + let!(:user) { create(:user, :with_avatar) } + let!(:project1) { create(:project, :with_avatar) } + let(:project2) { create(:project) } # instantiate after enabling hashed_storage - stub_application_setting(hashed_storage_enabled: true) + before do + # Markdown upload before enabling hashed_storage + UploadService.new(project1, uploaded_file, FileUploader).execute - # Markdown upload after enabling hashed_storage - UploadService.new(project2, uploaded_file, FileUploader).execute - end + stub_application_setting(hashed_storage_enabled: true) - it 'adds unhashed files to the untracked_files_for_uploads table' do - described_class.new.perform + # Markdown upload after enabling hashed_storage + UploadService.new(project2, uploaded_file, FileUploader).execute + end - expect(untracked_files_for_uploads.count).to eq(5) - end + it 'adds unhashed files to the untracked_files_for_uploads table' do + described_class.new.perform - it 'adds files with paths relative to CarrierWave.root' do - described_class.new.perform - untracked_files_for_uploads.all.each do |file| - expect(file.path.start_with?('uploads/')).to be_truthy + expect(untracked_files_for_uploads.count).to eq(5) end - end - it 'does not add hashed files to the untracked_files_for_uploads table' do - described_class.new.perform - - hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path - expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey - end + it 'adds files with paths relative to CarrierWave.root' do + described_class.new.perform + untracked_files_for_uploads.all.each do |file| + expect(file.path.start_with?('uploads/')).to be_truthy + end + end - it 'correctly schedules the follow-up background migration jobs' do - described_class.new.perform + it 'does not add hashed files to the untracked_files_for_uploads table' do + described_class.new.perform - expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5) - expect(BackgroundMigrationWorker.jobs.size).to eq(1) - end + hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path + expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey + end - # E.g. from a previous failed run of this background migration - context 'when there is existing data in untracked_files_for_uploads' do - before do + it 'correctly schedules the follow-up background migration jobs' do described_class.new.perform + + expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5) + expect(BackgroundMigrationWorker.jobs.size).to eq(1) end - it 'does not error or produce duplicates of existing data' do - expect do + # E.g. from a previous failed run of this background migration + context 'when there is existing data in untracked_files_for_uploads' do + before do described_class.new.perform - end.not_to change { untracked_files_for_uploads.count }.from(5) + end + + it 'does not error or produce duplicates of existing data' do + expect do + described_class.new.perform + end.not_to change { untracked_files_for_uploads.count }.from(5) + end end + + # E.g. The installation is in use at the time of migration, and someone has + # just uploaded a file + context 'when there are files in /uploads/tmp' do + let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') } + + before do + FileUtils.touch(tmp_file) + end + + after do + FileUtils.rm(tmp_file) + end + + it 'does not add files from /uploads/tmp' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(5) + end + end + end + end + + context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do + before do + # If this is CI, we use Postgres 9.2 so this stub has no effect. + # + # If this is being run on Postgres 9.5+ or MySQL, then this stub allows us + # to test the bulk insert functionality without ON CONFLICT DO NOTHING or + # IGNORE. + allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true) end - # E.g. The installation is in use at the time of migration, and someone has - # just uploaded a file - context 'when there are files in /uploads/tmp' do - let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') } + context 'when files were uploaded before and after hashed storage was enabled' do + let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) } + let!(:user) { create(:user, :with_avatar) } + let!(:project1) { create(:project, :with_avatar) } + let(:project2) { create(:project) } # instantiate after enabling hashed_storage before do - FileUtils.touch(tmp_file) - end + # Markdown upload before enabling hashed_storage + UploadService.new(project1, uploaded_file, FileUploader).execute - after do - FileUtils.rm(tmp_file) + stub_application_setting(hashed_storage_enabled: true) + + # Markdown upload after enabling hashed_storage + UploadService.new(project2, uploaded_file, FileUploader).execute end - it 'does not add files from /uploads/tmp' do + it 'adds unhashed files to the untracked_files_for_uploads table' do described_class.new.perform expect(untracked_files_for_uploads.count).to eq(5) end + + it 'adds files with paths relative to CarrierWave.root' do + described_class.new.perform + untracked_files_for_uploads.all.each do |file| + expect(file.path.start_with?('uploads/')).to be_truthy + end + end + + it 'does not add hashed files to the untracked_files_for_uploads table' do + described_class.new.perform + + hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path + expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey + end + + it 'correctly schedules the follow-up background migration jobs' do + described_class.new.perform + + expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5) + expect(BackgroundMigrationWorker.jobs.size).to eq(1) + end + + # E.g. from a previous failed run of this background migration + context 'when there is existing data in untracked_files_for_uploads' do + before do + described_class.new.perform + end + + it 'does not error or produce duplicates of existing data' do + expect do + described_class.new.perform + end.not_to change { untracked_files_for_uploads.count }.from(5) + end + end + + # E.g. The installation is in use at the time of migration, and someone has + # just uploaded a file + context 'when there are files in /uploads/tmp' do + let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') } + + before do + FileUtils.touch(tmp_file) + end + + after do + FileUtils.rm(tmp_file) + end + + it 'does not add files from /uploads/tmp' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(5) + end + end end end diff --git a/spec/migrations/track_untracked_uploads_spec.rb b/spec/migrations/track_untracked_uploads_spec.rb index 11824bebb91..01bfe26744f 100644 --- a/spec/migrations/track_untracked_uploads_spec.rb +++ b/spec/migrations/track_untracked_uploads_spec.rb @@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes) expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes) - expect(project2.reload.uploads.first.attributes).to include(@project2_avatar_attributes) - expect(project2.uploads.last.attributes).to include(@project2_markdown_attributes) + expect(project2.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project2_avatar_attributes) + expect(project2.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project2_markdown_attributes) end it 'ignores already-tracked uploads' do @@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes) expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes) - expect(project1.reload.uploads.first.attributes).to include(@project1_avatar_attributes) - expect(project1.uploads.last.attributes).to include(@project1_markdown_attributes) + expect(project1.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project1_avatar_attributes) + expect(project1.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project1_markdown_attributes) end it 'the temporary table untracked_files_for_uploads no longer exists' do |