diff options
author | Stan Hu <stanhu@gmail.com> | 2017-12-07 02:34:58 +0000 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2017-12-07 02:34:58 +0000 |
commit | 29e39e55c3d4b5c6c34c6faec84b0dcd5a3efffa (patch) | |
tree | d6b3ca53d05fa47db768c8d0508fe51c81b0f6ac | |
parent | ba44a57f101a87ebb9155485d5bde1287f7892cf (diff) | |
parent | 24c348f0d1270fe27268aa23e034473651b0cdf9 (diff) | |
download | gitlab-ce-29e39e55c3d4b5c6c34c6faec84b0dcd5a3efffa.tar.gz |
Merge branch 'mk-add-old-attachments-to-uploads-table' into 'master'
Add old files to uploads table
See merge request gitlab-org/gitlab-ce!15270
-rw-r--r-- | changelogs/unreleased/mk-add-old-attachments-to-uploads-table.yml | 5 | ||||
-rw-r--r-- | db/migrate/20171103000000_set_uploads_path_size_for_mysql.rb | 25 | ||||
-rw-r--r-- | db/post_migrate/20171103140253_track_untracked_uploads.rb | 21 | ||||
-rw-r--r-- | db/schema.rb | 2 | ||||
-rw-r--r-- | lib/gitlab/background_migration/populate_untracked_uploads.rb | 259 | ||||
-rw-r--r-- | lib/gitlab/background_migration/prepare_untracked_uploads.rb | 163 | ||||
-rw-r--r-- | lib/gitlab/database.rb | 10 | ||||
-rw-r--r-- | lib/gitlab/utils.rb | 17 | ||||
-rw-r--r-- | spec/lib/gitlab/background_migration/populate_untracked_uploads_spec.rb | 510 | ||||
-rw-r--r-- | spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb | 242 | ||||
-rw-r--r-- | spec/lib/gitlab/database_spec.rb | 16 | ||||
-rw-r--r-- | spec/lib/gitlab/utils_spec.rb | 10 | ||||
-rw-r--r-- | spec/migrations/track_untracked_uploads_spec.rb | 27 | ||||
-rw-r--r-- | spec/support/track_untracked_uploads_helpers.rb | 20 |
14 files changed, 1323 insertions, 4 deletions
diff --git a/changelogs/unreleased/mk-add-old-attachments-to-uploads-table.yml b/changelogs/unreleased/mk-add-old-attachments-to-uploads-table.yml new file mode 100644 index 00000000000..499543ef883 --- /dev/null +++ b/changelogs/unreleased/mk-add-old-attachments-to-uploads-table.yml @@ -0,0 +1,5 @@ +--- +title: Add untracked files to uploads table +merge_request: 15270 +author: +type: other diff --git a/db/migrate/20171103000000_set_uploads_path_size_for_mysql.rb b/db/migrate/20171103000000_set_uploads_path_size_for_mysql.rb new file mode 100644 index 00000000000..1fbe505f804 --- /dev/null +++ b/db/migrate/20171103000000_set_uploads_path_size_for_mysql.rb @@ -0,0 +1,25 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class SetUploadsPathSizeForMysql < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + # Set this constant to true if this migration requires downtime. + DOWNTIME = false + + def up + # We need at least 297 at the moment. For more detail on that number, see: + # https://gitlab.com/gitlab-org/gitlab-ce/issues/40168#what-is-the-expected-correct-behavior + # + # Rails + PostgreSQL `string` is equivalent to a `text` field, but + # Rails + MySQL `string` is `varchar(255)` by default. Also, note that we + # have an upper limit because with a unique index, MySQL has a max key + # length of 3072 bytes which seems to correspond to `varchar(1024)`. + change_column :uploads, :path, :string, limit: 511 + end + + def down + # It was unspecified, which is varchar(255) by default in Rails for MySQL. + change_column :uploads, :path, :string + end +end diff --git a/db/post_migrate/20171103140253_track_untracked_uploads.rb b/db/post_migrate/20171103140253_track_untracked_uploads.rb new file mode 100644 index 00000000000..548a94d2d38 --- /dev/null +++ b/db/post_migrate/20171103140253_track_untracked_uploads.rb @@ -0,0 +1,21 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class TrackUntrackedUploads < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + disable_ddl_transaction! + + DOWNTIME = false + MIGRATION = 'PrepareUntrackedUploads' + + def up + BackgroundMigrationWorker.perform_async(MIGRATION) + end + + def down + if table_exists?(:untracked_files_for_uploads) + drop_table :untracked_files_for_uploads + end + end +end diff --git a/db/schema.rb b/db/schema.rb index 0984ca6487f..2be1b745342 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -1737,7 +1737,7 @@ ActiveRecord::Schema.define(version: 20171124150326) do create_table "uploads", force: :cascade do |t| t.integer "size", limit: 8, null: false - t.string "path", null: false + t.string "path", limit: 511, null: false t.string "checksum", limit: 64 t.integer "model_id" t.string "model_type" diff --git a/lib/gitlab/background_migration/populate_untracked_uploads.rb b/lib/gitlab/background_migration/populate_untracked_uploads.rb new file mode 100644 index 00000000000..81e95e5832d --- /dev/null +++ b/lib/gitlab/background_migration/populate_untracked_uploads.rb @@ -0,0 +1,259 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # This class processes a batch of rows in `untracked_files_for_uploads` by + # adding each file to the `uploads` table if it does not exist. + class PopulateUntrackedUploads # rubocop:disable Metrics/ClassLength + # This class is responsible for producing the attributes necessary to + # track an uploaded file in the `uploads` table. + class UntrackedFile < ActiveRecord::Base # rubocop:disable Metrics/ClassLength, Metrics/LineLength + self.table_name = 'untracked_files_for_uploads' + + # Ends with /:random_hex/:filename + FILE_UPLOADER_PATH = %r{/\h+/[^/]+\z} + FULL_PATH_CAPTURE = %r{\A(.+)#{FILE_UPLOADER_PATH}} + + # These regex patterns are tested against a relative path, relative to + # the upload directory. + # For convenience, if there exists a capture group in the pattern, then + # it indicates the model_id. + PATH_PATTERNS = [ + { + pattern: %r{\A-/system/appearance/logo/(\d+)/}, + uploader: 'AttachmentUploader', + model_type: 'Appearance' + }, + { + pattern: %r{\A-/system/appearance/header_logo/(\d+)/}, + uploader: 'AttachmentUploader', + model_type: 'Appearance' + }, + { + pattern: %r{\A-/system/note/attachment/(\d+)/}, + uploader: 'AttachmentUploader', + model_type: 'Note' + }, + { + pattern: %r{\A-/system/user/avatar/(\d+)/}, + uploader: 'AvatarUploader', + model_type: 'User' + }, + { + pattern: %r{\A-/system/group/avatar/(\d+)/}, + uploader: 'AvatarUploader', + model_type: 'Namespace' + }, + { + pattern: %r{\A-/system/project/avatar/(\d+)/}, + uploader: 'AvatarUploader', + model_type: 'Project' + }, + { + pattern: FILE_UPLOADER_PATH, + uploader: 'FileUploader', + model_type: 'Project' + } + ].freeze + + def to_h + @upload_hash ||= { + path: upload_path, + uploader: uploader, + model_type: model_type, + model_id: model_id, + size: file_size, + checksum: checksum + } + end + + def upload_path + # UntrackedFile#path is absolute, but Upload#path depends on uploader + @upload_path ||= + if uploader == 'FileUploader' + # Path relative to project directory in uploads + matchd = path_relative_to_upload_dir.match(FILE_UPLOADER_PATH) + matchd[0].sub(%r{\A/}, '') # remove leading slash + else + path + end + end + + def uploader + matching_pattern_map[:uploader] + end + + def model_type + matching_pattern_map[:model_type] + end + + def model_id + return @model_id if defined?(@model_id) + + pattern = matching_pattern_map[:pattern] + matchd = path_relative_to_upload_dir.match(pattern) + + # If something is captured (matchd[1] is not nil), it is a model_id + # Only the FileUploader pattern will not match an ID + @model_id = matchd[1] ? matchd[1].to_i : file_uploader_model_id + end + + def file_size + File.size(absolute_path) + end + + def checksum + Digest::SHA256.file(absolute_path).hexdigest + end + + private + + def matching_pattern_map + @matching_pattern_map ||= PATH_PATTERNS.find do |path_pattern_map| + path_relative_to_upload_dir.match(path_pattern_map[:pattern]) + end + + unless @matching_pattern_map + raise "Unknown upload path pattern \"#{path}\"" + end + + @matching_pattern_map + end + + def file_uploader_model_id + matchd = path_relative_to_upload_dir.match(FULL_PATH_CAPTURE) + not_found_msg = <<~MSG + Could not capture project full_path from a FileUploader path: + "#{path_relative_to_upload_dir}" + MSG + raise not_found_msg unless matchd + + full_path = matchd[1] + project = Project.find_by_full_path(full_path) + return nil unless project + + project.id + end + + # Not including a leading slash + def path_relative_to_upload_dir + upload_dir = Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR # rubocop:disable Metrics/LineLength + base = %r{\A#{Regexp.escape(upload_dir)}/} + @path_relative_to_upload_dir ||= path.sub(base, '') + end + + def absolute_path + File.join(CarrierWave.root, path) + end + end + + # This class is used to query the `uploads` table. + class Upload < ActiveRecord::Base + self.table_name = 'uploads' + end + + def perform(start_id, end_id) + return unless migrate? + + files = UntrackedFile.where(id: start_id..end_id) + processed_files = insert_uploads_if_needed(files) + processed_files.delete_all + + drop_temp_table_if_finished + end + + private + + def migrate? + UntrackedFile.table_exists? && Upload.table_exists? + end + + def insert_uploads_if_needed(files) + filtered_files, error_files = filter_error_files(files) + filtered_files = filter_existing_uploads(filtered_files) + filtered_files = filter_deleted_models(filtered_files) + insert(filtered_files) + + processed_files = files.where.not(id: error_files.map(&:id)) + processed_files + end + + def filter_error_files(files) + files.partition do |file| + begin + file.to_h + true + rescue => e + msg = <<~MSG + Error parsing path "#{file.path}": + #{e.message} + #{e.backtrace.join("\n ")} + MSG + Rails.logger.error(msg) + false + end + end + end + + def filter_existing_uploads(files) + paths = files.map(&:upload_path) + existing_paths = Upload.where(path: paths).pluck(:path).to_set + + files.reject do |file| + existing_paths.include?(file.upload_path) + end + end + + # There are files on disk that are not in the uploads table because their + # model was deleted, and we don't delete the files on disk. + def filter_deleted_models(files) + ids = deleted_model_ids(files) + + files.reject do |file| + ids[file.model_type].include?(file.model_id) + end + end + + def deleted_model_ids(files) + ids = { + 'Appearance' => [], + 'Namespace' => [], + 'Note' => [], + 'Project' => [], + 'User' => [] + } + + # group model IDs by model type + files.each do |file| + ids[file.model_type] << file.model_id + end + + ids.each do |model_type, model_ids| + model_class = Object.const_get(model_type) + found_ids = model_class.where(id: model_ids.uniq).pluck(:id) + deleted_ids = ids[model_type] - found_ids + ids[model_type] = deleted_ids + end + + ids + end + + def insert(files) + rows = files.map do |file| + file.to_h.merge(created_at: 'NOW()') + end + + Gitlab::Database.bulk_insert('uploads', + rows, + disable_quote: :created_at) + end + + def drop_temp_table_if_finished + if UntrackedFile.all.empty? + UntrackedFile.connection.drop_table(:untracked_files_for_uploads, + if_exists: true) + end + end + end + end +end diff --git a/lib/gitlab/background_migration/prepare_untracked_uploads.rb b/lib/gitlab/background_migration/prepare_untracked_uploads.rb new file mode 100644 index 00000000000..476c46341ae --- /dev/null +++ b/lib/gitlab/background_migration/prepare_untracked_uploads.rb @@ -0,0 +1,163 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # This class finds all non-hashed uploaded file paths and saves them to a + # `untracked_files_for_uploads` table. + class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength + # For bulk_queue_background_migration_jobs_by_range + include Database::MigrationHelpers + + FIND_BATCH_SIZE = 500 + RELATIVE_UPLOAD_DIR = "uploads".freeze + ABSOLUTE_UPLOAD_DIR = "#{CarrierWave.root}/#{RELATIVE_UPLOAD_DIR}".freeze + FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze + START_WITH_CARRIERWAVE_ROOT_REGEX = %r{\A#{CarrierWave.root}/} + EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze + EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze + + # This class is used to iterate over batches of + # `untracked_files_for_uploads` rows. + class UntrackedFile < ActiveRecord::Base + include EachBatch + + self.table_name = 'untracked_files_for_uploads' + end + + def perform + ensure_temporary_tracking_table_exists + + # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since + # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be + # slow, start with an empty table for Postgres < 9.5. + # That way we can do bulk inserts at ~30x the speed of individual + # inserts (~20 minutes worth of inserts at GitLab.com scale instead of + # ~10 hours). + # In all other cases, installations will get both bulk inserts and the + # ability for these jobs to retry without having to clear and reinsert. + clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates? + + store_untracked_file_paths + + schedule_populate_untracked_uploads_jobs + end + + private + + def ensure_temporary_tracking_table_exists + table_name = :untracked_files_for_uploads + unless UntrackedFile.connection.table_exists?(table_name) + UntrackedFile.connection.create_table table_name do |t| + t.string :path, limit: 600, null: false + t.index :path, unique: true + end + end + end + + def clear_untracked_file_paths + UntrackedFile.delete_all + end + + def store_untracked_file_paths + return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR) + + each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths| + insert_file_paths(file_paths) + end + end + + def each_file_batch(search_dir, batch_size, &block) + cmd = build_find_command(search_dir) + + Open3.popen2(*cmd) do |stdin, stdout, status_thread| + yield_paths_in_batches(stdout, batch_size, &block) + + raise "Find command failed" unless status_thread.value.success? + end + end + + def yield_paths_in_batches(stdout, batch_size, &block) + paths = [] + + stdout.each_line("\0") do |line| + paths << line.chomp("\0").sub(START_WITH_CARRIERWAVE_ROOT_REGEX, '') + + if paths.size >= batch_size + yield(paths) + paths = [] + end + end + + yield(paths) + end + + def build_find_command(search_dir) + cmd = %W[find -L #{search_dir} + -type f + ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune ) + ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune ) + -print0] + + ionice = which_ionice + cmd = %W[#{ionice} -c Idle] + cmd if ionice + + log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\"" + Rails.logger.info log_msg + + cmd + end + + def which_ionice + Gitlab::Utils.which('ionice') + rescue StandardError + # In this case, returning false is relatively safe, + # even though it isn't very nice + false + end + + def insert_file_paths(file_paths) + sql = insert_sql(file_paths) + + ActiveRecord::Base.connection.execute(sql) + end + + def insert_sql(file_paths) + if postgresql_pre_9_5? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};" + elsif postgresql? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\ + " ON CONFLICT DO NOTHING;" + else # MySQL + "INSERT IGNORE INTO"\ + " #{table_columns_and_values_for_insert(file_paths)};" + end + end + + def table_columns_and_values_for_insert(file_paths) + values = file_paths.map do |file_path| + ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend, Metrics/LineLength + end.join(', ') + + "#{UntrackedFile.table_name} (path) VALUES #{values}" + end + + def postgresql? + @postgresql ||= Gitlab::Database.postgresql? + end + + def can_bulk_insert_and_ignore_duplicates? + !postgresql_pre_9_5? + end + + def postgresql_pre_9_5? + @postgresql_pre_9_5 ||= postgresql? && + Gitlab::Database.version.to_f < 9.5 + end + + def schedule_populate_untracked_uploads_jobs + bulk_queue_background_migration_jobs_by_range( + UntrackedFile, FOLLOW_UP_MIGRATION) + end + end + end +end diff --git a/lib/gitlab/database.rb b/lib/gitlab/database.rb index 96922e1a62f..e51794fef99 100644 --- a/lib/gitlab/database.rb +++ b/lib/gitlab/database.rb @@ -120,15 +120,21 @@ module Gitlab # values. # return_ids - When set to true the return value will be an Array of IDs of # the inserted rows, this only works on PostgreSQL. - def self.bulk_insert(table, rows, return_ids: false) + # disable_quote - A key or an Array of keys to exclude from quoting (You + # become responsible for protection from SQL injection for + # these keys!) + def self.bulk_insert(table, rows, return_ids: false, disable_quote: []) return if rows.empty? keys = rows.first.keys columns = keys.map { |key| connection.quote_column_name(key) } return_ids = false if mysql? + disable_quote = Array(disable_quote).to_set tuples = rows.map do |row| - row.values_at(*keys).map { |value| connection.quote(value) } + keys.map do |k| + disable_quote.include?(k) ? row[k] : connection.quote(row[k]) + end end sql = <<-EOF diff --git a/lib/gitlab/utils.rb b/lib/gitlab/utils.rb index abb3d3a02c3..b3baaf036d8 100644 --- a/lib/gitlab/utils.rb +++ b/lib/gitlab/utils.rb @@ -46,5 +46,22 @@ module Gitlab def random_string Random.rand(Float::MAX.to_i).to_s(36) end + + # See: http://stackoverflow.com/questions/2108727/which-in-ruby-checking-if-program-exists-in-path-from-ruby + # Cross-platform way of finding an executable in the $PATH. + # + # which('ruby') #=> /usr/bin/ruby + def which(cmd, env = ENV) + exts = env['PATHEXT'] ? env['PATHEXT'].split(';') : [''] + + env['PATH'].split(File::PATH_SEPARATOR).each do |path| + exts.each do |ext| + exe = File.join(path, "#{cmd}#{ext}") + return exe if File.executable?(exe) && !File.directory?(exe) + end + end + + nil + end end end diff --git a/spec/lib/gitlab/background_migration/populate_untracked_uploads_spec.rb b/spec/lib/gitlab/background_migration/populate_untracked_uploads_spec.rb new file mode 100644 index 00000000000..b80df6956b0 --- /dev/null +++ b/spec/lib/gitlab/background_migration/populate_untracked_uploads_spec.rb @@ -0,0 +1,510 @@ +require 'spec_helper' + +describe Gitlab::BackgroundMigration::PopulateUntrackedUploads, :sidekiq do + include TrackUntrackedUploadsHelpers + + subject { described_class.new } + + let!(:untracked_files_for_uploads) { described_class::UntrackedFile } + let!(:uploads) { described_class::Upload } + + before do + DatabaseCleaner.clean + drop_temp_table_if_exists + ensure_temporary_tracking_table_exists + uploads.delete_all + end + + after(:all) do + drop_temp_table_if_exists + end + + context 'with untracked files and tracked files in untracked_files_for_uploads' do + let!(:appearance) { create_or_update_appearance(logo: uploaded_file, header_logo: uploaded_file) } + let!(:user1) { create(:user, :with_avatar) } + let!(:user2) { create(:user, :with_avatar) } + let!(:project1) { create(:project, :with_avatar) } + let!(:project2) { create(:project, :with_avatar) } + + before do + UploadService.new(project1, uploaded_file, FileUploader).execute # Markdown upload + UploadService.new(project2, uploaded_file, FileUploader).execute # Markdown upload + + # File records created by PrepareUntrackedUploads + untracked_files_for_uploads.create!(path: appearance.uploads.first.path) + untracked_files_for_uploads.create!(path: appearance.uploads.last.path) + untracked_files_for_uploads.create!(path: user1.uploads.first.path) + untracked_files_for_uploads.create!(path: user2.uploads.first.path) + untracked_files_for_uploads.create!(path: project1.uploads.first.path) + untracked_files_for_uploads.create!(path: project2.uploads.first.path) + untracked_files_for_uploads.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{project1.full_path}/#{project1.uploads.last.path}") + untracked_files_for_uploads.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{project2.full_path}/#{project2.uploads.last.path}") + + # Untrack 4 files + user2.uploads.delete_all + project2.uploads.delete_all # 2 files: avatar and a Markdown upload + appearance.uploads.where("path like '%header_logo%'").delete_all + end + + it 'adds untracked files to the uploads table' do + expect do + subject.perform(1, untracked_files_for_uploads.last.id) + end.to change { uploads.count }.from(4).to(8) + + expect(user2.uploads.count).to eq(1) + expect(project2.uploads.count).to eq(2) + expect(appearance.uploads.count).to eq(2) + end + + it 'deletes rows after processing them' do + expect(subject).to receive(:drop_temp_table_if_finished) # Don't drop the table so we can look at it + + expect do + subject.perform(1, untracked_files_for_uploads.last.id) + end.to change { untracked_files_for_uploads.count }.from(8).to(0) + end + + it 'does not create duplicate uploads of already tracked files' do + subject.perform(1, untracked_files_for_uploads.last.id) + + expect(user1.uploads.count).to eq(1) + expect(project1.uploads.count).to eq(2) + expect(appearance.uploads.count).to eq(2) + end + + it 'uses the start and end batch ids [only 1st half]' do + ids = untracked_files_for_uploads.all.order(:id).pluck(:id) + start_id = ids[0] + end_id = ids[3] + + expect do + subject.perform(start_id, end_id) + end.to change { uploads.count }.from(4).to(6) + + expect(user1.uploads.count).to eq(1) + expect(user2.uploads.count).to eq(1) + expect(appearance.uploads.count).to eq(2) + expect(project1.uploads.count).to eq(2) + expect(project2.uploads.count).to eq(0) + + # Only 4 have been either confirmed or added to uploads + expect(untracked_files_for_uploads.count).to eq(4) + end + + it 'uses the start and end batch ids [only 2nd half]' do + ids = untracked_files_for_uploads.all.order(:id).pluck(:id) + start_id = ids[4] + end_id = ids[7] + + expect do + subject.perform(start_id, end_id) + end.to change { uploads.count }.from(4).to(6) + + expect(user1.uploads.count).to eq(1) + expect(user2.uploads.count).to eq(0) + expect(appearance.uploads.count).to eq(1) + expect(project1.uploads.count).to eq(2) + expect(project2.uploads.count).to eq(2) + + # Only 4 have been either confirmed or added to uploads + expect(untracked_files_for_uploads.count).to eq(4) + end + + it 'does not drop the temporary tracking table after processing the batch, if there are still untracked rows' do + subject.perform(1, untracked_files_for_uploads.last.id - 1) + + expect(ActiveRecord::Base.connection.table_exists?(:untracked_files_for_uploads)).to be_truthy + end + + it 'drops the temporary tracking table after processing the batch, if there are no untracked rows left' do + subject.perform(1, untracked_files_for_uploads.last.id) + + expect(ActiveRecord::Base.connection.table_exists?(:untracked_files_for_uploads)).to be_falsey + end + + it 'does not block a whole batch because of one bad path' do + untracked_files_for_uploads.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{project2.full_path}/._7d37bf4c747916390e596744117d5d1a") + expect(untracked_files_for_uploads.count).to eq(9) + expect(uploads.count).to eq(4) + + subject.perform(1, untracked_files_for_uploads.last.id) + + expect(untracked_files_for_uploads.count).to eq(1) + expect(uploads.count).to eq(8) + end + + it 'an unparseable path is shown in error output' do + bad_path = "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{project2.full_path}/._7d37bf4c747916390e596744117d5d1a" + untracked_files_for_uploads.create!(path: bad_path) + + expect(Rails.logger).to receive(:error).with(/Error parsing path "#{bad_path}":/) + + subject.perform(1, untracked_files_for_uploads.last.id) + end + end + + context 'with no untracked files' do + it 'does not add to the uploads table (and does not raise error)' do + expect do + subject.perform(1, 1000) + end.not_to change { uploads.count }.from(0) + end + end + + describe 'upload outcomes for each path pattern' do + shared_examples_for 'non_markdown_file' do + let!(:expected_upload_attrs) { model.uploads.first.attributes.slice('path', 'uploader', 'size', 'checksum') } + let!(:untracked_file) { untracked_files_for_uploads.create!(path: expected_upload_attrs['path']) } + + before do + model.uploads.delete_all + end + + it 'creates an Upload record' do + expect do + subject.perform(1, untracked_files_for_uploads.last.id) + end.to change { model.reload.uploads.count }.from(0).to(1) + + expect(model.uploads.first.attributes).to include(expected_upload_attrs) + end + end + + context 'for an appearance logo file path' do + let(:model) { create_or_update_appearance(logo: uploaded_file) } + + it_behaves_like 'non_markdown_file' + end + + context 'for an appearance header_logo file path' do + let(:model) { create_or_update_appearance(header_logo: uploaded_file) } + + it_behaves_like 'non_markdown_file' + end + + context 'for a pre-Markdown Note attachment file path' do + class Note < ActiveRecord::Base + has_many :uploads, as: :model, dependent: :destroy + end + + let(:model) { create(:note, :with_attachment) } + + it_behaves_like 'non_markdown_file' + end + + context 'for a user avatar file path' do + let(:model) { create(:user, :with_avatar) } + + it_behaves_like 'non_markdown_file' + end + + context 'for a group avatar file path' do + let(:model) { create(:group, :with_avatar) } + + it_behaves_like 'non_markdown_file' + end + + context 'for a project avatar file path' do + let(:model) { create(:project, :with_avatar) } + + it_behaves_like 'non_markdown_file' + end + + context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do + let(:model) { create(:project) } + + before do + # Upload the file + UploadService.new(model, uploaded_file, FileUploader).execute + + # Create the untracked_files_for_uploads record + untracked_files_for_uploads.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{model.full_path}/#{model.uploads.first.path}") + + # Save the expected upload attributes + @expected_upload_attrs = model.reload.uploads.first.attributes.slice('path', 'uploader', 'size', 'checksum') + + # Untrack the file + model.reload.uploads.delete_all + end + + it 'creates an Upload record' do + expect do + subject.perform(1, untracked_files_for_uploads.last.id) + end.to change { model.reload.uploads.count }.from(0).to(1) + + expect(model.uploads.first.attributes).to include(@expected_upload_attrs) + end + end + end +end + +describe Gitlab::BackgroundMigration::PopulateUntrackedUploads::UntrackedFile do + include TrackUntrackedUploadsHelpers + + let(:upload_class) { Gitlab::BackgroundMigration::PopulateUntrackedUploads::Upload } + + before(:all) do + ensure_temporary_tracking_table_exists + end + + after(:all) do + drop_temp_table_if_exists + end + + describe '#upload_path' do + def assert_upload_path(file_path, expected_upload_path) + untracked_file = create_untracked_file(file_path) + + expect(untracked_file.upload_path).to eq(expected_upload_path) + end + + context 'for an appearance logo file path' do + it 'returns the file path relative to the CarrierWave root' do + assert_upload_path('/-/system/appearance/logo/1/some_logo.jpg', 'uploads/-/system/appearance/logo/1/some_logo.jpg') + end + end + + context 'for an appearance header_logo file path' do + it 'returns the file path relative to the CarrierWave root' do + assert_upload_path('/-/system/appearance/header_logo/1/some_logo.jpg', 'uploads/-/system/appearance/header_logo/1/some_logo.jpg') + end + end + + context 'for a pre-Markdown Note attachment file path' do + it 'returns the file path relative to the CarrierWave root' do + assert_upload_path('/-/system/note/attachment/1234/some_attachment.pdf', 'uploads/-/system/note/attachment/1234/some_attachment.pdf') + end + end + + context 'for a user avatar file path' do + it 'returns the file path relative to the CarrierWave root' do + assert_upload_path('/-/system/user/avatar/1234/avatar.jpg', 'uploads/-/system/user/avatar/1234/avatar.jpg') + end + end + + context 'for a group avatar file path' do + it 'returns the file path relative to the CarrierWave root' do + assert_upload_path('/-/system/group/avatar/1234/avatar.jpg', 'uploads/-/system/group/avatar/1234/avatar.jpg') + end + end + + context 'for a project avatar file path' do + it 'returns the file path relative to the CarrierWave root' do + assert_upload_path('/-/system/project/avatar/1234/avatar.jpg', 'uploads/-/system/project/avatar/1234/avatar.jpg') + end + end + + context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do + it 'returns the file path relative to the project directory in uploads' do + project = create(:project) + random_hex = SecureRandom.hex + + assert_upload_path("/#{project.full_path}/#{random_hex}/Some file.jpg", "#{random_hex}/Some file.jpg") + end + end + end + + describe '#uploader' do + def assert_uploader(file_path, expected_uploader) + untracked_file = create_untracked_file(file_path) + + expect(untracked_file.uploader).to eq(expected_uploader) + end + + context 'for an appearance logo file path' do + it 'returns AttachmentUploader as a string' do + assert_uploader('/-/system/appearance/logo/1/some_logo.jpg', 'AttachmentUploader') + end + end + + context 'for an appearance header_logo file path' do + it 'returns AttachmentUploader as a string' do + assert_uploader('/-/system/appearance/header_logo/1/some_logo.jpg', 'AttachmentUploader') + end + end + + context 'for a pre-Markdown Note attachment file path' do + it 'returns AttachmentUploader as a string' do + assert_uploader('/-/system/note/attachment/1234/some_attachment.pdf', 'AttachmentUploader') + end + end + + context 'for a user avatar file path' do + it 'returns AvatarUploader as a string' do + assert_uploader('/-/system/user/avatar/1234/avatar.jpg', 'AvatarUploader') + end + end + + context 'for a group avatar file path' do + it 'returns AvatarUploader as a string' do + assert_uploader('/-/system/group/avatar/1234/avatar.jpg', 'AvatarUploader') + end + end + + context 'for a project avatar file path' do + it 'returns AvatarUploader as a string' do + assert_uploader('/-/system/project/avatar/1234/avatar.jpg', 'AvatarUploader') + end + end + + context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do + it 'returns FileUploader as a string' do + project = create(:project) + + assert_uploader("/#{project.full_path}/#{SecureRandom.hex}/Some file.jpg", 'FileUploader') + end + end + end + + describe '#model_type' do + def assert_model_type(file_path, expected_model_type) + untracked_file = create_untracked_file(file_path) + + expect(untracked_file.model_type).to eq(expected_model_type) + end + + context 'for an appearance logo file path' do + it 'returns Appearance as a string' do + assert_model_type('/-/system/appearance/logo/1/some_logo.jpg', 'Appearance') + end + end + + context 'for an appearance header_logo file path' do + it 'returns Appearance as a string' do + assert_model_type('/-/system/appearance/header_logo/1/some_logo.jpg', 'Appearance') + end + end + + context 'for a pre-Markdown Note attachment file path' do + it 'returns Note as a string' do + assert_model_type('/-/system/note/attachment/1234/some_attachment.pdf', 'Note') + end + end + + context 'for a user avatar file path' do + it 'returns User as a string' do + assert_model_type('/-/system/user/avatar/1234/avatar.jpg', 'User') + end + end + + context 'for a group avatar file path' do + it 'returns Namespace as a string' do + assert_model_type('/-/system/group/avatar/1234/avatar.jpg', 'Namespace') + end + end + + context 'for a project avatar file path' do + it 'returns Project as a string' do + assert_model_type('/-/system/project/avatar/1234/avatar.jpg', 'Project') + end + end + + context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do + it 'returns Project as a string' do + project = create(:project) + + assert_model_type("/#{project.full_path}/#{SecureRandom.hex}/Some file.jpg", 'Project') + end + end + end + + describe '#model_id' do + def assert_model_id(file_path, expected_model_id) + untracked_file = create_untracked_file(file_path) + + expect(untracked_file.model_id).to eq(expected_model_id) + end + + context 'for an appearance logo file path' do + it 'returns the ID as a string' do + assert_model_id('/-/system/appearance/logo/1/some_logo.jpg', 1) + end + end + + context 'for an appearance header_logo file path' do + it 'returns the ID as a string' do + assert_model_id('/-/system/appearance/header_logo/1/some_logo.jpg', 1) + end + end + + context 'for a pre-Markdown Note attachment file path' do + it 'returns the ID as a string' do + assert_model_id('/-/system/note/attachment/1234/some_attachment.pdf', 1234) + end + end + + context 'for a user avatar file path' do + it 'returns the ID as a string' do + assert_model_id('/-/system/user/avatar/1234/avatar.jpg', 1234) + end + end + + context 'for a group avatar file path' do + it 'returns the ID as a string' do + assert_model_id('/-/system/group/avatar/1234/avatar.jpg', 1234) + end + end + + context 'for a project avatar file path' do + it 'returns the ID as a string' do + assert_model_id('/-/system/project/avatar/1234/avatar.jpg', 1234) + end + end + + context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do + it 'returns the ID as a string' do + project = create(:project) + + assert_model_id("/#{project.full_path}/#{SecureRandom.hex}/Some file.jpg", project.id) + end + end + end + + describe '#file_size' do + context 'for an appearance logo file path' do + let(:appearance) { create_or_update_appearance(logo: uploaded_file) } + let(:untracked_file) { described_class.create!(path: appearance.uploads.first.path) } + + it 'returns the file size' do + expect(untracked_file.file_size).to eq(35255) + end + + it 'returns the same thing that CarrierWave would return' do + expect(untracked_file.file_size).to eq(appearance.logo.size) + end + end + + context 'for a project avatar file path' do + let(:project) { create(:project, avatar: uploaded_file) } + let(:untracked_file) { described_class.create!(path: project.uploads.first.path) } + + it 'returns the file size' do + expect(untracked_file.file_size).to eq(35255) + end + + it 'returns the same thing that CarrierWave would return' do + expect(untracked_file.file_size).to eq(project.avatar.size) + end + end + + context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do + let(:project) { create(:project) } + let(:untracked_file) { create_untracked_file("/#{project.full_path}/#{project.uploads.first.path}") } + + before do + UploadService.new(project, uploaded_file, FileUploader).execute + end + + it 'returns the file size' do + expect(untracked_file.file_size).to eq(35255) + end + + it 'returns the same thing that CarrierWave would return' do + expect(untracked_file.file_size).to eq(project.uploads.first.size) + end + end + end + + def create_untracked_file(path_relative_to_upload_dir) + described_class.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}#{path_relative_to_upload_dir}") + end +end diff --git a/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb b/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb new file mode 100644 index 00000000000..cd3f1a45270 --- /dev/null +++ b/spec/lib/gitlab/background_migration/prepare_untracked_uploads_spec.rb @@ -0,0 +1,242 @@ +require 'spec_helper' + +describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :sidekiq do + include TrackUntrackedUploadsHelpers + + let!(:untracked_files_for_uploads) { described_class::UntrackedFile } + + matcher :be_scheduled_migration do |*expected| + match do |migration| + BackgroundMigrationWorker.jobs.any? do |job| + job['args'] == [migration, expected] + end + end + + failure_message do |migration| + "Migration `#{migration}` with args `#{expected.inspect}` not scheduled!" + end + end + + before do + DatabaseCleaner.clean + + drop_temp_table_if_exists + end + + after do + drop_temp_table_if_exists + end + + around do |example| + # Especially important so the follow-up migration does not get run + Sidekiq::Testing.fake! do + example.run + end + end + + it 'ensures the untracked_files_for_uploads table exists' do + expect do + described_class.new.perform + end.to change { ActiveRecord::Base.connection.table_exists?(:untracked_files_for_uploads) }.from(false).to(true) + end + + it 'has a path field long enough for really long paths' do + described_class.new.perform + + component = 'a' * 255 + + long_path = [ + 'uploads', + component, # project.full_path + component # filename + ].flatten.join('/') + + record = untracked_files_for_uploads.create!(path: long_path) + expect(record.reload.path.size).to eq(519) + end + + context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE" do + around do |example| + # If this is CI, we use Postgres 9.2 so this whole context should be + # skipped since we're unable to use ON CONFLICT DO NOTHING or IGNORE. + if described_class.new.send(:can_bulk_insert_and_ignore_duplicates?) + example.run + end + end + + context 'when files were uploaded before and after hashed storage was enabled' do + let!(:appearance) { create_or_update_appearance(logo: uploaded_file, header_logo: uploaded_file) } + let!(:user) { create(:user, :with_avatar) } + let!(:project1) { create(:project, :with_avatar) } + let(:project2) { create(:project) } # instantiate after enabling hashed_storage + + before do + # Markdown upload before enabling hashed_storage + UploadService.new(project1, uploaded_file, FileUploader).execute + + stub_application_setting(hashed_storage_enabled: true) + + # Markdown upload after enabling hashed_storage + UploadService.new(project2, uploaded_file, FileUploader).execute + end + + it 'adds unhashed files to the untracked_files_for_uploads table' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(5) + end + + it 'adds files with paths relative to CarrierWave.root' do + described_class.new.perform + untracked_files_for_uploads.all.each do |file| + expect(file.path.start_with?('uploads/')).to be_truthy + end + end + + it 'does not add hashed files to the untracked_files_for_uploads table' do + described_class.new.perform + + hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path + expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey + end + + it 'correctly schedules the follow-up background migration jobs' do + described_class.new.perform + + expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5) + expect(BackgroundMigrationWorker.jobs.size).to eq(1) + end + + # E.g. from a previous failed run of this background migration + context 'when there is existing data in untracked_files_for_uploads' do + before do + described_class.new.perform + end + + it 'does not error or produce duplicates of existing data' do + expect do + described_class.new.perform + end.not_to change { untracked_files_for_uploads.count }.from(5) + end + end + + # E.g. The installation is in use at the time of migration, and someone has + # just uploaded a file + context 'when there are files in /uploads/tmp' do + let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') } + + before do + FileUtils.touch(tmp_file) + end + + after do + FileUtils.rm(tmp_file) + end + + it 'does not add files from /uploads/tmp' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(5) + end + end + end + end + + context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do + before do + # If this is CI, we use Postgres 9.2 so this stub has no effect. + # + # If this is being run on Postgres 9.5+ or MySQL, then this stub allows us + # to test the bulk insert functionality without ON CONFLICT DO NOTHING or + # IGNORE. + allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true) + end + + context 'when files were uploaded before and after hashed storage was enabled' do + let!(:appearance) { create_or_update_appearance(logo: uploaded_file, header_logo: uploaded_file) } + let!(:user) { create(:user, :with_avatar) } + let!(:project1) { create(:project, :with_avatar) } + let(:project2) { create(:project) } # instantiate after enabling hashed_storage + + before do + # Markdown upload before enabling hashed_storage + UploadService.new(project1, uploaded_file, FileUploader).execute + + stub_application_setting(hashed_storage_enabled: true) + + # Markdown upload after enabling hashed_storage + UploadService.new(project2, uploaded_file, FileUploader).execute + end + + it 'adds unhashed files to the untracked_files_for_uploads table' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(5) + end + + it 'adds files with paths relative to CarrierWave.root' do + described_class.new.perform + untracked_files_for_uploads.all.each do |file| + expect(file.path.start_with?('uploads/')).to be_truthy + end + end + + it 'does not add hashed files to the untracked_files_for_uploads table' do + described_class.new.perform + + hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path + expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey + end + + it 'correctly schedules the follow-up background migration jobs' do + described_class.new.perform + + expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5) + expect(BackgroundMigrationWorker.jobs.size).to eq(1) + end + + # E.g. from a previous failed run of this background migration + context 'when there is existing data in untracked_files_for_uploads' do + before do + described_class.new.perform + end + + it 'does not error or produce duplicates of existing data' do + expect do + described_class.new.perform + end.not_to change { untracked_files_for_uploads.count }.from(5) + end + end + + # E.g. The installation is in use at the time of migration, and someone has + # just uploaded a file + context 'when there are files in /uploads/tmp' do + let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') } + + before do + FileUtils.touch(tmp_file) + end + + after do + FileUtils.rm(tmp_file) + end + + it 'does not add files from /uploads/tmp' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(5) + end + end + end + end + + # Very new or lightly-used installations that are running this migration + # may not have an upload directory because they have no uploads. + context 'when no files were ever uploaded' do + it 'does not add to the untracked_files_for_uploads table (and does not raise error)' do + described_class.new.perform + + expect(untracked_files_for_uploads.count).to eq(0) + end + end +end diff --git a/spec/lib/gitlab/database_spec.rb b/spec/lib/gitlab/database_spec.rb index a5657b81952..b2f13fae73f 100644 --- a/spec/lib/gitlab/database_spec.rb +++ b/spec/lib/gitlab/database_spec.rb @@ -221,6 +221,22 @@ describe Gitlab::Database do described_class.bulk_insert('test', rows) end + it 'does not quote values of a column in the disable_quote option' do + [1, 2, 4, 5].each do |i| + expect(connection).to receive(:quote).with(i) + end + + described_class.bulk_insert('test', rows, disable_quote: :c) + end + + it 'does not quote values of columns in the disable_quote option' do + [2, 5].each do |i| + expect(connection).to receive(:quote).with(i) + end + + described_class.bulk_insert('test', rows, disable_quote: [:a, :c]) + end + it 'handles non-UTF-8 data' do expect { described_class.bulk_insert('test', [{ a: "\255" }]) }.not_to raise_error end diff --git a/spec/lib/gitlab/utils_spec.rb b/spec/lib/gitlab/utils_spec.rb index 3137a72fdc4..e872a5290c5 100644 --- a/spec/lib/gitlab/utils_spec.rb +++ b/spec/lib/gitlab/utils_spec.rb @@ -1,7 +1,7 @@ require 'spec_helper' describe Gitlab::Utils do - delegate :to_boolean, :boolean_to_yes_no, :slugify, :random_string, to: :described_class + delegate :to_boolean, :boolean_to_yes_no, :slugify, :random_string, :which, to: :described_class describe '.slugify' do { @@ -59,4 +59,12 @@ describe Gitlab::Utils do expect(random_string).to be_kind_of(String) end end + + describe '.which' do + it 'finds the full path to an executable binary' do + expect(File).to receive(:executable?).with('/bin/sh').and_return(true) + + expect(which('sh', 'PATH' => '/bin')).to eq('/bin/sh') + end + end end diff --git a/spec/migrations/track_untracked_uploads_spec.rb b/spec/migrations/track_untracked_uploads_spec.rb new file mode 100644 index 00000000000..7fe7a140e2f --- /dev/null +++ b/spec/migrations/track_untracked_uploads_spec.rb @@ -0,0 +1,27 @@ +require 'spec_helper' +require Rails.root.join('db', 'post_migrate', '20171103140253_track_untracked_uploads') + +describe TrackUntrackedUploads, :migration, :sidekiq do + include TrackUntrackedUploadsHelpers + + matcher :be_scheduled_migration do + match do |migration| + BackgroundMigrationWorker.jobs.any? do |job| + job['args'] == [migration] + end + end + + failure_message do |migration| + "Migration `#{migration}` with args `#{expected.inspect}` not scheduled!" + end + end + + it 'correctly schedules the follow-up background migration' do + Sidekiq::Testing.fake! do + migrate! + + expect(described_class::MIGRATION).to be_scheduled_migration + expect(BackgroundMigrationWorker.jobs.size).to eq(1) + end + end +end diff --git a/spec/support/track_untracked_uploads_helpers.rb b/spec/support/track_untracked_uploads_helpers.rb new file mode 100644 index 00000000000..d05eda08201 --- /dev/null +++ b/spec/support/track_untracked_uploads_helpers.rb @@ -0,0 +1,20 @@ +module TrackUntrackedUploadsHelpers + def uploaded_file + fixture_path = Rails.root.join('spec', 'fixtures', 'rails_sample.jpg') + fixture_file_upload(fixture_path) + end + + def ensure_temporary_tracking_table_exists + Gitlab::BackgroundMigration::PrepareUntrackedUploads.new.send(:ensure_temporary_tracking_table_exists) + end + + def drop_temp_table_if_exists + ActiveRecord::Base.connection.drop_table(:untracked_files_for_uploads) if ActiveRecord::Base.connection.table_exists?(:untracked_files_for_uploads) + end + + def create_or_update_appearance(attrs) + a = Appearance.first_or_initialize(title: 'foo', description: 'bar') + a.update!(attrs) + a + end +end |