diff options
Diffstat (limited to 'lib/gitlab/background_migration/prepare_untracked_uploads.rb')
-rw-r--r-- | lib/gitlab/background_migration/prepare_untracked_uploads.rb | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/lib/gitlab/background_migration/prepare_untracked_uploads.rb b/lib/gitlab/background_migration/prepare_untracked_uploads.rb new file mode 100644 index 00000000000..476c46341ae --- /dev/null +++ b/lib/gitlab/background_migration/prepare_untracked_uploads.rb @@ -0,0 +1,163 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # This class finds all non-hashed uploaded file paths and saves them to a + # `untracked_files_for_uploads` table. + class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength + # For bulk_queue_background_migration_jobs_by_range + include Database::MigrationHelpers + + FIND_BATCH_SIZE = 500 + RELATIVE_UPLOAD_DIR = "uploads".freeze + ABSOLUTE_UPLOAD_DIR = "#{CarrierWave.root}/#{RELATIVE_UPLOAD_DIR}".freeze + FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze + START_WITH_CARRIERWAVE_ROOT_REGEX = %r{\A#{CarrierWave.root}/} + EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze + EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze + + # This class is used to iterate over batches of + # `untracked_files_for_uploads` rows. + class UntrackedFile < ActiveRecord::Base + include EachBatch + + self.table_name = 'untracked_files_for_uploads' + end + + def perform + ensure_temporary_tracking_table_exists + + # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since + # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be + # slow, start with an empty table for Postgres < 9.5. + # That way we can do bulk inserts at ~30x the speed of individual + # inserts (~20 minutes worth of inserts at GitLab.com scale instead of + # ~10 hours). + # In all other cases, installations will get both bulk inserts and the + # ability for these jobs to retry without having to clear and reinsert. + clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates? + + store_untracked_file_paths + + schedule_populate_untracked_uploads_jobs + end + + private + + def ensure_temporary_tracking_table_exists + table_name = :untracked_files_for_uploads + unless UntrackedFile.connection.table_exists?(table_name) + UntrackedFile.connection.create_table table_name do |t| + t.string :path, limit: 600, null: false + t.index :path, unique: true + end + end + end + + def clear_untracked_file_paths + UntrackedFile.delete_all + end + + def store_untracked_file_paths + return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR) + + each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths| + insert_file_paths(file_paths) + end + end + + def each_file_batch(search_dir, batch_size, &block) + cmd = build_find_command(search_dir) + + Open3.popen2(*cmd) do |stdin, stdout, status_thread| + yield_paths_in_batches(stdout, batch_size, &block) + + raise "Find command failed" unless status_thread.value.success? + end + end + + def yield_paths_in_batches(stdout, batch_size, &block) + paths = [] + + stdout.each_line("\0") do |line| + paths << line.chomp("\0").sub(START_WITH_CARRIERWAVE_ROOT_REGEX, '') + + if paths.size >= batch_size + yield(paths) + paths = [] + end + end + + yield(paths) + end + + def build_find_command(search_dir) + cmd = %W[find -L #{search_dir} + -type f + ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune ) + ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune ) + -print0] + + ionice = which_ionice + cmd = %W[#{ionice} -c Idle] + cmd if ionice + + log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\"" + Rails.logger.info log_msg + + cmd + end + + def which_ionice + Gitlab::Utils.which('ionice') + rescue StandardError + # In this case, returning false is relatively safe, + # even though it isn't very nice + false + end + + def insert_file_paths(file_paths) + sql = insert_sql(file_paths) + + ActiveRecord::Base.connection.execute(sql) + end + + def insert_sql(file_paths) + if postgresql_pre_9_5? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};" + elsif postgresql? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\ + " ON CONFLICT DO NOTHING;" + else # MySQL + "INSERT IGNORE INTO"\ + " #{table_columns_and_values_for_insert(file_paths)};" + end + end + + def table_columns_and_values_for_insert(file_paths) + values = file_paths.map do |file_path| + ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend, Metrics/LineLength + end.join(', ') + + "#{UntrackedFile.table_name} (path) VALUES #{values}" + end + + def postgresql? + @postgresql ||= Gitlab::Database.postgresql? + end + + def can_bulk_insert_and_ignore_duplicates? + !postgresql_pre_9_5? + end + + def postgresql_pre_9_5? + @postgresql_pre_9_5 ||= postgresql? && + Gitlab::Database.version.to_f < 9.5 + end + + def schedule_populate_untracked_uploads_jobs + bulk_queue_background_migration_jobs_by_range( + UntrackedFile, FOLLOW_UP_MIGRATION) + end + end + end +end |