diff options
author | Stan Hu <stanhu@gmail.com> | 2017-12-07 02:34:58 +0000 |
---|---|---|
committer | Stan Hu <stanhu@gmail.com> | 2017-12-07 02:34:58 +0000 |
commit | 29e39e55c3d4b5c6c34c6faec84b0dcd5a3efffa (patch) | |
tree | d6b3ca53d05fa47db768c8d0508fe51c81b0f6ac /lib | |
parent | ba44a57f101a87ebb9155485d5bde1287f7892cf (diff) | |
parent | 24c348f0d1270fe27268aa23e034473651b0cdf9 (diff) | |
download | gitlab-ce-29e39e55c3d4b5c6c34c6faec84b0dcd5a3efffa.tar.gz |
Merge branch 'mk-add-old-attachments-to-uploads-table' into 'master'
Add old files to uploads table
See merge request gitlab-org/gitlab-ce!15270
Diffstat (limited to 'lib')
-rw-r--r-- | lib/gitlab/background_migration/populate_untracked_uploads.rb | 259 | ||||
-rw-r--r-- | lib/gitlab/background_migration/prepare_untracked_uploads.rb | 163 | ||||
-rw-r--r-- | lib/gitlab/database.rb | 10 | ||||
-rw-r--r-- | lib/gitlab/utils.rb | 17 |
4 files changed, 447 insertions, 2 deletions
diff --git a/lib/gitlab/background_migration/populate_untracked_uploads.rb b/lib/gitlab/background_migration/populate_untracked_uploads.rb new file mode 100644 index 00000000000..81e95e5832d --- /dev/null +++ b/lib/gitlab/background_migration/populate_untracked_uploads.rb @@ -0,0 +1,259 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # This class processes a batch of rows in `untracked_files_for_uploads` by + # adding each file to the `uploads` table if it does not exist. + class PopulateUntrackedUploads # rubocop:disable Metrics/ClassLength + # This class is responsible for producing the attributes necessary to + # track an uploaded file in the `uploads` table. + class UntrackedFile < ActiveRecord::Base # rubocop:disable Metrics/ClassLength, Metrics/LineLength + self.table_name = 'untracked_files_for_uploads' + + # Ends with /:random_hex/:filename + FILE_UPLOADER_PATH = %r{/\h+/[^/]+\z} + FULL_PATH_CAPTURE = %r{\A(.+)#{FILE_UPLOADER_PATH}} + + # These regex patterns are tested against a relative path, relative to + # the upload directory. + # For convenience, if there exists a capture group in the pattern, then + # it indicates the model_id. + PATH_PATTERNS = [ + { + pattern: %r{\A-/system/appearance/logo/(\d+)/}, + uploader: 'AttachmentUploader', + model_type: 'Appearance' + }, + { + pattern: %r{\A-/system/appearance/header_logo/(\d+)/}, + uploader: 'AttachmentUploader', + model_type: 'Appearance' + }, + { + pattern: %r{\A-/system/note/attachment/(\d+)/}, + uploader: 'AttachmentUploader', + model_type: 'Note' + }, + { + pattern: %r{\A-/system/user/avatar/(\d+)/}, + uploader: 'AvatarUploader', + model_type: 'User' + }, + { + pattern: %r{\A-/system/group/avatar/(\d+)/}, + uploader: 'AvatarUploader', + model_type: 'Namespace' + }, + { + pattern: %r{\A-/system/project/avatar/(\d+)/}, + uploader: 'AvatarUploader', + model_type: 'Project' + }, + { + pattern: FILE_UPLOADER_PATH, + uploader: 'FileUploader', + model_type: 'Project' + } + ].freeze + + def to_h + @upload_hash ||= { + path: upload_path, + uploader: uploader, + model_type: model_type, + model_id: model_id, + size: file_size, + checksum: checksum + } + end + + def upload_path + # UntrackedFile#path is absolute, but Upload#path depends on uploader + @upload_path ||= + if uploader == 'FileUploader' + # Path relative to project directory in uploads + matchd = path_relative_to_upload_dir.match(FILE_UPLOADER_PATH) + matchd[0].sub(%r{\A/}, '') # remove leading slash + else + path + end + end + + def uploader + matching_pattern_map[:uploader] + end + + def model_type + matching_pattern_map[:model_type] + end + + def model_id + return @model_id if defined?(@model_id) + + pattern = matching_pattern_map[:pattern] + matchd = path_relative_to_upload_dir.match(pattern) + + # If something is captured (matchd[1] is not nil), it is a model_id + # Only the FileUploader pattern will not match an ID + @model_id = matchd[1] ? matchd[1].to_i : file_uploader_model_id + end + + def file_size + File.size(absolute_path) + end + + def checksum + Digest::SHA256.file(absolute_path).hexdigest + end + + private + + def matching_pattern_map + @matching_pattern_map ||= PATH_PATTERNS.find do |path_pattern_map| + path_relative_to_upload_dir.match(path_pattern_map[:pattern]) + end + + unless @matching_pattern_map + raise "Unknown upload path pattern \"#{path}\"" + end + + @matching_pattern_map + end + + def file_uploader_model_id + matchd = path_relative_to_upload_dir.match(FULL_PATH_CAPTURE) + not_found_msg = <<~MSG + Could not capture project full_path from a FileUploader path: + "#{path_relative_to_upload_dir}" + MSG + raise not_found_msg unless matchd + + full_path = matchd[1] + project = Project.find_by_full_path(full_path) + return nil unless project + + project.id + end + + # Not including a leading slash + def path_relative_to_upload_dir + upload_dir = Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR # rubocop:disable Metrics/LineLength + base = %r{\A#{Regexp.escape(upload_dir)}/} + @path_relative_to_upload_dir ||= path.sub(base, '') + end + + def absolute_path + File.join(CarrierWave.root, path) + end + end + + # This class is used to query the `uploads` table. + class Upload < ActiveRecord::Base + self.table_name = 'uploads' + end + + def perform(start_id, end_id) + return unless migrate? + + files = UntrackedFile.where(id: start_id..end_id) + processed_files = insert_uploads_if_needed(files) + processed_files.delete_all + + drop_temp_table_if_finished + end + + private + + def migrate? + UntrackedFile.table_exists? && Upload.table_exists? + end + + def insert_uploads_if_needed(files) + filtered_files, error_files = filter_error_files(files) + filtered_files = filter_existing_uploads(filtered_files) + filtered_files = filter_deleted_models(filtered_files) + insert(filtered_files) + + processed_files = files.where.not(id: error_files.map(&:id)) + processed_files + end + + def filter_error_files(files) + files.partition do |file| + begin + file.to_h + true + rescue => e + msg = <<~MSG + Error parsing path "#{file.path}": + #{e.message} + #{e.backtrace.join("\n ")} + MSG + Rails.logger.error(msg) + false + end + end + end + + def filter_existing_uploads(files) + paths = files.map(&:upload_path) + existing_paths = Upload.where(path: paths).pluck(:path).to_set + + files.reject do |file| + existing_paths.include?(file.upload_path) + end + end + + # There are files on disk that are not in the uploads table because their + # model was deleted, and we don't delete the files on disk. + def filter_deleted_models(files) + ids = deleted_model_ids(files) + + files.reject do |file| + ids[file.model_type].include?(file.model_id) + end + end + + def deleted_model_ids(files) + ids = { + 'Appearance' => [], + 'Namespace' => [], + 'Note' => [], + 'Project' => [], + 'User' => [] + } + + # group model IDs by model type + files.each do |file| + ids[file.model_type] << file.model_id + end + + ids.each do |model_type, model_ids| + model_class = Object.const_get(model_type) + found_ids = model_class.where(id: model_ids.uniq).pluck(:id) + deleted_ids = ids[model_type] - found_ids + ids[model_type] = deleted_ids + end + + ids + end + + def insert(files) + rows = files.map do |file| + file.to_h.merge(created_at: 'NOW()') + end + + Gitlab::Database.bulk_insert('uploads', + rows, + disable_quote: :created_at) + end + + def drop_temp_table_if_finished + if UntrackedFile.all.empty? + UntrackedFile.connection.drop_table(:untracked_files_for_uploads, + if_exists: true) + end + end + end + end +end diff --git a/lib/gitlab/background_migration/prepare_untracked_uploads.rb b/lib/gitlab/background_migration/prepare_untracked_uploads.rb new file mode 100644 index 00000000000..476c46341ae --- /dev/null +++ b/lib/gitlab/background_migration/prepare_untracked_uploads.rb @@ -0,0 +1,163 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # This class finds all non-hashed uploaded file paths and saves them to a + # `untracked_files_for_uploads` table. + class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength + # For bulk_queue_background_migration_jobs_by_range + include Database::MigrationHelpers + + FIND_BATCH_SIZE = 500 + RELATIVE_UPLOAD_DIR = "uploads".freeze + ABSOLUTE_UPLOAD_DIR = "#{CarrierWave.root}/#{RELATIVE_UPLOAD_DIR}".freeze + FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze + START_WITH_CARRIERWAVE_ROOT_REGEX = %r{\A#{CarrierWave.root}/} + EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze + EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze + + # This class is used to iterate over batches of + # `untracked_files_for_uploads` rows. + class UntrackedFile < ActiveRecord::Base + include EachBatch + + self.table_name = 'untracked_files_for_uploads' + end + + def perform + ensure_temporary_tracking_table_exists + + # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since + # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be + # slow, start with an empty table for Postgres < 9.5. + # That way we can do bulk inserts at ~30x the speed of individual + # inserts (~20 minutes worth of inserts at GitLab.com scale instead of + # ~10 hours). + # In all other cases, installations will get both bulk inserts and the + # ability for these jobs to retry without having to clear and reinsert. + clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates? + + store_untracked_file_paths + + schedule_populate_untracked_uploads_jobs + end + + private + + def ensure_temporary_tracking_table_exists + table_name = :untracked_files_for_uploads + unless UntrackedFile.connection.table_exists?(table_name) + UntrackedFile.connection.create_table table_name do |t| + t.string :path, limit: 600, null: false + t.index :path, unique: true + end + end + end + + def clear_untracked_file_paths + UntrackedFile.delete_all + end + + def store_untracked_file_paths + return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR) + + each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths| + insert_file_paths(file_paths) + end + end + + def each_file_batch(search_dir, batch_size, &block) + cmd = build_find_command(search_dir) + + Open3.popen2(*cmd) do |stdin, stdout, status_thread| + yield_paths_in_batches(stdout, batch_size, &block) + + raise "Find command failed" unless status_thread.value.success? + end + end + + def yield_paths_in_batches(stdout, batch_size, &block) + paths = [] + + stdout.each_line("\0") do |line| + paths << line.chomp("\0").sub(START_WITH_CARRIERWAVE_ROOT_REGEX, '') + + if paths.size >= batch_size + yield(paths) + paths = [] + end + end + + yield(paths) + end + + def build_find_command(search_dir) + cmd = %W[find -L #{search_dir} + -type f + ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune ) + ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune ) + -print0] + + ionice = which_ionice + cmd = %W[#{ionice} -c Idle] + cmd if ionice + + log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\"" + Rails.logger.info log_msg + + cmd + end + + def which_ionice + Gitlab::Utils.which('ionice') + rescue StandardError + # In this case, returning false is relatively safe, + # even though it isn't very nice + false + end + + def insert_file_paths(file_paths) + sql = insert_sql(file_paths) + + ActiveRecord::Base.connection.execute(sql) + end + + def insert_sql(file_paths) + if postgresql_pre_9_5? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};" + elsif postgresql? + "INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\ + " ON CONFLICT DO NOTHING;" + else # MySQL + "INSERT IGNORE INTO"\ + " #{table_columns_and_values_for_insert(file_paths)};" + end + end + + def table_columns_and_values_for_insert(file_paths) + values = file_paths.map do |file_path| + ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend, Metrics/LineLength + end.join(', ') + + "#{UntrackedFile.table_name} (path) VALUES #{values}" + end + + def postgresql? + @postgresql ||= Gitlab::Database.postgresql? + end + + def can_bulk_insert_and_ignore_duplicates? + !postgresql_pre_9_5? + end + + def postgresql_pre_9_5? + @postgresql_pre_9_5 ||= postgresql? && + Gitlab::Database.version.to_f < 9.5 + end + + def schedule_populate_untracked_uploads_jobs + bulk_queue_background_migration_jobs_by_range( + UntrackedFile, FOLLOW_UP_MIGRATION) + end + end + end +end diff --git a/lib/gitlab/database.rb b/lib/gitlab/database.rb index 96922e1a62f..e51794fef99 100644 --- a/lib/gitlab/database.rb +++ b/lib/gitlab/database.rb @@ -120,15 +120,21 @@ module Gitlab # values. # return_ids - When set to true the return value will be an Array of IDs of # the inserted rows, this only works on PostgreSQL. - def self.bulk_insert(table, rows, return_ids: false) + # disable_quote - A key or an Array of keys to exclude from quoting (You + # become responsible for protection from SQL injection for + # these keys!) + def self.bulk_insert(table, rows, return_ids: false, disable_quote: []) return if rows.empty? keys = rows.first.keys columns = keys.map { |key| connection.quote_column_name(key) } return_ids = false if mysql? + disable_quote = Array(disable_quote).to_set tuples = rows.map do |row| - row.values_at(*keys).map { |value| connection.quote(value) } + keys.map do |k| + disable_quote.include?(k) ? row[k] : connection.quote(row[k]) + end end sql = <<-EOF diff --git a/lib/gitlab/utils.rb b/lib/gitlab/utils.rb index abb3d3a02c3..b3baaf036d8 100644 --- a/lib/gitlab/utils.rb +++ b/lib/gitlab/utils.rb @@ -46,5 +46,22 @@ module Gitlab def random_string Random.rand(Float::MAX.to_i).to_s(36) end + + # See: http://stackoverflow.com/questions/2108727/which-in-ruby-checking-if-program-exists-in-path-from-ruby + # Cross-platform way of finding an executable in the $PATH. + # + # which('ruby') #=> /usr/bin/ruby + def which(cmd, env = ENV) + exts = env['PATHEXT'] ? env['PATHEXT'].split(';') : [''] + + env['PATH'].split(File::PATH_SEPARATOR).each do |path| + exts.each do |ext| + exe = File.join(path, "#{cmd}#{ext}") + return exe if File.executable?(exe) && !File.directory?(exe) + end + end + + nil + end end end |