summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorStan Hu <stanhu@gmail.com>2017-12-07 02:34:58 +0000
committerStan Hu <stanhu@gmail.com>2017-12-07 02:34:58 +0000
commit29e39e55c3d4b5c6c34c6faec84b0dcd5a3efffa (patch)
treed6b3ca53d05fa47db768c8d0508fe51c81b0f6ac /lib
parentba44a57f101a87ebb9155485d5bde1287f7892cf (diff)
parent24c348f0d1270fe27268aa23e034473651b0cdf9 (diff)
downloadgitlab-ce-29e39e55c3d4b5c6c34c6faec84b0dcd5a3efffa.tar.gz
Merge branch 'mk-add-old-attachments-to-uploads-table' into 'master'
Add old files to uploads table See merge request gitlab-org/gitlab-ce!15270
Diffstat (limited to 'lib')
-rw-r--r--lib/gitlab/background_migration/populate_untracked_uploads.rb259
-rw-r--r--lib/gitlab/background_migration/prepare_untracked_uploads.rb163
-rw-r--r--lib/gitlab/database.rb10
-rw-r--r--lib/gitlab/utils.rb17
4 files changed, 447 insertions, 2 deletions
diff --git a/lib/gitlab/background_migration/populate_untracked_uploads.rb b/lib/gitlab/background_migration/populate_untracked_uploads.rb
new file mode 100644
index 00000000000..81e95e5832d
--- /dev/null
+++ b/lib/gitlab/background_migration/populate_untracked_uploads.rb
@@ -0,0 +1,259 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module BackgroundMigration
+ # This class processes a batch of rows in `untracked_files_for_uploads` by
+ # adding each file to the `uploads` table if it does not exist.
+ class PopulateUntrackedUploads # rubocop:disable Metrics/ClassLength
+ # This class is responsible for producing the attributes necessary to
+ # track an uploaded file in the `uploads` table.
+ class UntrackedFile < ActiveRecord::Base # rubocop:disable Metrics/ClassLength, Metrics/LineLength
+ self.table_name = 'untracked_files_for_uploads'
+
+ # Ends with /:random_hex/:filename
+ FILE_UPLOADER_PATH = %r{/\h+/[^/]+\z}
+ FULL_PATH_CAPTURE = %r{\A(.+)#{FILE_UPLOADER_PATH}}
+
+ # These regex patterns are tested against a relative path, relative to
+ # the upload directory.
+ # For convenience, if there exists a capture group in the pattern, then
+ # it indicates the model_id.
+ PATH_PATTERNS = [
+ {
+ pattern: %r{\A-/system/appearance/logo/(\d+)/},
+ uploader: 'AttachmentUploader',
+ model_type: 'Appearance'
+ },
+ {
+ pattern: %r{\A-/system/appearance/header_logo/(\d+)/},
+ uploader: 'AttachmentUploader',
+ model_type: 'Appearance'
+ },
+ {
+ pattern: %r{\A-/system/note/attachment/(\d+)/},
+ uploader: 'AttachmentUploader',
+ model_type: 'Note'
+ },
+ {
+ pattern: %r{\A-/system/user/avatar/(\d+)/},
+ uploader: 'AvatarUploader',
+ model_type: 'User'
+ },
+ {
+ pattern: %r{\A-/system/group/avatar/(\d+)/},
+ uploader: 'AvatarUploader',
+ model_type: 'Namespace'
+ },
+ {
+ pattern: %r{\A-/system/project/avatar/(\d+)/},
+ uploader: 'AvatarUploader',
+ model_type: 'Project'
+ },
+ {
+ pattern: FILE_UPLOADER_PATH,
+ uploader: 'FileUploader',
+ model_type: 'Project'
+ }
+ ].freeze
+
+ def to_h
+ @upload_hash ||= {
+ path: upload_path,
+ uploader: uploader,
+ model_type: model_type,
+ model_id: model_id,
+ size: file_size,
+ checksum: checksum
+ }
+ end
+
+ def upload_path
+ # UntrackedFile#path is absolute, but Upload#path depends on uploader
+ @upload_path ||=
+ if uploader == 'FileUploader'
+ # Path relative to project directory in uploads
+ matchd = path_relative_to_upload_dir.match(FILE_UPLOADER_PATH)
+ matchd[0].sub(%r{\A/}, '') # remove leading slash
+ else
+ path
+ end
+ end
+
+ def uploader
+ matching_pattern_map[:uploader]
+ end
+
+ def model_type
+ matching_pattern_map[:model_type]
+ end
+
+ def model_id
+ return @model_id if defined?(@model_id)
+
+ pattern = matching_pattern_map[:pattern]
+ matchd = path_relative_to_upload_dir.match(pattern)
+
+ # If something is captured (matchd[1] is not nil), it is a model_id
+ # Only the FileUploader pattern will not match an ID
+ @model_id = matchd[1] ? matchd[1].to_i : file_uploader_model_id
+ end
+
+ def file_size
+ File.size(absolute_path)
+ end
+
+ def checksum
+ Digest::SHA256.file(absolute_path).hexdigest
+ end
+
+ private
+
+ def matching_pattern_map
+ @matching_pattern_map ||= PATH_PATTERNS.find do |path_pattern_map|
+ path_relative_to_upload_dir.match(path_pattern_map[:pattern])
+ end
+
+ unless @matching_pattern_map
+ raise "Unknown upload path pattern \"#{path}\""
+ end
+
+ @matching_pattern_map
+ end
+
+ def file_uploader_model_id
+ matchd = path_relative_to_upload_dir.match(FULL_PATH_CAPTURE)
+ not_found_msg = <<~MSG
+ Could not capture project full_path from a FileUploader path:
+ "#{path_relative_to_upload_dir}"
+ MSG
+ raise not_found_msg unless matchd
+
+ full_path = matchd[1]
+ project = Project.find_by_full_path(full_path)
+ return nil unless project
+
+ project.id
+ end
+
+ # Not including a leading slash
+ def path_relative_to_upload_dir
+ upload_dir = Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR # rubocop:disable Metrics/LineLength
+ base = %r{\A#{Regexp.escape(upload_dir)}/}
+ @path_relative_to_upload_dir ||= path.sub(base, '')
+ end
+
+ def absolute_path
+ File.join(CarrierWave.root, path)
+ end
+ end
+
+ # This class is used to query the `uploads` table.
+ class Upload < ActiveRecord::Base
+ self.table_name = 'uploads'
+ end
+
+ def perform(start_id, end_id)
+ return unless migrate?
+
+ files = UntrackedFile.where(id: start_id..end_id)
+ processed_files = insert_uploads_if_needed(files)
+ processed_files.delete_all
+
+ drop_temp_table_if_finished
+ end
+
+ private
+
+ def migrate?
+ UntrackedFile.table_exists? && Upload.table_exists?
+ end
+
+ def insert_uploads_if_needed(files)
+ filtered_files, error_files = filter_error_files(files)
+ filtered_files = filter_existing_uploads(filtered_files)
+ filtered_files = filter_deleted_models(filtered_files)
+ insert(filtered_files)
+
+ processed_files = files.where.not(id: error_files.map(&:id))
+ processed_files
+ end
+
+ def filter_error_files(files)
+ files.partition do |file|
+ begin
+ file.to_h
+ true
+ rescue => e
+ msg = <<~MSG
+ Error parsing path "#{file.path}":
+ #{e.message}
+ #{e.backtrace.join("\n ")}
+ MSG
+ Rails.logger.error(msg)
+ false
+ end
+ end
+ end
+
+ def filter_existing_uploads(files)
+ paths = files.map(&:upload_path)
+ existing_paths = Upload.where(path: paths).pluck(:path).to_set
+
+ files.reject do |file|
+ existing_paths.include?(file.upload_path)
+ end
+ end
+
+ # There are files on disk that are not in the uploads table because their
+ # model was deleted, and we don't delete the files on disk.
+ def filter_deleted_models(files)
+ ids = deleted_model_ids(files)
+
+ files.reject do |file|
+ ids[file.model_type].include?(file.model_id)
+ end
+ end
+
+ def deleted_model_ids(files)
+ ids = {
+ 'Appearance' => [],
+ 'Namespace' => [],
+ 'Note' => [],
+ 'Project' => [],
+ 'User' => []
+ }
+
+ # group model IDs by model type
+ files.each do |file|
+ ids[file.model_type] << file.model_id
+ end
+
+ ids.each do |model_type, model_ids|
+ model_class = Object.const_get(model_type)
+ found_ids = model_class.where(id: model_ids.uniq).pluck(:id)
+ deleted_ids = ids[model_type] - found_ids
+ ids[model_type] = deleted_ids
+ end
+
+ ids
+ end
+
+ def insert(files)
+ rows = files.map do |file|
+ file.to_h.merge(created_at: 'NOW()')
+ end
+
+ Gitlab::Database.bulk_insert('uploads',
+ rows,
+ disable_quote: :created_at)
+ end
+
+ def drop_temp_table_if_finished
+ if UntrackedFile.all.empty?
+ UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
+ if_exists: true)
+ end
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/background_migration/prepare_untracked_uploads.rb b/lib/gitlab/background_migration/prepare_untracked_uploads.rb
new file mode 100644
index 00000000000..476c46341ae
--- /dev/null
+++ b/lib/gitlab/background_migration/prepare_untracked_uploads.rb
@@ -0,0 +1,163 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module BackgroundMigration
+ # This class finds all non-hashed uploaded file paths and saves them to a
+ # `untracked_files_for_uploads` table.
+ class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength
+ # For bulk_queue_background_migration_jobs_by_range
+ include Database::MigrationHelpers
+
+ FIND_BATCH_SIZE = 500
+ RELATIVE_UPLOAD_DIR = "uploads".freeze
+ ABSOLUTE_UPLOAD_DIR = "#{CarrierWave.root}/#{RELATIVE_UPLOAD_DIR}".freeze
+ FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'.freeze
+ START_WITH_CARRIERWAVE_ROOT_REGEX = %r{\A#{CarrierWave.root}/}
+ EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze
+ EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze
+
+ # This class is used to iterate over batches of
+ # `untracked_files_for_uploads` rows.
+ class UntrackedFile < ActiveRecord::Base
+ include EachBatch
+
+ self.table_name = 'untracked_files_for_uploads'
+ end
+
+ def perform
+ ensure_temporary_tracking_table_exists
+
+ # Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
+ # doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
+ # slow, start with an empty table for Postgres < 9.5.
+ # That way we can do bulk inserts at ~30x the speed of individual
+ # inserts (~20 minutes worth of inserts at GitLab.com scale instead of
+ # ~10 hours).
+ # In all other cases, installations will get both bulk inserts and the
+ # ability for these jobs to retry without having to clear and reinsert.
+ clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
+
+ store_untracked_file_paths
+
+ schedule_populate_untracked_uploads_jobs
+ end
+
+ private
+
+ def ensure_temporary_tracking_table_exists
+ table_name = :untracked_files_for_uploads
+ unless UntrackedFile.connection.table_exists?(table_name)
+ UntrackedFile.connection.create_table table_name do |t|
+ t.string :path, limit: 600, null: false
+ t.index :path, unique: true
+ end
+ end
+ end
+
+ def clear_untracked_file_paths
+ UntrackedFile.delete_all
+ end
+
+ def store_untracked_file_paths
+ return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
+
+ each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths|
+ insert_file_paths(file_paths)
+ end
+ end
+
+ def each_file_batch(search_dir, batch_size, &block)
+ cmd = build_find_command(search_dir)
+
+ Open3.popen2(*cmd) do |stdin, stdout, status_thread|
+ yield_paths_in_batches(stdout, batch_size, &block)
+
+ raise "Find command failed" unless status_thread.value.success?
+ end
+ end
+
+ def yield_paths_in_batches(stdout, batch_size, &block)
+ paths = []
+
+ stdout.each_line("\0") do |line|
+ paths << line.chomp("\0").sub(START_WITH_CARRIERWAVE_ROOT_REGEX, '')
+
+ if paths.size >= batch_size
+ yield(paths)
+ paths = []
+ end
+ end
+
+ yield(paths)
+ end
+
+ def build_find_command(search_dir)
+ cmd = %W[find -L #{search_dir}
+ -type f
+ ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
+ ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
+ -print0]
+
+ ionice = which_ionice
+ cmd = %W[#{ionice} -c Idle] + cmd if ionice
+
+ log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\""
+ Rails.logger.info log_msg
+
+ cmd
+ end
+
+ def which_ionice
+ Gitlab::Utils.which('ionice')
+ rescue StandardError
+ # In this case, returning false is relatively safe,
+ # even though it isn't very nice
+ false
+ end
+
+ def insert_file_paths(file_paths)
+ sql = insert_sql(file_paths)
+
+ ActiveRecord::Base.connection.execute(sql)
+ end
+
+ def insert_sql(file_paths)
+ if postgresql_pre_9_5?
+ "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
+ elsif postgresql?
+ "INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\
+ " ON CONFLICT DO NOTHING;"
+ else # MySQL
+ "INSERT IGNORE INTO"\
+ " #{table_columns_and_values_for_insert(file_paths)};"
+ end
+ end
+
+ def table_columns_and_values_for_insert(file_paths)
+ values = file_paths.map do |file_path|
+ ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend, Metrics/LineLength
+ end.join(', ')
+
+ "#{UntrackedFile.table_name} (path) VALUES #{values}"
+ end
+
+ def postgresql?
+ @postgresql ||= Gitlab::Database.postgresql?
+ end
+
+ def can_bulk_insert_and_ignore_duplicates?
+ !postgresql_pre_9_5?
+ end
+
+ def postgresql_pre_9_5?
+ @postgresql_pre_9_5 ||= postgresql? &&
+ Gitlab::Database.version.to_f < 9.5
+ end
+
+ def schedule_populate_untracked_uploads_jobs
+ bulk_queue_background_migration_jobs_by_range(
+ UntrackedFile, FOLLOW_UP_MIGRATION)
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/database.rb b/lib/gitlab/database.rb
index 96922e1a62f..e51794fef99 100644
--- a/lib/gitlab/database.rb
+++ b/lib/gitlab/database.rb
@@ -120,15 +120,21 @@ module Gitlab
# values.
# return_ids - When set to true the return value will be an Array of IDs of
# the inserted rows, this only works on PostgreSQL.
- def self.bulk_insert(table, rows, return_ids: false)
+ # disable_quote - A key or an Array of keys to exclude from quoting (You
+ # become responsible for protection from SQL injection for
+ # these keys!)
+ def self.bulk_insert(table, rows, return_ids: false, disable_quote: [])
return if rows.empty?
keys = rows.first.keys
columns = keys.map { |key| connection.quote_column_name(key) }
return_ids = false if mysql?
+ disable_quote = Array(disable_quote).to_set
tuples = rows.map do |row|
- row.values_at(*keys).map { |value| connection.quote(value) }
+ keys.map do |k|
+ disable_quote.include?(k) ? row[k] : connection.quote(row[k])
+ end
end
sql = <<-EOF
diff --git a/lib/gitlab/utils.rb b/lib/gitlab/utils.rb
index abb3d3a02c3..b3baaf036d8 100644
--- a/lib/gitlab/utils.rb
+++ b/lib/gitlab/utils.rb
@@ -46,5 +46,22 @@ module Gitlab
def random_string
Random.rand(Float::MAX.to_i).to_s(36)
end
+
+ # See: http://stackoverflow.com/questions/2108727/which-in-ruby-checking-if-program-exists-in-path-from-ruby
+ # Cross-platform way of finding an executable in the $PATH.
+ #
+ # which('ruby') #=> /usr/bin/ruby
+ def which(cmd, env = ENV)
+ exts = env['PATHEXT'] ? env['PATHEXT'].split(';') : ['']
+
+ env['PATH'].split(File::PATH_SEPARATOR).each do |path|
+ exts.each do |ext|
+ exe = File.join(path, "#{cmd}#{ext}")
+ return exe if File.executable?(exe) && !File.directory?(exe)
+ end
+ end
+
+ nil
+ end
end
end