From dabd91b2c8a42ac0d0c357190002a5a4b96a57a6 Mon Sep 17 00:00:00 2001 From: Toon Claes Date: Thu, 13 Jun 2019 23:07:59 +0200 Subject: Add rake task to clean orphan artifact files This adds the rake task rake gitlab:cleanup:orphan_job_artifact_files. This rake task cleans all orphan job artifact files it can find on disk. It performs a search on the complete folder of all artifacts on disk. Then it filters out all the job artifact ID for which it could not find a record with matching ID in the database. For these, the file is deleted from disk. --- lib/gitlab/cleanup/orphan_job_artifact_files.rb | 132 +++++++++++++++++++++ .../cleanup/orphan_job_artifact_files_batch.rb | 80 +++++++++++++ lib/tasks/gitlab/cleanup.rake | 25 ++++ 3 files changed, 237 insertions(+) create mode 100644 lib/gitlab/cleanup/orphan_job_artifact_files.rb create mode 100644 lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb (limited to 'lib') diff --git a/lib/gitlab/cleanup/orphan_job_artifact_files.rb b/lib/gitlab/cleanup/orphan_job_artifact_files.rb new file mode 100644 index 00000000000..ee7164b3e55 --- /dev/null +++ b/lib/gitlab/cleanup/orphan_job_artifact_files.rb @@ -0,0 +1,132 @@ +# frozen_string_literal: true + +module Gitlab + module Cleanup + class OrphanJobArtifactFiles + include Gitlab::Utils::StrongMemoize + + ABSOLUTE_ARTIFACT_DIR = ::JobArtifactUploader.root.freeze + LOST_AND_FOUND = File.join(ABSOLUTE_ARTIFACT_DIR, '-', 'lost+found').freeze + BATCH_SIZE = 500 + DEFAULT_NICENESS = 'Best-effort' + + attr_accessor :batch, :total_found, :total_cleaned + attr_reader :limit, :dry_run, :niceness, :logger + + def initialize(limit: nil, dry_run: true, niceness: nil, logger: nil) + @limit = limit + @dry_run = dry_run + @niceness = niceness || DEFAULT_NICENESS + @logger = logger || Rails.logger + @total_found = @total_cleaned = 0 + + new_batch! + end + + def run! + log_info('Looking for orphan job artifacts to clean up') + + find_artifacts do |artifact_file| + batch << artifact_file + + clean_batch! if batch.full? + break if limit_reached? + end + + clean_batch! + + log_info("Processed #{total_found} job artifacts to find and clean #{total_cleaned} orphans.") + end + + private + + def new_batch! + self.batch = ::Gitlab::Cleanup::OrphanJobArtifactFilesBatch + .new(batch_size: batch_size, logger: logger, dry_run: dry_run) + end + + def clean_batch! + batch.clean! + + update_stats!(batch) + + new_batch! + end + + def update_stats!(batch) + self.total_found += batch.artifact_files.count + self.total_cleaned += batch.lost_and_found.count + end + + def limit_reached? + return false unless limit + + total_cleaned >= limit + end + + def batch_size + return BATCH_SIZE unless limit + return if limit_reached? + + todo = limit - total_cleaned + [BATCH_SIZE, todo].min + end + + def find_artifacts + Open3.popen3(*find_command) do |stdin, stdout, stderr, status_thread| + stdout.each_line do |line| + yield line + end + + log_error(stderr.read.color(:red)) unless status_thread.value.success? + end + end + + def find_command + strong_memoize(:find_command) do + cmd = %W[find -L #{absolute_artifact_dir}] + + # Search for Job Artifact IDs, they are found 6 directory + # levels deep. For example: + # shared/artifacts/2c/62/2c...a3/2019_02_27/836/628/job.log + # 1 2 3 4 5 6 + # | | | ^- date | ^- Job Artifact ID + # | | | ^- Job ID + # ^--+--+- components of hashed storage project path + cmd += %w[-mindepth 6 -maxdepth 6] + + # Artifact directories are named on their ID + cmd += %w[-type d] + + if ionice + raise ArgumentError, 'Invalid niceness' unless niceness.match?(/^\w[\w\-]*$/) + + cmd.unshift(*%W[#{ionice} --class #{niceness}]) + end + + log_info("find command: '#{cmd.join(' ')}'") + + cmd + end + end + + def absolute_artifact_dir + File.absolute_path(ABSOLUTE_ARTIFACT_DIR) + end + + def ionice + strong_memoize(:ionice) do + Gitlab::Utils.which('ionice') + end + end + + def log_info(msg, params = {}) + logger.info("#{'[DRY RUN]' if dry_run} #{msg}") + end + + def log_error(msg, params = {}) + logger.error(msg) + end + end + end +end diff --git a/lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb b/lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb new file mode 100644 index 00000000000..5c30258c0fc --- /dev/null +++ b/lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +module Gitlab + module Cleanup + class OrphanJobArtifactFilesBatch + BatchFull = Class.new(StandardError) + + class ArtifactFile + attr_accessor :path + + def initialize(path) + @path = path + end + + def artifact_id + path.split('/').last.to_i + end + end + + include Gitlab::Utils::StrongMemoize + + attr_reader :batch_size, :dry_run + attr_accessor :artifact_files + + def initialize(batch_size:, dry_run: true, logger: Rails.logger) + @batch_size = batch_size + @dry_run = dry_run + @logger = logger + @artifact_files = [] + end + + def clean! + return if artifact_files.empty? + + lost_and_found.each do |artifact| + clean_one!(artifact) + end + end + + def full? + artifact_files.count >= batch_size + end + + def <<(artifact_path) + raise BatchFull, "Batch full! Already contains #{artifact_files.count} artifacts" if full? + + artifact_files << ArtifactFile.new(artifact_path) + end + + def lost_and_found + strong_memoize(:lost_and_found) do + artifact_file_ids = artifact_files.map(&:artifact_id) + existing_artifact_ids = ::Ci::JobArtifact.id_in(artifact_file_ids).pluck_primary_key + + artifact_files.reject { |artifact| existing_artifact_ids.include?(artifact.artifact_id) } + end + end + + private + + def clean_one!(artifact_file) + log_debug("Found orphan job artifact file @ #{artifact_file.path}") + + remove_file!(artifact_file) unless dry_run + end + + def remove_file!(artifact_file) + FileUtils.rm_rf(artifact_file.path) + end + + def log_info(msg, params = {}) + @logger.info("#{'[DRY RUN]' if dry_run} #{msg}") + end + + def log_debug(msg, params = {}) + @logger.debug(msg) + end + end + end +end diff --git a/lib/tasks/gitlab/cleanup.rake b/lib/tasks/gitlab/cleanup.rake index 760331620ef..105ef417df3 100644 --- a/lib/tasks/gitlab/cleanup.rake +++ b/lib/tasks/gitlab/cleanup.rake @@ -115,6 +115,18 @@ namespace :gitlab do end end + desc 'GitLab | Cleanup | Clean orphan job artifact files' + task orphan_job_artifact_files: :gitlab_environment do + warn_user_is_not_gitlab + + cleaner = Gitlab::Cleanup::OrphanJobArtifactFiles.new(limit: limit, dry_run: dry_run?, niceness: niceness, logger: logger) + cleaner.run! + + if dry_run? + logger.info "To clean up these files run this command with DRY_RUN=false".color(:yellow) + end + end + def remove? ENV['REMOVE'] == 'true' end @@ -123,12 +135,25 @@ namespace :gitlab do ENV['DRY_RUN'] != 'false' end + def debug? + ENV['DEBUG'].present? + end + + def limit + ENV['LIMIT']&.to_i + end + + def niceness + ENV['NICENESS'].presence + end + def logger return @logger if defined?(@logger) @logger = if Rails.env.development? || Rails.env.production? Logger.new(STDOUT).tap do |stdout_logger| stdout_logger.extend(ActiveSupport::Logger.broadcast(Rails.logger)) + stdout_logger.level = debug? ? Logger::DEBUG : Logger::INFO end else Rails.logger -- cgit v1.2.1