# frozen_string_literal: true module Gitlab module Sanitizers class Exif # these tags are not removed from the image WHITELISTED_TAGS = %w( ResolutionUnit XResolution YResolution YCbCrSubSampling YCbCrPositioning BitsPerSample ImageHeight ImageWidth ImageSize Copyright CopyrightNotice Orientation ).freeze # these tags are common in exiftool output, these # do not contain any sensitive information, but # we don't need to preserve them when removing # exif tags IGNORED_TAGS = %w( ColorComponents EncodingProcess ExifByteOrder ExifToolVersion JFIFVersion Directory FileAccessDate FileInodeChangeDate FileModifyDate FileName FilePermissions FileSize SourceFile Megapixels FileType FileTypeExtension MIMEType ).freeze ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" } attr_reader :logger def initialize(logger: Gitlab::AppLogger) @logger = logger end # rubocop: disable CodeReuse/ActiveRecord def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil, uploader: nil, since: nil) relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?', '%.jpg', '%.jpeg', '%.tiff') relation = relation.where(uploader: uploader) if uploader relation = relation.where('created_at > ?', since) if since logger.info "running in dry run mode, no images will be rewritten" if dry_run find_params = { start: start_id.present? ? start_id.to_i : nil, finish: stop_id.present? ? stop_id.to_i : Upload.last&.id, batch_size: 1000 } relation.find_each(find_params) do |upload| clean(upload.retrieve_uploader, dry_run: dry_run) sleep sleep_time if sleep_time rescue => err logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}" logger.debug err.backtrace.join("\n ") end end # rubocop: enable CodeReuse/ActiveRecord def clean(uploader, dry_run: true) Dir.mktmpdir('gitlab-exif') do |tmpdir| src_path = fetch_upload_to_file(uploader, tmpdir) to_remove = extra_tags(src_path) if to_remove.empty? logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping" break end logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}" break if dry_run remove_and_store(tmpdir, src_path, uploader) end end def extra_tags(path) exif_tags(path).keys - ALLOWED_TAGS end private def remove_and_store(tmpdir, src_path, uploader) exec_remove_exif!(src_path) logger.info "#{upload_ref(uploader.upload)}: exif removed, storing" File.open(src_path, 'r') { |f| uploader.store!(f) } end def exec_remove_exif!(path) # IPTC and XMP-iptcExt groups may keep copyright information so # we always preserve them cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path] output, status = Gitlab::Popen.popen(cmd) if status != 0 raise "exiftool return code is #{status}: #{output}" end if File.size(path) == 0 raise "size of file is 0" end # exiftool creates backup of the original file in filename_original old_path = "#{path}_original" if File.size(path) == File.size(old_path) raise "size of sanitized file is same as original size" end end def fetch_upload_to_file(uploader, dir) # upload is stored into the file with the original name - this filename # is used by carrierwave when storing the file back to the storage filename = File.join(dir, uploader.filename) File.open(filename, 'w') do |file| file.binmode file.write uploader.read end filename end def upload_ref(upload) "#{upload.id}:#{upload.path}" end def exif_tags(path) cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path] output, status = Gitlab::Popen.popen(cmd) raise "failed to get exif tags: #{output}" if status != 0 Gitlab::Json.parse(output).first end end end end