diff options
author | Jan Provaznik <jprovaznik@gitlab.com> | 2019-04-02 07:48:35 +0000 |
---|---|---|
committer | GitLab Release Tools Bot <robert+release-tools@gitlab.com> | 2019-04-02 07:48:35 +0000 |
commit | a466d97e62a89b320713da44d67d452284ad8282 (patch) | |
tree | 58f9b4a2cf07ebc875c6e9ba6168361dcb5773ae /lib | |
parent | 6557858faeb5ae56d18a2f4463d43e0bc22d700f (diff) | |
download | gitlab-ce-a466d97e62a89b320713da44d67d452284ad8282.tar.gz |
Rake task for removing exif from uploads
Adds a rake task which can be used for removing EXIF
data from existing uploads.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/gitlab/sanitizers/exif.rb | 156 | ||||
-rw-r--r-- | lib/tasks/gitlab/uploads/sanitize.rake | 18 |
2 files changed, 174 insertions, 0 deletions
diff --git a/lib/gitlab/sanitizers/exif.rb b/lib/gitlab/sanitizers/exif.rb new file mode 100644 index 00000000000..0928ccdc324 --- /dev/null +++ b/lib/gitlab/sanitizers/exif.rb @@ -0,0 +1,156 @@ +# frozen_string_literal: true + +module Gitlab + module Sanitizers + class Exif + # these tags are not removed from the image + WHITELISTED_TAGS = %w( + ResolutionUnit + XResolution + YResolution + YCbCrSubSampling + YCbCrPositioning + BitsPerSample + ImageHeight + ImageWidth + ImageSize + Copyright + CopyrightNotice + Orientation + ).freeze + + # these tags are common in exiftool output, these + # do not contain any sensitive information, but + # we don't need to preserve them when removing + # exif tags + IGNORED_TAGS = %w( + ColorComponents + EncodingProcess + ExifByteOrder + ExifToolVersion + JFIFVersion + Directory + FileAccessDate + FileInodeChangeDate + FileModifyDate + FileName + FilePermissions + FileSize + SourceFile + Megapixels + FileType + FileTypeExtension + MIMEType + ).freeze + + ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS + EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" } + + attr_reader :logger + + def initialize(logger: Rails.logger) + @logger = logger + end + + # rubocop: disable CodeReuse/ActiveRecord + def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil) + relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?', + '%.jpg', '%.jpeg', '%.tiff') + + logger.info "running in dry run mode, no images will be rewritten" if dry_run + + find_params = { + start: start_id.present? ? start_id.to_i : nil, + finish: stop_id.present? ? stop_id.to_i : Upload.last&.id + } + + relation.find_each(find_params) do |upload| + clean(upload.build_uploader, dry_run: dry_run) + sleep sleep_time if sleep_time + rescue => err + logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}" + logger.debug err.backtrace.join("\n ") + end + end + # rubocop: enable CodeReuse/ActiveRecord + + def clean(uploader, dry_run: true) + Dir.mktmpdir('gitlab-exif') do |tmpdir| + src_path = fetch_upload_to_file(uploader, tmpdir) + + to_remove = extra_tags(src_path) + + if to_remove.empty? + logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping" + break + end + + logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}" + + break if dry_run + + remove_and_store(tmpdir, src_path, uploader) + end + end + + def extra_tags(path) + exif_tags(path).keys - ALLOWED_TAGS + end + + private + + def remove_and_store(tmpdir, src_path, uploader) + exec_remove_exif!(src_path) + logger.info "#{upload_ref(uploader.upload)}: exif removed, storing" + File.open(src_path, 'r') { |f| uploader.store!(f) } + end + + def exec_remove_exif!(path) + # IPTC and XMP-iptcExt groups may keep copyright information so + # we always preserve them + cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path] + output, status = Gitlab::Popen.popen(cmd) + + if status != 0 + raise "exiftool return code is #{status}: #{output}" + end + + if File.size(path) == 0 + raise "size of file is 0" + end + + # exiftool creates backup of the original file in filename_original + old_path = "#{path}_original" + if File.size(path) == File.size(old_path) + raise "size of sanitized file is same as original size" + end + end + + def fetch_upload_to_file(uploader, dir) + # upload is stored into the file with the original name - this filename + # is used by carrierwave when storing the file back to the storage + filename = File.join(dir, uploader.filename) + + File.open(filename, 'w') do |file| + file.binmode + file.write uploader.read + end + + filename + end + + def upload_ref(upload) + "#{upload.id}:#{upload.path}" + end + + def exif_tags(path) + cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path] + output, status = Gitlab::Popen.popen(cmd) + + raise "failed to get exif tags: #{output}" if status != 0 + + JSON.parse(output).first + end + end + end +end diff --git a/lib/tasks/gitlab/uploads/sanitize.rake b/lib/tasks/gitlab/uploads/sanitize.rake new file mode 100644 index 00000000000..12cf5302555 --- /dev/null +++ b/lib/tasks/gitlab/uploads/sanitize.rake @@ -0,0 +1,18 @@ +namespace :gitlab do + namespace :uploads do + namespace :sanitize do + desc 'GitLab | Uploads | Remove EXIF from images.' + task :remove_exif, [:start_id, :stop_id, :dry_run, :sleep_time] => :environment do |task, args| + args.with_defaults(dry_run: 'true') + args.with_defaults(sleep_time: 0.3) + + logger = Logger.new(STDOUT) + + sanitizer = Gitlab::Sanitizers::Exif.new(logger: logger) + sanitizer.batch_clean(start_id: args.start_id, stop_id: args.stop_id, + dry_run: args.dry_run != 'false', + sleep_time: args.sleep_time.to_f) + end + end + end +end |