summaryrefslogtreecommitdiff
path: root/lib/gitlab/sanitizers
diff options
context:
space:
mode:
authorJan Provaznik <jprovaznik@gitlab.com>2019-04-02 07:48:35 +0000
committerGitLab Release Tools Bot <robert+release-tools@gitlab.com>2019-04-02 07:48:35 +0000
commita466d97e62a89b320713da44d67d452284ad8282 (patch)
tree58f9b4a2cf07ebc875c6e9ba6168361dcb5773ae /lib/gitlab/sanitizers
parent6557858faeb5ae56d18a2f4463d43e0bc22d700f (diff)
downloadgitlab-ce-a466d97e62a89b320713da44d67d452284ad8282.tar.gz
Rake task for removing exif from uploads
Adds a rake task which can be used for removing EXIF data from existing uploads.
Diffstat (limited to 'lib/gitlab/sanitizers')
-rw-r--r--lib/gitlab/sanitizers/exif.rb156
1 files changed, 156 insertions, 0 deletions
diff --git a/lib/gitlab/sanitizers/exif.rb b/lib/gitlab/sanitizers/exif.rb
new file mode 100644
index 00000000000..0928ccdc324
--- /dev/null
+++ b/lib/gitlab/sanitizers/exif.rb
@@ -0,0 +1,156 @@
+# frozen_string_literal: true
+
+module Gitlab
+ module Sanitizers
+ class Exif
+ # these tags are not removed from the image
+ WHITELISTED_TAGS = %w(
+ ResolutionUnit
+ XResolution
+ YResolution
+ YCbCrSubSampling
+ YCbCrPositioning
+ BitsPerSample
+ ImageHeight
+ ImageWidth
+ ImageSize
+ Copyright
+ CopyrightNotice
+ Orientation
+ ).freeze
+
+ # these tags are common in exiftool output, these
+ # do not contain any sensitive information, but
+ # we don't need to preserve them when removing
+ # exif tags
+ IGNORED_TAGS = %w(
+ ColorComponents
+ EncodingProcess
+ ExifByteOrder
+ ExifToolVersion
+ JFIFVersion
+ Directory
+ FileAccessDate
+ FileInodeChangeDate
+ FileModifyDate
+ FileName
+ FilePermissions
+ FileSize
+ SourceFile
+ Megapixels
+ FileType
+ FileTypeExtension
+ MIMEType
+ ).freeze
+
+ ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS
+ EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" }
+
+ attr_reader :logger
+
+ def initialize(logger: Rails.logger)
+ @logger = logger
+ end
+
+ # rubocop: disable CodeReuse/ActiveRecord
+ def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil)
+ relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?',
+ '%.jpg', '%.jpeg', '%.tiff')
+
+ logger.info "running in dry run mode, no images will be rewritten" if dry_run
+
+ find_params = {
+ start: start_id.present? ? start_id.to_i : nil,
+ finish: stop_id.present? ? stop_id.to_i : Upload.last&.id
+ }
+
+ relation.find_each(find_params) do |upload|
+ clean(upload.build_uploader, dry_run: dry_run)
+ sleep sleep_time if sleep_time
+ rescue => err
+ logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}"
+ logger.debug err.backtrace.join("\n ")
+ end
+ end
+ # rubocop: enable CodeReuse/ActiveRecord
+
+ def clean(uploader, dry_run: true)
+ Dir.mktmpdir('gitlab-exif') do |tmpdir|
+ src_path = fetch_upload_to_file(uploader, tmpdir)
+
+ to_remove = extra_tags(src_path)
+
+ if to_remove.empty?
+ logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping"
+ break
+ end
+
+ logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}"
+
+ break if dry_run
+
+ remove_and_store(tmpdir, src_path, uploader)
+ end
+ end
+
+ def extra_tags(path)
+ exif_tags(path).keys - ALLOWED_TAGS
+ end
+
+ private
+
+ def remove_and_store(tmpdir, src_path, uploader)
+ exec_remove_exif!(src_path)
+ logger.info "#{upload_ref(uploader.upload)}: exif removed, storing"
+ File.open(src_path, 'r') { |f| uploader.store!(f) }
+ end
+
+ def exec_remove_exif!(path)
+ # IPTC and XMP-iptcExt groups may keep copyright information so
+ # we always preserve them
+ cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path]
+ output, status = Gitlab::Popen.popen(cmd)
+
+ if status != 0
+ raise "exiftool return code is #{status}: #{output}"
+ end
+
+ if File.size(path) == 0
+ raise "size of file is 0"
+ end
+
+ # exiftool creates backup of the original file in filename_original
+ old_path = "#{path}_original"
+ if File.size(path) == File.size(old_path)
+ raise "size of sanitized file is same as original size"
+ end
+ end
+
+ def fetch_upload_to_file(uploader, dir)
+ # upload is stored into the file with the original name - this filename
+ # is used by carrierwave when storing the file back to the storage
+ filename = File.join(dir, uploader.filename)
+
+ File.open(filename, 'w') do |file|
+ file.binmode
+ file.write uploader.read
+ end
+
+ filename
+ end
+
+ def upload_ref(upload)
+ "#{upload.id}:#{upload.path}"
+ end
+
+ def exif_tags(path)
+ cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path]
+ output, status = Gitlab::Popen.popen(cmd)
+
+ raise "failed to get exif tags: #{output}" if status != 0
+
+ JSON.parse(output).first
+ end
+ end
+ end
+end