1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
# frozen_string_literal: true
module Gitlab
module Sanitizers
class Exif
# these tags are not removed from the image
WHITELISTED_TAGS = %w(
ResolutionUnit
XResolution
YResolution
YCbCrSubSampling
YCbCrPositioning
BitsPerSample
ImageHeight
ImageWidth
ImageSize
Copyright
CopyrightNotice
Orientation
).freeze
# these tags are common in exiftool output, these
# do not contain any sensitive information, but
# we don't need to preserve them when removing
# exif tags
IGNORED_TAGS = %w(
ColorComponents
EncodingProcess
ExifByteOrder
ExifToolVersion
JFIFVersion
Directory
FileAccessDate
FileInodeChangeDate
FileModifyDate
FileName
FilePermissions
FileSize
SourceFile
Megapixels
FileType
FileTypeExtension
MIMEType
).freeze
ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS
EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" }
attr_reader :logger
def initialize(logger: Rails.logger) # rubocop:disable Gitlab/RailsLogger
@logger = logger
end
# rubocop: disable CodeReuse/ActiveRecord
def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil, uploader: nil, since: nil)
relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?',
'%.jpg', '%.jpeg', '%.tiff')
relation = relation.where(uploader: uploader) if uploader
relation = relation.where('created_at > ?', since) if since
logger.info "running in dry run mode, no images will be rewritten" if dry_run
find_params = {
start: start_id.present? ? start_id.to_i : nil,
finish: stop_id.present? ? stop_id.to_i : Upload.last&.id,
batch_size: 1000
}
relation.find_each(find_params) do |upload|
clean(upload.build_uploader, dry_run: dry_run)
sleep sleep_time if sleep_time
rescue => err
logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}"
logger.debug err.backtrace.join("\n ")
end
end
# rubocop: enable CodeReuse/ActiveRecord
def clean(uploader, dry_run: true)
Dir.mktmpdir('gitlab-exif') do |tmpdir|
src_path = fetch_upload_to_file(uploader, tmpdir)
to_remove = extra_tags(src_path)
if to_remove.empty?
logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping"
break
end
logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}"
break if dry_run
remove_and_store(tmpdir, src_path, uploader)
end
end
def extra_tags(path)
exif_tags(path).keys - ALLOWED_TAGS
end
private
def remove_and_store(tmpdir, src_path, uploader)
exec_remove_exif!(src_path)
logger.info "#{upload_ref(uploader.upload)}: exif removed, storing"
File.open(src_path, 'r') { |f| uploader.store!(f) }
end
def exec_remove_exif!(path)
# IPTC and XMP-iptcExt groups may keep copyright information so
# we always preserve them
cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path]
output, status = Gitlab::Popen.popen(cmd)
if status != 0
raise "exiftool return code is #{status}: #{output}"
end
if File.size(path) == 0
raise "size of file is 0"
end
# exiftool creates backup of the original file in filename_original
old_path = "#{path}_original"
if File.size(path) == File.size(old_path)
raise "size of sanitized file is same as original size"
end
end
def fetch_upload_to_file(uploader, dir)
# upload is stored into the file with the original name - this filename
# is used by carrierwave when storing the file back to the storage
filename = File.join(dir, uploader.filename)
File.open(filename, 'w') do |file|
file.binmode
file.write uploader.read
end
filename
end
def upload_ref(upload)
"#{upload.id}:#{upload.path}"
end
def exif_tags(path)
cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path]
output, status = Gitlab::Popen.popen(cmd)
raise "failed to get exif tags: #{output}" if status != 0
JSON.parse(output).first
end
end
end
end
|