summaryrefslogtreecommitdiff
path: root/lib/gitlab/sanitizers/exif.rb
blob: bb4e4ce7bbc0a42a7d9990da15132e7b4077b17f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# frozen_string_literal: true

module Gitlab
  module Sanitizers
    class Exif
      # these tags are not removed from the image
      WHITELISTED_TAGS = %w(
        ResolutionUnit
        XResolution
        YResolution
        YCbCrSubSampling
        YCbCrPositioning
        BitsPerSample
        ImageHeight
        ImageWidth
        ImageSize
        Copyright
        CopyrightNotice
        Orientation
      ).freeze

      # these tags are common in exiftool output, these
      # do not contain any sensitive information, but
      # we don't need to preserve them when removing
      # exif tags
      IGNORED_TAGS = %w(
        ColorComponents
        EncodingProcess
        ExifByteOrder
        ExifToolVersion
        JFIFVersion
        Directory
        FileAccessDate
        FileInodeChangeDate
        FileModifyDate
        FileName
        FilePermissions
        FileSize
        SourceFile
        Megapixels
        FileType
        FileTypeExtension
        MIMEType
      ).freeze

      ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS
      EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" }

      attr_reader :logger

      def initialize(logger: Rails.logger) # rubocop:disable Gitlab/RailsLogger
        @logger = logger
      end

      # rubocop: disable CodeReuse/ActiveRecord
      def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil)
        relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?',
                                '%.jpg', '%.jpeg', '%.tiff')

        logger.info "running in dry run mode, no images will be rewritten" if dry_run

        find_params = {
          start: start_id.present? ? start_id.to_i : nil,
          finish: stop_id.present? ? stop_id.to_i : Upload.last&.id
        }

        relation.find_each(find_params) do |upload|
          clean(upload.build_uploader, dry_run: dry_run)
          sleep sleep_time if sleep_time
        rescue => err
          logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}"
          logger.debug err.backtrace.join("\n ")
        end
      end
      # rubocop: enable CodeReuse/ActiveRecord

      def clean(uploader, dry_run: true)
        Dir.mktmpdir('gitlab-exif') do |tmpdir|
          src_path = fetch_upload_to_file(uploader, tmpdir)

          to_remove = extra_tags(src_path)

          if to_remove.empty?
            logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping"
            break
          end

          logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}"

          break if dry_run

          remove_and_store(tmpdir, src_path, uploader)
        end
      end

      def extra_tags(path)
        exif_tags(path).keys - ALLOWED_TAGS
      end

      private

      def remove_and_store(tmpdir, src_path, uploader)
        exec_remove_exif!(src_path)
        logger.info "#{upload_ref(uploader.upload)}: exif removed, storing"
        File.open(src_path, 'r') { |f| uploader.store!(f) }
      end

      def exec_remove_exif!(path)
        # IPTC and XMP-iptcExt groups may keep copyright information so
        # we always preserve them
        cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path]
        output, status = Gitlab::Popen.popen(cmd)

        if status != 0
          raise "exiftool return code is #{status}: #{output}"
        end

        if File.size(path) == 0
          raise "size of file is 0"
        end

        # exiftool creates backup of the original file in filename_original
        old_path = "#{path}_original"
        if File.size(path) == File.size(old_path)
          raise "size of sanitized file is same as original size"
        end
      end

      def fetch_upload_to_file(uploader, dir)
        # upload is stored into the file with the original name - this filename
        # is used by carrierwave when storing the file back to the storage
        filename = File.join(dir, uploader.filename)

        File.open(filename, 'w') do |file|
          file.binmode
          file.write uploader.read
        end

        filename
      end

      def upload_ref(upload)
        "#{upload.id}:#{upload.path}"
      end

      def exif_tags(path)
        cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path]
        output, status = Gitlab::Popen.popen(cmd)

        raise "failed to get exif tags: #{output}" if status != 0

        JSON.parse(output).first
      end
    end
  end
end