summaryrefslogtreecommitdiff
path: root/lib/gitlab/background_migration/recalculate_vulnerabilities_occurrences_uuid.rb
blob: 9a42d035285ab4fa69264408e83b6cf63f5cd1bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# frozen_string_literal: true

# rubocop: disable Style/Documentation
class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid # rubocop:disable Metrics/ClassLength
  # rubocop: disable Gitlab/NamespacedClass
  class VulnerabilitiesIdentifier < ActiveRecord::Base
    self.table_name = "vulnerability_identifiers"
    has_many :primary_findings, class_name: 'VulnerabilitiesFinding', inverse_of: :primary_identifier, foreign_key: 'primary_identifier_id'
  end

  class VulnerabilitiesFinding < ActiveRecord::Base
    include EachBatch
    include ShaAttribute

    self.table_name = "vulnerability_occurrences"

    has_many :signatures, foreign_key: 'finding_id', class_name: 'VulnerabilityFindingSignature', inverse_of: :finding
    belongs_to :primary_identifier, class_name: 'VulnerabilitiesIdentifier', inverse_of: :primary_findings, foreign_key: 'primary_identifier_id'

    REPORT_TYPES = {
      sast: 0,
      dependency_scanning: 1,
      container_scanning: 2,
      dast: 3,
      secret_detection: 4,
      coverage_fuzzing: 5,
      api_fuzzing: 6,
      cluster_image_scanning: 7,
      generic: 99
    }.with_indifferent_access.freeze
    enum report_type: REPORT_TYPES

    sha_attribute :fingerprint
    sha_attribute :location_fingerprint
  end

  class VulnerabilityFindingSignature < ActiveRecord::Base
    include ShaAttribute

    self.table_name = 'vulnerability_finding_signatures'
    belongs_to :finding, foreign_key: 'finding_id', inverse_of: :signatures, class_name: 'VulnerabilitiesFinding'

    sha_attribute :signature_sha
  end

  class VulnerabilitiesFindingPipeline < ActiveRecord::Base
    include EachBatch
    self.table_name = "vulnerability_occurrence_pipelines"
  end

  class Vulnerability < ActiveRecord::Base
    include EachBatch
    self.table_name = "vulnerabilities"
  end

  class CalculateFindingUUID
    FINDING_NAMESPACES_IDS = {
      development: "a143e9e2-41b3-47bc-9a19-081d089229f4",
      test: "a143e9e2-41b3-47bc-9a19-081d089229f4",
      staging: "a6930898-a1b2-4365-ab18-12aa474d9b26",
      production: "58dc0f06-936c-43b3-93bb-71693f1b6570"
    }.freeze

    NAMESPACE_REGEX = /(\h{8})-(\h{4})-(\h{4})-(\h{4})-(\h{4})(\h{8})/.freeze
    PACK_PATTERN = "NnnnnN"

    def self.call(value)
      Digest::UUID.uuid_v5(namespace_id, value)
    end

    def self.namespace_id
      namespace_uuid = FINDING_NAMESPACES_IDS.fetch(Rails.env.to_sym)
      # Digest::UUID is broken when using an UUID in namespace_id
      # https://github.com/rails/rails/issues/37681#issue-520718028
      namespace_uuid.scan(NAMESPACE_REGEX).flatten.map { |s| s.to_i(16) }.pack(PACK_PATTERN)
    end
  end
  # rubocop: enable Gitlab/NamespacedClass

  # rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength
  def perform(start_id, end_id)
    log_info('Migration started', start_id: start_id, end_id: end_id)

    VulnerabilitiesFinding
      .joins(:primary_identifier)
      .includes(:signatures)
      .select(:id, :report_type, :primary_identifier_id, :fingerprint, :location_fingerprint, :project_id, :created_at, :vulnerability_id, :uuid)
      .where(id: start_id..end_id)
      .each_batch(of: 50) do |relation|
      duplicates = find_duplicates(relation)
      remove_findings(ids: duplicates) if duplicates.present?

      to_update = relation.reject { |finding| duplicates.include?(finding.id) }

      begin
        known_uuids = Set.new
        to_be_deleted = []

        mappings = to_update.each_with_object({}) do |finding, hash|
          uuid = calculate_uuid_v5_for_finding(finding)

          if known_uuids.add?(uuid)
            hash[finding] = { uuid: uuid }
          else
            to_be_deleted << finding.id
          end
        end

        # It is technically still possible to have duplicate uuids
        # if the data integrity is broken somehow and the primary identifiers of
        # the findings are pointing to different projects with the same fingerprint values.
        if to_be_deleted.present?
          log_info('Conflicting UUIDs found within the batch', finding_ids: to_be_deleted)

          remove_findings(ids: to_be_deleted)
        end

        ::Gitlab::Database::BulkUpdate.execute(%i[uuid], mappings) if mappings.present?

        log_info('Recalculation is done', finding_ids: mappings.keys.pluck(:id))
      rescue ActiveRecord::RecordNotUnique => error
        log_info('RecordNotUnique error received')

        match_data = /\(uuid\)=\((?<uuid>\S{36})\)/.match(error.message)

        # This exception returns the **correct** UUIDv5 which probably comes from a later record
        # and it's the one we can drop in the easiest way before retrying the UPDATE query
        if match_data
          uuid = match_data[:uuid]
          log_info('Conflicting UUID found', uuid: uuid)

          id = VulnerabilitiesFinding.find_by(uuid: uuid)&.id
          remove_findings(ids: id) if id
          retry
        else
          log_error('Couldnt find conflicting uuid')

          Gitlab::ErrorTracking.track_and_raise_exception(error)
        end
      end
    end

    mark_job_as_succeeded(start_id, end_id)
  rescue StandardError => error
    log_error('An exception happened')

    Gitlab::ErrorTracking.track_and_raise_exception(error)
  end
  # rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength

  private

  def find_duplicates(relation)
    to_exclude = []
    relation.flat_map do |record|
      # Assuming we're scanning id 31 and the duplicate is id 40
      # first we'd process 31 and add 40 to the list of ids to remove
      # then we would process record 40 and add 31 to the list of removals
      # so we would drop both records
      to_exclude << record.id

      VulnerabilitiesFinding.where(
        report_type: record.report_type,
        location_fingerprint: record.location_fingerprint,
        primary_identifier_id: record.primary_identifier_id,
        project_id: record.project_id
      ).where.not(id: to_exclude).pluck(:id)
    end
  end

  def remove_findings(ids:)
    ids = Array(ids)
    log_info('Removing Findings and associated records', ids: ids)

    vulnerability_ids = VulnerabilitiesFinding.where(id: ids).pluck(:vulnerability_id).uniq.compact

    VulnerabilitiesFindingPipeline.where(occurrence_id: ids).each_batch { |batch| batch.delete_all }
    Vulnerability.where(id: vulnerability_ids).each_batch { |batch| batch.delete_all }
    VulnerabilitiesFinding.where(id: ids).delete_all
  end

  def calculate_uuid_v5_for_finding(vulnerability_finding)
    return unless vulnerability_finding

    signatures = vulnerability_finding.signatures.sort_by { |signature| signature.algorithm_type_before_type_cast }
    location_fingerprint = signatures.last&.signature_sha || vulnerability_finding.location_fingerprint

    uuid_v5_name_components = {
      report_type: vulnerability_finding.report_type,
      primary_identifier_fingerprint: vulnerability_finding.fingerprint,
      location_fingerprint: location_fingerprint,
      project_id: vulnerability_finding.project_id
    }

    name = uuid_v5_name_components.values.join('-')

    CalculateFindingUUID.call(name)
  end

  def log_info(message, **extra)
    logger.info(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra)
  end

  def log_error(message, **extra)
    logger.error(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra)
  end

  def logger
    @logger ||= Gitlab::BackgroundMigration::Logger.build
  end

  def mark_job_as_succeeded(*arguments)
    Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(
      'RecalculateVulnerabilitiesOccurrencesUuid',
      arguments
    )
  end
end