# frozen_string_literal: true # rubocop: disable Style/Documentation class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid # rubocop:disable Metrics/ClassLength # rubocop: disable Gitlab/NamespacedClass class VulnerabilitiesIdentifier < ActiveRecord::Base self.table_name = "vulnerability_identifiers" has_many :primary_findings, class_name: 'VulnerabilitiesFinding', inverse_of: :primary_identifier, foreign_key: 'primary_identifier_id' end class VulnerabilitiesFinding < ActiveRecord::Base include EachBatch include ShaAttribute self.table_name = "vulnerability_occurrences" has_many :signatures, foreign_key: 'finding_id', class_name: 'VulnerabilityFindingSignature', inverse_of: :finding belongs_to :primary_identifier, class_name: 'VulnerabilitiesIdentifier', inverse_of: :primary_findings, foreign_key: 'primary_identifier_id' REPORT_TYPES = { sast: 0, dependency_scanning: 1, container_scanning: 2, dast: 3, secret_detection: 4, coverage_fuzzing: 5, api_fuzzing: 6, cluster_image_scanning: 7, generic: 99 }.with_indifferent_access.freeze enum report_type: REPORT_TYPES sha_attribute :fingerprint sha_attribute :location_fingerprint end class VulnerabilityFindingSignature < ActiveRecord::Base include ShaAttribute self.table_name = 'vulnerability_finding_signatures' belongs_to :finding, foreign_key: 'finding_id', inverse_of: :signatures, class_name: 'VulnerabilitiesFinding' sha_attribute :signature_sha end class VulnerabilitiesFindingPipeline < ActiveRecord::Base include EachBatch self.table_name = "vulnerability_occurrence_pipelines" end class Vulnerability < ActiveRecord::Base include EachBatch self.table_name = "vulnerabilities" end class CalculateFindingUUID FINDING_NAMESPACES_IDS = { development: "a143e9e2-41b3-47bc-9a19-081d089229f4", test: "a143e9e2-41b3-47bc-9a19-081d089229f4", staging: "a6930898-a1b2-4365-ab18-12aa474d9b26", production: "58dc0f06-936c-43b3-93bb-71693f1b6570" }.freeze NAMESPACE_REGEX = /(\h{8})-(\h{4})-(\h{4})-(\h{4})-(\h{4})(\h{8})/.freeze PACK_PATTERN = "NnnnnN" def self.call(value) Digest::UUID.uuid_v5(namespace_id, value) end def self.namespace_id namespace_uuid = FINDING_NAMESPACES_IDS.fetch(Rails.env.to_sym) # Digest::UUID is broken when using an UUID in namespace_id # https://github.com/rails/rails/issues/37681#issue-520718028 namespace_uuid.scan(NAMESPACE_REGEX).flatten.map { |s| s.to_i(16) }.pack(PACK_PATTERN) end end # rubocop: enable Gitlab/NamespacedClass # rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength def perform(start_id, end_id) log_info('Migration started', start_id: start_id, end_id: end_id) VulnerabilitiesFinding .joins(:primary_identifier) .includes(:signatures) .select(:id, :report_type, :primary_identifier_id, :fingerprint, :location_fingerprint, :project_id, :created_at, :vulnerability_id, :uuid) .where(id: start_id..end_id) .each_batch(of: 50) do |relation| duplicates = find_duplicates(relation) remove_findings(ids: duplicates) if duplicates.present? to_update = relation.reject { |finding| duplicates.include?(finding.id) } begin known_uuids = Set.new to_be_deleted = [] mappings = to_update.each_with_object({}) do |finding, hash| uuid = calculate_uuid_v5_for_finding(finding) if known_uuids.add?(uuid) hash[finding] = { uuid: uuid } else to_be_deleted << finding.id end end # It is technically still possible to have duplicate uuids # if the data integrity is broken somehow and the primary identifiers of # the findings are pointing to different projects with the same fingerprint values. if to_be_deleted.present? log_info('Conflicting UUIDs found within the batch', finding_ids: to_be_deleted) remove_findings(ids: to_be_deleted) end ::Gitlab::Database::BulkUpdate.execute(%i[uuid], mappings) if mappings.present? log_info('Recalculation is done', finding_ids: mappings.keys.pluck(:id)) rescue ActiveRecord::RecordNotUnique => error log_info('RecordNotUnique error received') match_data = /\(uuid\)=\((?\S{36})\)/.match(error.message) # This exception returns the **correct** UUIDv5 which probably comes from a later record # and it's the one we can drop in the easiest way before retrying the UPDATE query if match_data uuid = match_data[:uuid] log_info('Conflicting UUID found', uuid: uuid) id = VulnerabilitiesFinding.find_by(uuid: uuid)&.id remove_findings(ids: id) if id retry else log_error('Couldnt find conflicting uuid') Gitlab::ErrorTracking.track_and_raise_exception(error) end end end mark_job_as_succeeded(start_id, end_id) rescue StandardError => error log_error('An exception happened') Gitlab::ErrorTracking.track_and_raise_exception(error) end # rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength private def find_duplicates(relation) to_exclude = [] relation.flat_map do |record| # Assuming we're scanning id 31 and the duplicate is id 40 # first we'd process 31 and add 40 to the list of ids to remove # then we would process record 40 and add 31 to the list of removals # so we would drop both records to_exclude << record.id VulnerabilitiesFinding.where( report_type: record.report_type, location_fingerprint: record.location_fingerprint, primary_identifier_id: record.primary_identifier_id, project_id: record.project_id ).where.not(id: to_exclude).pluck(:id) end end def remove_findings(ids:) ids = Array(ids) log_info('Removing Findings and associated records', ids: ids) vulnerability_ids = VulnerabilitiesFinding.where(id: ids).pluck(:vulnerability_id).uniq.compact VulnerabilitiesFindingPipeline.where(occurrence_id: ids).each_batch { |batch| batch.delete_all } Vulnerability.where(id: vulnerability_ids).each_batch { |batch| batch.delete_all } VulnerabilitiesFinding.where(id: ids).delete_all end def calculate_uuid_v5_for_finding(vulnerability_finding) return unless vulnerability_finding signatures = vulnerability_finding.signatures.sort_by { |signature| signature.algorithm_type_before_type_cast } location_fingerprint = signatures.last&.signature_sha || vulnerability_finding.location_fingerprint uuid_v5_name_components = { report_type: vulnerability_finding.report_type, primary_identifier_fingerprint: vulnerability_finding.fingerprint, location_fingerprint: location_fingerprint, project_id: vulnerability_finding.project_id } name = uuid_v5_name_components.values.join('-') CalculateFindingUUID.call(name) end def log_info(message, **extra) logger.info(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra) end def log_error(message, **extra) logger.error(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra) end def logger @logger ||= Gitlab::BackgroundMigration::Logger.build end def mark_job_as_succeeded(*arguments) Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( 'RecalculateVulnerabilitiesOccurrencesUuid', arguments ) end end