summaryrefslogtreecommitdiff
path: root/lib/gitlab/background_migration/remove_duplicate_vulnerabilities_findings.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gitlab/background_migration/remove_duplicate_vulnerabilities_findings.rb')
-rw-r--r--lib/gitlab/background_migration/remove_duplicate_vulnerabilities_findings.rb50
1 files changed, 50 insertions, 0 deletions
diff --git a/lib/gitlab/background_migration/remove_duplicate_vulnerabilities_findings.rb b/lib/gitlab/background_migration/remove_duplicate_vulnerabilities_findings.rb
new file mode 100644
index 00000000000..ca61118a06c
--- /dev/null
+++ b/lib/gitlab/background_migration/remove_duplicate_vulnerabilities_findings.rb
@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+
+# rubocop: disable Style/Documentation
+class Gitlab::BackgroundMigration::RemoveDuplicateVulnerabilitiesFindings
+ DELETE_BATCH_SIZE = 100
+
+ # rubocop:disable Gitlab/NamespacedClass
+ class VulnerabilitiesFinding < ActiveRecord::Base
+ self.table_name = "vulnerability_occurrences"
+ end
+ # rubocop:enable Gitlab/NamespacedClass
+
+ def perform(start_id, end_id)
+ batch = VulnerabilitiesFinding.where(id: start_id..end_id)
+
+ cte = Gitlab::SQL::CTE.new(:batch, batch.select(:report_type, :location_fingerprint, :primary_identifier_id, :project_id))
+
+ query = VulnerabilitiesFinding
+ .select('batch.report_type', 'batch.location_fingerprint', 'batch.primary_identifier_id', 'batch.project_id', 'array_agg(id) as ids')
+ .distinct
+ .with(cte.to_arel)
+ .from(cte.alias_to(Arel.sql('batch')))
+ .joins(
+ %(
+ INNER JOIN
+ vulnerability_occurrences ON
+ vulnerability_occurrences.report_type = batch.report_type AND
+ vulnerability_occurrences.location_fingerprint = batch.location_fingerprint AND
+ vulnerability_occurrences.primary_identifier_id = batch.primary_identifier_id AND
+ vulnerability_occurrences.project_id = batch.project_id
+ )).group('batch.report_type', 'batch.location_fingerprint', 'batch.primary_identifier_id', 'batch.project_id')
+ .having('COUNT(*) > 1')
+
+ ids_to_delete = []
+
+ query.to_a.each do |record|
+ # We want to keep the latest finding since it might have recent metadata
+ duplicate_ids = record.ids.uniq.sort
+ duplicate_ids.pop
+ ids_to_delete.concat(duplicate_ids)
+
+ if ids_to_delete.size == DELETE_BATCH_SIZE
+ VulnerabilitiesFinding.where(id: ids_to_delete).delete_all
+ ids_to_delete.clear
+ end
+ end
+
+ VulnerabilitiesFinding.where(id: ids_to_delete).delete_all if ids_to_delete.any?
+ end
+end