1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
# frozen_string_literal: true
class RemoveDuplicateLabelsFromProject < ActiveRecord::Migration[6.0]
DOWNTIME = false
CREATE = 1
RENAME = 2
disable_ddl_transaction!
class BackupLabel < Label
self.table_name = 'backup_labels'
end
class Label < ApplicationRecord
self.table_name = 'labels'
end
class Project < ApplicationRecord
include EachBatch
self.table_name = 'projects'
end
BATCH_SIZE = 100_000
def up
# Split to smaller chunks
# Loop rather than background job, every 100,000
# there are 45,000,000 projects in total
Project.each_batch(of: BATCH_SIZE) do |batch|
range = batch.pluck('MIN(id)', 'MAX(id)').first
transaction do
remove_full_duplicates(*range)
end
transaction do
rename_partial_duplicates(*range)
end
end
end
def down
Project.each_batch(of: BATCH_SIZE) do |batch|
range = batch.pluck('MIN(id)', 'MAX(id)').first
restore_renamed_labels(*range)
restore_deleted_labels(*range)
end
end
def remove_full_duplicates(start_id, stop_id)
# Fields that are considered duplicate:
# project_id title template description type color
duplicate_labels = ApplicationRecord.connection.execute(<<-SQL.squish)
WITH data AS #{Gitlab::Database::AsWithMaterialized.materialized_if_supported} (
SELECT labels.*,
row_number() OVER (PARTITION BY labels.project_id, labels.title, labels.template, labels.description, labels.type, labels.color ORDER BY labels.id) AS row_number,
#{CREATE} AS restore_action
FROM labels
WHERE labels.project_id BETWEEN #{start_id} AND #{stop_id}
AND NOT EXISTS (SELECT * FROM board_labels WHERE board_labels.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM label_links WHERE label_links.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM label_priorities WHERE label_priorities.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM lists WHERE lists.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM resource_label_events WHERE resource_label_events.label_id = labels.id)
) SELECT * FROM data WHERE row_number > 1;
SQL
if duplicate_labels.any?
# create backup records
BackupLabel.insert_all!(duplicate_labels.map { |label| label.except("row_number") })
Label.where(id: duplicate_labels.pluck("id")).delete_all
end
end
def rename_partial_duplicates(start_id, stop_id)
# We need to ensure that the new title (with `_duplicate#{ID}`) doesn't exceed the limit.
# Truncate the original title (if needed) to 245 characters minus the length of the ID
# then add `_duplicate#{ID}`
soft_duplicates = ApplicationRecord.connection.execute(<<-SQL.squish)
WITH data AS #{Gitlab::Database::AsWithMaterialized.materialized_if_supported} (
SELECT
*,
substring(title from 1 for 245 - length(id::text)) || '_duplicate' || id::text as new_title,
#{RENAME} AS restore_action,
row_number() OVER (PARTITION BY project_id, title ORDER BY id) AS row_number
FROM labels
WHERE project_id BETWEEN #{start_id} AND #{stop_id}
) SELECT * FROM data WHERE row_number > 1;
SQL
if soft_duplicates.any?
# create backup records
BackupLabel.insert_all!(soft_duplicates.map { |label| label.except("row_number") })
ApplicationRecord.connection.execute(<<-SQL.squish)
UPDATE labels SET title = substring(title from 1 for 245 - length(id::text)) || '_duplicate' || id::text
WHERE labels.id IN (#{soft_duplicates.map { |dup| dup["id"] }.join(", ")});
SQL
end
end
def restore_renamed_labels(start_id, stop_id)
# the backup label IDs are not incremental, they are copied directly from the Labels table
ApplicationRecord.connection.execute(<<-SQL.squish)
WITH backups AS #{Gitlab::Database::AsWithMaterialized.materialized_if_supported} (
SELECT id, title
FROM backup_labels
WHERE project_id BETWEEN #{start_id} AND #{stop_id} AND
restore_action = #{RENAME}
) UPDATE labels SET title = backups.title
FROM backups
WHERE labels.id = backups.id;
SQL
end
def restore_deleted_labels(start_id, stop_id)
ActiveRecord::Base.connection.execute(<<-SQL.squish)
INSERT INTO labels
SELECT id, title, color, project_id, created_at, updated_at, template, description, description_html, type, group_id, cached_markdown_version FROM backup_labels
WHERE backup_labels.project_id BETWEEN #{start_id} AND #{stop_id}
AND backup_labels.restore_action = #{CREATE}
SQL
end
end
|