summaryrefslogtreecommitdiff
path: root/db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb
blob: 9e606b2264b4d1b82e29e59c2c5647d016be00fe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# frozen_string_literal: true

# rubocop:disable Migration/PreventStrings

# This migration cleans up Projects that were orphaned when their namespace was deleted
# Instead of deleting them, we:
# - Find (or create) the Ghost User
# - Create (if not already exists) a `lost-and-found` group owned by the Ghost User
# - Find orphaned projects --> namespace_id can not be found in namespaces
# - Move the orphaned projects to the `lost-and-found` group
#   (while making them private and setting `archived=true`)
#
# On GitLab.com (2020-05-11) this migration will update 66 orphaned projects
class CleanupProjectsWithMissingNamespace < ActiveRecord::Migration[6.0]
  include Gitlab::Database::MigrationHelpers

  DOWNTIME = false
  VISIBILITY_PRIVATE = 0
  ACCESS_LEVEL_OWNER = 50

  # The batch size of projects to check in each iteration
  # We expect the selectivity for orphaned projects to be very low:
  #  (66 orphaned projects out of a total 13.6M)
  # so 10K should be a safe choice
  BATCH_SIZE = 10000

  disable_ddl_transaction!

  class UserDetail < ActiveRecord::Base
    self.table_name = 'user_details'

    belongs_to :user, class_name: 'CleanupProjectsWithMissingNamespace::User'
  end

  class User < ActiveRecord::Base
    self.table_name = 'users'

    LOST_AND_FOUND_GROUP = 'lost-and-found'
    USER_TYPE_GHOST = 5
    DEFAULT_PROJECTS_LIMIT = 100000

    default_value_for :admin, false
    default_value_for :can_create_group, true # we need this to create the group
    default_value_for :can_create_team, false
    default_value_for :project_view, :files
    default_value_for :notified_of_own_activity, false
    default_value_for :preferred_language, I18n.default_locale

    has_one :user_detail, class_name: 'CleanupProjectsWithMissingNamespace::UserDetail'
    has_one :namespace, -> { where(type: nil) },
            foreign_key: :owner_id, inverse_of: :owner, autosave: true,
            class_name: 'CleanupProjectsWithMissingNamespace::Namespace'

    before_save :ensure_namespace_correct
    before_save :ensure_bio_is_assigned_to_user_details, if: :bio_changed?

    enum project_view: { readme: 0, activity: 1, files: 2 }

    def ensure_namespace_correct
      if namespace
        namespace.path = username if username_changed?
        namespace.name = name if name_changed?
      else
        build_namespace(path: username, name: name)
      end
    end

    def ensure_bio_is_assigned_to_user_details
      user_detail.bio = bio.to_s[0...255]
    end

    def user_detail
      super.presence || build_user_detail
    end

    # Return (or create if necessary) the `lost-and-found` group
    def lost_and_found_group
      existing_lost_and_found_group || Group.create_unique_group(self, LOST_AND_FOUND_GROUP)
    end

    def existing_lost_and_found_group
      # There should only be one Group for User Ghost starting with LOST_AND_FOUND_GROUP
      Group
        .joins('INNER JOIN members ON namespaces.id = members.source_id')
        .where('namespaces.type = ?', 'Group')
        .where('members.type = ?', 'GroupMember')
        .where('members.source_type = ?', 'Namespace')
        .where('members.user_id = ?', self.id)
        .where('members.requested_at IS NULL')
        .where('members.access_level = ?', ACCESS_LEVEL_OWNER)
        .find_by(Group.arel_table[:name].matches("#{LOST_AND_FOUND_GROUP}%"))
    end

    class << self
      # Return (or create if necessary) the ghost user
      def ghost
        email = 'ghost%s@example.com'

        unique_internal(where(user_type: USER_TYPE_GHOST), 'ghost', email) do |u|
          u.bio = _('This is a "Ghost User", created to hold all issues authored by users that have since been deleted. This user cannot be removed.')
          u.name = 'Ghost User'
        end
      end

      def unique_internal(scope, username, email_pattern, &block)
        scope.first || create_unique_internal(scope, username, email_pattern, &block)
      end

      def create_unique_internal(scope, username, email_pattern, &creation_block)
        # Since we only want a single one of these in an instance, we use an
        # exclusive lease to ensure that this block is never run concurrently.
        lease_key = "user:unique_internal:#{username}"
        lease = Gitlab::ExclusiveLease.new(lease_key, timeout: 1.minute.to_i)

        until uuid = lease.try_obtain
          # Keep trying until we obtain the lease. To prevent hammering Redis too
          # much we'll wait for a bit between retries.
          sleep(1)
        end

        # Recheck if the user is already present. One might have been
        # added between the time we last checked (first line of this method)
        # and the time we acquired the lock.
        existing_user = uncached { scope.first }
        return existing_user if existing_user.present?

        uniquify = Uniquify.new

        username = uniquify.string(username) { |s| User.find_by_username(s) }

        email = uniquify.string(-> (n) { Kernel.sprintf(email_pattern, n) }) do |s|
          User.find_by_email(s)
        end

        User.create!(
          username: username,
          email: email,
          user_type: USER_TYPE_GHOST,
          projects_limit: DEFAULT_PROJECTS_LIMIT,
          state: :active,
          &creation_block
        )
      ensure
        Gitlab::ExclusiveLease.cancel(lease_key, uuid)
      end
    end
  end

  class Namespace < ActiveRecord::Base
    self.table_name = 'namespaces'

    belongs_to :owner, class_name: 'CleanupProjectsWithMissingNamespace::User'
  end

  class Group < Namespace
    # Disable STI to allow us to manually set "type = 'Group'"
    # Otherwise rails forces "type = CleanupProjectsWithMissingNamespace::Group"
    self.inheritance_column = :_type_disabled

    def self.create_unique_group(user, group_name)
      # 'lost-and-found' may be already defined, find a unique one
      group_name = Uniquify.new.string(group_name) do |str|
        Group.where(parent_id: nil, name: str).exists?
      end

      group = Group.create!(
        name: group_name,
        path: group_name,
        type: 'Group',
        description: 'Group to store orphaned projects',
        visibility_level: VISIBILITY_PRIVATE
      )

      # No need to create a route for the lost-and-found group

      GroupMember.add_user(group, user, ACCESS_LEVEL_OWNER)

      group
    end
  end

  class Member < ActiveRecord::Base
    self.table_name = 'members'
  end

  class GroupMember < Member
    NOTIFICATION_SETTING_GLOBAL = 3

    # Disable STI to allow us to manually set "type = 'GroupMember'"
    # Otherwise rails forces "type = CleanupProjectsWithMissingNamespace::GroupMember"
    self.inheritance_column = :_type_disabled

    def self.add_user(source, user, access_level)
      GroupMember.create!(
        type: 'GroupMember',
        source_id: source.id,
        user_id: user.id,
        source_type: 'Namespace',
        access_level: access_level,
        notification_level: NOTIFICATION_SETTING_GLOBAL
      )
    end
  end

  class Project < ActiveRecord::Base
    self.table_name = 'projects'

    include ::EachBatch

    def self.without_namespace
      where(
        'NOT EXISTS (
          SELECT 1
          FROM namespaces
          WHERE projects.namespace_id = namespaces.id
        )'
      )
    end
  end

  def up
    # Reset the column information of all the models that update the database
    # to ensure the Active Record's knowledge of the table structure is current
    User.reset_column_information
    Namespace.reset_column_information
    Member.reset_column_information
    Project.reset_column_information

    # Find or Create the ghost user
    ghost_user = User.ghost

    # Find or Create the `lost-and-found`
    lost_and_found = ghost_user.lost_and_found_group

    # With BATCH_SIZE=10K and projects.count=13.6M
    # ~1360 iterations will be run:
    # - each requires on average ~160ms for relation.without_namespace
    # - worst case scenario is that 66 of those batches will trigger an update (~200ms each)
    #   In general, we expect less than 5% (=66/13.6M x 10K) to trigger an update
    # Expected total run time: ~235 seconds (== 220 seconds + 14 seconds)
    Project.each_batch(of: BATCH_SIZE) do |relation|
      relation.without_namespace.update_all <<~SQL
        namespace_id = #{lost_and_found.id},
        archived = TRUE,
        visibility_level = #{VISIBILITY_PRIVATE},

        -- Names are expected to be unique inside their namespace
        --  (uniqueness validation on namespace_id, name)
        -- Attach the id to the name and path to make sure that they are unique
        name = name || '_' || id::text,
        path = path || '_' || id::text
      SQL
    end
  end

  def down
    # no-op: the original state for those projects was inconsistent
    # Also, the original namespace_id for each project is lost during the update
  end
end
# rubocop:enable Migration/PreventStrings