summaryrefslogtreecommitdiff
path: root/db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb
diff options
context:
space:
mode:
Diffstat (limited to 'db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb')
-rw-r--r--db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb263
1 files changed, 263 insertions, 0 deletions
diff --git a/db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb b/db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb
new file mode 100644
index 00000000000..442acfc6d16
--- /dev/null
+++ b/db/post_migrate/20200511083541_cleanup_projects_with_missing_namespace.rb
@@ -0,0 +1,263 @@
+# frozen_string_literal: true
+
+# rubocop:disable Migration/PreventStrings
+
+# This migration cleans up Projects that were orphaned when their namespace was deleted
+# Instead of deleting them, we:
+# - Find (or create) the Ghost User
+# - Create (if not already exists) a `lost-and-found` group owned by the Ghost User
+# - Find orphaned projects --> namespace_id can not be found in namespaces
+# - Move the orphaned projects to the `lost-and-found` group
+# (while making them private and setting `archived=true`)
+#
+# On GitLab.com (2020-05-11) this migration will update 66 orphaned projects
+class CleanupProjectsWithMissingNamespace < ActiveRecord::Migration[6.0]
+ include Gitlab::Database::MigrationHelpers
+
+ DOWNTIME = false
+ VISIBILITY_PRIVATE = 0
+ ACCESS_LEVEL_OWNER = 50
+
+ # The batch size of projects to check in each iteration
+ # We expect the selectivity for orphaned projects to be very low:
+ # (66 orphaned projects out of a total 13.6M)
+ # so 10K should be a safe choice
+ BATCH_SIZE = 10000
+
+ disable_ddl_transaction!
+
+ class UserDetail < ActiveRecord::Base
+ self.table_name = 'user_details'
+
+ belongs_to :user, class_name: 'CleanupProjectsWithMissingNamespace::User'
+ end
+
+ class User < ActiveRecord::Base
+ self.table_name = 'users'
+
+ LOST_AND_FOUND_GROUP = 'lost-and-found'
+ USER_TYPE_GHOST = 5
+ DEFAULT_PROJECTS_LIMIT = 100000
+
+ default_value_for :admin, false
+ default_value_for :can_create_group, true # we need this to create the group
+ default_value_for :can_create_team, false
+ default_value_for :project_view, :files
+ default_value_for :notified_of_own_activity, false
+ default_value_for :preferred_language, I18n.default_locale
+
+ has_one :user_detail, class_name: 'CleanupProjectsWithMissingNamespace::UserDetail'
+ has_one :namespace, -> { where(type: nil) },
+ foreign_key: :owner_id, inverse_of: :owner, autosave: true,
+ class_name: 'CleanupProjectsWithMissingNamespace::Namespace'
+
+ before_save :ensure_namespace_correct
+ before_save :ensure_bio_is_assigned_to_user_details, if: :bio_changed?
+
+ enum project_view: { readme: 0, activity: 1, files: 2 }
+
+ def ensure_namespace_correct
+ if namespace
+ namespace.path = username if username_changed?
+ namespace.name = name if name_changed?
+ else
+ build_namespace(path: username, name: name)
+ end
+ end
+
+ def ensure_bio_is_assigned_to_user_details
+ return if Feature.disabled?(:migrate_bio_to_user_details, default_enabled: true)
+
+ user_detail.bio = bio.to_s[0...255]
+ end
+
+ def user_detail
+ super.presence || build_user_detail
+ end
+
+ # Return (or create if necessary) the `lost-and-found` group
+ def lost_and_found_group
+ existing_lost_and_found_group || Group.create_unique_group(self, LOST_AND_FOUND_GROUP)
+ end
+
+ def existing_lost_and_found_group
+ # There should only be one Group for User Ghost starting with LOST_AND_FOUND_GROUP
+ Group
+ .joins('INNER JOIN members ON namespaces.id = members.source_id')
+ .where('namespaces.type = ?', 'Group')
+ .where('members.type = ?', 'GroupMember')
+ .where('members.source_type = ?', 'Namespace')
+ .where('members.user_id = ?', self.id)
+ .where('members.requested_at IS NULL')
+ .where('members.access_level = ?', ACCESS_LEVEL_OWNER)
+ .find_by(Group.arel_table[:name].matches("#{LOST_AND_FOUND_GROUP}%"))
+ end
+
+ class << self
+ # Return (or create if necessary) the ghost user
+ def ghost
+ email = 'ghost%s@example.com'
+
+ unique_internal(where(user_type: USER_TYPE_GHOST), 'ghost', email) do |u|
+ u.bio = _('This is a "Ghost User", created to hold all issues authored by users that have since been deleted. This user cannot be removed.')
+ u.name = 'Ghost User'
+ end
+ end
+
+ def unique_internal(scope, username, email_pattern, &block)
+ scope.first || create_unique_internal(scope, username, email_pattern, &block)
+ end
+
+ def create_unique_internal(scope, username, email_pattern, &creation_block)
+ # Since we only want a single one of these in an instance, we use an
+ # exclusive lease to ensure that this block is never run concurrently.
+ lease_key = "user:unique_internal:#{username}"
+ lease = Gitlab::ExclusiveLease.new(lease_key, timeout: 1.minute.to_i)
+
+ until uuid = lease.try_obtain
+ # Keep trying until we obtain the lease. To prevent hammering Redis too
+ # much we'll wait for a bit between retries.
+ sleep(1)
+ end
+
+ # Recheck if the user is already present. One might have been
+ # added between the time we last checked (first line of this method)
+ # and the time we acquired the lock.
+ existing_user = uncached { scope.first }
+ return existing_user if existing_user.present?
+
+ uniquify = Uniquify.new
+
+ username = uniquify.string(username) { |s| User.find_by_username(s) }
+
+ email = uniquify.string(-> (n) { Kernel.sprintf(email_pattern, n) }) do |s|
+ User.find_by_email(s)
+ end
+
+ User.create!(
+ username: username,
+ email: email,
+ user_type: USER_TYPE_GHOST,
+ projects_limit: DEFAULT_PROJECTS_LIMIT,
+ state: :active,
+ &creation_block
+ )
+ ensure
+ Gitlab::ExclusiveLease.cancel(lease_key, uuid)
+ end
+ end
+ end
+
+ class Namespace < ActiveRecord::Base
+ self.table_name = 'namespaces'
+
+ belongs_to :owner, class_name: 'CleanupProjectsWithMissingNamespace::User'
+ end
+
+ class Group < Namespace
+ # Disable STI to allow us to manually set "type = 'Group'"
+ # Otherwise rails forces "type = CleanupProjectsWithMissingNamespace::Group"
+ self.inheritance_column = :_type_disabled
+
+ def self.create_unique_group(user, group_name)
+ # 'lost-and-found' may be already defined, find a unique one
+ group_name = Uniquify.new.string(group_name) do |str|
+ Group.where(parent_id: nil, name: str).exists?
+ end
+
+ group = Group.create!(
+ name: group_name,
+ path: group_name,
+ type: 'Group',
+ description: 'Group to store orphaned projects',
+ visibility_level: VISIBILITY_PRIVATE
+ )
+
+ # No need to create a route for the lost-and-found group
+
+ GroupMember.add_user(group, user, ACCESS_LEVEL_OWNER)
+
+ group
+ end
+ end
+
+ class Member < ActiveRecord::Base
+ self.table_name = 'members'
+ end
+
+ class GroupMember < Member
+ NOTIFICATION_SETTING_GLOBAL = 3
+
+ # Disable STI to allow us to manually set "type = 'GroupMember'"
+ # Otherwise rails forces "type = CleanupProjectsWithMissingNamespace::GroupMember"
+ self.inheritance_column = :_type_disabled
+
+ def self.add_user(source, user, access_level)
+ GroupMember.create!(
+ type: 'GroupMember',
+ source_id: source.id,
+ user_id: user.id,
+ source_type: 'Namespace',
+ access_level: access_level,
+ notification_level: NOTIFICATION_SETTING_GLOBAL
+ )
+ end
+ end
+
+ class Project < ActiveRecord::Base
+ self.table_name = 'projects'
+
+ include ::EachBatch
+
+ def self.without_namespace
+ where(
+ 'NOT EXISTS (
+ SELECT 1
+ FROM namespaces
+ WHERE projects.namespace_id = namespaces.id
+ )'
+ )
+ end
+ end
+
+ def up
+ # Reset the column information of all the models that update the database
+ # to ensure the Active Record's knowledge of the table structure is current
+ User.reset_column_information
+ Namespace.reset_column_information
+ Member.reset_column_information
+ Project.reset_column_information
+
+ # Find or Create the ghost user
+ ghost_user = User.ghost
+
+ # Find or Create the `lost-and-found`
+ lost_and_found = ghost_user.lost_and_found_group
+
+ # With BATCH_SIZE=10K and projects.count=13.6M
+ # ~1360 iterations will be run:
+ # - each requires on average ~160ms for relation.without_namespace
+ # - worst case scenario is that 66 of those batches will trigger an update (~200ms each)
+ # In general, we expect less than 5% (=66/13.6M x 10K) to trigger an update
+ # Expected total run time: ~235 seconds (== 220 seconds + 14 seconds)
+ Project.each_batch(of: BATCH_SIZE) do |relation|
+ relation.without_namespace.update_all <<~SQL
+ namespace_id = #{lost_and_found.id},
+ archived = TRUE,
+ visibility_level = #{VISIBILITY_PRIVATE},
+
+ -- Names are expected to be unique inside their namespace
+ -- (uniqueness validation on namespace_id, name)
+ -- Attach the id to the name and path to make sure that they are unique
+ name = name || '_' || id,
+ path = path || '_' || id
+ SQL
+ end
+ end
+
+ def down
+ # no-op: the original state for those projects was inconsistent
+ # Also, the original namespace_id for each project is lost during the update
+ end
+end
+# rubocop:enable Migration/PreventStrings