diff options
Diffstat (limited to 'lib/gitlab/background_migration')
14 files changed, 461 insertions, 20 deletions
diff --git a/lib/gitlab/background_migration/backfill_issue_search_data.rb b/lib/gitlab/background_migration/backfill_issue_search_data.rb new file mode 100644 index 00000000000..ec206cbfd41 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_issue_search_data.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true +# rubocop:disable Style/Documentation + +module Gitlab + module BackgroundMigration + # Backfills the new `issue_search_data` table, which contains + # the tsvector from the issue title and description. + class BackfillIssueSearchData + include Gitlab::Database::DynamicModelHelpers + + def perform(start_id, stop_id, batch_table, batch_column, sub_batch_size, pause_ms) + define_batchable_model(batch_table, connection: ActiveRecord::Base.connection).where(batch_column => start_id..stop_id).each_batch(of: sub_batch_size) do |sub_batch| + update_search_data(sub_batch) + + sleep(pause_ms * 0.001) + rescue ActiveRecord::StatementInvalid => e + raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector') + + update_search_data_individually(sub_batch, pause_ms) + end + end + + private + + def update_search_data(relation) + relation.klass.connection.execute( + <<~SQL + INSERT INTO issue_search_data (project_id, issue_id, search_vector, created_at, updated_at) + SELECT + project_id, + id, + setweight(to_tsvector('english', LEFT(title, 255)), 'A') || setweight(to_tsvector('english', LEFT(REGEXP_REPLACE(description, '[A-Za-z0-9+/@]{50,}', ' ', 'g'), 1048576)), 'B'), + NOW(), + NOW() + FROM issues + WHERE issues.id IN (#{relation.select(:id).to_sql}) + ON CONFLICT DO NOTHING + SQL + ) + end + + def update_search_data_individually(relation, pause_ms) + relation.pluck(:id).each do |issue_id| + update_search_data(relation.klass.where(id: issue_id)) + + sleep(pause_ms * 0.001) + rescue ActiveRecord::StatementInvalid => e + raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector') + + logger.error( + message: 'Error updating search data: string is too long for tsvector', + class: relation.klass.name, + model_id: issue_id + ) + end + end + + def logger + @logger ||= Gitlab::BackgroundMigration::Logger.build + end + end + end +end diff --git a/lib/gitlab/background_migration/backfill_jira_tracker_deployment_type2.rb b/lib/gitlab/background_migration/backfill_jira_tracker_deployment_type2.rb index 61145f6a445..669e5338dd1 100644 --- a/lib/gitlab/background_migration/backfill_jira_tracker_deployment_type2.rb +++ b/lib/gitlab/background_migration/backfill_jira_tracker_deployment_type2.rb @@ -79,7 +79,7 @@ module Gitlab end def mark_jobs_as_succeeded(*arguments) - Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(self.class.name, arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(self.class.name.demodulize, arguments) end end end diff --git a/lib/gitlab/background_migration/backfill_member_namespace_for_group_members.rb b/lib/gitlab/background_migration/backfill_member_namespace_for_group_members.rb new file mode 100644 index 00000000000..1ed147d67c7 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_member_namespace_for_group_members.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Backfills the `members.member_namespace_id` column for `type=GroupMember` + class BackfillMemberNamespaceForGroupMembers + include Gitlab::Database::DynamicModelHelpers + + def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, pause_ms) + parent_batch_relation = relation_scoped_to_range(batch_table, batch_column, start_id, end_id) + + parent_batch_relation.each_batch(column: batch_column, of: sub_batch_size) do |sub_batch| + batch_metrics.time_operation(:update_all) do + sub_batch.update_all('member_namespace_id=source_id') + end + + pause_ms = [0, pause_ms].max + sleep(pause_ms * 0.001) + end + end + + def batch_metrics + @batch_metrics ||= Gitlab::Database::BackgroundMigration::BatchMetrics.new + end + + private + + def relation_scoped_to_range(source_table, source_key_column, start_id, stop_id) + define_batchable_model(source_table, connection: ActiveRecord::Base.connection) + .joins('INNER JOIN namespaces ON members.source_id = namespaces.id') + .where(source_key_column => start_id..stop_id) + .where(type: 'GroupMember') + .where(source_type: 'Namespace') + .where(member_namespace_id: nil) + end + end + end +end diff --git a/lib/gitlab/background_migration/batching_strategies/base_strategy.rb b/lib/gitlab/background_migration/batching_strategies/base_strategy.rb new file mode 100644 index 00000000000..37bddea4f61 --- /dev/null +++ b/lib/gitlab/background_migration/batching_strategies/base_strategy.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + module BatchingStrategies + # Simple base class for batching strategy job classes. + # + # Any strategy class that inherits from the base class will have connection to the tracking database set on + # initialization. + class BaseStrategy + def initialize(connection:) + @connection = connection + end + + def next_batch(*arguments) + raise NotImplementedError, + "#{self.class} does not implement #{__method__}" + end + + private + + attr_reader :connection + end + end + end +end diff --git a/lib/gitlab/background_migration/batching_strategies/primary_key_batching_strategy.rb b/lib/gitlab/background_migration/batching_strategies/primary_key_batching_strategy.rb index 09700438d47..5569bac0e19 100644 --- a/lib/gitlab/background_migration/batching_strategies/primary_key_batching_strategy.rb +++ b/lib/gitlab/background_migration/batching_strategies/primary_key_batching_strategy.rb @@ -8,7 +8,7 @@ module Gitlab # values for the next batch as an array. # # If no more batches exist in the table, returns nil. - class PrimaryKeyBatchingStrategy + class PrimaryKeyBatchingStrategy < BaseStrategy include Gitlab::Database::DynamicModelHelpers # Finds and returns the next batch in the table. @@ -19,7 +19,7 @@ module Gitlab # batch_size - The size of the next batch # job_arguments - The migration job arguments def next_batch(table_name, column_name, batch_min_value:, batch_size:, job_arguments:) - model_class = define_batchable_model(table_name, connection: ActiveRecord::Base.connection) + model_class = define_batchable_model(table_name, connection: connection) quoted_column_name = model_class.connection.quote_column_name(column_name) relation = model_class.where("#{quoted_column_name} >= ?", batch_min_value) diff --git a/lib/gitlab/background_migration/encrypt_integration_properties.rb b/lib/gitlab/background_migration/encrypt_integration_properties.rb new file mode 100644 index 00000000000..3843356af69 --- /dev/null +++ b/lib/gitlab/background_migration/encrypt_integration_properties.rb @@ -0,0 +1,84 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Migrates the integration.properties column from plaintext to encrypted text. + class EncryptIntegrationProperties + # The Integration model, with just the relevant bits. + class Integration < ActiveRecord::Base + include EachBatch + + ALGORITHM = 'aes-256-gcm' + + self.table_name = 'integrations' + self.inheritance_column = :_type_disabled + + scope :with_properties, -> { where.not(properties: nil) } + scope :not_already_encrypted, -> { where(encrypted_properties: nil) } + scope :for_batch, ->(range) { where(id: range) } + + attr_encrypted :encrypted_properties_tmp, + attribute: :encrypted_properties, + mode: :per_attribute_iv, + key: ::Settings.attr_encrypted_db_key_base_32, + algorithm: ALGORITHM, + marshal: true, + marshaler: ::Gitlab::Json, + encode: false, + encode_iv: false + + # See 'Integration#reencrypt_properties' + def encrypt_properties + data = ::Gitlab::Json.parse(properties) + iv = generate_iv(ALGORITHM) + ep = self.class.encrypt(:encrypted_properties_tmp, data, { iv: iv }) + + [ep, iv] + end + end + + def perform(start_id, stop_id) + batch_query = Integration.with_properties.not_already_encrypted.for_batch(start_id..stop_id) + encrypt_batch(batch_query) + mark_job_as_succeeded(start_id, stop_id) + end + + private + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( + self.class.name.demodulize, + arguments + ) + end + + # represent binary string as a PSQL binary literal: + # https://www.postgresql.org/docs/9.4/datatype-binary.html + def bytea(value) + "'\\x#{value.unpack1('H*')}'::bytea" + end + + def encrypt_batch(batch_query) + values = batch_query.select(:id, :properties).map do |record| + encrypted_properties, encrypted_properties_iv = record.encrypt_properties + "(#{record.id}, #{bytea(encrypted_properties)}, #{bytea(encrypted_properties_iv)})" + end + + return if values.empty? + + Integration.connection.execute(<<~SQL.squish) + WITH cte(cte_id, cte_encrypted_properties, cte_encrypted_properties_iv) + AS #{::Gitlab::Database::AsWithMaterialized.materialized_if_supported} ( + SELECT * + FROM (VALUES #{values.join(',')}) AS t (id, encrypted_properties, encrypted_properties_iv) + ) + UPDATE #{Integration.table_name} + SET encrypted_properties = cte_encrypted_properties + , encrypted_properties_iv = cte_encrypted_properties_iv + FROM cte + WHERE cte_id = id + SQL + end + end + end +end diff --git a/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb b/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb index 2b049ea2d2f..a34e923545c 100644 --- a/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb +++ b/lib/gitlab/background_migration/fix_vulnerability_occurrences_with_hashes_as_raw_metadata.rb @@ -59,7 +59,7 @@ module Gitlab private def mark_job_as_succeeded(*arguments) - Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( + ::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded( 'FixVulnerabilityOccurrencesWithHashesAsRawMetadata', arguments ) diff --git a/lib/gitlab/background_migration/job_coordinator.rb b/lib/gitlab/background_migration/job_coordinator.rb index b7d47c389df..acbb5f76ad8 100644 --- a/lib/gitlab/background_migration/job_coordinator.rb +++ b/lib/gitlab/background_migration/job_coordinator.rb @@ -50,34 +50,41 @@ module Gitlab Gitlab::Database::SharedModel.using_connection(connection, &block) end - def steal(steal_class, retry_dead_jobs: false) - with_shared_connection do + def pending_jobs(include_dead_jobs: false) + Enumerator.new do |y| queues = [ Sidekiq::ScheduledSet.new, Sidekiq::Queue.new(self.queue) ] - if retry_dead_jobs + if include_dead_jobs queues << Sidekiq::RetrySet.new queues << Sidekiq::DeadSet.new end queues.each do |queue| queue.each do |job| - migration_class, migration_args = job.args + y << job if job.klass == worker_class.name + end + end + end + end + + def steal(steal_class, retry_dead_jobs: false) + with_shared_connection do + pending_jobs(include_dead_jobs: retry_dead_jobs).each do |job| + migration_class, migration_args = job.args - next unless job.klass == worker_class.name - next unless migration_class == steal_class - next if block_given? && !(yield job) + next unless migration_class == steal_class + next if block_given? && !(yield job) - begin - perform(migration_class, migration_args) if job.delete - rescue Exception # rubocop:disable Lint/RescueException - worker_class # enqueue this migration again - .perform_async(migration_class, migration_args) + begin + perform(migration_class, migration_args) if job.delete + rescue Exception # rubocop:disable Lint/RescueException + worker_class # enqueue this migration again + .perform_async(migration_class, migration_args) - raise - end + raise end end end diff --git a/lib/gitlab/background_migration/migrate_personal_namespace_project_maintainer_to_owner.rb b/lib/gitlab/background_migration/migrate_personal_namespace_project_maintainer_to_owner.rb new file mode 100644 index 00000000000..49eff6e2771 --- /dev/null +++ b/lib/gitlab/background_migration/migrate_personal_namespace_project_maintainer_to_owner.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Migrates personal namespace project `maintainer` memberships (for the associated user only) to OWNER + # Does not create any missing records, simply migrates existing ones + class MigratePersonalNamespaceProjectMaintainerToOwner + include Gitlab::Database::DynamicModelHelpers + + def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, pause_ms) + parent_batch_relation = relation_scoped_to_range(batch_table, batch_column, start_id, end_id) + + parent_batch_relation.each_batch(column: batch_column, of: sub_batch_size) do |sub_batch| + batch_metrics.time_operation(:update_all) do + sub_batch.update_all('access_level = 50') + end + + pause_ms = 0 if pause_ms < 0 + sleep(pause_ms * 0.001) + end + end + + def batch_metrics + @batch_metrics ||= Gitlab::Database::BackgroundMigration::BatchMetrics.new + end + + private + + def relation_scoped_to_range(source_table, source_key_column, start_id, stop_id) + # members of projects within their own personal namespace + + # rubocop: disable CodeReuse/ActiveRecord + define_batchable_model(:members, connection: ApplicationRecord.connection) + .where(source_key_column => start_id..stop_id) + .joins("INNER JOIN projects ON members.source_id = projects.id") + .joins("INNER JOIN namespaces ON projects.namespace_id = namespaces.id") + .where(type: 'ProjectMember') + .where("namespaces.type = 'User'") + .where('members.access_level < 50') + .where('namespaces.owner_id = members.user_id') + end + end + # rubocop: enable CodeReuse/ActiveRecord + end +end diff --git a/lib/gitlab/background_migration/nullify_orphan_runner_id_on_ci_builds.rb b/lib/gitlab/background_migration/nullify_orphan_runner_id_on_ci_builds.rb new file mode 100644 index 00000000000..78e897d9ae1 --- /dev/null +++ b/lib/gitlab/background_migration/nullify_orphan_runner_id_on_ci_builds.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # A job to nullify orphan runner_id on ci_builds table + class NullifyOrphanRunnerIdOnCiBuilds + include Gitlab::Database::DynamicModelHelpers + + def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, pause_ms) + pause_ms = 0 if pause_ms < 0 + + batch_relation = relation_scoped_to_range(batch_table, batch_column, start_id, end_id) + batch_relation.each_batch(column: batch_column, of: sub_batch_size, order_hint: :type) do |sub_batch| + batch_metrics.time_operation(:update_all) do + sub_batch.update_all(runner_id: nil) + end + + sleep(pause_ms * 0.001) + end + end + + def batch_metrics + @batch_metrics ||= Gitlab::Database::BackgroundMigration::BatchMetrics.new + end + + private + + def connection + ActiveRecord::Base.connection + end + + def relation_scoped_to_range(source_table, source_key_column, start_id, stop_id) + define_batchable_model(source_table, connection: connection) + .joins('LEFT OUTER JOIN ci_runners ON ci_runners.id = ci_builds.runner_id') + .where('ci_builds.runner_id IS NOT NULL AND ci_runners.id IS NULL') + .where(source_key_column => start_id..stop_id) + end + end + end +end diff --git a/lib/gitlab/background_migration/project_namespaces/backfill_project_namespaces.rb b/lib/gitlab/background_migration/project_namespaces/backfill_project_namespaces.rb index ba3f7c47047..c34cc57ce60 100644 --- a/lib/gitlab/background_migration/project_namespaces/backfill_project_namespaces.rb +++ b/lib/gitlab/background_migration/project_namespaces/backfill_project_namespaces.rb @@ -34,8 +34,11 @@ module Gitlab def backfill_project_namespaces(namespace_id) project_ids.each_slice(sub_batch_size) do |project_ids| - ActiveRecord::Base.connection.execute("select gin_clean_pending_list('index_namespaces_on_name_trigram')") - ActiveRecord::Base.connection.execute("select gin_clean_pending_list('index_namespaces_on_path_trigram')") + # cleanup gin indexes on namespaces table + cleanup_gin_index('namespaces') + + # cleanup gin indexes on projects table + cleanup_gin_index('projects') # We need to lock these project records for the period when we create project namespaces # and link them to projects so that if a project is modified in the time between creating @@ -53,6 +56,14 @@ module Gitlab end end + def cleanup_gin_index(table_name) + index_names = ActiveRecord::Base.connection.select_values("select indexname::text from pg_indexes where tablename = '#{table_name}' and indexdef ilike '%gin%'") + + index_names.each do |index_name| + ActiveRecord::Base.connection.execute("select gin_clean_pending_list('#{index_name}')") + end + end + def cleanup_backfilled_project_namespaces(namespace_id) project_ids.each_slice(sub_batch_size) do |project_ids| # IMPORTANT: first nullify project_namespace_id in projects table to avoid removing projects when records diff --git a/lib/gitlab/background_migration/remove_all_trace_expiration_dates.rb b/lib/gitlab/background_migration/remove_all_trace_expiration_dates.rb new file mode 100644 index 00000000000..d47aa76f24b --- /dev/null +++ b/lib/gitlab/background_migration/remove_all_trace_expiration_dates.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Removing expire_at timestamps that shouldn't have + # been written to traces on gitlab.com. + class RemoveAllTraceExpirationDates + include Gitlab::Database::MigrationHelpers + + BATCH_SIZE = 1_000 + + # Stubbed class to connect to the CI database + # connects_to has to be called in abstract classes. + class MultiDbAdaptableClass < ActiveRecord::Base + self.abstract_class = true + + if Gitlab::Database.has_config?(:ci) + connects_to database: { writing: :ci, reading: :ci } + end + end + + # Stubbed class to access the ci_job_artifacts table + class JobArtifact < MultiDbAdaptableClass + include EachBatch + + self.table_name = 'ci_job_artifacts' + + TARGET_TIMESTAMPS = [ + Date.new(2021, 04, 22).midnight.utc, + Date.new(2021, 05, 22).midnight.utc, + Date.new(2021, 06, 22).midnight.utc, + Date.new(2022, 01, 22).midnight.utc, + Date.new(2022, 02, 22).midnight.utc, + Date.new(2022, 03, 22).midnight.utc, + Date.new(2022, 04, 22).midnight.utc + ].freeze + + scope :traces, -> { where(file_type: 3) } + scope :between, -> (start_id, end_id) { where(id: start_id..end_id) } + scope :in_targeted_timestamps, -> { where(expire_at: TARGET_TIMESTAMPS) } + end + + def perform(start_id, end_id) + return unless Gitlab.com? + + JobArtifact.traces + .between(start_id, end_id) + .in_targeted_timestamps + .each_batch(of: BATCH_SIZE) { |batch| batch.update_all(expire_at: nil) } + end + end + end +end diff --git a/lib/gitlab/background_migration/reset_duplicate_ci_runners_token_encrypted_values_on_projects.rb b/lib/gitlab/background_migration/reset_duplicate_ci_runners_token_encrypted_values_on_projects.rb new file mode 100644 index 00000000000..80ca76ef37f --- /dev/null +++ b/lib/gitlab/background_migration/reset_duplicate_ci_runners_token_encrypted_values_on_projects.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # A job to nullify duplicate runners_token_encrypted values in projects table in batches + class ResetDuplicateCiRunnersTokenEncryptedValuesOnProjects + class Project < ActiveRecord::Base # rubocop:disable Style/Documentation + include ::EachBatch + + self.table_name = 'projects' + + scope :base_query, -> do + where.not(runners_token_encrypted: nil) + end + end + + def perform(start_id, end_id) + # Reset duplicate runner tokens that would prevent creating an unique index. + duplicate_tokens = Project.base_query + .where(id: start_id..end_id) + .group(:runners_token_encrypted) + .having('COUNT(*) > 1') + .pluck(:runners_token_encrypted) + + Project.where(runners_token_encrypted: duplicate_tokens).update_all(runners_token_encrypted: nil) if duplicate_tokens.any? + + mark_job_as_succeeded(start_id, end_id) + end + + private + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('ResetDuplicateCiRunnersTokenEncryptedValuesOnProjects', arguments) + end + end + end +end diff --git a/lib/gitlab/background_migration/reset_duplicate_ci_runners_token_values_on_projects.rb b/lib/gitlab/background_migration/reset_duplicate_ci_runners_token_values_on_projects.rb new file mode 100644 index 00000000000..d87ce6c88d3 --- /dev/null +++ b/lib/gitlab/background_migration/reset_duplicate_ci_runners_token_values_on_projects.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # A job to nullify duplicate ci_runners_token values in projects table in batches + class ResetDuplicateCiRunnersTokenValuesOnProjects + class Project < ActiveRecord::Base # rubocop:disable Style/Documentation + include ::EachBatch + + self.table_name = 'projects' + + scope :base_query, -> do + where.not(runners_token: nil) + end + end + + def perform(start_id, end_id) + # Reset duplicate runner tokens that would prevent creating an unique index. + duplicate_tokens = Project.base_query + .where(id: start_id..end_id) + .group(:runners_token) + .having('COUNT(*) > 1') + .pluck(:runners_token) + + Project.where(runners_token: duplicate_tokens).update_all(runners_token: nil) if duplicate_tokens.any? + + mark_job_as_succeeded(start_id, end_id) + end + + private + + def mark_job_as_succeeded(*arguments) + Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded('ResetDuplicateCiRunnerValuesTokensOnProjects', arguments) + end + end + end +end |