1 files changed, 400 insertions, 23 deletions
diff --git a/doc/development/database/loose_foreign_keys.md b/doc/development/database/loose_foreign_keys.md
index 17a825b4812..2bcdc91202a 100644
--- a/doc/development/database/loose_foreign_keys.md
+++ b/doc/development/database/loose_foreign_keys.md
@@ -95,27 +95,27 @@ Created database 'gitlabhq_test_ee'
 Created database 'gitlabhq_geo_test_ee'
 
 Showing cross-schema foreign keys (20):
-   ID | HAS_LFK |                                     FROM |                   TO |                         COLUMN |       ON_DELETE 
-    0 |       N |                                ci_builds |             projects |                     project_id |         cascade 
-    1 |       N |                         ci_job_artifacts |             projects |                     project_id |         cascade 
-    2 |       N |                             ci_pipelines |             projects |                     project_id |         cascade 
-    3 |       Y |                             ci_pipelines |       merge_requests |               merge_request_id |         cascade 
-    4 |       N |                   external_pull_requests |             projects |                     project_id |         cascade 
-    5 |       N |                     ci_sources_pipelines |             projects |                     project_id |         cascade 
-    6 |       N |                                ci_stages |             projects |                     project_id |         cascade 
-    7 |       N |                    ci_pipeline_schedules |             projects |                     project_id |         cascade 
-    8 |       N |                       ci_runner_projects |             projects |                     project_id |         cascade 
-    9 |       Y |             dast_site_profiles_pipelines |         ci_pipelines |                 ci_pipeline_id |         cascade 
-   10 |       Y |                   vulnerability_feedback |         ci_pipelines |                    pipeline_id |         nullify 
-   11 |       N |                             ci_variables |             projects |                     project_id |         cascade 
-   12 |       N |                                  ci_refs |             projects |                     project_id |         cascade 
-   13 |       N |                       ci_builds_metadata |             projects |                     project_id |         cascade 
-   14 |       N |                ci_subscriptions_projects |             projects |          downstream_project_id |         cascade 
-   15 |       N |                ci_subscriptions_projects |             projects |            upstream_project_id |         cascade 
-   16 |       N |                      ci_sources_projects |             projects |              source_project_id |         cascade 
-   17 |       N |         ci_job_token_project_scope_links |             projects |              source_project_id |         cascade 
-   18 |       N |         ci_job_token_project_scope_links |             projects |              target_project_id |         cascade 
-   19 |       N |                ci_project_monthly_usages |             projects |                     project_id |         cascade 
+   ID | HAS_LFK |                                     FROM |                   TO |                         COLUMN |       ON_DELETE
+    0 |       N |                                ci_builds |             projects |                     project_id |         cascade
+    1 |       N |                         ci_job_artifacts |             projects |                     project_id |         cascade
+    2 |       N |                             ci_pipelines |             projects |                     project_id |         cascade
+    3 |       Y |                             ci_pipelines |       merge_requests |               merge_request_id |         cascade
+    4 |       N |                   external_pull_requests |             projects |                     project_id |         cascade
+    5 |       N |                     ci_sources_pipelines |             projects |                     project_id |         cascade
+    6 |       N |                                ci_stages |             projects |                     project_id |         cascade
+    7 |       N |                    ci_pipeline_schedules |             projects |                     project_id |         cascade
+    8 |       N |                       ci_runner_projects |             projects |                     project_id |         cascade
+    9 |       Y |             dast_site_profiles_pipelines |         ci_pipelines |                 ci_pipeline_id |         cascade
+   10 |       Y |                   vulnerability_feedback |         ci_pipelines |                    pipeline_id |         nullify
+   11 |       N |                             ci_variables |             projects |                     project_id |         cascade
+   12 |       N |                                  ci_refs |             projects |                     project_id |         cascade
+   13 |       N |                       ci_builds_metadata |             projects |                     project_id |         cascade
+   14 |       N |                ci_subscriptions_projects |             projects |          downstream_project_id |         cascade
+   15 |       N |                ci_subscriptions_projects |             projects |            upstream_project_id |         cascade
+   16 |       N |                      ci_sources_projects |             projects |              source_project_id |         cascade
+   17 |       N |         ci_job_token_project_scope_links |             projects |              source_project_id |         cascade
+   18 |       N |         ci_job_token_project_scope_links |             projects |              target_project_id |         cascade
+   19 |       N |                ci_project_monthly_usages |             projects |                     project_id |         cascade
 
 To match FK write one or many filters to match against FROM/TO/COLUMN:
 - scripts/decomposition/generate-loose-foreign-key <filter(s)...>
@@ -191,7 +191,7 @@ ci_pipelines:
 ### Track record changes
 
 To know about deletions in the `projects` table, configure a `DELETE` trigger
-using a [post-deployment migration](../post_deployment_migrations.md). The
+using a [post-deployment migration](post_deployment_migrations.md). The
 trigger needs to be configured only once. If the model already has at least one
 `loose_foreign_key` definition, then this step can be skipped:
 
@@ -226,7 +226,7 @@ ON DELETE CASCADE;
 
 The migration must run after the `DELETE` trigger is installed and the loose
 foreign key definition is deployed. As such, it must be a [post-deployment
-migration](../post_deployment_migrations.md) dated after the migration for the
+migration](post_deployment_migrations.md) dated after the migration for the
 trigger. If the foreign key is deleted earlier, there is a good chance of
 introducing data inconsistency which needs manual cleanup:
 
@@ -480,3 +480,380 @@ it executes `occurrence.pipeline.created_at`.
 When looping through the vulnerability occurrences in the Sidekiq worker, we
 could try to load the corresponding pipeline and choose to skip processing that
 occurrence if pipeline is not found.
+
+## Architecture
+
+The loose foreign keys feature is implemented within the `LooseForeignKeys` Ruby namespace. The
+code is isolated from the core application code and theoretically, it could be a standalone library.
+
+The feature is invoked solely in the [`LooseForeignKeys::CleanupWorker`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/workers/loose_foreign_keys/cleanup_worker.rb) worker class. The worker is scheduled via a
+cron job where the schedule depends on the configuration of the GitLab instance.
+
+- Non-decomposed GitLab (1 database): invoked every minute.
+- Decomposed GitLab (2 databases, CI and Main): invoked every minute, cleaning up one database
+at a time. For example, the cleanup worker for the main database runs every two minutes.
+
+To avoid lock contention and the processing of the same database rows, the worker does not run
+parallel. This behavior is ensured with a Redis lock.
+
+**Record cleanup procedure:**
+
+1. Acquire the Redis lock.
+1. Determine which database to clean up.
+1. Collect all database tables where the deletions are tracked (parent tables).
+    - This is achieved by reading the `config/gitlab_loose_foreign_keys.yml` file.
+    - A table is considered "tracked" when a loose foreign key definition exists for the table and
+    the `DELETE` trigger is installed.
+1. Cycle through the tables with an infinite loop.
+1. For each table, load a batch of deleted parent records to clean up.
+1. Depending on the YAML configuration, build `DELETE` or `UPDATE` (nullify) queries for the
+referenced child tables.
+1. Invoke the queries.
+1. Repeat until all child records are cleaned up or the maximum limit is reached.
+1. Remove the deleted parent records when all child records are cleaned up.
+
+### Database structure
+
+The feature relies on triggers installed on the parent tables. When a parent record is deleted,
+the trigger will automatically insert a new record into the `loose_foreign_keys_deleted_records`
+database table.
+
+The inserted record will store the following information about the deleted record:
+
+- `fully_qualified_table_name`: name of the database table where the record was located.
+- `primary_key_value`: the ID of the record, the value will be present in the child tables as
+the foreign key value. At the moment, composite primary keys are not supported, the parent table
+must have an `id` column.
+- `status`: defaults to pending, represents the status of the cleanup process.
+- `consume_after`: defaults to the current time.
+- `cleanup_attempts`: defaults to 0. The number of times the worker tried to clean up this record.
+A non-zero number would mean that this record has many child records and cleaning it up requires
+several runs.
+
+#### Database decomposition
+
+The `loose_foreign_keys_deleted_records` table will exist on both database servers (Ci and Main)
+after the [database decomposition](https://gitlab.com/groups/gitlab-org/-/epics/6168). The worker
+ill determine which parent tables belong to which database by reading the
+`lib/gitlab/database/gitlab_schemas.yml` YAML file.
+
+Example:
+
+- Main database tables
+  - `projects`
+  - `namespaces`
+  - `merge_requests`
+- Ci database tables
+  - `ci_builds`
+  - `ci_pipelines`
+
+When the worker is invoked for the Ci database, the worker will load deleted records only from the
+`ci_builds` and `ci_pipelines` tables. During the cleanup process, `DELETE` and `UPDATE` queries
+will mostly run on tables located in the Main database. In this example, one `UPDATE` query will
+nullify the `merge_requests.head_pipeline_id` column.
+
+#### Database partitioning
+
+Due to the large volume of inserts the database table receives daily, a special partitioning
+strategy was implemented to address data bloat concerns. Originally, the
+[time-decay](https://about.gitlab.com/company/team/structure/working-groups/database-scalability/time-decay.html)
+strategy was considered for the feature but due to the large data volume we decided to implement a
+new strategy.
+
+A deleted record is considered fully processed when all its direct children records have been
+cleaned up. When this happens, the loose foreign key worker will update the `status` column of
+the deleted record. After this step, the record is no longer needed.
+
+The sliding partitioning strategy provides an efficient way of cleaning up old, unused data by
+adding a new database partition and removing the old one when certain conditions are met.
+The `loose_foreign_keys_deleted_records` database table is list partitioned where most of the
+time there is only one partition attached to the table.
+
+```sql
+                                                             Partitioned table "public.loose_foreign_keys_deleted_records"
+           Column           |           Type           | Collation | Nullable |                            Default                             | Storage  | Stats target | Description
+----------------------------+--------------------------+-----------+----------+----------------------------------------------------------------+----------+--------------+-------------
+ id                         | bigint                   |           | not null | nextval('loose_foreign_keys_deleted_records_id_seq'::regclass) | plain    |              |
+ partition                  | bigint                   |           | not null | 84                                                             | plain    |              |
+ primary_key_value          | bigint                   |           | not null |                                                                | plain    |              |
+ status                     | smallint                 |           | not null | 1                                                              | plain    |              |
+ created_at                 | timestamp with time zone |           | not null | now()                                                          | plain    |              |
+ fully_qualified_table_name | text                     |           | not null |                                                                | extended |              |
+ consume_after              | timestamp with time zone |           |          | now()                                                          | plain    |              |
+ cleanup_attempts           | smallint                 |           |          | 0                                                              | plain    |              |
+Partition key: LIST (partition)
+Indexes:
+    "loose_foreign_keys_deleted_records_pkey" PRIMARY KEY, btree (partition, id)
+    "index_loose_foreign_keys_deleted_records_for_partitioned_query" btree (partition, fully_qualified_table_name, consume_after, id) WHERE status = 1
+Check constraints:
+    "check_1a541f3235" CHECK (char_length(fully_qualified_table_name) <= 150)
+Partitions: gitlab_partitions_dynamic.loose_foreign_keys_deleted_records_84 FOR VALUES IN ('84')
+```
+
+The `partition` column controls the insert direction, the `partition` value determines which
+partition will get the deleted rows inserted via the trigger. Notice that the default value of
+the `partition` table matches with the value of the list partition (84). In `INSERT` query
+within the trigger thevalue of the `partition` is omitted, the trigger always relies on the
+default value of the column.
+
+Example `INSERT` query for the trigger:
+
+```sql
+INSERT INTO loose_foreign_keys_deleted_records
+(fully_qualified_table_name, primary_key_value)
+SELECT TG_TABLE_SCHEMA || '.' || TG_TABLE_NAME, old_table.id FROM old_table;
+```
+
+The partition "sliding" process is controlled by two, regularly executed callbacks. These
+callbackes are defined within the `LooseForeignKeys::DeletedRecord` model.
+
+The `next_partition_if` callback controls when to create a new partition. A new partition will
+be created when the current partition has at least one record older than 24 hours. A new partition
+is added by the [`PartitionManager`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/database/partitioning/partition_manager.rb)
+using the following steps:
+
+1. Create a new partition, where the `VALUE` for the partition is `CURRENT_PARTITION + 1`.
+1. Update the default value of the `partition` column to `CURRENT_PARTITION + 1`.
+
+With these steps, new `INSERT`-s via the triggers will end up in the new partition. At this point,
+the database table has two partitions.
+
+The `detach_partition_if` callback determines if the old partitions can be detached from the table.
+A partition is detachable if there are no pending (unprocessed) records in the partition
+(`status = 1`). The detached partitions will be available for some time, you can see the list
+detached partitions in the `detached_partitions` table:
+
+```sql
+select * from detached_partitions;
+```
+
+#### Cleanup queries
+
+The `LooseForeignKeys::CleanupWorker` has its database query builder which depends on `Arel`.
+The feature doesn't reference any application-specific `ActiveRecord` models to avoid unexpected
+side effects. The database queries are batched, which means that several parent records are being
+cleaned up at the same time.
+
+Example `DELETE` query:
+
+```sql
+DELETE
+FROM "merge_request_metrics"
+WHERE ("merge_request_metrics"."id") IN
+  (SELECT "merge_request_metrics"."id"
+    FROM "merge_request_metrics"
+    WHERE "merge_request_metrics"."pipeline_id" IN (1, 2, 10, 20)
+    LIMIT 1000 FOR UPDATE SKIP LOCKED)
+```
+
+The primary key values of the parent records are 1, 2, 10, and 20.
+
+Example `UPDATE` (nullify) query:
+
+```sql
+UPDATE "merge_requests"
+SET "head_pipeline_id" = NULL
+WHERE ("merge_requests"."id") IN
+    (SELECT "merge_requests"."id"
+     FROM "merge_requests"
+     WHERE "merge_requests"."head_pipeline_id" IN (3, 4, 30, 40)
+     LIMIT 500 FOR UPDATE SKIP LOCKED)
+```
+
+These queries are batched, which means that in many cases, several invocations are needed to clean
+up all associated child records.
+
+The batching is implemented with loops, the processing will stop when all associated child records
+are cleaned up or the limit is reached.
+
+```ruby
+loop do
+  modification_count = process_batch_with_skip_locked
+
+  break if modification_count == 0 || over_limit?
+end
+
+loop do
+  modification_count = process_batch
+
+  break if modification_count == 0 || over_limit?
+end
+```
+
+The loop-based batch processing is preferred over `EachBatch` for the following reasons:
+
+- The records in the batch are modified, so the next batch will contain different records.
+- There is always an index on the foreign key column however, the column is usually not unique.
+`EachBatch` requires a unique column for the iteration.
+- The record order doesn't matter for the cleanup.
+
+Notice that we have two loops. The initial loop will process records with the `SKIP LOCKED` clause.
+The query will skip rows that are locked by other application processes. This will ensure that the
+cleanup worker will less likely to become blocked. The second loop will execute the database
+queries without `SKIP LOCKED` to ensure that all records have been processed.
+
+#### Processing limits
+
+A constant, large volume of record updates or deletions can cause incidents and affect the
+availability of GitLab:
+
+- Increased table bloat.
+- Increased number of pending WAL files.
+- Busy tables, difficulty when acquiring locks.
+
+To mitigate these issues, several limits are applied when the worker runs.
+
+- Each query has `LIMIT`, a query cannot process an unbounded number of rows.
+- The maximum number of record deletions and record updates is limited.
+- The maximum runtime (30 seconds) for the database queries is limited.
+
+The limit rules are implemented in the `LooseForeignKeys::ModificationTracker` class. When one of
+the limits (record modification count, time limit) is reached the processing is stopped
+immediately. After some time, the next scheduled worker will continue the cleanup process.
+
+#### Performance characteristics
+
+The database trigger on the parent tables will **decrease** the record deletion speed. Each
+statement that removes rows from the parent table will invoke the trigger to insert records
+into the `loose_foreign_keys_deleted_records` table.
+
+The queries within the cleanup worker are fairly efficient index scans, with limits in place
+they're unlikely to affect other parts of the application.
+
+The database queries are not running in transaction, when an error happens for example a statement
+timeout or a worker crash, the next job will continue the processing.
+
+## Troubleshooting
+
+### Accumulation of deleted records
+
+There can be cases where the workers need to process an unusually large amount of data. This can
+happen under normal usage, for example when a large project or group is deleted. In this scenario,
+there can be several million rows to be deleted or nullified. Due to the limits enforced by the
+worker, processing this data will take some time.
+
+When cleaning up "heavy-hitters", the feature ensures fair processing by rescheduling larger
+batches for later. This gives time for other deleted records to be processed.
+
+For example, a project with millions of `ci_builds` records is deleted. The `ci_builds` records
+will be deleted by the loose foreign keys feature.
+
+1. The cleanup worker is scheduled and picks up a batch of deleted `projects` records. The large
+project is part of the batch.
+1. Deletion of the orphaned `ci_builds` rows has started.
+1. The time limit is reached, but the cleanup is not complete.
+1. The `cleanup_attempts` column is incremented for the deleted records.
+1. Go to step 1. The next cleanup worker continues the cleanup.
+1. When the `cleanup_attempts` reaches 3, the batch is re-scheduled 10 minutes later by updating
+the `consume_after` column.
+1. The next cleanup worker will process a different batch.
+
+We have Prometheus metrics in place to monitor the deleted record cleanup:
+
+- `loose_foreign_key_processed_deleted_records`: Number of processed deleted records. When large
+cleanup happens, this number would decrease.
+- `loose_foreign_key_incremented_deleted_records`: Number of deleted records which were not
+finished processing. The `cleanup_attempts` column was incremented.
+- `loose_foreign_key_rescheduled_deleted_records`: Number of deleted records that had to be
+rescheduled at a later time after 3 cleanup attempts.
+
+Example Thanos query:
+
+```plaintext
+loose_foreign_key_rescheduled_deleted_records{env="gprd", table="ci_runners"}
+```
+
+Another way to look at the situation is by running a database query. This query gives the exact
+counts of the unprocessed records:
+
+```sql
+SELECT partition, fully_qualified_table_name, count(*)
+FROM loose_foreign_keys_deleted_records
+WHERE
+status = 1
+GROUP BY 1, 2;
+```
+
+Example output:
+
+```sql
+ partition | fully_qualified_table_name | count
+-----------+----------------------------+-------
+        87 | public.ci_builds           |   874
+        87 | public.ci_job_artifacts    |  6658
+        87 | public.ci_pipelines        |   102
+        87 | public.ci_runners          |   111
+        87 | public.merge_requests      |   255
+        87 | public.namespaces          |    25
+        87 | public.projects            |     6
+```
+
+The query includes the partition number which can be useful to detect if the cleanup process is
+significantly lagging behind. When multiple different partition values are present in the list
+that means the cleanup of some deleted records didn't finish in several days (1 new partition
+is added every day).
+
+Steps to diagnose the problem:
+
+- Check which records are accumulating.
+- Try to get an estimate of the number of remaining records.
+- Looking into the worker performance stats (Kibana or Thanos).
+
+Possible solutions:
+
+- Short-term: increase the batch sizes.
+- Long-term: invoke the worker more frequently. Parallelize the worker
+
+For a one-time fix, we can run the cleanup worker several times from the rails console. The worker
+can run parallelly however, this can introduce lock contention and it could increase the worker
+runtime.
+
+```ruby
+LooseForeignKeys::CleanupWorker.new.perform
+```
+
+When the cleanup is done, the older partitions will be automatically detached by the
+`PartitionManager`.
+
+### PartitionManager bug
+
+NOTE:
+This issue happened in the past on Staging and it has been mitigated.
+
+When adding a new partition, the default value of the `partition` column is also updated. This is
+a schema change that is executed in the same transaction as the new partition creation. It's highly
+unlikely that the `partition` column goes outdated.
+
+However, if this happens then this can cause application-wide incidents because the `partition`
+value points to a partition that doesn't exist. Symptom: deletion of records from tables where the
+`DELETE` trigger is installed fails.
+
+```sql
+\d+ loose_foreign_keys_deleted_records;
+
+           Column           |           Type           | Collation | Nullable |                            Default                             | Storage  | Stats target | Description
+----------------------------+--------------------------+-----------+----------+----------------------------------------------------------------+----------+--------------+-------------
+ id                         | bigint                   |           | not null | nextval('loose_foreign_keys_deleted_records_id_seq'::regclass) | plain    |              |
+ partition                  | bigint                   |           | not null | 4                                                              | plain    |              |
+ primary_key_value          | bigint                   |           | not null |                                                                | plain    |              |
+ status                     | smallint                 |           | not null | 1                                                              | plain    |              |
+ created_at                 | timestamp with time zone |           | not null | now()                                                          | plain    |              |
+ fully_qualified_table_name | text                     |           | not null |                                                                | extended |              |
+ consume_after              | timestamp with time zone |           |          | now()                                                          | plain    |              |
+ cleanup_attempts           | smallint                 |           |          | 0                                                              | plain    |              |
+Partition key: LIST (partition)
+Indexes:
+    "loose_foreign_keys_deleted_records_pkey" PRIMARY KEY, btree (partition, id)
+    "index_loose_foreign_keys_deleted_records_for_partitioned_query" btree (partition, fully_qualified_table_name, consume_after, id) WHERE status = 1
+Check constraints:
+    "check_1a541f3235" CHECK (char_length(fully_qualified_table_name) <= 150)
+Partitions: gitlab_partitions_dynamic.loose_foreign_keys_deleted_records_3 FOR VALUES IN ('3')
+```
+
+Check the default value of the `partition` column and compare it with the available partitions
+(4 vs 3). The partition with the value of 4 does not exist. To mitigate the problem an emergency
+schema change is required:
+
+```sql
+ALTER TABLE loose_foreign_keys_deleted_records ALTER COLUMN partition SET DEFAULT 3;
+```