From 8e6ab9a259d921298940190161fadfd118c6dc15 Mon Sep 17 00:00:00 2001
From: Jordi Serra Torrens <jordi.serra-torrens@mongodb.com>
Date: Thu, 30 Dec 2021 12:18:25 +0000
Subject: SERVER-62245 MigrationRecovery must not assume that only one
 migration needs to be recovered

---
 etc/backports_required_for_multiversion_tests.yml  |  4 +
 .../recover_multiple_migrations_on_stepup.js       | 85 ++++++++++++++++++++++
 src/mongo/db/s/migration_coordinator.cpp           | 41 +++++++++--
 src/mongo/db/s/migration_coordinator.h             |  2 +
 src/mongo/db/s/migration_util.cpp                  | 25 ++-----
 5 files changed, 133 insertions(+), 24 deletions(-)
 create mode 100644 jstests/sharding/recover_multiple_migrations_on_stepup.js

diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index d8cd34bf4a1..582f4cb127e 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -132,6 +132,8 @@ last-continuous:
       test_file: jstests/auth/dbcheck.js
     - ticket: SERVER-61666
       test_file: jstests/replsets/tenant_migration_transaction_boundary.js
+    - ticket: SERVER-62245
+      test_file: jstests/sharding/recover_multiple_migrations_on_stepup.js
     - ticket: SERVER-62212
       test_file: jstests/replsets/dbcheck_write_concern.js
     - ticket: SERVER-62296
@@ -409,6 +411,8 @@ last-lts:
       test_file: jstests/auth/dbcheck.js
     - ticket: SERVER-61666
       test_file: jstests/replsets/tenant_migration_transaction_boundary.js
+    - ticket: SERVER-62245
+      test_file: jstests/sharding/recover_multiple_migrations_on_stepup.js
     - ticket: SERVER-62296
       test_file: jstests/sharding/migration_recovers_unfinished_migrations.js
 
diff --git a/jstests/sharding/recover_multiple_migrations_on_stepup.js b/jstests/sharding/recover_multiple_migrations_on_stepup.js
new file mode 100644
index 00000000000..b8cf4644df8
--- /dev/null
+++ b/jstests/sharding/recover_multiple_migrations_on_stepup.js
@@ -0,0 +1,85 @@
+/**
+ * Tests that if on stepup a shard finds more than one config.migrationCoordinators documents (which
+ * could happen as a consequence of the bug described in SERVER-62245), the shard will be able to
+ * properly recover all migrations.
+ *
+ * TODO: SERVER-62316 This test should be deleted when 6.0 becomes the lastLTS version, since a
+ * situation where there are more than one migrationCoordinators documents will no longer be
+ * possible.
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load('./jstests/libs/chunk_manipulation_util.js');
+
+// Disable checking for index consistency to ensure that the config server doesn't trigger a
+// StaleShardVersion exception on the shards and cause them to refresh their sharding metadata. That
+// would interfere with the precise migration recovery interleaving this test requires.
+const nodeOptions = {
+    setParameter: {enableShardedIndexConsistencyCheck: false}
+};
+
+var st = new ShardingTest({shards: 2, other: {configOptions: nodeOptions, enableBalancer: false}});
+let staticMongod = MongoRunner.runMongod({});
+
+const dbName = "test";
+const collNameA = "foo";
+const collNameB = "bar";
+const nsA = dbName + "." + collNameA;
+const nsB = dbName + "." + collNameB;
+const collA = st.s.getDB(dbName)[collNameA];
+const collB = st.s.getDB(dbName)[collNameB];
+
+assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
+st.ensurePrimaryShard(dbName, st.shard0.shardName);
+assert.commandWorked(st.s.adminCommand({shardCollection: nsA, key: {_id: 1}}));
+assert.commandWorked(st.s.adminCommand({shardCollection: nsB, key: {_id: 1}}));
+
+// Run a first migration just to save its associated config.migrationCoordinators document for the
+// purposes of the test.
+var moveChunkHangAtStep3Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep3");
+var joinMoveChunk1 = moveChunkParallel(
+    staticMongod, st.s0.host, {_id: 0}, null, nsA, st.shard1.shardName, true /* expectSuccess */);
+moveChunkHangAtStep3Failpoint.wait();
+
+let migrationCoordinatorDocuments =
+    st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray();
+assert.eq(1, migrationCoordinatorDocuments.length);
+let firstMigrationCoordinatorDoc = migrationCoordinatorDocuments[0];
+
+// Let the first migration finish and delete its migrationCoordinator document. Otherwise because of
+// the fix introduced in SERVER-62296 no new migration could start until the first one has deleted
+// its recovery document.
+moveChunkHangAtStep3Failpoint.off();
+joinMoveChunk1();
+
+// Start a second migration on a different collection, wait until it persists it's recovery document
+// and then step down the donor.
+var moveChunkHangAtStep3Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep3");
+
+var joinMoveChunk2 = moveChunkParallel(
+    staticMongod, st.s0.host, {_id: 0}, null, nsB, st.shard1.shardName, false /* expectSuccess */);
+
+moveChunkHangAtStep3Failpoint.wait();
+
+// Insert the recovery document from the first migration as to simulate the bug described in
+// SERVER-62245
+assert.commandWorked(st.rs0.getPrimary().getDB('config')['migrationCoordinators'].insert(
+    firstMigrationCoordinatorDoc));
+
+// Now we have two migrations pending to be recovered
+assert.eq(2, st.rs0.getPrimary().getDB('config')['migrationCoordinators'].countDocuments({}));
+
+// Stepdown the donor shard
+assert.commandWorked(st.rs0.getPrimary().adminCommand({replSetStepDown: 5, force: true}));
+moveChunkHangAtStep3Failpoint.off();
+joinMoveChunk2();
+
+// Check that the donor shard has been able to recover the shard version for both collections.
+assert.eq(0, collA.find().itcount());
+assert.eq(0, collB.find().itcount());
+
+MongoRunner.stopMongod(staticMongod);
+st.stop();
+})();
diff --git a/src/mongo/db/s/migration_coordinator.cpp b/src/mongo/db/s/migration_coordinator.cpp
index 5b983056cf3..dfab697554e 100644
--- a/src/mongo/db/s/migration_coordinator.cpp
+++ b/src/mongo/db/s/migration_coordinator.cpp
@@ -90,7 +90,7 @@ MigrationCoordinator::MigrationCoordinator(MigrationSessionId sessionId,
       _waitForDelete(waitForDelete) {}
 
 MigrationCoordinator::MigrationCoordinator(const MigrationCoordinatorDocument& doc)
-    : _migrationInfo(doc) {}
+    : _migrationInfo(doc), _recoveringMigration(true) {}
 
 MigrationCoordinator::~MigrationCoordinator() = default;
 
@@ -208,10 +208,26 @@ SemiFuture<void> MigrationCoordinator::_commitMigrationOnDonorAndRecipient(
         "lsid"_attr = _migrationInfo.getLsid(),
         "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
         "migrationId"_attr = _migrationInfo.getId());
-    migrationutil::advanceTransactionOnRecipient(opCtx,
-                                                 _migrationInfo.getRecipientShardId(),
-                                                 _migrationInfo.getLsid(),
-                                                 _migrationInfo.getTxnNumber());
+    try {
+        migrationutil::advanceTransactionOnRecipient(opCtx,
+                                                     _migrationInfo.getRecipientShardId(),
+                                                     _migrationInfo.getLsid(),
+                                                     _migrationInfo.getTxnNumber());
+    } catch (const ExceptionFor<ErrorCodes::TransactionTooOld>& ex) {
+        // TODO: SERVER-62316: No longer catch after 6.0 branches out
+        if (_recoveringMigration) {
+            LOGV2_WARNING(6224500,
+                          "Transaction number on recipient shard was already advanced by a later "
+                          "migration that started before this one finished recovery",
+                          "namespace"_attr = _migrationInfo.getNss(),
+                          "migrationId"_attr = _migrationInfo.getId(),
+                          "lsid"_attr = _migrationInfo.getLsid(),
+                          "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
+                          "error"_attr = redact(ex));
+        } else {
+            throw;
+        }
+    }
 
     hangBeforeSendingCommitDecision.pauseWhileSet();
 
@@ -293,8 +309,21 @@ void MigrationCoordinator::_abortMigrationOnDonorAndRecipient(OperationContext*
                     "recipientShardId"_attr = _migrationInfo.getRecipientShardId(),
                     "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
                     "error"_attr = exShardNotFound);
+    } catch (const ExceptionFor<ErrorCodes::TransactionTooOld>& ex) {
+        // TODO: SERVER-62316: No longer catch after 6.0 branches out
+        if (_recoveringMigration) {
+            LOGV2_WARNING(6224501,
+                          "Transaction number on recipient shard was already advanced by a later "
+                          "migration that started before this one finished recovery",
+                          "namespace"_attr = _migrationInfo.getNss(),
+                          "migrationId"_attr = _migrationInfo.getId(),
+                          "lsid"_attr = _migrationInfo.getLsid(),
+                          "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
+                          "error"_attr = redact(ex));
+        } else {
+            throw;
+        }
     }
-
     LOGV2_DEBUG(23902,
                 2,
                 "Marking range deletion task on recipient as ready for processing",
diff --git a/src/mongo/db/s/migration_coordinator.h b/src/mongo/db/s/migration_coordinator.h
index 0c96d1eab15..88db1a68008 100644
--- a/src/mongo/db/s/migration_coordinator.h
+++ b/src/mongo/db/s/migration_coordinator.h
@@ -125,6 +125,8 @@ private:
     MigrationCoordinatorDocument _migrationInfo;
     bool _waitForDelete = false;
     boost::optional<ExecutorFuture<void>> _releaseRecipientCriticalSectionFuture;
+    const bool _recoveringMigration =
+        false;  // TODO: SERVER-62316: Can be removed after 6.0 branches out
 };
 
 }  // namespace migrationutil
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp
index 61fe9f728eb..fa60f7990a8 100644
--- a/src/mongo/db/s/migration_util.cpp
+++ b/src/mongo/db/s/migration_util.cpp
@@ -816,7 +816,12 @@ void markAsReadyRangeDeletionTaskLocally(OperationContext* opCtx, const UUID& mi
     auto update = BSON("$unset" << BSON(RangeDeletionTask::kPendingFieldName << ""));
 
     hangInReadyRangeDeletionLocallyInterruptible.pauseWhileSet(opCtx);
-    store.update(opCtx, query, update);
+    try {
+        store.update(opCtx, query, update);
+    } catch (const ExceptionFor<ErrorCodes::NoMatchingDocument>&) {
+        // If we are recovering the migration, the range-deletion may have already finished. So its
+        // associated document may already have been removed.
+    }
 
     if (hangInReadyRangeDeletionLocallyThenSimulateErrorUninterruptible.shouldFail()) {
         hangInReadyRangeDeletionLocallyThenSimulateErrorUninterruptible.pauseWhileSet(opCtx);
@@ -884,16 +889,6 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) {
     store.forEach(opCtx,
                   BSONObj{},
                   [&opCtx, &unfinishedMigrationsCount](const MigrationCoordinatorDocument& doc) {
-                      // MigrationCoordinators are only created under the MigrationBlockingGuard,
-                      // which means that only one can possibly exist on an instance at a time.
-                      // Furthermore, recovery of an incomplete MigrationCoordator also acquires the
-                      // MigrationBlockingGuard. Because of this it is not possible to have more
-                      // than one unfinished migration.
-                      invariant(unfinishedMigrationsCount == 0,
-                                str::stream()
-                                    << "Upon step-up a second migration coordinator was found"
-                                    << redact(doc.toBSON()));
-
                       unfinishedMigrationsCount++;
                       LOGV2_DEBUG(4798511,
                                   3,
@@ -908,14 +903,8 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) {
                           CollectionShardingRuntime::get(opCtx, nss)->clearFilteringMetadata(opCtx);
                       }
 
-                      auto mbg = std::make_shared<MigrationBlockingGuard>(
-                          opCtx,
-                          str::stream() << "Recovery of migration session "
-                                        << doc.getMigrationSessionId().toString()
-                                        << " on collection " << nss);
-
                       ExecutorFuture<void>(getMigrationUtilExecutor(opCtx->getServiceContext()))
-                          .then([serviceContext = opCtx->getServiceContext(), nss, mbg] {
+                          .then([serviceContext = opCtx->getServiceContext(), nss] {
                               ThreadClient tc("TriggerMigrationRecovery", serviceContext);
                               {
                                   stdx::lock_guard<Client> lk(*tc.get());
-- 
cgit v1.2.1