summaryrefslogtreecommitdiff
path: root/jstests/replsets
diff options
context:
space:
mode:
authorauto-revert-processor <dev-prod-dag@mongodb.com>2023-04-15 04:58:07 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-04-15 05:32:18 +0000
commitfecd7c876ae3838942f752267f4a0b30e5e0d373 (patch)
tree31dc0c3b5f93149b007e2365c06b24df10cb0b05 /jstests/replsets
parent88a17d29b2890c0ab25ed44ddc349f3589fb6934 (diff)
downloadmongo-fecd7c876ae3838942f752267f4a0b30e5e0d373.tar.gz
Revert "SERVER-75990: Tenant Migrations are not resilient to recipient failover"
This reverts commit c92d1161fdf333a3e3219631557a480fae30a593.
Diffstat (limited to 'jstests/replsets')
-rw-r--r--jstests/replsets/tenant_migration_cloner_stats_with_failover.js146
-rw-r--r--jstests/replsets/tenant_migration_cluster_time_keys_cloning.js101
-rw-r--r--jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js488
-rw-r--r--jstests/replsets/tenant_migration_external_keys_ttl.js228
-rw-r--r--jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js54
-rw-r--r--jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js199
-rw-r--r--jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js211
-rw-r--r--jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js110
-rw-r--r--jstests/replsets/tenant_migration_recipient_rollback_recovery.js335
-rw-r--r--jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js94
-rw-r--r--jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js66
-rw-r--r--jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js67
-rw-r--r--jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js132
-rw-r--r--jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js133
-rw-r--r--jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js124
-rw-r--r--jstests/replsets/tenant_migration_resume_oplog_application.js119
-rw-r--r--jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js4
-rw-r--r--jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js3
18 files changed, 2441 insertions, 173 deletions
diff --git a/jstests/replsets/tenant_migration_cloner_stats_with_failover.js b/jstests/replsets/tenant_migration_cloner_stats_with_failover.js
new file mode 100644
index 00000000000..77c98dea613
--- /dev/null
+++ b/jstests/replsets/tenant_migration_cloner_stats_with_failover.js
@@ -0,0 +1,146 @@
+/**
+ * Tests tenant migration cloner stats such as 'approxTotalDataSize', 'approxTotalBytesCopied',
+ * 'databasesClonedBeforeFailover' across multiple databases and collections with failovers.
+ *
+ * This test does the following:
+ * 1. Insert two databases on the donor. The first database consists of one collection, the second
+ * consists of two collections.
+ * 2. Wait for the primary (referred to as the original primary) to clone one batch from the second
+ * database's second collection.
+ * 3. Step up the new primary. Ensure that the stats such as 'databasesClonedBeforeFailover' tally.
+ * 4. Allow the tenant migration to complete and commit. Ensure that stats are sensible.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * incompatible_with_shard_merge,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject().
+load("jstests/libs/fail_point_util.js"); // For configureFailPoint().
+
+// Limit the batch size to test the stat in between batches.
+const tenantMigrationTest = new TenantMigrationTest(
+ {name: jsTestName(), sharedOptions: {setParameter: {collectionClonerBatchSize: 10}}});
+
+const kMigrationId = UUID();
+const kTenantId = ObjectId().str;
+const kReadPreference = {
+ mode: "primary"
+};
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(kMigrationId),
+ tenantId: kTenantId,
+ readPreference: kReadPreference
+};
+
+const dbName = tenantMigrationTest.tenantDB(kTenantId, "testDB");
+const collName = "coll";
+const dbName1 = dbName + '_db_1';
+const dbName2 = dbName + '_db_2';
+const db2Coll1 = collName + "_db_2_1";
+const db2Coll2 = collName + "_db_2_2";
+
+// Add a large amount of data to the donor.
+jsTestLog("Adding data to donor.");
+const dataForEachCollection = [...Array(100).keys()].map((i) => ({a: i, b: 'metanoia'}));
+tenantMigrationTest.insertDonorDB(dbName1, collName + "_1", dataForEachCollection);
+tenantMigrationTest.insertDonorDB(dbName2, db2Coll1, dataForEachCollection);
+tenantMigrationTest.insertDonorDB(dbName2, db2Coll2, dataForEachCollection);
+
+const originalRecipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const newRecipientPrimary = tenantMigrationTest.getRecipientRst().getSecondaries()[0];
+
+jsTestLog("Collecting the stats of the databases and collections from the donor.");
+const donorPrimary = tenantMigrationTest.getDonorPrimary();
+const donorDB2 = donorPrimary.getDB(dbName2);
+
+const db1Size = assert.commandWorked(donorPrimary.getDB(dbName1).runCommand({dbStats: 1})).dataSize;
+const db2Size = assert.commandWorked(donorDB2.runCommand({dbStats: 1})).dataSize;
+const db2Collection1Size = assert.commandWorked(donorDB2.runCommand({collStats: db2Coll1})).size;
+const db2Collection2Size = assert.commandWorked(donorDB2.runCommand({collStats: db2Coll2})).size;
+
+const donorStats = {
+ db1Size,
+ db2Size,
+ db2Collection1Size,
+ db2Collection2Size
+};
+jsTestLog("Collected the following stats on the donor: " + tojson(donorStats));
+
+// The last collection to be cloned is the one with a greater UUID.
+const collInfo = donorDB2.getCollectionInfos();
+const uuid1 = collInfo[0].info.uuid;
+const uuid2 = collInfo[1].info.uuid;
+const lastCollection = (uuid1 > uuid2) ? db2Coll1 : db2Coll2;
+
+// Create a failpoint to pause after one batch of the second database's second collection has been
+// cloned.
+const fpAfterBatchOfSecondDB = configureFailPoint(
+ originalRecipientPrimary,
+ "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse",
+ {nss: originalRecipientPrimary.getDB(dbName2).getCollection(lastCollection).getFullName()});
+
+jsTestLog("Starting tenant migration with migrationId: " + kMigrationId +
+ ", tenantId: " + kTenantId);
+assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+
+let res = 0;
+let currOp = 0;
+jsTestLog("Waiting until one batch of second database has been cloned by original primary.");
+fpAfterBatchOfSecondDB.wait();
+// Since documents are inserted on a separate thread, wait until the expected stats are seen. The
+// failpoint needs to be maintained so that the next batch isn't processed.
+assert.soon(() => {
+ res = originalRecipientPrimary.adminCommand(
+ {currentOp: true, desc: "tenant recipient migration"});
+ currOp = res.inprog[0];
+
+ // Wait until one batch of documents of the second database's second collection has been copied.
+ return currOp.approxTotalBytesCopied > db1Size + db2Collection1Size;
+}, res);
+
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+// Since the two collections on the second database are the same size,
+// 'db1Size + db2Collection1Size' and 'db1Size + db2Collection2Size' evaluate to the same value.
+assert.gt(currOp.approxTotalBytesCopied, db1Size + db2Collection1Size, res);
+assert.lt(currOp.approxTotalBytesCopied, db1Size + db2Size, res);
+assert.eq(currOp.databases.databasesClonedBeforeFailover, 0, res);
+assert.eq(currOp.databases[dbName2].clonedCollectionsBeforeFailover, 0, res);
+const bytesCopiedIncludingSecondDB = currOp.approxTotalBytesCopied;
+jsTestLog("Bytes copied after first batch of second database: " + bytesCopiedIncludingSecondDB);
+
+// Wait until the batch of the second collection of the second database has been replicated from the
+// original primary to the new primary. Then, step up the new primary.
+const fpAfterCreatingCollectionOfSecondDB =
+ configureFailPoint(newRecipientPrimary, "tenantCollectionClonerHangAfterCreateCollection");
+tenantMigrationTest.getRecipientRst().stepUp(newRecipientPrimary);
+fpAfterBatchOfSecondDB.off();
+
+jsTestLog("Wait until the new primary creates collection of second database.");
+fpAfterCreatingCollectionOfSecondDB.wait();
+res = newRecipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+currOp = res.inprog[0];
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.eq(currOp.approxTotalBytesCopied, bytesCopiedIncludingSecondDB, res);
+assert.eq(currOp.databases.databasesClonedBeforeFailover, 1, res);
+assert.eq(currOp.databases[dbName2].clonedCollectionsBeforeFailover, 1, res);
+fpAfterCreatingCollectionOfSecondDB.off();
+
+// After the migration completes, the total bytes copied should be equal to the total data size.
+jsTestLog("Waiting for migration to complete.");
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+res = newRecipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+currOp = res.inprog[0];
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.eq(currOp.approxTotalBytesCopied, db1Size + db2Size, res);
+assert.eq(currOp.databases.databasesClonedBeforeFailover, 1, res);
+assert.eq(currOp.databases[dbName2].clonedCollectionsBeforeFailover, 1, res);
+
+tenantMigrationTest.stop();
diff --git a/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js b/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js
index f3ab1df7e24..f3737dd4706 100644
--- a/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js
+++ b/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js
@@ -177,58 +177,55 @@ const migrationX509Options = makeX509OptionsForTest();
tenantMigrationTest.stop();
})();
-// TODO SERVER-76128: Tenant Migrations are not robust to recipient failover.
-// (() => {
-// jsTest.log("Test that the donor and recipient correctly copy each other's cluster time keys "
-// +
-// "when there is recipient failover.");
-// const recipientRst = new ReplSetTest({
-// nodes: 3,
-// name: "recipientRst",
-// serverless: true,
-// nodeOptions: migrationX509Options.recipient
-// });
-// recipientRst.startSet();
-// recipientRst.initiate();
-// if (isShardMergeEnabled(recipientRst.getPrimary().getDB("adminDB"))) {
-// jsTestLog("Skip: featureFlagShardMerge enabled, but shard merge does not survive
-// failover");
-// recipientRst.stopSet();
-// return;
-// }
-
-// const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), recipientRst});
-
-// const recipientPrimary = recipientRst.getPrimary();
-// const fp = configureFailPoint(recipientPrimary,
-// "fpAfterPersistingTenantMigrationRecipientInstanceStateDoc",
-// {action: "hang"});
-
-// const migrationId = UUID();
-// const migrationOpts = {
-// migrationIdString: extractUUIDFromObject(migrationId),
-// tenantId: kTenantId1,
-// };
-// assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
-// fp.wait();
-
-// assert.commandWorked(
-// recipientPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
-// assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
-
-// fp.off();
-// TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(
-// migrationOpts, true /* retryOnRetryableErrors */));
-
-// assertCopiedExternalKeys(tenantMigrationTest, migrationId);
-
-// // After another migration, the first's keys should still exist.
-// runMigrationAndAssertExternalKeysCopied(tenantMigrationTest, kTenantId2);
-// assertCopiedExternalKeys(tenantMigrationTest, migrationId);
-
-// recipientRst.stopSet();
-// tenantMigrationTest.stop();
-// })();
+(() => {
+ jsTest.log("Test that the donor and recipient correctly copy each other's cluster time keys " +
+ "when there is recipient failover.");
+ const recipientRst = new ReplSetTest({
+ nodes: 3,
+ name: "recipientRst",
+ serverless: true,
+ nodeOptions: migrationX509Options.recipient
+ });
+ recipientRst.startSet();
+ recipientRst.initiate();
+ if (isShardMergeEnabled(recipientRst.getPrimary().getDB("adminDB"))) {
+ jsTestLog("Skip: featureFlagShardMerge enabled, but shard merge does not survive failover");
+ recipientRst.stopSet();
+ return;
+ }
+
+ const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), recipientRst});
+
+ const recipientPrimary = recipientRst.getPrimary();
+ const fp = configureFailPoint(recipientPrimary,
+ "fpAfterPersistingTenantMigrationRecipientInstanceStateDoc",
+ {action: "hang"});
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId1,
+ };
+ assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+ fp.wait();
+
+ assert.commandWorked(
+ recipientPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
+
+ fp.off();
+ TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(
+ migrationOpts, true /* retryOnRetryableErrors */));
+
+ assertCopiedExternalKeys(tenantMigrationTest, migrationId);
+
+ // After another migration, the first's keys should still exist.
+ runMigrationAndAssertExternalKeysCopied(tenantMigrationTest, kTenantId2);
+ assertCopiedExternalKeys(tenantMigrationTest, migrationId);
+
+ recipientRst.stopSet();
+ tenantMigrationTest.stop();
+})();
(() => {
jsTest.log("Test that the donor waits for copied external keys to replicate to every node");
diff --git a/jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js b/jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js
new file mode 100644
index 00000000000..b3b158c7134
--- /dev/null
+++ b/jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js
@@ -0,0 +1,488 @@
+/**
+ * Tests that tenant migrations resume successfully on donor stepup and restart.
+ *
+ * Incompatible with shard merge, which can't handle restart.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * # Some tenant migration statistics field names were changed in 6.1.
+ * requires_fcv_61,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * # Tenant migrations are only used in serverless.
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {
+ forgetMigrationAsync,
+ isShardMergeEnabled,
+ makeX509OptionsForTest,
+ runMigrationAsync,
+ tryAbortMigrationAsync
+} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/uuid_util.js");
+load("jstests/replsets/rslib.js"); // 'createRstArgs'
+
+const kMaxSleepTimeMS = 100;
+const kTenantId = ObjectId().str;
+const kMigrationFpNames = [
+ "pauseTenantMigrationBeforeLeavingDataSyncState",
+ "pauseTenantMigrationBeforeLeavingBlockingState",
+ "abortTenantMigrationBeforeLeavingBlockingState",
+ ""
+];
+
+// Set the delay before a state doc is garbage collected to be short to speed up the test but long
+// enough for the state doc to still be around after stepup or restart.
+const kGarbageCollectionDelayMS = 30 * 1000;
+
+// Set the TTL monitor to run at a smaller interval to speed up the test.
+const kTTLMonitorSleepSecs = 1;
+
+const migrationX509Options = makeX509OptionsForTest();
+
+/**
+ * Runs the donorStartMigration command to start a migration, and interrupts the migration on the
+ * donor using the 'interruptFunc', and asserts that migration eventually commits.
+ */
+function testDonorStartMigrationInterrupt(interruptFunc,
+ {donorRestarted = false, disableForShardMerge = true}) {
+ const donorRst = new ReplSetTest(
+ {nodes: 3, name: "donorRst", serverless: true, nodeOptions: migrationX509Options.donor});
+
+ donorRst.startSet();
+ donorRst.initiate();
+
+ const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), donorRst});
+
+ let donorPrimary = tenantMigrationTest.getDonorPrimary();
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+ if (disableForShardMerge && isShardMergeEnabled(recipientPrimary.getDB("admin"))) {
+ jsTest.log("Skipping test for shard merge");
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+ return;
+ }
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ };
+ const donorRstArgs = createRstArgs(donorRst);
+
+ const runMigrationThread =
+ new Thread(runMigrationAsync, migrationOpts, donorRstArgs, {retryOnRetryableErrors: true});
+ runMigrationThread.start();
+
+ // Wait for donorStartMigration command to start.
+ assert.soon(() => donorPrimary.adminCommand({currentOp: true, desc: "tenant donor migration"})
+ .inprog.length > 0);
+
+ sleep(Math.random() * kMaxSleepTimeMS);
+ interruptFunc(donorRst);
+
+ TenantMigrationTest.assertCommitted(runMigrationThread.returnData());
+ tenantMigrationTest.waitForDonorNodesToReachState(donorRst.nodes,
+ migrationId,
+ migrationOpts.tenantId,
+ TenantMigrationTest.DonorState.kCommitted);
+ assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+ donorPrimary = tenantMigrationTest.getDonorPrimary(); // Could change after interrupt.
+ const donorStats = tenantMigrationTest.getTenantMigrationStats(donorPrimary);
+ jsTestLog(`Stats at the donor primary: ${tojson(donorStats)}`);
+ if (donorRestarted) {
+ // If full restart happened the count could be lost completely.
+ assert.gte(1, donorStats.totalMigrationDonationsCommitted);
+ } else {
+ // The double counting happens when the failover happens after migration completes
+ // but before the state doc GC mark is persisted. While this test is targeting this
+ // scenario it is low probability in production.
+ assert(1 == donorStats.totalMigrationDonationsCommitted ||
+ 2 == donorStats.totalMigrationDonationsCommitted);
+ }
+ // Skip checking the stats on the recipient since enableRecipientTesting is false
+ // so the recipient is forced to respond to recipientSyncData without starting the
+ // migration.
+
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+}
+
+/**
+ * Starts a migration and waits for it to commit, then runs the donorForgetMigration, and interrupts
+ * the donor using the 'interruptFunc', and asserts that the migration state is eventually garbage
+ * collected.
+ */
+function testDonorForgetMigrationInterrupt(interruptFunc) {
+ const donorRst = new ReplSetTest({
+ nodes: 3,
+ name: "donorRst",
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.donor, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: kTTLMonitorSleepSecs,
+ }
+ })
+ });
+ const recipientRst = new ReplSetTest({
+ nodes: 1,
+ name: "recipientRst",
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.recipient, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: kTTLMonitorSleepSecs,
+ }
+ })
+ });
+
+ donorRst.startSet();
+ donorRst.initiate();
+
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst});
+
+ const donorPrimary = tenantMigrationTest.getDonorPrimary();
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: recipientRst.getURL(),
+ };
+ const donorRstArgs = createRstArgs(donorRst);
+
+ TenantMigrationTest.assertCommitted(
+ tenantMigrationTest.runMigration(migrationOpts, {automaticForgetMigration: false}));
+ const forgetMigrationThread = new Thread(forgetMigrationAsync,
+ migrationOpts.migrationIdString,
+ donorRstArgs,
+ true /* retryOnRetryableErrors */);
+ forgetMigrationThread.start();
+
+ // Wait for donorForgetMigration command to start.
+ assert.soon(() => {
+ const res = assert.commandWorked(
+ donorPrimary.adminCommand({currentOp: true, desc: "tenant donor migration"}));
+ return res.inprog[0].expireAt != null;
+ });
+ sleep(Math.random() * kMaxSleepTimeMS);
+ interruptFunc(donorRst);
+
+ assert.commandWorkedOrFailedWithCode(
+ tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString),
+ ErrorCodes.NoSuchTenantMigration);
+
+ assert.commandWorked(forgetMigrationThread.returnData());
+ tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, migrationOpts.tenantId);
+
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+ recipientRst.stopSet();
+}
+
+/**
+ * Starts a migration and sets the passed in failpoint, then runs the donorAbortMigration, and
+ * interrupts the donor using the 'interruptFunc', and asserts that the migration state is
+ * eventually garbage collected.
+ */
+function testDonorAbortMigrationInterrupt(
+ interruptFunc, fpName, {fpWaitBeforeAbort = false, isShutdown = false} = {}) {
+ const donorRst = new ReplSetTest({
+ nodes: 3,
+ name: "donorRst",
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.donor, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: kTTLMonitorSleepSecs,
+ }
+ })
+ });
+ const recipientRst = new ReplSetTest({
+ nodes: 1,
+ name: "recipientRst",
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.recipient, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: kTTLMonitorSleepSecs,
+ }
+ })
+ });
+
+ donorRst.startSet();
+ donorRst.initiate();
+
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst});
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: recipientRst.getURL(),
+ };
+ const donorRstArgs = createRstArgs(donorRst);
+ let donorPrimary = tenantMigrationTest.getDonorPrimary();
+
+ // If we passed in a valid failpoint we set it, otherwise we let the migration run normally.
+ let fp;
+ if (fpName) {
+ fp = configureFailPoint(donorPrimary, fpName);
+ }
+
+ assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+
+ if (fp && !isShutdown && fpWaitBeforeAbort) {
+ fp.wait();
+ }
+
+ const tryAbortThread = new Thread(tryAbortMigrationAsync,
+ {migrationIdString: migrationOpts.migrationIdString},
+ donorRstArgs,
+ true /* retryOnRetryableErrors */);
+ tryAbortThread.start();
+
+ // Wait for donorAbortMigration command to start.
+ assert.soon(() => {
+ const res = assert.commandWorked(
+ donorPrimary.adminCommand({currentOp: true, desc: "tenant donor migration"}));
+ return res.inprog[0].receivedCancellation;
+ });
+
+ interruptFunc(donorRst);
+
+ if (fp && !isShutdown) {
+ // Turn off failpoint in order to allow the migration to resume after stepup.
+ fp.off();
+ }
+
+ tryAbortThread.join();
+
+ let res = tryAbortThread.returnData();
+ assert.commandWorkedOrFailedWithCode(res, ErrorCodes.TenantMigrationCommitted);
+
+ donorPrimary = tenantMigrationTest.getDonorPrimary();
+ let configDonorsColl = donorPrimary.getCollection(TenantMigrationTest.kConfigDonorsNS);
+ let donorDoc = configDonorsColl.findOne({tenantId: kTenantId});
+
+ if (!res.ok) {
+ assert.eq(donorDoc.state, TenantMigrationTest.DonorState.kCommitted);
+ } else {
+ assert.eq(donorDoc.state, TenantMigrationTest.DonorState.kAborted);
+ }
+
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+ recipientRst.stopSet();
+}
+
+/**
+ * Starts a migration and sets the passed in failpoint, then either waits for the failpoint or lets
+ * the migration run successfully and interrupts the donor using the 'interruptFunc'. After
+ * restarting, check the to see if the donorDoc data has persisted.
+ */
+function testStateDocPersistenceOnFailover(interruptFunc, fpName, isShutdown = false) {
+ const donorRst = new ReplSetTest(
+ {nodes: 3, name: "donorRst", serverless: true, nodeOptions: migrationX509Options.donor});
+
+ donorRst.startSet();
+ donorRst.initiate();
+
+ const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), donorRst});
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ };
+ let donorPrimary = tenantMigrationTest.getDonorPrimary();
+
+ // If we passed in a valid failpoint we set it, otherwise we let the migration run normally.
+ let fp;
+ if (fpName) {
+ fp = configureFailPoint(donorPrimary, fpName);
+ assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+ fp.wait();
+ } else {
+ TenantMigrationTest.assertCommitted(tenantMigrationTest.runMigration(migrationOpts));
+ }
+
+ let configDonorsColl = donorPrimary.getCollection(TenantMigrationTest.kConfigDonorsNS);
+ let donorDocBeforeFailover = configDonorsColl.findOne({tenantId: kTenantId});
+
+ interruptFunc(tenantMigrationTest.getDonorRst());
+
+ if (fp && !isShutdown) {
+ // Turn off failpoint in order to allow the migration to resume after stepup.
+ fp.off();
+ }
+
+ donorPrimary = tenantMigrationTest.getDonorPrimary();
+ configDonorsColl = donorPrimary.getCollection(TenantMigrationTest.kConfigDonorsNS);
+ let donorDocAfterFailover = configDonorsColl.findOne({tenantId: kTenantId});
+
+ // Check persisted fields in the donor doc.
+ assert.eq(donorDocBeforeFailover._id, donorDocAfterFailover._id);
+ assert.eq(donorDocBeforeFailover.recipientConnString,
+ donorDocAfterFailover.recipientConnString);
+ assert.eq(donorDocBeforeFailover.readPreference, donorDocAfterFailover.readPreference);
+ assert.eq(donorDocBeforeFailover.startMigrationDonorTimestamp,
+ donorDocAfterFailover.startMigrationDonorTimestamp);
+ assert.eq(donorDocBeforeFailover.migration, donorDocAfterFailover.migration);
+ assert.eq(donorDocBeforeFailover.tenantId, donorDocAfterFailover.tenantId);
+ assert.eq(donorDocBeforeFailover.donorCertificateForRecipient,
+ donorDocAfterFailover.donorCertificateForRecipient);
+ assert.eq(donorDocBeforeFailover.recipientCertificateForDonor,
+ donorDocAfterFailover.recipientCertificateForDonor);
+ assert.eq(donorDocBeforeFailover.migrationStart, donorDocAfterFailover.migrationStart);
+
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+}
+
+(() => {
+ jsTest.log("Test that the migration resumes on stepup");
+ testDonorStartMigrationInterrupt((donorRst) => {
+ // Force the primary to step down but make it likely to step back up.
+ const donorPrimary = donorRst.getPrimary();
+ assert.commandWorked(
+ donorPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0}));
+ }, {donorRestarted: false});
+})();
+
+(() => {
+ jsTest.log("Test that the migration resumes after restart");
+ testDonorStartMigrationInterrupt((donorRst) => {
+ // Skip validation on shutdown because the full validation can conflict with the tenant
+ // migration and cause it to fail.
+ donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true});
+ donorRst.startSet({restart: true});
+ }, {donorRestarted: true, disableForShardMerge: true});
+})();
+
+(() => {
+ jsTest.log("Test that the donorForgetMigration command can be retried on stepup");
+ testDonorForgetMigrationInterrupt((donorRst) => {
+ // Force the primary to step down but make it likely to step back up.
+ const donorPrimary = donorRst.getPrimary();
+ assert.commandWorked(
+ donorPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0}));
+ });
+})();
+
+(() => {
+ jsTest.log("Test that the donorForgetMigration command can be retried after restart");
+ testDonorForgetMigrationInterrupt((donorRst) => {
+ // Skip validation on shutdown because the full validation can conflict with the tenant
+ // migration and cause it to fail.
+ donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true});
+ donorRst.startSet({restart: true});
+ });
+})();
+
+(() => {
+ jsTest.log("Test that the donorAbortMigration command can be retried after restart");
+
+ kMigrationFpNames.forEach(fpName => {
+ if (!fpName) {
+ jsTest.log("Testing without setting a failpoint.");
+ } else {
+ jsTest.log("Testing with failpoint: " + fpName);
+ }
+
+ testDonorAbortMigrationInterrupt((donorRst) => {
+ // Skip validation on shutdown because the full validation can conflict with the tenant
+ // migration and cause it to fail.
+ donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true});
+ donorRst.startSet({restart: true});
+ }, fpName, {isShutdown: true});
+ });
+})();
+
+(() => {
+ jsTest.log(
+ "Test that the donorAbortMigration command fails if issued after state == kCommitted");
+
+ testDonorAbortMigrationInterrupt((donorRst) => {},
+ "pauseTenantMigrationAfterUpdatingToCommittedState",
+ {fpWaitBeforeAbort: true});
+})();
+
+(() => {
+ jsTest.log("Test that the donorAbortMigration command can be retried on stepup");
+ kMigrationFpNames.forEach(fpName => {
+ if (!fpName) {
+ jsTest.log("Testing without setting a failpoint.");
+ } else {
+ jsTest.log("Testing with failpoint: " + fpName);
+ }
+
+ testDonorAbortMigrationInterrupt((donorRst) => {
+ // Force the primary to step down but make it likely to step back up.
+ const donorPrimary = donorRst.getPrimary();
+ assert.commandWorked(donorPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0}));
+ }, fpName);
+ });
+})();
+
+(() => {
+ jsTest.log("Test stateDoc data persistence on restart.");
+ kMigrationFpNames.forEach(fpName => {
+ if (!fpName) {
+ jsTest.log("Testing without setting a failpoint.");
+ } else {
+ jsTest.log("Testing with failpoint: " + fpName);
+ }
+
+ testStateDocPersistenceOnFailover((donorRst) => {
+ // Skip validation on shutdown because the full validation can conflict with the tenant
+ // migration and cause it to fail.
+ donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true});
+ donorRst.startSet({restart: true});
+ }, fpName, true);
+ });
+})();
+
+(() => {
+ jsTest.log("Test stateDoc data persistence on stepup.");
+ kMigrationFpNames.forEach(fpName => {
+ if (!fpName) {
+ jsTest.log("Testing without setting a failpoint.");
+ } else {
+ jsTest.log("Testing with failpoint: " + fpName);
+ }
+
+ testStateDocPersistenceOnFailover((donorRst) => {
+ // Force the primary to step down but make it likely to step back up.
+ const donorPrimary = donorRst.getPrimary();
+ assert.commandWorked(donorPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0}));
+ }, fpName);
+ });
+})();
diff --git a/jstests/replsets/tenant_migration_external_keys_ttl.js b/jstests/replsets/tenant_migration_external_keys_ttl.js
index 172600f1d2a..28611f9abaf 100644
--- a/jstests/replsets/tenant_migration_external_keys_ttl.js
+++ b/jstests/replsets/tenant_migration_external_keys_ttl.js
@@ -286,39 +286,38 @@ function makeTestParams() {
teardown();
}
- // TODO SERVER-76128: Tenant Migrations are not robust to recipient failover.
- // jsTestLog("Recipient failover before receiving forgetMigration");
- // {
- // const {tmt, teardown} = setup();
- // const [tenantId, migrationId, migrationOpts] = makeTestParams();
- // const recipientPrimary = tmt.getRecipientPrimary();
- // const fp = configureFailPoint(recipientPrimary,
- // "fpAfterConnectingTenantMigrationRecipientInstance",
- // {action: "hang"});
-
- // assert.commandWorked(tmt.startMigration(migrationOpts));
- // fp.wait();
-
- // assert.commandWorked(recipientPrimary.adminCommand(
- // {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
- // assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
- // fp.off();
-
- // TenantMigrationTest.assertCommitted(
- // tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */));
-
- // // The keys should have been created without a TTL deadline.
- // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false});
- // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false});
-
- // assert.commandWorked(tmt.forgetMigration(migrationOpts.migrationIdString));
-
- // // After running donorForgetMigration, the TTL value should be updated. The default TTL
- // // buffer is 1 day so the keys will not have been deleted.
- // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true});
- // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true});
- // teardown();
- // }
+ jsTestLog("Recipient failover before receiving forgetMigration");
+ {
+ const {tmt, teardown} = setup();
+ const [tenantId, migrationId, migrationOpts] = makeTestParams();
+ const recipientPrimary = tmt.getRecipientPrimary();
+ const fp = configureFailPoint(recipientPrimary,
+ "fpAfterConnectingTenantMigrationRecipientInstance",
+ {action: "hang"});
+
+ assert.commandWorked(tmt.startMigration(migrationOpts));
+ fp.wait();
+
+ assert.commandWorked(recipientPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
+ fp.off();
+
+ TenantMigrationTest.assertCommitted(
+ tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */));
+
+ // The keys should have been created without a TTL deadline.
+ verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false});
+ verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false});
+
+ assert.commandWorked(tmt.forgetMigration(migrationOpts.migrationIdString));
+
+ // After running donorForgetMigration, the TTL value should be updated. The default TTL
+ // buffer is 1 day so the keys will not have been deleted.
+ verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true});
+ verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true});
+ teardown();
+ }
jsTestLog(
"Donor failover after receiving forgetMigration before marking keys garbage collectable");
@@ -356,42 +355,39 @@ function makeTestParams() {
teardown();
}
- // TODO SERVER-76128: Tenant Migrations are not robust to recipient failover.
- // jsTestLog(
- // "Recipient failover after receiving forgetMigration before marking keys garbage
- // collectable");
- // {
- // const {tmt, donorRst, teardown} = setup();
- // const [tenantId, migrationId, migrationOpts] = makeTestParams();
- // const recipientPrimary = tmt.getRecipientPrimary();
-
- // assert.commandWorked(tmt.startMigration(migrationOpts));
- // TenantMigrationTest.assertCommitted(
- // tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */));
-
- // // The keys should have been created without a TTL deadline.
- // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false});
- // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false});
-
- // const fp = configureFailPoint(
- // recipientPrimary, "pauseTenantMigrationBeforeMarkingExternalKeysGarbageCollectable");
- // const forgetMigrationThread = new Thread(
- // forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst),
- // true);
- // forgetMigrationThread.start();
- // fp.wait();
-
- // assert.commandWorked(recipientPrimary.adminCommand(
- // {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
- // assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
- // fp.off();
-
- // assert.commandWorked(forgetMigrationThread.returnData());
-
- // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true});
- // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true});
- // teardown();
- // }
+ jsTestLog(
+ "Recipient failover after receiving forgetMigration before marking keys garbage collectable");
+ {
+ const {tmt, donorRst, teardown} = setup();
+ const [tenantId, migrationId, migrationOpts] = makeTestParams();
+ const recipientPrimary = tmt.getRecipientPrimary();
+
+ assert.commandWorked(tmt.startMigration(migrationOpts));
+ TenantMigrationTest.assertCommitted(
+ tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */));
+
+ // The keys should have been created without a TTL deadline.
+ verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false});
+ verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false});
+
+ const fp = configureFailPoint(
+ recipientPrimary, "pauseTenantMigrationBeforeMarkingExternalKeysGarbageCollectable");
+ const forgetMigrationThread = new Thread(
+ forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst), true);
+ forgetMigrationThread.start();
+ fp.wait();
+
+ assert.commandWorked(recipientPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
+ fp.off();
+
+ assert.commandWorked(forgetMigrationThread.returnData());
+
+ verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true});
+ verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true});
+ teardown();
+ }
jsTestLog("Donor failover after receiving forgetMigration after updating keys.");
{
@@ -437,53 +433,49 @@ function makeTestParams() {
teardown();
}
- // TODO SERVER-76128: Tenant Migrations are not robust to recipient failover.
- // jsTestLog("Recipient failover after receiving forgetMigration after updating keys.");
- // {
- // const {tmt, donorRst, recipientRst, teardown} = setup();
- // // this test expects the external keys to expire, so lower the expiration timeouts.
- // const lowerExternalKeysBufferSecs = 5;
- // const lowerStateDocExpirationMS = 500;
- // for (let conn of [...donorRst.nodes, ...recipientRst.nodes]) {
- // setTenantMigrationExpirationParams(
- // conn, lowerStateDocExpirationMS, lowerExternalKeysBufferSecs);
- // }
- // const [tenantId, migrationId, migrationOpts] = makeTestParams();
- // const recipientPrimary = tmt.getRecipientPrimary();
-
- // assert.commandWorked(tmt.startMigration(migrationOpts));
- // TenantMigrationTest.assertCommitted(
- // tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */));
-
- // // The keys should have been created without a TTL deadline.
- // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false});
- // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false});
-
- // const fp = configureFailPoint(
- // recipientPrimary, "fpAfterReceivingRecipientForgetMigration", {action: "hang"});
- // const forgetMigrationThread = new Thread(
- // forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst),
- // true);
- // forgetMigrationThread.start();
- // fp.wait();
-
- // // Let the keys expire on the donor before the state document is deleted to verify
- // retrying
- // // recipientForgetMigration can handle this case. The keys won't be deleted until the
- // buffer
- // // expires, so sleep to avoid wasted work.
- // sleep((lowerExternalKeysBufferSecs * 1000) + lowerStateDocExpirationMS + 500);
- // waitForExternalKeysToBeDeleted(tmt.getRecipientPrimary(), migrationId);
-
- // assert.commandWorked(recipientPrimary.adminCommand(
- // {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
- // assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
- // fp.off();
-
- // assert.commandWorked(forgetMigrationThread.returnData());
-
- // // Eventually the donor's keys should be deleted too.
- // waitForExternalKeysToBeDeleted(tmt.getDonorPrimary(), migrationId);
- // teardown();
- // }
+ jsTestLog("Recipient failover after receiving forgetMigration after updating keys.");
+ {
+ const {tmt, donorRst, recipientRst, teardown} = setup();
+ // this test expects the external keys to expire, so lower the expiration timeouts.
+ const lowerExternalKeysBufferSecs = 5;
+ const lowerStateDocExpirationMS = 500;
+ for (let conn of [...donorRst.nodes, ...recipientRst.nodes]) {
+ setTenantMigrationExpirationParams(
+ conn, lowerStateDocExpirationMS, lowerExternalKeysBufferSecs);
+ }
+ const [tenantId, migrationId, migrationOpts] = makeTestParams();
+ const recipientPrimary = tmt.getRecipientPrimary();
+
+ assert.commandWorked(tmt.startMigration(migrationOpts));
+ TenantMigrationTest.assertCommitted(
+ tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */));
+
+ // The keys should have been created without a TTL deadline.
+ verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false});
+ verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false});
+
+ const fp = configureFailPoint(
+ recipientPrimary, "fpAfterReceivingRecipientForgetMigration", {action: "hang"});
+ const forgetMigrationThread = new Thread(
+ forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst), true);
+ forgetMigrationThread.start();
+ fp.wait();
+
+ // Let the keys expire on the donor before the state document is deleted to verify retrying
+ // recipientForgetMigration can handle this case. The keys won't be deleted until the buffer
+ // expires, so sleep to avoid wasted work.
+ sleep((lowerExternalKeysBufferSecs * 1000) + lowerStateDocExpirationMS + 500);
+ waitForExternalKeysToBeDeleted(tmt.getRecipientPrimary(), migrationId);
+
+ assert.commandWorked(recipientPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
+ fp.off();
+
+ assert.commandWorked(forgetMigrationThread.returnData());
+
+ // Eventually the donor's keys should be deleted too.
+ waitForExternalKeysToBeDeleted(tmt.getDonorPrimary(), migrationId);
+ teardown();
+ }
})();
diff --git a/jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js b/jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js
new file mode 100644
index 00000000000..4cd9ae5a415
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js
@@ -0,0 +1,54 @@
+/**
+ * Tests whether the recipient returns an appropriate error code to the donor when the recipient
+ * primary is made to step down before creating the oplog buffer collection.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_persistence,
+ * requires_replication,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject().
+load("jstests/libs/fail_point_util.js"); // For configureFailPoint().
+
+const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), sharedOptions: {nodes: 2}});
+
+const kMigrationId = UUID();
+const kTenantId = ObjectId().str;
+const kReadPreference = {
+ mode: "primary"
+};
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(kMigrationId),
+ tenantId: kTenantId,
+ readPreference: kReadPreference
+};
+
+const fpBeforeCreatingOplogBuffer =
+ configureFailPoint(tenantMigrationTest.getRecipientPrimary(),
+ "fpAfterRetrievingStartOpTimesMigrationRecipientInstance",
+ {action: "hang"});
+
+jsTestLog("Starting tenant migration with migrationId: " + kMigrationId +
+ ", tenantId: " + kTenantId);
+assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+
+jsTestLog("Waiting until the recipient primary is about to create an oplog buffer collection.");
+fpBeforeCreatingOplogBuffer.wait();
+
+jsTestLog("Stepping a new primary up.");
+tenantMigrationTest.getRecipientRst().stepUp(
+ tenantMigrationTest.getRecipientRst().getSecondaries()[0]);
+
+fpBeforeCreatingOplogBuffer.off();
+
+jsTestLog("Waiting for migration to complete.");
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+
+tenantMigrationTest.stop();
diff --git a/jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js b/jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js
new file mode 100644
index 00000000000..b0c58169b01
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js
@@ -0,0 +1,199 @@
+/**
+ * Tests that during tenant migration, a new recipient node's state document and in-memory state is
+ * initialized after initial sync, when 1) the node hasn't begun cloning data yet, 2) is cloning
+ * data, and 3) is in the tenant oplog application phase.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * incompatible_with_shard_merge,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {makeX509OptionsForTest} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js");
+load('jstests/replsets/rslib.js'); // for waitForNewlyAddedRemovalForNodeToBeCommitted
+
+const migrationX509Options = makeX509OptionsForTest();
+
+const testDBName = 'testDB';
+const testCollName = 'testColl';
+
+// Restarts a node, allows the node to go through initial sync, and then makes sure its state
+// matches up with the primary's. Returns the initial sync node.
+function restartNodeAndCheckState(tenantId, tenantMigrationTest, checkMtab) {
+ // Restart a node and allow it to complete initial sync.
+ const recipientRst = tenantMigrationTest.getRecipientRst();
+ const originalRecipientPrimary = recipientRst.getPrimary();
+
+ jsTestLog("Restarting a node from the recipient replica set.");
+ let initialSyncNode = recipientRst.getSecondaries()[0];
+ initialSyncNode =
+ recipientRst.restart(initialSyncNode, {startClean: true, skipValidation: true});
+
+ // Allow the new node to finish initial sync.
+ waitForNewlyAddedRemovalForNodeToBeCommitted(originalRecipientPrimary,
+ recipientRst.getNodeId(initialSyncNode));
+ recipientRst.awaitSecondaryNodes();
+ recipientRst.awaitReplication();
+
+ jsTestLog("Ensure that the new node's state matches up with the primary's.");
+ // Make sure the new node's state makes sense.
+ let recipientDocOnPrimary = undefined;
+ let recipientDocOnNewNode = undefined;
+ assert.soon(
+ () => {
+ recipientDocOnPrimary =
+ originalRecipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS)
+ .findOne({tenantId});
+ recipientDocOnNewNode =
+ initialSyncNode.getCollection(TenantMigrationTest.kConfigRecipientsNS)
+ .findOne({tenantId});
+
+ return recipientDocOnPrimary.state == recipientDocOnNewNode.state;
+ },
+ `States never matched, primary: ${recipientDocOnPrimary}, on new node: ${
+ recipientDocOnNewNode}`);
+
+ if (checkMtab) {
+ jsTestLog("Ensuring TenantMigrationAccessBlocker states match.");
+ const primaryMtab = tenantMigrationTest.getTenantMigrationAccessBlocker(
+ {recipientNode: originalRecipientPrimary, tenantId});
+ const newNodeMtab = tenantMigrationTest.getTenantMigrationAccessBlocker(
+ {recipientNode: initialSyncNode, tenantId});
+
+ assert.eq(primaryMtab.recipient.state,
+ newNodeMtab.recipient.state,
+ `Mtab didn't match, primary: ${primaryMtab}, on new node: ${newNodeMtab}`);
+ }
+
+ return initialSyncNode;
+}
+
+// Restarts a node without tenant oplog application. Ensures its state matches up with the
+// primary's, and then steps it up.
+function restartNodeAndCheckStateWithoutOplogApplication(
+ tenantId, tenantMigrationTest, checkMtab, fpOnRecipient) {
+ fpOnRecipient.wait();
+
+ const initialSyncNode = restartNodeAndCheckState(tenantId, tenantMigrationTest, checkMtab);
+
+ jsTestLog("Stepping up the new node.");
+ // Now step up the new node
+ tenantMigrationTest.getRecipientRst().stepUp(initialSyncNode);
+ fpOnRecipient.off();
+}
+
+// Pauses the recipient before the tenant oplog application phase, and inserts documents on the
+// donor that the recipient tenant oplog applier must apply. Then restarts node, allows initial
+// sync, and steps the restarted node up.
+function restartNodeAndCheckStateDuringOplogApplication(
+ tenantId, tenantMigrationTest, checkMtab, fpOnRecipient) {
+ fpOnRecipient.wait();
+
+ // Pause the tenant oplog applier before applying a batch.
+ const originalRecipientPrimary = tenantMigrationTest.getRecipientPrimary();
+ const fpPauseOplogApplierOnBatch =
+ configureFailPoint(originalRecipientPrimary, "fpBeforeTenantOplogApplyingBatch");
+
+ // Insert documents into the donor after data cloning but before tenant oplog application, so
+ // that the recipient has entries to apply during tenant oplog application.
+ tenantMigrationTest.insertDonorDB(
+ tenantMigrationTest.tenantDB(tenantId, testDBName),
+ testCollName,
+ [...Array(30).keys()].map((i) => ({a: i, b: "George Harrison - All Things Must Pass"})));
+
+ // Wait until the oplog applier has started and is trying to apply a batch. Then restart a node.
+ fpPauseOplogApplierOnBatch.wait();
+ const initialSyncNode = restartNodeAndCheckState(tenantId, tenantMigrationTest, checkMtab);
+
+ jsTestLog("Stepping up the new node.");
+ // Now step up the new node
+ tenantMigrationTest.getRecipientRst().stepUp(initialSyncNode);
+ fpPauseOplogApplierOnBatch.off();
+ fpOnRecipient.off();
+}
+
+// This function does the following:
+// 1. Configures a failpoint on the recipient primary, depending on the 'recipientFailpoint' that is
+// passed into the function.
+// 2. Starts a tenant migration.
+// 3. Waits for the recipient failpoint to be hit. Restarts a node, to make it go through initial
+// sync.
+// 4. Makes sure the restarted node's state is as expected.
+// 5. Steps up the restarted node as the recipient primary, lifts the recipient failpoint, and
+// allows the migration to complete.
+function runTestCase(recipientFailpoint, checkMtab, restartNodeAndCheckStateFunction) {
+ const tenantId = ObjectId().str;
+ const donorRst = new ReplSetTest({
+ name: "donorRst",
+ nodes: 1,
+ serverless: true,
+ nodeOptions: Object.assign(migrationX509Options.donor, {
+ setParameter: {
+ // Allow non-timestamped reads on donor after migration completes for testing.
+ 'failpoint.tenantMigrationDonorAllowsNonTimestampedReads':
+ tojson({mode: 'alwaysOn'}),
+ }
+ })
+ });
+ donorRst.startSet();
+ donorRst.initiate();
+
+ const tenantMigrationTest = new TenantMigrationTest({
+ name: jsTestName(),
+ donorRst,
+ sharedOptions: {setParameter: {tenantApplierBatchSizeOps: 2}}
+ });
+
+ const migrationOpts = {migrationIdString: extractUUIDFromObject(UUID()), tenantId};
+ const dbName = tenantMigrationTest.tenantDB(tenantId, testDBName);
+ const originalRecipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+ const fpOnRecipient =
+ configureFailPoint(originalRecipientPrimary, recipientFailpoint, {action: "hang"});
+ tenantMigrationTest.insertDonorDB(dbName, testCollName);
+
+ jsTestLog(`Starting a tenant migration with migrationID ${
+ migrationOpts.migrationIdString}, and tenantId ${tenantId}`);
+ assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+
+ restartNodeAndCheckStateFunction(tenantId, tenantMigrationTest, checkMtab, fpOnRecipient);
+
+ // Allow the migration to run to completion.
+ jsTestLog("Allowing migration to run to completion.");
+ TenantMigrationTest.assertCommitted(
+ tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+
+ assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+}
+
+// These two test cases are for before the mtab is created, and before the oplog applier has been
+// started.
+runTestCase("fpAfterStartingOplogFetcherMigrationRecipientInstance",
+ false /* checkMtab */,
+ restartNodeAndCheckStateWithoutOplogApplication);
+runTestCase("tenantCollectionClonerHangAfterCreateCollection",
+ false /* checkMtab */,
+ restartNodeAndCheckStateWithoutOplogApplication);
+
+// Test case to initial sync a node while the recipient is in the oplog application phase.
+runTestCase("fpBeforeFulfillingDataConsistentPromise",
+ true /* checkMtab */,
+ restartNodeAndCheckStateDuringOplogApplication);
+
+// A case after data consistency so that the mtab exists. We do not care about the oplog applier in
+// this case.
+runTestCase("fpAfterWaitForRejectReadsBeforeTimestamp",
+ true /* checkMtab */,
+ restartNodeAndCheckStateWithoutOplogApplication);
diff --git a/jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js b/jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js
new file mode 100644
index 00000000000..f00a4c4f083
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js
@@ -0,0 +1,211 @@
+/**
+ * Tests that tenant migrations resume successfully on recipient stepup and restart.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_windows_tls,
+ * incompatible_with_shard_merge,
+ * # Some tenant migration statistics field names were changed in 6.1.
+ * requires_fcv_61,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {
+ forgetMigrationAsync,
+ makeX509OptionsForTest,
+ runMigrationAsync,
+} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/uuid_util.js");
+load('jstests/replsets/rslib.js'); // 'createRstArgs'
+
+const kMaxSleepTimeMS = 100;
+const kTenantId = ObjectId().str;
+
+// Set the delay before a state doc is garbage collected to be short to speed up the test but long
+// enough for the state doc to still be around after stepup or restart.
+const kGarbageCollectionDelayMS = 30 * 1000;
+
+// Set the TTL monitor to run at a smaller interval to speed up the test.
+const kTTLMonitorSleepSecs = 1;
+
+const migrationX509Options = makeX509OptionsForTest();
+
+/**
+ * Runs the donorStartMigration command to start a migration, and interrupts the migration on the
+ * recipient using the 'interruptFunc' after the migration starts on the recipient side, and
+ * asserts that migration eventually commits.
+ * @param {recipientRestarted} bool is needed to properly assert the tenant migrations stat count.
+ */
+function testRecipientSyncDataInterrupt(interruptFunc, recipientRestarted) {
+ const recipientRst = new ReplSetTest({
+ nodes: 3,
+ name: "recipientRst",
+ serverless: true,
+ nodeOptions: migrationX509Options.recipient
+ });
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), recipientRst});
+
+ const donorRst = tenantMigrationTest.getDonorRst();
+ const donorPrimary = tenantMigrationTest.getDonorPrimary();
+ let recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ };
+ const donorRstArgs = createRstArgs(donorRst);
+
+ const runMigrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs);
+ runMigrationThread.start();
+
+ // Wait for recipientSyncData command to start.
+ assert.soon(
+ () => recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"})
+ .inprog.length > 0);
+
+ sleep(Math.random() * kMaxSleepTimeMS);
+ interruptFunc(recipientRst);
+
+ TenantMigrationTest.assertCommitted(runMigrationThread.returnData());
+ tenantMigrationTest.waitForDonorNodesToReachState(donorRst.nodes,
+ migrationId,
+ migrationOpts.tenantId,
+ TenantMigrationTest.DonorState.kCommitted);
+ assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+ const donorStats = tenantMigrationTest.getTenantMigrationStats(donorPrimary);
+ assert.eq(1, donorStats.totalMigrationDonationsCommitted);
+
+ tenantMigrationTest.stop();
+ recipientRst.stopSet();
+}
+
+/**
+ * Starts a migration and waits for it to commit, then runs the donorForgetMigration, and interrupts
+ * the recipient using the 'interruptFunc', and asserts that the migration state is eventually
+ * garbage collected.
+ */
+function testRecipientForgetMigrationInterrupt(interruptFunc) {
+ const donorRst = new ReplSetTest({
+ nodes: 1,
+ name: "donorRst",
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.donor, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: kTTLMonitorSleepSecs,
+ }
+ })
+ });
+ const recipientRst = new ReplSetTest({
+ nodes: 3,
+ name: "recipientRst",
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.recipient, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: kTTLMonitorSleepSecs,
+ }
+ })
+ });
+
+ donorRst.startSet();
+ donorRst.initiate();
+
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst});
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+ const migrationId = UUID();
+ const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: recipientRst.getURL(),
+ };
+ const donorRstArgs = createRstArgs(donorRst);
+
+ TenantMigrationTest.assertCommitted(
+ tenantMigrationTest.runMigration(migrationOpts, {automaticForgetMigration: false}));
+ const forgetMigrationThread = new Thread(forgetMigrationAsync,
+ migrationOpts.migrationIdString,
+ donorRstArgs,
+ false /* retryOnRetryableErrors */);
+ forgetMigrationThread.start();
+
+ // Wait for recipientForgetMigration command to start.
+ assert.soon(() => {
+ const res = assert.commandWorked(
+ recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"}));
+ return res.inprog[0].expireAt != null;
+ });
+ sleep(Math.random() * kMaxSleepTimeMS);
+ interruptFunc(recipientRst);
+
+ assert.commandWorkedOrFailedWithCode(
+ tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString),
+ ErrorCodes.NoSuchTenantMigration);
+
+ assert.commandWorked(forgetMigrationThread.returnData());
+ tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, migrationOpts.tenantId);
+
+ tenantMigrationTest.stop();
+ donorRst.stopSet();
+ recipientRst.stopSet();
+}
+
+(() => {
+ jsTest.log("Test that the migration resumes on stepup");
+ testRecipientSyncDataInterrupt((recipientRst) => {
+ // Force the primary to step down but make it likely to step back up.
+ const recipientPrimary = recipientRst.getPrimary();
+ assert.commandWorked(recipientPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
+ }, false);
+})();
+
+(() => {
+ jsTest.log("Test that the migration resumes after restart");
+ testRecipientSyncDataInterrupt((recipientRst) => {
+ recipientRst.stopSet(null /* signal */, true /*forRestart */);
+ recipientRst.startSet({restart: true});
+ recipientRst.awaitSecondaryNodes();
+ recipientRst.getPrimary();
+ }, true);
+})();
+
+(() => {
+ jsTest.log("Test that the recipientForgetMigration command can be retried on stepup");
+ testRecipientForgetMigrationInterrupt((recipientRst) => {
+ // Force the primary to step down but make it likely to step back up.
+ const recipientPrimary = recipientRst.getPrimary();
+ assert.commandWorked(recipientPrimary.adminCommand(
+ {replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+ assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0}));
+ });
+})();
+
+(() => {
+ jsTest.log("Test that the recipientForgetMigration command can be retried after restart");
+ testRecipientForgetMigrationInterrupt((recipientRst) => {
+ recipientRst.stopSet(null /* signal */, true /*forRestart */);
+ recipientRst.startSet({restart: true});
+ recipientRst.awaitSecondaryNodes();
+ recipientRst.getPrimary();
+ });
+})();
diff --git a/jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js b/jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js
new file mode 100644
index 00000000000..03b5f7eb972
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js
@@ -0,0 +1,110 @@
+/**
+ * Tests whether the recipient correctly clears its oplog buffer if the recipient primary
+ * fails over while fetching retryable writes oplog entries from the donor.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject().
+load("jstests/libs/fail_point_util.js"); // For configureFailPoint().
+
+const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), sharedOptions: {nodes: 2}});
+
+const kMigrationId = UUID();
+const kTenantId = ObjectId().str;
+const kDbName = tenantMigrationTest.tenantDB(kTenantId, "testDb");
+const kCollName = "testColl";
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(kMigrationId),
+ tenantId: kTenantId,
+};
+
+const donorRst = tenantMigrationTest.getDonorRst();
+const donorPrimary = tenantMigrationTest.getDonorPrimary();
+const rsConn = new Mongo(donorRst.getURL());
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+const session = rsConn.startSession({retryWrites: true});
+const sessionColl = session.getDatabase(kDbName)[kCollName];
+
+const session2 = rsConn.startSession({retryWrites: true});
+const sessionColl2 = session2.getDatabase(kDbName)[kCollName];
+
+jsTestLog("Run retryable writes prior to the migration.");
+assert.commandWorked(sessionColl.insert({_id: "retryableWrite1"}));
+assert.commandWorked(sessionColl2.insert({_id: "retryableWrite2"}));
+
+jsTestLog("Setting up failpoints.");
+// Use `pauseAfterRetrievingRetryableWritesBatch` to hang after inserting the first batch of results
+// from the aggregation request into the oplog buffer.
+const fpPauseAfterRetrievingRetryableWritesBatch =
+ configureFailPoint(recipientPrimary, "pauseAfterRetrievingRetryableWritesBatch");
+
+// Set aggregation request batch size to 1 so that we can failover in between batches.
+const fpSetSmallAggregationBatchSize =
+ configureFailPoint(recipientPrimary, "fpSetSmallAggregationBatchSize");
+
+jsTestLog("Starting tenant migration with migrationId: " + kMigrationId +
+ ", tenantId: " + kTenantId);
+assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+
+jsTestLog("Waiting until the recipient primary fetches a batch of retryable writes oplog entries.");
+fpSetSmallAggregationBatchSize.wait();
+fpPauseAfterRetrievingRetryableWritesBatch.wait();
+
+// Check that the oplog buffer is correctly populated.
+const kOplogBufferNS = "repl.migration.oplog_" + migrationOpts.migrationIdString;
+let recipientOplogBuffer = recipientPrimary.getDB("config")[kOplogBufferNS];
+// We expect to have only retryableWrite1 since the cursor batch size is 1 and we paused after
+// inserting the first branch of results from the aggregation request.
+let cursor = recipientOplogBuffer.find();
+assert.eq(cursor.itcount(), 1, "Incorrect number of oplog entries in buffer: " + cursor.toArray());
+
+// Check that we haven't completed the retryable writes fetching stage yet.
+let recipientConfigColl = recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS);
+let recipientDoc = recipientConfigColl.find({"_id": kMigrationId}).toArray();
+assert.eq(recipientDoc.length, 1);
+assert.eq(recipientDoc[0].completedFetchingRetryableWritesBeforeStartOpTime, false);
+
+jsTestLog("Stepping a new primary up.");
+const recipientRst = tenantMigrationTest.getRecipientRst();
+const recipientSecondary = recipientRst.getSecondary();
+// Use `fpAfterFetchingRetryableWritesEntriesBeforeStartOpTime` to hang after populating the oplog
+// buffer with retryable writes entries. Set this before stepping up instead of after so that the
+// new primary will not be able to pass this stage without the failpoint being set.
+const fpAfterFetchingRetryableWritesEntries = configureFailPoint(
+ recipientSecondary, "fpAfterFetchingRetryableWritesEntriesBeforeStartOpTime", {action: "hang"});
+
+recipientRst.stepUp(recipientSecondary);
+
+fpPauseAfterRetrievingRetryableWritesBatch.off();
+const newRecipientPrimary = recipientRst.getPrimary();
+
+fpAfterFetchingRetryableWritesEntries.wait();
+// The new primary should have cleared its oplog buffer and refetched both retryableWrite1 and
+// retryableWrite2. Otherwise, we will invariant when trying to add those entries.
+recipientOplogBuffer = newRecipientPrimary.getDB("config")[kOplogBufferNS];
+cursor = recipientOplogBuffer.find();
+assert.eq(cursor.itcount(), 2, "Incorrect number of oplog entries in buffer: " + cursor.toArray());
+
+recipientConfigColl = newRecipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS);
+recipientDoc = recipientConfigColl.find({"_id": kMigrationId}).toArray();
+assert.eq(recipientDoc.length, 1);
+assert.eq(recipientDoc[0].completedFetchingRetryableWritesBeforeStartOpTime, true);
+
+fpAfterFetchingRetryableWritesEntries.off();
+fpSetSmallAggregationBatchSize.off();
+
+jsTestLog("Waiting for migration to complete.");
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+
+tenantMigrationTest.stop();
diff --git a/jstests/replsets/tenant_migration_recipient_rollback_recovery.js b/jstests/replsets/tenant_migration_recipient_rollback_recovery.js
new file mode 100644
index 00000000000..ffb2cc4919e
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_rollback_recovery.js
@@ -0,0 +1,335 @@
+/**
+ * Tests that tenant migrations that go through recipient rollback are recovered correctly.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {
+ forgetMigrationAsync,
+ makeX509OptionsForTest,
+ runMigrationAsync,
+} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/replsets/libs/rollback_test.js");
+load("jstests/replsets/rslib.js"); // 'createRstArgs'
+
+const kTenantId = ObjectId().str;
+
+const kMaxSleepTimeMS = 250;
+
+// Set the delay before a state doc is garbage collected to be short to speed up the test but long
+// enough for the state doc to still be around after the recipient is back in the replication steady
+// state.
+const kGarbageCollectionDelayMS = 30 * 1000;
+
+const migrationX509Options = makeX509OptionsForTest();
+
+function makeMigrationOpts(tenantMigrationTest, migrationId, tenantId) {
+ return {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: tenantId,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ readPreference: {mode: "primary"},
+ };
+}
+
+/**
+ * Starts a recipient ReplSetTest and creates a TenantMigrationTest for it. Runs 'setUpFunc' after
+ * initiating the recipient. Then, runs 'rollbackOpsFunc' while replication is disabled on the
+ * secondaries, shuts down the primary and restarts it after re-election to force the operations in
+ * 'rollbackOpsFunc' to be rolled back. Finally, runs 'steadyStateFunc' after it is back in the
+ * replication steady state.
+ */
+function testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc) {
+ const donorRst = new ReplSetTest({
+ name: "donorRst",
+ nodes: 1,
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.donor, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: 1,
+ }
+ })
+ });
+ donorRst.startSet();
+ donorRst.initiate();
+
+ const donorRstArgs = createRstArgs(donorRst);
+
+ const recipientRst = new ReplSetTest({
+ name: "recipientRst",
+ nodes: 3,
+ serverless: true,
+ nodeOptions: Object.assign({}, migrationX509Options.recipient, {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS,
+ ttlMonitorSleepSecs: 1,
+ }
+ })
+ });
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst});
+ setUpFunc(tenantMigrationTest, donorRstArgs);
+
+ let originalRecipientPrimary = recipientRst.getPrimary();
+ const originalRecipientSecondaries = recipientRst.getSecondaries();
+ // The default WC is majority and stopServerReplication will prevent satisfying any majority
+ // writes.
+ assert.commandWorked(originalRecipientPrimary.adminCommand(
+ {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}));
+ recipientRst.awaitLastOpCommitted();
+
+ // Disable replication on the secondaries so that writes during this step will be rolled back.
+ stopServerReplication(originalRecipientSecondaries);
+ rollbackOpsFunc(tenantMigrationTest, donorRstArgs);
+
+ // Shut down the primary and re-enable replication to allow one of the secondaries to get
+ // elected, and make the writes above get rolled back on the original primary when it comes
+ // back up.
+ recipientRst.stop(originalRecipientPrimary);
+ restartServerReplication(originalRecipientSecondaries);
+ const newRecipientPrimary = recipientRst.getPrimary();
+ assert.neq(originalRecipientPrimary, newRecipientPrimary);
+
+ // Restart the original primary.
+ originalRecipientPrimary =
+ recipientRst.start(originalRecipientPrimary, {waitForConnect: true}, true /* restart */);
+ originalRecipientPrimary.setSecondaryOk();
+ recipientRst.awaitReplication();
+
+ steadyStateFunc(tenantMigrationTest);
+
+ donorRst.stopSet();
+ recipientRst.stopSet();
+}
+
+/**
+ * Starts a migration and waits for the recipient's primary to insert the recipient's state doc.
+ * Forces the write to be rolled back. After the replication steady state is reached, asserts that
+ * recipientSyncData can restart the migration on the new primary.
+ */
+function testRollbackInitialState() {
+ const migrationId = UUID();
+ let migrationOpts;
+ let migrationThread;
+
+ let setUpFunc = (tenantMigrationTest, donorRstArgs) => {};
+
+ let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => {
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+ // Start the migration asynchronously and wait for the primary to insert the state doc.
+ migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str);
+ migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs);
+ migrationThread.start();
+ assert.soon(() => {
+ return 1 ===
+ recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS).count({
+ _id: migrationId
+ });
+ });
+ };
+
+ let steadyStateFunc = (tenantMigrationTest) => {
+ // Verify that the migration restarted successfully on the new primary despite rollback.
+ TenantMigrationTest.assertCommitted(migrationThread.returnData());
+ tenantMigrationTest.assertRecipientNodesInExpectedState({
+ nodes: tenantMigrationTest.getRecipientRst().nodes,
+ migrationId: migrationId,
+ tenantId: migrationOpts.tenantId,
+ expectedState: TenantMigrationTest.RecipientState.kConsistent,
+ expectedAccessState: TenantMigrationTest.RecipientAccessState.kRejectBefore
+ });
+ assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+ };
+
+ testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc);
+}
+
+/**
+ * Starts a migration after enabling 'pauseFailPoint' (must pause the migration) and
+ * 'setUpFailPoints' on the recipient's primary. Waits for the primary to do the write to transition
+ * to 'nextState' after reaching 'pauseFailPoint' (i.e. the state doc matches 'query'), then forces
+ * the write to be rolled back. After the replication steady state is reached, asserts that the
+ * migration is resumed successfully by new primary regardless of what the rolled back state
+ * transition is.
+ */
+function testRollBackStateTransition(pauseFailPoint, setUpFailPoints, nextState, query) {
+ jsTest.log(`Test roll back the write to transition to state "${
+ nextState}" after reaching failpoint "${pauseFailPoint}"`);
+
+ const migrationId = UUID();
+ let migrationOpts;
+ let migrationThread, pauseFp;
+
+ let setUpFunc = (tenantMigrationTest, donorRstArgs) => {
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+ setUpFailPoints.forEach(failPoint => configureFailPoint(recipientPrimary, failPoint));
+ pauseFp = configureFailPoint(recipientPrimary, pauseFailPoint, {action: "hang"});
+
+ migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str);
+ migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs);
+ migrationThread.start();
+ pauseFp.wait();
+ };
+
+ let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => {
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+ // Resume the migration and wait for the primary to do the write for the state transition.
+ pauseFp.off();
+ assert.soon(() => {
+ return 1 ===
+ recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS)
+ .count(Object.assign({_id: migrationId}, query));
+ });
+ };
+
+ let steadyStateFunc = (tenantMigrationTest) => {
+ // Verify that the migration resumed successfully on the new primary despite the rollback.
+ TenantMigrationTest.assertCommitted(migrationThread.returnData());
+ tenantMigrationTest.waitForRecipientNodesToReachState(
+ tenantMigrationTest.getRecipientRst().nodes,
+ migrationId,
+ migrationOpts.tenantId,
+ TenantMigrationTest.RecipientState.kConsistent,
+ TenantMigrationTest.RecipientAccessState.kRejectBefore);
+ assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+ };
+
+ testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc);
+}
+
+/**
+ * Runs donorForgetMigration after completing a migration. Waits for the recipient's primary to
+ * mark the recipient's state doc as garbage collectable, then forces the write to be rolled back.
+ * After the replication steady state is reached, asserts that recipientForgetMigration can be
+ * retried on the new primary and that the state doc is eventually garbage collected.
+ */
+function testRollBackMarkingStateGarbageCollectable() {
+ const migrationId = UUID();
+ let migrationOpts;
+ let forgetMigrationThread;
+
+ let setUpFunc = (tenantMigrationTest, donorRstArgs) => {
+ migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str);
+ TenantMigrationTest.assertCommitted(
+ tenantMigrationTest.runMigration(migrationOpts, {automaticForgetMigration: false}));
+ };
+
+ let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => {
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+ // Run donorForgetMigration and wait for the primary to do the write to mark the state doc
+ // as garbage collectable.
+ forgetMigrationThread = new Thread(forgetMigrationAsync,
+ migrationOpts.migrationIdString,
+ donorRstArgs,
+ false /* retryOnRetryableErrors */);
+ forgetMigrationThread.start();
+ assert.soon(() => {
+ return 1 ===
+ recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS)
+ .count({_id: migrationId, expireAt: {$exists: 1}});
+ });
+ };
+
+ let steadyStateFunc = (tenantMigrationTest) => {
+ // Verify that the migration state got garbage collected successfully despite the rollback.
+ assert.commandWorked(forgetMigrationThread.returnData());
+ tenantMigrationTest.waitForMigrationGarbageCollection(
+ migrationId,
+ migrationOpts.tenantId,
+ tenantMigrationTest.getDonorRst().nodes,
+ tenantMigrationTest.getRecipientRst().nodes);
+ };
+
+ testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc);
+}
+
+/**
+ * Starts a migration and forces the recipient's primary to go through rollback after a random
+ * amount of time. After the replication steady state is reached, asserts that the migration is
+ * resumed successfully.
+ */
+function testRollBackRandom() {
+ const migrationId = UUID();
+ let migrationOpts;
+ let migrationThread;
+
+ let setUpFunc = (tenantMigrationTest, donorRstArgs) => {
+ migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str);
+ migrationThread = new Thread(async (donorRstArgs, migrationOpts) => {
+ const {runMigrationAsync, forgetMigrationAsync} =
+ await import("jstests/replsets/libs/tenant_migration_util.js");
+ assert.commandWorked(await runMigrationAsync(migrationOpts, donorRstArgs));
+ assert.commandWorked(await forgetMigrationAsync(
+ migrationOpts.migrationIdString, donorRstArgs, false /* retryOnRetryableErrors */));
+ }, donorRstArgs, migrationOpts);
+
+ // Start the migration and wait for a random amount of time before transitioning to the
+ // rollback operations state.
+ migrationThread.start();
+ sleep(Math.random() * kMaxSleepTimeMS);
+ };
+
+ let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => {
+ // Let the migration run in the rollback operations state for a random amount of time.
+ sleep(Math.random() * kMaxSleepTimeMS);
+ };
+
+ let steadyStateFunc = (tenantMigrationTest) => {
+ // Verify that the migration completed and was garbage collected successfully despite the
+ // rollback.
+ migrationThread.join();
+ tenantMigrationTest.waitForRecipientNodesToReachState(
+ tenantMigrationTest.getRecipientRst().nodes,
+ migrationId,
+ migrationOpts.tenantId,
+ TenantMigrationTest.RecipientState.kDone,
+ TenantMigrationTest.RecipientAccessState.kRejectBefore);
+ tenantMigrationTest.waitForMigrationGarbageCollection(
+ migrationId,
+ migrationOpts.tenantId,
+ tenantMigrationTest.getDonorRst().nodes,
+ tenantMigrationTest.getRecipientRst().nodes);
+ };
+
+ testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc);
+}
+
+jsTest.log("Test roll back recipient's state doc insert");
+testRollbackInitialState();
+
+jsTest.log("Test roll back recipient's state doc update");
+[{
+ pauseFailPoint: "fpBeforeMarkingCloneSuccess",
+ nextState: "reject",
+ query: {dataConsistentStopDonorOpTime: {$exists: 1}}
+},
+ {
+ pauseFailPoint: "fpBeforePersistingRejectReadsBeforeTimestamp",
+ nextState: "rejectBefore",
+ query: {rejectReadsBeforeTimestamp: {$exists: 1}}
+ }].forEach(({pauseFailPoint, setUpFailPoints = [], nextState, query}) => {
+ testRollBackStateTransition(pauseFailPoint, setUpFailPoints, nextState, query);
+});
+
+jsTest.log("Test roll back marking the donor's state doc as garbage collectable");
+testRollBackMarkingStateGarbageCollectable();
+
+jsTest.log("Test roll back random");
+testRollBackRandom();
diff --git a/jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js b/jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js
new file mode 100644
index 00000000000..81422e28454
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js
@@ -0,0 +1,94 @@
+/**
+ * Exercises the code path for the recipientSyncData command that waits until a timestamp provided
+ * by the donor is majority committed: make sure that in this code path, when the recipient is
+ * interrupted by a primary step down, the recipient properly swaps the error code to the true code
+ * (like primary step down) that the donor can retry on.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_persistence,
+ * requires_replication,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject()
+
+// Make the batch size small so that we can pause before all the batches are applied.
+const tenantMigrationTest = new TenantMigrationTest(
+ {name: jsTestName(), sharedOptions: {setParameter: {tenantApplierBatchSizeOps: 2}}});
+
+const kMigrationId = UUID();
+const kTenantId = ObjectId().str;
+const kReadPreference = {
+ mode: "primary"
+};
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(kMigrationId),
+ tenantId: kTenantId,
+ readPreference: kReadPreference
+};
+
+const dbName = tenantMigrationTest.tenantDB(kTenantId, "testDB");
+const collName = jsTestName() + "_collection";
+
+const recipientRst = tenantMigrationTest.getRecipientRst();
+const recipientPrimary = recipientRst.getPrimary();
+
+// FailPoint to pause right before the data consistent promise is fulfilled.
+const fpBeforeDataConsistent = configureFailPoint(
+ recipientPrimary, "fpBeforeFulfillingDataConsistentPromise", {action: "hang"});
+const fpBeforeApplierFutureCalled =
+ configureFailPoint(recipientPrimary, "fpWaitUntilTimestampMajorityCommitted");
+
+tenantMigrationTest.insertDonorDB(dbName, collName);
+
+jsTestLog("Starting migration.");
+// Start the migration, and allow it to progress to the point where the _dataConsistentPromise has
+// been fulfilled.
+tenantMigrationTest.startMigration(migrationOpts);
+
+jsTestLog("Waiting for data consistent promise.");
+// Pause right before the _dataConsistentPromise is fulfilled. Therefore, the applier has
+// finished applying entries at least until dataConsistentStopDonorOpTime.
+fpBeforeDataConsistent.wait();
+
+jsTestLog("Pausing the tenant oplog applier.");
+// Pause the applier now. All the entries that the applier cannot process now are past the
+// dataConsistentStopDonorOpTime.
+const fpPauseOplogApplier =
+ configureFailPoint(recipientPrimary, "fpBeforeTenantOplogApplyingBatch");
+
+jsTestLog("Writing to donor db.");
+// Send writes to the donor. The applier will not be able to process these as it is paused.
+const docsToApply = [...Array(10).keys()].map((i) => ({a: i}));
+tenantMigrationTest.insertDonorDB(dbName, collName, docsToApply);
+
+jsTestLog("Waiting to hit failpoint in tenant oplog applier.");
+fpPauseOplogApplier.wait();
+
+jsTestLog("Allowing recipient to respond.");
+// Allow the recipient to respond to the donor for the recipientSyncData command that waits on the
+// fulfillment of the _dataConsistentPromise. The donor will then send another recipientSyncData
+// command that waits on the provided donor timestamp to be majority committed.
+fpBeforeDataConsistent.off();
+
+jsTestLog("Reach the point where we are waiting for the tenant oplog applier to catch up.");
+fpBeforeApplierFutureCalled.wait();
+fpBeforeApplierFutureCalled.off();
+
+jsTestLog("Stepping another node up.");
+// Make a new recipient primary step up. This will ask the applier to shutdown.
+recipientRst.stepUp(recipientRst.getSecondaries()[0]);
+
+jsTestLog("Release the tenant oplog applier failpoint.");
+fpPauseOplogApplier.off();
+
+jsTestLog("Waiting for migration to complete.");
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+
+tenantMigrationTest.stop();
diff --git a/jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js b/jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js
new file mode 100644
index 00000000000..c95fb70d474
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js
@@ -0,0 +1,66 @@
+/**
+ * Tests that a migration will continuously retry sync source selection when there are no available
+ * donor hosts. Also checks that a donor host is considered an uneligible sync source when it has a
+ * majority OpTime earlier than the recipient's stored 'startApplyingDonorOpTime'.
+ *
+ * Tests that if the stale donor host advances its majority OpTime to 'startApplyingDonorOpTime'
+ * or later, the recipient will successfully choose that donor as sync source and resume the
+ * migration.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * # The currentOp output field 'dataSyncCompleted' was renamed to 'migrationCompleted'.
+ * requires_fcv_70,
+ * serverless,
+ * ]
+ */
+
+import {
+ setUpMigrationSyncSourceTest
+} from "jstests/replsets/libs/tenant_migration_recipient_sync_source.js";
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+
+// After this setUp() call, we should have a migration with 'secondary' read preference. The
+// recipient should be continuously retrying sync source selection, unable to choose
+// 'delayedSecondary' because it is too stale and 'donorSecondary' because it is down.
+const {
+ tenantMigrationTest,
+ migrationOpts,
+ donorSecondary,
+ delayedSecondary,
+ hangAfterCreatingConnections
+} = setUpMigrationSyncSourceTest();
+
+if (!tenantMigrationTest) {
+ // Feature flag was not enabled.
+ quit();
+}
+
+jsTestLog("Restarting replication on 'delayedSecondary'");
+restartServerReplication(delayedSecondary);
+
+// The recipient should eventually be able to connect to the lagged secondary, after the secondary
+// has caught up and the exclude timeout has expired.
+hangAfterCreatingConnections.wait();
+
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+const currOp = res.inprog[0];
+assert.eq(delayedSecondary.host,
+ currOp.donorSyncSource,
+ `the recipient should only be able to choose 'delayedSecondary' as sync source`);
+
+hangAfterCreatingConnections.off();
+
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+// Remove 'donorSecondary' so that the test can complete properly.
+const donorRst = tenantMigrationTest.getDonorRst();
+donorRst.remove(donorSecondary);
+donorRst.stopSet();
+tenantMigrationTest.stop();
diff --git a/jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js b/jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js
new file mode 100644
index 00000000000..10ded49a34b
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js
@@ -0,0 +1,67 @@
+/**
+ * Tests that a migration will continuously retry sync source selection when there are no available
+ * donor hosts. Also checks that a donor host is considered an uneligible sync source when it has a
+ * majority OpTime earlier than the recipient's stored 'startApplyingDonorOpTime'.
+ *
+ * Tests that if a donor host becomes available, the recipient will successfully choose it as a
+ * sync source and resume the migration.
+ *
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * # The currentOp output field 'dataSyncCompleted' was renamed to 'migrationCompleted'.
+ * requires_fcv_70,
+ * serverless,
+ * ]
+ */
+
+import {
+ setUpMigrationSyncSourceTest
+} from "jstests/replsets/libs/tenant_migration_recipient_sync_source.js";
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+
+// After this setUp() call, we should have a migration with 'secondary' read preference. The
+// recipient should be continuously retrying sync source selection, unable to choose
+// 'delayedSecondary' because it is too stale and 'donorSecondary' because it is down.
+const {
+ tenantMigrationTest,
+ migrationOpts,
+ donorSecondary,
+ delayedSecondary,
+ hangAfterCreatingConnections
+} = setUpMigrationSyncSourceTest();
+
+if (!tenantMigrationTest) {
+ // Feature flag was not enabled.
+ quit();
+}
+
+const donorRst = tenantMigrationTest.getDonorRst();
+
+jsTestLog("Restarting 'donorSecondary'");
+donorRst.start(donorSecondary, null /* options */, true /* restart */);
+
+// The recipient should eventually be able to connect to the donor secondary, after the node reaches
+// 'secondary' state.
+hangAfterCreatingConnections.wait();
+
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+const currOp = res.inprog[0];
+// 'donorSecondary' should always be the chosen sync source, since read preference is 'secondary'
+// and 'delayedSecondary' cannot be chosen because it is too stale.
+assert.eq(donorSecondary.host,
+ currOp.donorSyncSource,
+ `the recipient should only be able to choose 'donorSecondary' as sync source`);
+
+hangAfterCreatingConnections.off();
+restartServerReplication(delayedSecondary);
+
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+donorRst.stopSet();
+tenantMigrationTest.stop();
diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js
new file mode 100644
index 00000000000..ea9654d97ac
--- /dev/null
+++ b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js
@@ -0,0 +1,132 @@
+/**
+ * Tests that in tenant migration, the recipient set can resume collection cloning from the last
+ * document cloned after a failover.
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {
+ checkTenantDBHashes,
+ makeX509OptionsForTest,
+} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject'
+
+const tenantMigrationFailoverTest = function(isTimeSeries, createCollFn, docs) {
+ const batchSize = 2;
+ const recipientRst = new ReplSetTest({
+ nodes: 2,
+ name: jsTestName() + "_recipient",
+ serverless: true,
+ nodeOptions: Object.assign(makeX509OptionsForTest().recipient, {
+ setParameter: {
+ // Use a batch size of 2 so that collection cloner requires more than a single
+ // batch to complete.
+ collectionClonerBatchSize: batchSize,
+ // Allow reads on recipient before migration completes for testing.
+ 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}),
+ }
+ })
+ });
+
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst});
+ const donorPrimary = tenantMigrationTest.getDonorPrimary();
+
+ const tenantId = ObjectId().str;
+ const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB");
+ const donorDB = donorPrimary.getDB(dbName);
+ const collName = "testColl";
+
+ const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+ // Create collection and insert documents.
+ assert.commandWorked(createCollFn(donorDB, collName));
+ tenantMigrationTest.insertDonorDB(dbName, collName, docs);
+
+ const migrationId = UUID();
+ const migrationIdString = extractUUIDFromObject(migrationId);
+ const migrationOpts = {
+ migrationIdString: migrationIdString,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ tenantId,
+ };
+
+ // Configure a fail point to have the recipient primary hang after cloning 2 documents.
+ const recipientDb = recipientPrimary.getDB(dbName);
+ let recipientColl = isTimeSeries ? recipientDb.getCollection("system.buckets." + collName)
+ : recipientDb.getCollection(collName);
+
+ const hangDuringCollectionClone =
+ configureFailPoint(recipientDb,
+ "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse",
+ {nss: recipientColl.getFullName()});
+
+ // Start a migration and wait for recipient to hang after cloning 2 documents.
+ assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+ hangDuringCollectionClone.wait();
+ assert.soon(() => recipientColl.find().itcount() === batchSize);
+
+ // Insert some documents that will be fetched by the recipient. This is to test that on
+ // failover, the fetcher will resume fetching from where it left off. The system is expected
+ // to crash if the recipient fetches a duplicate oplog entry upon resuming the migration.
+ tenantMigrationTest.insertDonorDB(dbName, "aNewColl", [{_id: "docToBeFetched"}]);
+ assert.soon(() => {
+ const configDb = recipientPrimary.getDB("config");
+ const oplogBuffer = configDb.getCollection("repl.migration.oplog_" + migrationIdString);
+ return oplogBuffer.find({"entry.o._id": "docToBeFetched"}).count() === 1;
+ });
+
+ // Step up a new node in the recipient set and trigger a failover. The new primary should resume
+ // cloning starting from the third document.
+ const newRecipientPrimary = recipientRst.getSecondaries()[0];
+ recipientRst.stepUp(newRecipientPrimary);
+ hangDuringCollectionClone.off();
+ recipientRst.getPrimary();
+
+ // The migration should go through after recipient failover.
+ TenantMigrationTest.assertCommitted(
+ tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+
+ // Check that recipient has cloned all documents in the collection.
+ recipientColl = newRecipientPrimary.getDB(dbName).getCollection(collName);
+ assert.eq(docs.length, recipientColl.find().itcount());
+ assert.docEq(docs, recipientColl.find().sort({_id: 1}).toArray());
+ checkTenantDBHashes({
+ donorRst: tenantMigrationTest.getDonorRst(),
+ recipientRst: tenantMigrationTest.getRecipientRst(),
+ tenantId
+ });
+
+ tenantMigrationTest.stop();
+ recipientRst.stopSet();
+};
+
+jsTestLog("Running tenant migration test for time-series collection");
+tenantMigrationFailoverTest(true,
+ (db, collName) => db.createCollection(
+ collName, {timeseries: {timeField: "time", metaField: "bucket"}}),
+ [
+ // Group each document in its own bucket in order to work with the
+ // collectionClonerBatchSize we set at the recipient replSet.
+ {_id: 1, time: ISODate(), bucket: "a"},
+ {_id: 2, time: ISODate(), bucket: "b"},
+ {_id: 3, time: ISODate(), bucket: "c"},
+ {_id: 4, time: ISODate(), bucket: "d"}
+ ]);
+
+jsTestLog("Running tenant migration test for regular collection");
+tenantMigrationFailoverTest(false,
+ (db, collName) => db.createCollection(collName),
+ [{_id: 0}, {_id: "string"}, {_id: UUID()}, {_id: new Date()}]);
diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js
new file mode 100644
index 00000000000..262e40edf00
--- /dev/null
+++ b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js
@@ -0,0 +1,133 @@
+/**
+ * Tests that in tenant migration, the collection recreated on a dropped view namespace is handled
+ * correctly on resuming the logical tenant collection cloning phase due to recipient failover.
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {makeX509OptionsForTest} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject'
+
+const tenantMigrationFailoverTest = function(isTimeSeries, createCollFn) {
+ const recipientRst = new ReplSetTest({
+ nodes: 2,
+ name: jsTestName() + "_recipient",
+ serverless: true,
+ nodeOptions: Object.assign(makeX509OptionsForTest().recipient, {
+ setParameter: {
+ // Allow reads on recipient before migration completes for testing.
+ 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}),
+ }
+ })
+ });
+
+ recipientRst.startSet();
+ recipientRst.initiate();
+
+ const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst});
+
+ const donorRst = tenantMigrationTest.getDonorRst();
+ const donorPrimary = donorRst.getPrimary();
+
+ const tenantId = ObjectId().str;
+ const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB");
+ const donorDB = donorPrimary.getDB(dbName);
+ const collName = "testColl";
+ const donorColl = donorDB[collName];
+
+ let getCollectionInfo = function(conn) {
+ return conn.getDB(dbName).getCollectionInfos().filter(coll => {
+ return coll.name === collName;
+ });
+ };
+
+ // Create a timeseries collection or a regular view.
+ assert.commandWorked(createCollFn(donorDB, collName));
+ donorRst.awaitReplication();
+
+ const migrationId = UUID();
+ const migrationIdString = extractUUIDFromObject(migrationId);
+ const migrationOpts = {
+ migrationIdString: migrationIdString,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ tenantId,
+ };
+
+ const recipientPrimary = recipientRst.getPrimary();
+ const recipientDb = recipientPrimary.getDB(dbName);
+ const recipientSystemViewsColl = recipientDb.getCollection("system.views");
+
+ // Configure a fail point to have the recipient primary hang after cloning
+ // "<OID>_testDB.system.views" collection.
+ const hangDuringCollectionClone =
+ configureFailPoint(recipientPrimary,
+ "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse",
+ {nss: recipientSystemViewsColl.getFullName()});
+
+ // Start the migration and wait for the migration to hang after cloning
+ // "<OID>_testDB.system.views" collection.
+ assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+ hangDuringCollectionClone.wait();
+
+ assert.soon(() => recipientSystemViewsColl.find().itcount() >= 1);
+ recipientRst.awaitLastOpCommitted();
+ const newRecipientPrimary = recipientRst.getSecondaries()[0];
+
+ // Verify that a view has been registered for "<OID>_testDB.testColl" on the new
+ // recipient primary.
+ let collectionInfo = getCollectionInfo(newRecipientPrimary);
+ assert.eq(1, collectionInfo.length);
+ assert(collectionInfo[0].type === (isTimeSeries ? "timeseries" : "view"),
+ "data store type mismatch: " + tojson(collectionInfo[0]));
+
+ // Drop the view and create a regular collection with the same namespace as the
+ // dropped view on donor.
+ assert(donorColl.drop());
+ assert.commandWorked(donorDB.createCollection(collName));
+
+ // We need to skip TenantDatabaseCloner::listExistingCollectionsStage() to make sure
+ // the recipient always clone the above newly created regular collection after the failover.
+ // Currently, we restart cloning after a failover, only from the collection whose UUID is
+ // greater than or equal to the last collection we have on disk.
+ const skiplistExistingCollectionsStage =
+ configureFailPoint(newRecipientPrimary, "skiplistExistingCollectionsStage");
+
+ // Step up a new node in the recipient set and trigger a failover.
+ recipientRst.stepUp(newRecipientPrimary);
+ hangDuringCollectionClone.off();
+
+ // The migration should go through after recipient failover.
+ TenantMigrationTest.assertCommitted(
+ tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+
+ // Check that recipient has dropped the view and and re-created the regular collection as part
+ // of migration oplog catchup phase.
+ collectionInfo = getCollectionInfo(newRecipientPrimary);
+ assert.eq(1, collectionInfo.length);
+ assert(collectionInfo[0].type === "collection",
+ "data store type mismatch: " + tojson(collectionInfo[0]));
+
+ tenantMigrationTest.stop();
+ recipientRst.stopSet();
+};
+
+jsTestLog("Running tenant migration test for time-series collection");
+// Creating a timeseries collection, implicity creates a view on the 'collName' collection
+// namespace.
+tenantMigrationFailoverTest(true,
+ (db, collName) => db.createCollection(
+ collName, {timeseries: {timeField: "time", metaField: "bucket"}}));
+
+jsTestLog("Running tenant migration test for regular view");
+tenantMigrationFailoverTest(false,
+ (db, collName) => db.createView(collName, "sourceCollection", []));
diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js
new file mode 100644
index 00000000000..0919240cf24
--- /dev/null
+++ b/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js
@@ -0,0 +1,124 @@
+/**
+ * Tests that in tenant migration, the recipient set can resume collection cloning from the last
+ * document cloned after a failover even if the collection has been renamed on the donor.
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {
+ checkTenantDBHashes,
+ makeX509OptionsForTest,
+ runMigrationAsync
+} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject'
+load("jstests/libs/parallelTester.js"); // for 'Thread'
+load('jstests/replsets/rslib.js'); // 'createRstArgs'
+
+const recipientRst = new ReplSetTest({
+ nodes: 2,
+ name: jsTestName() + "_recipient",
+ serverless: true,
+ nodeOptions: Object.assign(makeX509OptionsForTest().recipient, {
+ setParameter: {
+ // Use a batch size of 2 so that collection cloner requires more than a single batch to
+ // complete.
+ collectionClonerBatchSize: 2,
+ // Allow reads on recipient before migration completes for testing.
+ 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}),
+ }
+ })
+});
+
+recipientRst.startSet();
+recipientRst.initiate();
+
+const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst});
+const tenantId = ObjectId().str;
+const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB");
+const collName = "testColl";
+
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const donorPrimary = tenantMigrationTest.getDonorPrimary();
+
+// Test _id with mixed bson types.
+const docs = [{_id: 0}, {_id: "string"}, {_id: UUID()}, {_id: new Date()}];
+tenantMigrationTest.insertDonorDB(dbName, collName, docs);
+
+const migrationId = UUID();
+const migrationIdString = extractUUIDFromObject(migrationId);
+const migrationOpts = {
+ migrationIdString: migrationIdString,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ tenantId,
+};
+
+// Configure a fail point to have the recipient primary hang after cloning 2 documents.
+const recipientDb = recipientPrimary.getDB(dbName);
+let recipientColl = recipientDb.getCollection(collName);
+const hangDuringCollectionClone =
+ configureFailPoint(recipientDb,
+ "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse",
+ {nss: recipientColl.getFullName()});
+
+// Start a migration and wait for recipient to hang after cloning 2 documents.
+const donorRstArgs = createRstArgs(tenantMigrationTest.getDonorRst());
+const migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs);
+migrationThread.start();
+hangDuringCollectionClone.wait();
+assert.soon(() => recipientColl.find().itcount() === 2);
+
+// Insert some documents that will be fetched by the recipient. This is to test that on failover,
+// the fetcher will resume fetching from where it left off. The system is expected to crash if
+// the recipient fetches a duplicate oplog entry upon resuming the migration.
+tenantMigrationTest.insertDonorDB(dbName, "aNewColl", [{_id: "docToBeFetched"}]);
+assert.soon(() => {
+ const configDb = recipientPrimary.getDB("config");
+ const oplogBuffer = configDb.getCollection("repl.migration.oplog_" + migrationIdString);
+ return oplogBuffer.find({"entry.o._id": "docToBeFetched"}).count() === 1;
+});
+
+recipientRst.awaitLastOpCommitted();
+
+// Set a failpoint to prevent the new recipient primary from completing the migration before the
+// donor renames the collection.
+const newRecipientPrimary = recipientRst.getSecondaries()[0];
+const fpPauseAtStartOfMigration =
+ configureFailPoint(newRecipientPrimary, "pauseAfterRunTenantMigrationRecipientInstance");
+
+// Step up a new node in the recipient set and trigger a failover. The new primary should resume
+// cloning starting from the third document.
+recipientRst.stepUp(newRecipientPrimary);
+hangDuringCollectionClone.off();
+recipientRst.getPrimary();
+
+// Rename the collection on the donor.
+const donorColl = donorPrimary.getDB(dbName).getCollection(collName);
+const collNameRenamed = collName + "_renamed";
+assert.commandWorked(donorColl.renameCollection(collNameRenamed));
+
+// The migration should go through after recipient failover.
+fpPauseAtStartOfMigration.off();
+TenantMigrationTest.assertCommitted(migrationThread.returnData());
+
+// Check that recipient has cloned all documents in the renamed collection.
+recipientColl = newRecipientPrimary.getDB(dbName).getCollection(collNameRenamed);
+assert.eq(4, recipientColl.find().itcount());
+assert.eq(recipientColl.find().sort({_id: 1}).toArray(), docs);
+checkTenantDBHashes({
+ donorRst: tenantMigrationTest.getDonorRst(),
+ recipientRst: tenantMigrationTest.getRecipientRst(),
+ tenantId
+});
+
+tenantMigrationTest.stop();
+recipientRst.stopSet();
diff --git a/jstests/replsets/tenant_migration_resume_oplog_application.js b/jstests/replsets/tenant_migration_resume_oplog_application.js
new file mode 100644
index 00000000000..70849750914
--- /dev/null
+++ b/jstests/replsets/tenant_migration_resume_oplog_application.js
@@ -0,0 +1,119 @@
+/**
+ * Tests that in a tenant migration, the recipient primary will resume oplog application on
+ * failover.
+ * @tags: [
+ * incompatible_with_macos,
+ * incompatible_with_shard_merge,
+ * incompatible_with_windows_tls,
+ * requires_majority_read_concern,
+ * requires_persistence,
+ * serverless,
+ * ]
+ */
+
+import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js";
+import {
+ checkTenantDBHashes,
+ makeX509OptionsForTest,
+ runMigrationAsync,
+} from "jstests/replsets/libs/tenant_migration_util.js";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject'
+load("jstests/libs/parallelTester.js"); // for 'Thread'
+load("jstests/libs/write_concern_util.js"); // for 'stopReplicationOnSecondaries'
+load("jstests/aggregation/extras/utils.js"); // For assertArrayEq.
+load('jstests/replsets/rslib.js'); // For 'createRstArgs'
+
+const recipientRst = new ReplSetTest({
+ nodes: 3,
+ name: jsTestName() + "_recipient",
+ serverless: true,
+ // Use a batch size of 2 so that we can hang in the middle of tenant oplog application.
+ nodeOptions: Object.assign(makeX509OptionsForTest().recipient,
+ {setParameter: {tenantApplierBatchSizeOps: 2}})
+});
+
+recipientRst.startSet();
+recipientRst.initiate();
+
+const tenantMigrationTest =
+ new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst});
+
+const tenantId = ObjectId().str;
+const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB");
+const collName = "testColl";
+
+const donorPrimary = tenantMigrationTest.getDonorPrimary();
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const donorRst = tenantMigrationTest.getDonorRst();
+const donorTestColl = donorPrimary.getDB(dbName).getCollection(collName);
+
+// Populate the donor replica set with some initial data and make sure it is majority committed.
+const majorityCommittedDocs = [{_id: 0, x: 0}, {_id: 1, x: 1}];
+assert.commandWorked(donorTestColl.insert(majorityCommittedDocs, {writeConcern: {w: "majority"}}));
+assert.eq(2, donorTestColl.find().readConcern("majority").itcount());
+
+const migrationId = UUID();
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+ tenantId,
+};
+
+// Configure fail point to have the recipient primary hang after the cloner completes and the oplog
+// applier has started.
+let waitAfterDatabaseClone = configureFailPoint(
+ recipientPrimary, "fpAfterStartingOplogApplierMigrationRecipientInstance", {action: "hang"});
+// Configure fail point to hang the tenant oplog applier after it applies the first batch.
+let waitInOplogApplier = configureFailPoint(recipientPrimary, "hangInTenantOplogApplication");
+
+// Start a migration and wait for recipient to hang in the tenant database cloner.
+const donorRstArgs = createRstArgs(donorRst);
+const migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs);
+migrationThread.start();
+waitAfterDatabaseClone.wait();
+
+// Insert some writes that will eventually be picked up by the tenant oplog applier on the
+// recipient.
+const docsToApply = [{_id: 2, x: 2}, {_id: 3, x: 3}, {_id: 4, x: 4}];
+tenantMigrationTest.insertDonorDB(dbName, collName, docsToApply);
+
+// Wait for the applied oplog batch to be replicated.
+waitInOplogApplier.wait();
+recipientRst.awaitReplication();
+let local = recipientPrimary.getDB("local");
+let appliedNoOps = local.oplog.rs.find({fromTenantMigration: migrationId, op: "n"});
+let resultsArr = appliedNoOps.toArray();
+// It is possible that the first batch applied includes a resume no-op token. We do not write no-op
+// entries for resume token entries in tenant migrations.
+assert.gt(appliedNoOps.count(), 0, resultsArr);
+assert.lte(appliedNoOps.count(), 2, resultsArr);
+assert.eq(docsToApply[0], resultsArr[0].o2.o, resultsArr);
+if (appliedNoOps.count() === 2) {
+ assert.eq(docsToApply[1], resultsArr[1].o2.o, resultsArr);
+}
+// Step up a new node in the recipient set and trigger a failover. The new primary should resume
+// fetching starting from the unapplied documents.
+const newRecipientPrimary = recipientRst.getSecondaries()[0];
+recipientRst.stepUp(newRecipientPrimary);
+waitAfterDatabaseClone.off();
+waitInOplogApplier.off();
+recipientRst.getPrimary();
+
+// The migration should go through after recipient failover.
+TenantMigrationTest.assertCommitted(migrationThread.returnData());
+// Validate that the last no-op entry is applied.
+local = newRecipientPrimary.getDB("local");
+appliedNoOps = local.oplog.rs.find({fromTenantMigration: migrationId, op: "n"});
+resultsArr = appliedNoOps.toArray();
+assert.eq(3, appliedNoOps.count(), appliedNoOps);
+assert.eq(docsToApply[2], resultsArr[2].o2.o, resultsArr);
+
+checkTenantDBHashes({
+ donorRst: tenantMigrationTest.getDonorRst(),
+ recipientRst: tenantMigrationTest.getRecipientRst(),
+ tenantId
+});
+tenantMigrationTest.stop();
+recipientRst.stopSet();
diff --git a/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js b/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js
index a9ee61f0993..245673daa62 100644
--- a/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js
+++ b/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js
@@ -186,7 +186,6 @@ assert.commandWorked(
jsTest.log("Waiting for migration to complete");
waitBeforeFetchingTransactions.off();
TenantMigrationTest.assertCommitted(migrationThread.returnData());
-tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString);
// Print the no-op oplog entries for debugging purposes.
jsTestLog("Recipient oplog migration entries.");
@@ -263,7 +262,6 @@ function testRecipientRetryableWrites(db, writes) {
jsTestLog("Run retryable write on primary after the migration");
testRecipientRetryableWrites(recipientDb, beforeWrites);
testRecipientRetryableWrites(recipientDb, duringWrites);
-
jsTestLog("Step up secondary");
const recipientRst = tenantMigrationTest.getRecipientRst();
recipientRst.stepUp(recipientRst.getSecondary());
@@ -271,6 +269,8 @@ jsTestLog("Run retryable write on secondary after the migration");
testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), beforeWrites);
testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), duringWrites);
+tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString);
+
jsTestLog("Trying a back-to-back migration");
const tenantMigrationTest2 = new TenantMigrationTest(
{name: jsTestName() + "2", donorRst: tenantMigrationTest.getRecipientRst()});
diff --git a/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js b/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js
index b5a466046f8..4a3475178f5 100644
--- a/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js
+++ b/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js
@@ -105,7 +105,6 @@ function testRetryOnRecipient(ordered) {
jsTest.log("Waiting for migration to complete");
pauseTenantMigrationBeforeLeavingDataSyncState.off();
TenantMigrationTest.assertCommitted(migrationThread.returnData());
- tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString);
// Print the no-op oplog entries for debugging purposes.
jsTestLog("Recipient oplog migration entries.");
@@ -133,6 +132,8 @@ function testRetryOnRecipient(ordered) {
testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), beforeWrites);
testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), duringWrites);
+ tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString);
+
jsTestLog("Trying a back-to-back migration");
const tenantMigrationTest2 = new TenantMigrationTest(
{name: jsTestName() + "2", donorRst: tenantMigrationTest.getRecipientRst()});