diff options
author | Christopher Caplinger <christopher.caplinger@mongodb.com> | 2023-04-17 13:37:45 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-04-17 14:44:05 +0000 |
commit | f8968230334cca0e504035188445697fcc8088cf (patch) | |
tree | 80eb2eab4753c88b67c7535616d6f8439ed83915 /jstests/replsets | |
parent | 408cbcd9802208741d5a4d96e2b52aedd97ce50f (diff) | |
download | mongo-f8968230334cca0e504035188445697fcc8088cf.tar.gz |
SERVER-75990: Tenant Migrations are not resilient to recipient failover
Diffstat (limited to 'jstests/replsets')
18 files changed, 173 insertions, 2441 deletions
diff --git a/jstests/replsets/tenant_migration_cloner_stats_with_failover.js b/jstests/replsets/tenant_migration_cloner_stats_with_failover.js deleted file mode 100644 index 77c98dea613..00000000000 --- a/jstests/replsets/tenant_migration_cloner_stats_with_failover.js +++ /dev/null @@ -1,146 +0,0 @@ -/** - * Tests tenant migration cloner stats such as 'approxTotalDataSize', 'approxTotalBytesCopied', - * 'databasesClonedBeforeFailover' across multiple databases and collections with failovers. - * - * This test does the following: - * 1. Insert two databases on the donor. The first database consists of one collection, the second - * consists of two collections. - * 2. Wait for the primary (referred to as the original primary) to clone one batch from the second - * database's second collection. - * 3. Step up the new primary. Ensure that the stats such as 'databasesClonedBeforeFailover' tally. - * 4. Allow the tenant migration to complete and commit. Ensure that stats are sensible. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * incompatible_with_shard_merge, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject(). -load("jstests/libs/fail_point_util.js"); // For configureFailPoint(). - -// Limit the batch size to test the stat in between batches. -const tenantMigrationTest = new TenantMigrationTest( - {name: jsTestName(), sharedOptions: {setParameter: {collectionClonerBatchSize: 10}}}); - -const kMigrationId = UUID(); -const kTenantId = ObjectId().str; -const kReadPreference = { - mode: "primary" -}; -const migrationOpts = { - migrationIdString: extractUUIDFromObject(kMigrationId), - tenantId: kTenantId, - readPreference: kReadPreference -}; - -const dbName = tenantMigrationTest.tenantDB(kTenantId, "testDB"); -const collName = "coll"; -const dbName1 = dbName + '_db_1'; -const dbName2 = dbName + '_db_2'; -const db2Coll1 = collName + "_db_2_1"; -const db2Coll2 = collName + "_db_2_2"; - -// Add a large amount of data to the donor. -jsTestLog("Adding data to donor."); -const dataForEachCollection = [...Array(100).keys()].map((i) => ({a: i, b: 'metanoia'})); -tenantMigrationTest.insertDonorDB(dbName1, collName + "_1", dataForEachCollection); -tenantMigrationTest.insertDonorDB(dbName2, db2Coll1, dataForEachCollection); -tenantMigrationTest.insertDonorDB(dbName2, db2Coll2, dataForEachCollection); - -const originalRecipientPrimary = tenantMigrationTest.getRecipientPrimary(); -const newRecipientPrimary = tenantMigrationTest.getRecipientRst().getSecondaries()[0]; - -jsTestLog("Collecting the stats of the databases and collections from the donor."); -const donorPrimary = tenantMigrationTest.getDonorPrimary(); -const donorDB2 = donorPrimary.getDB(dbName2); - -const db1Size = assert.commandWorked(donorPrimary.getDB(dbName1).runCommand({dbStats: 1})).dataSize; -const db2Size = assert.commandWorked(donorDB2.runCommand({dbStats: 1})).dataSize; -const db2Collection1Size = assert.commandWorked(donorDB2.runCommand({collStats: db2Coll1})).size; -const db2Collection2Size = assert.commandWorked(donorDB2.runCommand({collStats: db2Coll2})).size; - -const donorStats = { - db1Size, - db2Size, - db2Collection1Size, - db2Collection2Size -}; -jsTestLog("Collected the following stats on the donor: " + tojson(donorStats)); - -// The last collection to be cloned is the one with a greater UUID. -const collInfo = donorDB2.getCollectionInfos(); -const uuid1 = collInfo[0].info.uuid; -const uuid2 = collInfo[1].info.uuid; -const lastCollection = (uuid1 > uuid2) ? db2Coll1 : db2Coll2; - -// Create a failpoint to pause after one batch of the second database's second collection has been -// cloned. -const fpAfterBatchOfSecondDB = configureFailPoint( - originalRecipientPrimary, - "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse", - {nss: originalRecipientPrimary.getDB(dbName2).getCollection(lastCollection).getFullName()}); - -jsTestLog("Starting tenant migration with migrationId: " + kMigrationId + - ", tenantId: " + kTenantId); -assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - -let res = 0; -let currOp = 0; -jsTestLog("Waiting until one batch of second database has been cloned by original primary."); -fpAfterBatchOfSecondDB.wait(); -// Since documents are inserted on a separate thread, wait until the expected stats are seen. The -// failpoint needs to be maintained so that the next batch isn't processed. -assert.soon(() => { - res = originalRecipientPrimary.adminCommand( - {currentOp: true, desc: "tenant recipient migration"}); - currOp = res.inprog[0]; - - // Wait until one batch of documents of the second database's second collection has been copied. - return currOp.approxTotalBytesCopied > db1Size + db2Collection1Size; -}, res); - -assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res); -// Since the two collections on the second database are the same size, -// 'db1Size + db2Collection1Size' and 'db1Size + db2Collection2Size' evaluate to the same value. -assert.gt(currOp.approxTotalBytesCopied, db1Size + db2Collection1Size, res); -assert.lt(currOp.approxTotalBytesCopied, db1Size + db2Size, res); -assert.eq(currOp.databases.databasesClonedBeforeFailover, 0, res); -assert.eq(currOp.databases[dbName2].clonedCollectionsBeforeFailover, 0, res); -const bytesCopiedIncludingSecondDB = currOp.approxTotalBytesCopied; -jsTestLog("Bytes copied after first batch of second database: " + bytesCopiedIncludingSecondDB); - -// Wait until the batch of the second collection of the second database has been replicated from the -// original primary to the new primary. Then, step up the new primary. -const fpAfterCreatingCollectionOfSecondDB = - configureFailPoint(newRecipientPrimary, "tenantCollectionClonerHangAfterCreateCollection"); -tenantMigrationTest.getRecipientRst().stepUp(newRecipientPrimary); -fpAfterBatchOfSecondDB.off(); - -jsTestLog("Wait until the new primary creates collection of second database."); -fpAfterCreatingCollectionOfSecondDB.wait(); -res = newRecipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"}); -currOp = res.inprog[0]; -assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res); -assert.eq(currOp.approxTotalBytesCopied, bytesCopiedIncludingSecondDB, res); -assert.eq(currOp.databases.databasesClonedBeforeFailover, 1, res); -assert.eq(currOp.databases[dbName2].clonedCollectionsBeforeFailover, 1, res); -fpAfterCreatingCollectionOfSecondDB.off(); - -// After the migration completes, the total bytes copied should be equal to the total data size. -jsTestLog("Waiting for migration to complete."); -TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); -res = newRecipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"}); -currOp = res.inprog[0]; -assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res); -assert.eq(currOp.approxTotalBytesCopied, db1Size + db2Size, res); -assert.eq(currOp.databases.databasesClonedBeforeFailover, 1, res); -assert.eq(currOp.databases[dbName2].clonedCollectionsBeforeFailover, 1, res); - -tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js b/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js index f3737dd4706..f3ab1df7e24 100644 --- a/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js +++ b/jstests/replsets/tenant_migration_cluster_time_keys_cloning.js @@ -177,55 +177,58 @@ const migrationX509Options = makeX509OptionsForTest(); tenantMigrationTest.stop(); })(); -(() => { - jsTest.log("Test that the donor and recipient correctly copy each other's cluster time keys " + - "when there is recipient failover."); - const recipientRst = new ReplSetTest({ - nodes: 3, - name: "recipientRst", - serverless: true, - nodeOptions: migrationX509Options.recipient - }); - recipientRst.startSet(); - recipientRst.initiate(); - if (isShardMergeEnabled(recipientRst.getPrimary().getDB("adminDB"))) { - jsTestLog("Skip: featureFlagShardMerge enabled, but shard merge does not survive failover"); - recipientRst.stopSet(); - return; - } - - const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), recipientRst}); - - const recipientPrimary = recipientRst.getPrimary(); - const fp = configureFailPoint(recipientPrimary, - "fpAfterPersistingTenantMigrationRecipientInstanceStateDoc", - {action: "hang"}); - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId1, - }; - assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - fp.wait(); - - assert.commandWorked( - recipientPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); - - fp.off(); - TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete( - migrationOpts, true /* retryOnRetryableErrors */)); - - assertCopiedExternalKeys(tenantMigrationTest, migrationId); - - // After another migration, the first's keys should still exist. - runMigrationAndAssertExternalKeysCopied(tenantMigrationTest, kTenantId2); - assertCopiedExternalKeys(tenantMigrationTest, migrationId); - - recipientRst.stopSet(); - tenantMigrationTest.stop(); -})(); +// TODO SERVER-76128: Tenant Migrations are not robust to recipient failover. +// (() => { +// jsTest.log("Test that the donor and recipient correctly copy each other's cluster time keys " +// + +// "when there is recipient failover."); +// const recipientRst = new ReplSetTest({ +// nodes: 3, +// name: "recipientRst", +// serverless: true, +// nodeOptions: migrationX509Options.recipient +// }); +// recipientRst.startSet(); +// recipientRst.initiate(); +// if (isShardMergeEnabled(recipientRst.getPrimary().getDB("adminDB"))) { +// jsTestLog("Skip: featureFlagShardMerge enabled, but shard merge does not survive +// failover"); +// recipientRst.stopSet(); +// return; +// } + +// const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), recipientRst}); + +// const recipientPrimary = recipientRst.getPrimary(); +// const fp = configureFailPoint(recipientPrimary, +// "fpAfterPersistingTenantMigrationRecipientInstanceStateDoc", +// {action: "hang"}); + +// const migrationId = UUID(); +// const migrationOpts = { +// migrationIdString: extractUUIDFromObject(migrationId), +// tenantId: kTenantId1, +// }; +// assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); +// fp.wait(); + +// assert.commandWorked( +// recipientPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true})); +// assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); + +// fp.off(); +// TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete( +// migrationOpts, true /* retryOnRetryableErrors */)); + +// assertCopiedExternalKeys(tenantMigrationTest, migrationId); + +// // After another migration, the first's keys should still exist. +// runMigrationAndAssertExternalKeysCopied(tenantMigrationTest, kTenantId2); +// assertCopiedExternalKeys(tenantMigrationTest, migrationId); + +// recipientRst.stopSet(); +// tenantMigrationTest.stop(); +// })(); (() => { jsTest.log("Test that the donor waits for copied external keys to replicate to every node"); diff --git a/jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js b/jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js deleted file mode 100644 index b3b158c7134..00000000000 --- a/jstests/replsets/tenant_migration_donor_resume_on_stepup_and_restart.js +++ /dev/null @@ -1,488 +0,0 @@ -/** - * Tests that tenant migrations resume successfully on donor stepup and restart. - * - * Incompatible with shard merge, which can't handle restart. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * # Some tenant migration statistics field names were changed in 6.1. - * requires_fcv_61, - * requires_majority_read_concern, - * requires_persistence, - * # Tenant migrations are only used in serverless. - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import { - forgetMigrationAsync, - isShardMergeEnabled, - makeX509OptionsForTest, - runMigrationAsync, - tryAbortMigrationAsync -} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/parallelTester.js"); -load("jstests/libs/uuid_util.js"); -load("jstests/replsets/rslib.js"); // 'createRstArgs' - -const kMaxSleepTimeMS = 100; -const kTenantId = ObjectId().str; -const kMigrationFpNames = [ - "pauseTenantMigrationBeforeLeavingDataSyncState", - "pauseTenantMigrationBeforeLeavingBlockingState", - "abortTenantMigrationBeforeLeavingBlockingState", - "" -]; - -// Set the delay before a state doc is garbage collected to be short to speed up the test but long -// enough for the state doc to still be around after stepup or restart. -const kGarbageCollectionDelayMS = 30 * 1000; - -// Set the TTL monitor to run at a smaller interval to speed up the test. -const kTTLMonitorSleepSecs = 1; - -const migrationX509Options = makeX509OptionsForTest(); - -/** - * Runs the donorStartMigration command to start a migration, and interrupts the migration on the - * donor using the 'interruptFunc', and asserts that migration eventually commits. - */ -function testDonorStartMigrationInterrupt(interruptFunc, - {donorRestarted = false, disableForShardMerge = true}) { - const donorRst = new ReplSetTest( - {nodes: 3, name: "donorRst", serverless: true, nodeOptions: migrationX509Options.donor}); - - donorRst.startSet(); - donorRst.initiate(); - - const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), donorRst}); - - let donorPrimary = tenantMigrationTest.getDonorPrimary(); - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - - if (disableForShardMerge && isShardMergeEnabled(recipientPrimary.getDB("admin"))) { - jsTest.log("Skipping test for shard merge"); - tenantMigrationTest.stop(); - donorRst.stopSet(); - return; - } - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - }; - const donorRstArgs = createRstArgs(donorRst); - - const runMigrationThread = - new Thread(runMigrationAsync, migrationOpts, donorRstArgs, {retryOnRetryableErrors: true}); - runMigrationThread.start(); - - // Wait for donorStartMigration command to start. - assert.soon(() => donorPrimary.adminCommand({currentOp: true, desc: "tenant donor migration"}) - .inprog.length > 0); - - sleep(Math.random() * kMaxSleepTimeMS); - interruptFunc(donorRst); - - TenantMigrationTest.assertCommitted(runMigrationThread.returnData()); - tenantMigrationTest.waitForDonorNodesToReachState(donorRst.nodes, - migrationId, - migrationOpts.tenantId, - TenantMigrationTest.DonorState.kCommitted); - assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - - donorPrimary = tenantMigrationTest.getDonorPrimary(); // Could change after interrupt. - const donorStats = tenantMigrationTest.getTenantMigrationStats(donorPrimary); - jsTestLog(`Stats at the donor primary: ${tojson(donorStats)}`); - if (donorRestarted) { - // If full restart happened the count could be lost completely. - assert.gte(1, donorStats.totalMigrationDonationsCommitted); - } else { - // The double counting happens when the failover happens after migration completes - // but before the state doc GC mark is persisted. While this test is targeting this - // scenario it is low probability in production. - assert(1 == donorStats.totalMigrationDonationsCommitted || - 2 == donorStats.totalMigrationDonationsCommitted); - } - // Skip checking the stats on the recipient since enableRecipientTesting is false - // so the recipient is forced to respond to recipientSyncData without starting the - // migration. - - tenantMigrationTest.stop(); - donorRst.stopSet(); -} - -/** - * Starts a migration and waits for it to commit, then runs the donorForgetMigration, and interrupts - * the donor using the 'interruptFunc', and asserts that the migration state is eventually garbage - * collected. - */ -function testDonorForgetMigrationInterrupt(interruptFunc) { - const donorRst = new ReplSetTest({ - nodes: 3, - name: "donorRst", - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.donor, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: kTTLMonitorSleepSecs, - } - }) - }); - const recipientRst = new ReplSetTest({ - nodes: 1, - name: "recipientRst", - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.recipient, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: kTTLMonitorSleepSecs, - } - }) - }); - - donorRst.startSet(); - donorRst.initiate(); - - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst}); - - const donorPrimary = tenantMigrationTest.getDonorPrimary(); - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId, - recipientConnString: recipientRst.getURL(), - }; - const donorRstArgs = createRstArgs(donorRst); - - TenantMigrationTest.assertCommitted( - tenantMigrationTest.runMigration(migrationOpts, {automaticForgetMigration: false})); - const forgetMigrationThread = new Thread(forgetMigrationAsync, - migrationOpts.migrationIdString, - donorRstArgs, - true /* retryOnRetryableErrors */); - forgetMigrationThread.start(); - - // Wait for donorForgetMigration command to start. - assert.soon(() => { - const res = assert.commandWorked( - donorPrimary.adminCommand({currentOp: true, desc: "tenant donor migration"})); - return res.inprog[0].expireAt != null; - }); - sleep(Math.random() * kMaxSleepTimeMS); - interruptFunc(donorRst); - - assert.commandWorkedOrFailedWithCode( - tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString), - ErrorCodes.NoSuchTenantMigration); - - assert.commandWorked(forgetMigrationThread.returnData()); - tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, migrationOpts.tenantId); - - tenantMigrationTest.stop(); - donorRst.stopSet(); - recipientRst.stopSet(); -} - -/** - * Starts a migration and sets the passed in failpoint, then runs the donorAbortMigration, and - * interrupts the donor using the 'interruptFunc', and asserts that the migration state is - * eventually garbage collected. - */ -function testDonorAbortMigrationInterrupt( - interruptFunc, fpName, {fpWaitBeforeAbort = false, isShutdown = false} = {}) { - const donorRst = new ReplSetTest({ - nodes: 3, - name: "donorRst", - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.donor, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: kTTLMonitorSleepSecs, - } - }) - }); - const recipientRst = new ReplSetTest({ - nodes: 1, - name: "recipientRst", - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.recipient, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: kTTLMonitorSleepSecs, - } - }) - }); - - donorRst.startSet(); - donorRst.initiate(); - - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst}); - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId, - recipientConnString: recipientRst.getURL(), - }; - const donorRstArgs = createRstArgs(donorRst); - let donorPrimary = tenantMigrationTest.getDonorPrimary(); - - // If we passed in a valid failpoint we set it, otherwise we let the migration run normally. - let fp; - if (fpName) { - fp = configureFailPoint(donorPrimary, fpName); - } - - assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - - if (fp && !isShutdown && fpWaitBeforeAbort) { - fp.wait(); - } - - const tryAbortThread = new Thread(tryAbortMigrationAsync, - {migrationIdString: migrationOpts.migrationIdString}, - donorRstArgs, - true /* retryOnRetryableErrors */); - tryAbortThread.start(); - - // Wait for donorAbortMigration command to start. - assert.soon(() => { - const res = assert.commandWorked( - donorPrimary.adminCommand({currentOp: true, desc: "tenant donor migration"})); - return res.inprog[0].receivedCancellation; - }); - - interruptFunc(donorRst); - - if (fp && !isShutdown) { - // Turn off failpoint in order to allow the migration to resume after stepup. - fp.off(); - } - - tryAbortThread.join(); - - let res = tryAbortThread.returnData(); - assert.commandWorkedOrFailedWithCode(res, ErrorCodes.TenantMigrationCommitted); - - donorPrimary = tenantMigrationTest.getDonorPrimary(); - let configDonorsColl = donorPrimary.getCollection(TenantMigrationTest.kConfigDonorsNS); - let donorDoc = configDonorsColl.findOne({tenantId: kTenantId}); - - if (!res.ok) { - assert.eq(donorDoc.state, TenantMigrationTest.DonorState.kCommitted); - } else { - assert.eq(donorDoc.state, TenantMigrationTest.DonorState.kAborted); - } - - tenantMigrationTest.stop(); - donorRst.stopSet(); - recipientRst.stopSet(); -} - -/** - * Starts a migration and sets the passed in failpoint, then either waits for the failpoint or lets - * the migration run successfully and interrupts the donor using the 'interruptFunc'. After - * restarting, check the to see if the donorDoc data has persisted. - */ -function testStateDocPersistenceOnFailover(interruptFunc, fpName, isShutdown = false) { - const donorRst = new ReplSetTest( - {nodes: 3, name: "donorRst", serverless: true, nodeOptions: migrationX509Options.donor}); - - donorRst.startSet(); - donorRst.initiate(); - - const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), donorRst}); - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - }; - let donorPrimary = tenantMigrationTest.getDonorPrimary(); - - // If we passed in a valid failpoint we set it, otherwise we let the migration run normally. - let fp; - if (fpName) { - fp = configureFailPoint(donorPrimary, fpName); - assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - fp.wait(); - } else { - TenantMigrationTest.assertCommitted(tenantMigrationTest.runMigration(migrationOpts)); - } - - let configDonorsColl = donorPrimary.getCollection(TenantMigrationTest.kConfigDonorsNS); - let donorDocBeforeFailover = configDonorsColl.findOne({tenantId: kTenantId}); - - interruptFunc(tenantMigrationTest.getDonorRst()); - - if (fp && !isShutdown) { - // Turn off failpoint in order to allow the migration to resume after stepup. - fp.off(); - } - - donorPrimary = tenantMigrationTest.getDonorPrimary(); - configDonorsColl = donorPrimary.getCollection(TenantMigrationTest.kConfigDonorsNS); - let donorDocAfterFailover = configDonorsColl.findOne({tenantId: kTenantId}); - - // Check persisted fields in the donor doc. - assert.eq(donorDocBeforeFailover._id, donorDocAfterFailover._id); - assert.eq(donorDocBeforeFailover.recipientConnString, - donorDocAfterFailover.recipientConnString); - assert.eq(donorDocBeforeFailover.readPreference, donorDocAfterFailover.readPreference); - assert.eq(donorDocBeforeFailover.startMigrationDonorTimestamp, - donorDocAfterFailover.startMigrationDonorTimestamp); - assert.eq(donorDocBeforeFailover.migration, donorDocAfterFailover.migration); - assert.eq(donorDocBeforeFailover.tenantId, donorDocAfterFailover.tenantId); - assert.eq(donorDocBeforeFailover.donorCertificateForRecipient, - donorDocAfterFailover.donorCertificateForRecipient); - assert.eq(donorDocBeforeFailover.recipientCertificateForDonor, - donorDocAfterFailover.recipientCertificateForDonor); - assert.eq(donorDocBeforeFailover.migrationStart, donorDocAfterFailover.migrationStart); - - tenantMigrationTest.stop(); - donorRst.stopSet(); -} - -(() => { - jsTest.log("Test that the migration resumes on stepup"); - testDonorStartMigrationInterrupt((donorRst) => { - // Force the primary to step down but make it likely to step back up. - const donorPrimary = donorRst.getPrimary(); - assert.commandWorked( - donorPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0})); - }, {donorRestarted: false}); -})(); - -(() => { - jsTest.log("Test that the migration resumes after restart"); - testDonorStartMigrationInterrupt((donorRst) => { - // Skip validation on shutdown because the full validation can conflict with the tenant - // migration and cause it to fail. - donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true}); - donorRst.startSet({restart: true}); - }, {donorRestarted: true, disableForShardMerge: true}); -})(); - -(() => { - jsTest.log("Test that the donorForgetMigration command can be retried on stepup"); - testDonorForgetMigrationInterrupt((donorRst) => { - // Force the primary to step down but make it likely to step back up. - const donorPrimary = donorRst.getPrimary(); - assert.commandWorked( - donorPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0})); - }); -})(); - -(() => { - jsTest.log("Test that the donorForgetMigration command can be retried after restart"); - testDonorForgetMigrationInterrupt((donorRst) => { - // Skip validation on shutdown because the full validation can conflict with the tenant - // migration and cause it to fail. - donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true}); - donorRst.startSet({restart: true}); - }); -})(); - -(() => { - jsTest.log("Test that the donorAbortMigration command can be retried after restart"); - - kMigrationFpNames.forEach(fpName => { - if (!fpName) { - jsTest.log("Testing without setting a failpoint."); - } else { - jsTest.log("Testing with failpoint: " + fpName); - } - - testDonorAbortMigrationInterrupt((donorRst) => { - // Skip validation on shutdown because the full validation can conflict with the tenant - // migration and cause it to fail. - donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true}); - donorRst.startSet({restart: true}); - }, fpName, {isShutdown: true}); - }); -})(); - -(() => { - jsTest.log( - "Test that the donorAbortMigration command fails if issued after state == kCommitted"); - - testDonorAbortMigrationInterrupt((donorRst) => {}, - "pauseTenantMigrationAfterUpdatingToCommittedState", - {fpWaitBeforeAbort: true}); -})(); - -(() => { - jsTest.log("Test that the donorAbortMigration command can be retried on stepup"); - kMigrationFpNames.forEach(fpName => { - if (!fpName) { - jsTest.log("Testing without setting a failpoint."); - } else { - jsTest.log("Testing with failpoint: " + fpName); - } - - testDonorAbortMigrationInterrupt((donorRst) => { - // Force the primary to step down but make it likely to step back up. - const donorPrimary = donorRst.getPrimary(); - assert.commandWorked(donorPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0})); - }, fpName); - }); -})(); - -(() => { - jsTest.log("Test stateDoc data persistence on restart."); - kMigrationFpNames.forEach(fpName => { - if (!fpName) { - jsTest.log("Testing without setting a failpoint."); - } else { - jsTest.log("Testing with failpoint: " + fpName); - } - - testStateDocPersistenceOnFailover((donorRst) => { - // Skip validation on shutdown because the full validation can conflict with the tenant - // migration and cause it to fail. - donorRst.stopSet(null /* signal */, true /*forRestart */, {skipValidation: true}); - donorRst.startSet({restart: true}); - }, fpName, true); - }); -})(); - -(() => { - jsTest.log("Test stateDoc data persistence on stepup."); - kMigrationFpNames.forEach(fpName => { - if (!fpName) { - jsTest.log("Testing without setting a failpoint."); - } else { - jsTest.log("Testing with failpoint: " + fpName); - } - - testStateDocPersistenceOnFailover((donorRst) => { - // Force the primary to step down but make it likely to step back up. - const donorPrimary = donorRst.getPrimary(); - assert.commandWorked(donorPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0})); - }, fpName); - }); -})(); diff --git a/jstests/replsets/tenant_migration_external_keys_ttl.js b/jstests/replsets/tenant_migration_external_keys_ttl.js index 28611f9abaf..172600f1d2a 100644 --- a/jstests/replsets/tenant_migration_external_keys_ttl.js +++ b/jstests/replsets/tenant_migration_external_keys_ttl.js @@ -286,38 +286,39 @@ function makeTestParams() { teardown(); } - jsTestLog("Recipient failover before receiving forgetMigration"); - { - const {tmt, teardown} = setup(); - const [tenantId, migrationId, migrationOpts] = makeTestParams(); - const recipientPrimary = tmt.getRecipientPrimary(); - const fp = configureFailPoint(recipientPrimary, - "fpAfterConnectingTenantMigrationRecipientInstance", - {action: "hang"}); - - assert.commandWorked(tmt.startMigration(migrationOpts)); - fp.wait(); - - assert.commandWorked(recipientPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); - fp.off(); - - TenantMigrationTest.assertCommitted( - tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */)); - - // The keys should have been created without a TTL deadline. - verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false}); - verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false}); - - assert.commandWorked(tmt.forgetMigration(migrationOpts.migrationIdString)); - - // After running donorForgetMigration, the TTL value should be updated. The default TTL - // buffer is 1 day so the keys will not have been deleted. - verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true}); - verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true}); - teardown(); - } + // TODO SERVER-76128: Tenant Migrations are not robust to recipient failover. + // jsTestLog("Recipient failover before receiving forgetMigration"); + // { + // const {tmt, teardown} = setup(); + // const [tenantId, migrationId, migrationOpts] = makeTestParams(); + // const recipientPrimary = tmt.getRecipientPrimary(); + // const fp = configureFailPoint(recipientPrimary, + // "fpAfterConnectingTenantMigrationRecipientInstance", + // {action: "hang"}); + + // assert.commandWorked(tmt.startMigration(migrationOpts)); + // fp.wait(); + + // assert.commandWorked(recipientPrimary.adminCommand( + // {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); + // assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); + // fp.off(); + + // TenantMigrationTest.assertCommitted( + // tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */)); + + // // The keys should have been created without a TTL deadline. + // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false}); + // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false}); + + // assert.commandWorked(tmt.forgetMigration(migrationOpts.migrationIdString)); + + // // After running donorForgetMigration, the TTL value should be updated. The default TTL + // // buffer is 1 day so the keys will not have been deleted. + // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true}); + // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true}); + // teardown(); + // } jsTestLog( "Donor failover after receiving forgetMigration before marking keys garbage collectable"); @@ -355,39 +356,42 @@ function makeTestParams() { teardown(); } - jsTestLog( - "Recipient failover after receiving forgetMigration before marking keys garbage collectable"); - { - const {tmt, donorRst, teardown} = setup(); - const [tenantId, migrationId, migrationOpts] = makeTestParams(); - const recipientPrimary = tmt.getRecipientPrimary(); - - assert.commandWorked(tmt.startMigration(migrationOpts)); - TenantMigrationTest.assertCommitted( - tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */)); - - // The keys should have been created without a TTL deadline. - verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false}); - verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false}); - - const fp = configureFailPoint( - recipientPrimary, "pauseTenantMigrationBeforeMarkingExternalKeysGarbageCollectable"); - const forgetMigrationThread = new Thread( - forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst), true); - forgetMigrationThread.start(); - fp.wait(); - - assert.commandWorked(recipientPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); - fp.off(); - - assert.commandWorked(forgetMigrationThread.returnData()); - - verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true}); - verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true}); - teardown(); - } + // TODO SERVER-76128: Tenant Migrations are not robust to recipient failover. + // jsTestLog( + // "Recipient failover after receiving forgetMigration before marking keys garbage + // collectable"); + // { + // const {tmt, donorRst, teardown} = setup(); + // const [tenantId, migrationId, migrationOpts] = makeTestParams(); + // const recipientPrimary = tmt.getRecipientPrimary(); + + // assert.commandWorked(tmt.startMigration(migrationOpts)); + // TenantMigrationTest.assertCommitted( + // tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */)); + + // // The keys should have been created without a TTL deadline. + // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false}); + // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false}); + + // const fp = configureFailPoint( + // recipientPrimary, "pauseTenantMigrationBeforeMarkingExternalKeysGarbageCollectable"); + // const forgetMigrationThread = new Thread( + // forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst), + // true); + // forgetMigrationThread.start(); + // fp.wait(); + + // assert.commandWorked(recipientPrimary.adminCommand( + // {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); + // assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); + // fp.off(); + + // assert.commandWorked(forgetMigrationThread.returnData()); + + // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: true}); + // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: true}); + // teardown(); + // } jsTestLog("Donor failover after receiving forgetMigration after updating keys."); { @@ -433,49 +437,53 @@ function makeTestParams() { teardown(); } - jsTestLog("Recipient failover after receiving forgetMigration after updating keys."); - { - const {tmt, donorRst, recipientRst, teardown} = setup(); - // this test expects the external keys to expire, so lower the expiration timeouts. - const lowerExternalKeysBufferSecs = 5; - const lowerStateDocExpirationMS = 500; - for (let conn of [...donorRst.nodes, ...recipientRst.nodes]) { - setTenantMigrationExpirationParams( - conn, lowerStateDocExpirationMS, lowerExternalKeysBufferSecs); - } - const [tenantId, migrationId, migrationOpts] = makeTestParams(); - const recipientPrimary = tmt.getRecipientPrimary(); - - assert.commandWorked(tmt.startMigration(migrationOpts)); - TenantMigrationTest.assertCommitted( - tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */)); - - // The keys should have been created without a TTL deadline. - verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false}); - verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false}); - - const fp = configureFailPoint( - recipientPrimary, "fpAfterReceivingRecipientForgetMigration", {action: "hang"}); - const forgetMigrationThread = new Thread( - forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst), true); - forgetMigrationThread.start(); - fp.wait(); - - // Let the keys expire on the donor before the state document is deleted to verify retrying - // recipientForgetMigration can handle this case. The keys won't be deleted until the buffer - // expires, so sleep to avoid wasted work. - sleep((lowerExternalKeysBufferSecs * 1000) + lowerStateDocExpirationMS + 500); - waitForExternalKeysToBeDeleted(tmt.getRecipientPrimary(), migrationId); - - assert.commandWorked(recipientPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); - fp.off(); - - assert.commandWorked(forgetMigrationThread.returnData()); - - // Eventually the donor's keys should be deleted too. - waitForExternalKeysToBeDeleted(tmt.getDonorPrimary(), migrationId); - teardown(); - } + // TODO SERVER-76128: Tenant Migrations are not robust to recipient failover. + // jsTestLog("Recipient failover after receiving forgetMigration after updating keys."); + // { + // const {tmt, donorRst, recipientRst, teardown} = setup(); + // // this test expects the external keys to expire, so lower the expiration timeouts. + // const lowerExternalKeysBufferSecs = 5; + // const lowerStateDocExpirationMS = 500; + // for (let conn of [...donorRst.nodes, ...recipientRst.nodes]) { + // setTenantMigrationExpirationParams( + // conn, lowerStateDocExpirationMS, lowerExternalKeysBufferSecs); + // } + // const [tenantId, migrationId, migrationOpts] = makeTestParams(); + // const recipientPrimary = tmt.getRecipientPrimary(); + + // assert.commandWorked(tmt.startMigration(migrationOpts)); + // TenantMigrationTest.assertCommitted( + // tmt.waitForMigrationToComplete(migrationOpts, true /* retryOnRetryableErrors */)); + + // // The keys should have been created without a TTL deadline. + // verifyExternalKeys(tmt.getDonorPrimary(), {migrationId, expectTTLValue: false}); + // verifyExternalKeys(tmt.getRecipientPrimary(), {migrationId, expectTTLValue: false}); + + // const fp = configureFailPoint( + // recipientPrimary, "fpAfterReceivingRecipientForgetMigration", {action: "hang"}); + // const forgetMigrationThread = new Thread( + // forgetMigrationAsync, migrationOpts.migrationIdString, createRstArgs(donorRst), + // true); + // forgetMigrationThread.start(); + // fp.wait(); + + // // Let the keys expire on the donor before the state document is deleted to verify + // retrying + // // recipientForgetMigration can handle this case. The keys won't be deleted until the + // buffer + // // expires, so sleep to avoid wasted work. + // sleep((lowerExternalKeysBufferSecs * 1000) + lowerStateDocExpirationMS + 500); + // waitForExternalKeysToBeDeleted(tmt.getRecipientPrimary(), migrationId); + + // assert.commandWorked(recipientPrimary.adminCommand( + // {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); + // assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); + // fp.off(); + + // assert.commandWorked(forgetMigrationThread.returnData()); + + // // Eventually the donor's keys should be deleted too. + // waitForExternalKeysToBeDeleted(tmt.getDonorPrimary(), migrationId); + // teardown(); + // } })(); diff --git a/jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js b/jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js deleted file mode 100644 index 4cd9ae5a415..00000000000 --- a/jstests/replsets/tenant_migration_recipient_failover_before_creating_oplog_buffer.js +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Tests whether the recipient returns an appropriate error code to the donor when the recipient - * primary is made to step down before creating the oplog buffer collection. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_persistence, - * requires_replication, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject(). -load("jstests/libs/fail_point_util.js"); // For configureFailPoint(). - -const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), sharedOptions: {nodes: 2}}); - -const kMigrationId = UUID(); -const kTenantId = ObjectId().str; -const kReadPreference = { - mode: "primary" -}; -const migrationOpts = { - migrationIdString: extractUUIDFromObject(kMigrationId), - tenantId: kTenantId, - readPreference: kReadPreference -}; - -const fpBeforeCreatingOplogBuffer = - configureFailPoint(tenantMigrationTest.getRecipientPrimary(), - "fpAfterRetrievingStartOpTimesMigrationRecipientInstance", - {action: "hang"}); - -jsTestLog("Starting tenant migration with migrationId: " + kMigrationId + - ", tenantId: " + kTenantId); -assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - -jsTestLog("Waiting until the recipient primary is about to create an oplog buffer collection."); -fpBeforeCreatingOplogBuffer.wait(); - -jsTestLog("Stepping a new primary up."); -tenantMigrationTest.getRecipientRst().stepUp( - tenantMigrationTest.getRecipientRst().getSecondaries()[0]); - -fpBeforeCreatingOplogBuffer.off(); - -jsTestLog("Waiting for migration to complete."); -TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); - -tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js b/jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js deleted file mode 100644 index b0c58169b01..00000000000 --- a/jstests/replsets/tenant_migration_recipient_initial_sync_cloning.js +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Tests that during tenant migration, a new recipient node's state document and in-memory state is - * initialized after initial sync, when 1) the node hasn't begun cloning data yet, 2) is cloning - * data, and 3) is in the tenant oplog application phase. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * incompatible_with_shard_merge, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import {makeX509OptionsForTest} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); -load('jstests/replsets/rslib.js'); // for waitForNewlyAddedRemovalForNodeToBeCommitted - -const migrationX509Options = makeX509OptionsForTest(); - -const testDBName = 'testDB'; -const testCollName = 'testColl'; - -// Restarts a node, allows the node to go through initial sync, and then makes sure its state -// matches up with the primary's. Returns the initial sync node. -function restartNodeAndCheckState(tenantId, tenantMigrationTest, checkMtab) { - // Restart a node and allow it to complete initial sync. - const recipientRst = tenantMigrationTest.getRecipientRst(); - const originalRecipientPrimary = recipientRst.getPrimary(); - - jsTestLog("Restarting a node from the recipient replica set."); - let initialSyncNode = recipientRst.getSecondaries()[0]; - initialSyncNode = - recipientRst.restart(initialSyncNode, {startClean: true, skipValidation: true}); - - // Allow the new node to finish initial sync. - waitForNewlyAddedRemovalForNodeToBeCommitted(originalRecipientPrimary, - recipientRst.getNodeId(initialSyncNode)); - recipientRst.awaitSecondaryNodes(); - recipientRst.awaitReplication(); - - jsTestLog("Ensure that the new node's state matches up with the primary's."); - // Make sure the new node's state makes sense. - let recipientDocOnPrimary = undefined; - let recipientDocOnNewNode = undefined; - assert.soon( - () => { - recipientDocOnPrimary = - originalRecipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS) - .findOne({tenantId}); - recipientDocOnNewNode = - initialSyncNode.getCollection(TenantMigrationTest.kConfigRecipientsNS) - .findOne({tenantId}); - - return recipientDocOnPrimary.state == recipientDocOnNewNode.state; - }, - `States never matched, primary: ${recipientDocOnPrimary}, on new node: ${ - recipientDocOnNewNode}`); - - if (checkMtab) { - jsTestLog("Ensuring TenantMigrationAccessBlocker states match."); - const primaryMtab = tenantMigrationTest.getTenantMigrationAccessBlocker( - {recipientNode: originalRecipientPrimary, tenantId}); - const newNodeMtab = tenantMigrationTest.getTenantMigrationAccessBlocker( - {recipientNode: initialSyncNode, tenantId}); - - assert.eq(primaryMtab.recipient.state, - newNodeMtab.recipient.state, - `Mtab didn't match, primary: ${primaryMtab}, on new node: ${newNodeMtab}`); - } - - return initialSyncNode; -} - -// Restarts a node without tenant oplog application. Ensures its state matches up with the -// primary's, and then steps it up. -function restartNodeAndCheckStateWithoutOplogApplication( - tenantId, tenantMigrationTest, checkMtab, fpOnRecipient) { - fpOnRecipient.wait(); - - const initialSyncNode = restartNodeAndCheckState(tenantId, tenantMigrationTest, checkMtab); - - jsTestLog("Stepping up the new node."); - // Now step up the new node - tenantMigrationTest.getRecipientRst().stepUp(initialSyncNode); - fpOnRecipient.off(); -} - -// Pauses the recipient before the tenant oplog application phase, and inserts documents on the -// donor that the recipient tenant oplog applier must apply. Then restarts node, allows initial -// sync, and steps the restarted node up. -function restartNodeAndCheckStateDuringOplogApplication( - tenantId, tenantMigrationTest, checkMtab, fpOnRecipient) { - fpOnRecipient.wait(); - - // Pause the tenant oplog applier before applying a batch. - const originalRecipientPrimary = tenantMigrationTest.getRecipientPrimary(); - const fpPauseOplogApplierOnBatch = - configureFailPoint(originalRecipientPrimary, "fpBeforeTenantOplogApplyingBatch"); - - // Insert documents into the donor after data cloning but before tenant oplog application, so - // that the recipient has entries to apply during tenant oplog application. - tenantMigrationTest.insertDonorDB( - tenantMigrationTest.tenantDB(tenantId, testDBName), - testCollName, - [...Array(30).keys()].map((i) => ({a: i, b: "George Harrison - All Things Must Pass"}))); - - // Wait until the oplog applier has started and is trying to apply a batch. Then restart a node. - fpPauseOplogApplierOnBatch.wait(); - const initialSyncNode = restartNodeAndCheckState(tenantId, tenantMigrationTest, checkMtab); - - jsTestLog("Stepping up the new node."); - // Now step up the new node - tenantMigrationTest.getRecipientRst().stepUp(initialSyncNode); - fpPauseOplogApplierOnBatch.off(); - fpOnRecipient.off(); -} - -// This function does the following: -// 1. Configures a failpoint on the recipient primary, depending on the 'recipientFailpoint' that is -// passed into the function. -// 2. Starts a tenant migration. -// 3. Waits for the recipient failpoint to be hit. Restarts a node, to make it go through initial -// sync. -// 4. Makes sure the restarted node's state is as expected. -// 5. Steps up the restarted node as the recipient primary, lifts the recipient failpoint, and -// allows the migration to complete. -function runTestCase(recipientFailpoint, checkMtab, restartNodeAndCheckStateFunction) { - const tenantId = ObjectId().str; - const donorRst = new ReplSetTest({ - name: "donorRst", - nodes: 1, - serverless: true, - nodeOptions: Object.assign(migrationX509Options.donor, { - setParameter: { - // Allow non-timestamped reads on donor after migration completes for testing. - 'failpoint.tenantMigrationDonorAllowsNonTimestampedReads': - tojson({mode: 'alwaysOn'}), - } - }) - }); - donorRst.startSet(); - donorRst.initiate(); - - const tenantMigrationTest = new TenantMigrationTest({ - name: jsTestName(), - donorRst, - sharedOptions: {setParameter: {tenantApplierBatchSizeOps: 2}} - }); - - const migrationOpts = {migrationIdString: extractUUIDFromObject(UUID()), tenantId}; - const dbName = tenantMigrationTest.tenantDB(tenantId, testDBName); - const originalRecipientPrimary = tenantMigrationTest.getRecipientPrimary(); - - const fpOnRecipient = - configureFailPoint(originalRecipientPrimary, recipientFailpoint, {action: "hang"}); - tenantMigrationTest.insertDonorDB(dbName, testCollName); - - jsTestLog(`Starting a tenant migration with migrationID ${ - migrationOpts.migrationIdString}, and tenantId ${tenantId}`); - assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - - restartNodeAndCheckStateFunction(tenantId, tenantMigrationTest, checkMtab, fpOnRecipient); - - // Allow the migration to run to completion. - jsTestLog("Allowing migration to run to completion."); - TenantMigrationTest.assertCommitted( - tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); - - assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - - tenantMigrationTest.stop(); - donorRst.stopSet(); -} - -// These two test cases are for before the mtab is created, and before the oplog applier has been -// started. -runTestCase("fpAfterStartingOplogFetcherMigrationRecipientInstance", - false /* checkMtab */, - restartNodeAndCheckStateWithoutOplogApplication); -runTestCase("tenantCollectionClonerHangAfterCreateCollection", - false /* checkMtab */, - restartNodeAndCheckStateWithoutOplogApplication); - -// Test case to initial sync a node while the recipient is in the oplog application phase. -runTestCase("fpBeforeFulfillingDataConsistentPromise", - true /* checkMtab */, - restartNodeAndCheckStateDuringOplogApplication); - -// A case after data consistency so that the mtab exists. We do not care about the oplog applier in -// this case. -runTestCase("fpAfterWaitForRejectReadsBeforeTimestamp", - true /* checkMtab */, - restartNodeAndCheckStateWithoutOplogApplication); diff --git a/jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js b/jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js deleted file mode 100644 index f00a4c4f083..00000000000 --- a/jstests/replsets/tenant_migration_recipient_resume_on_stepup_and_restart.js +++ /dev/null @@ -1,211 +0,0 @@ -/** - * Tests that tenant migrations resume successfully on recipient stepup and restart. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_windows_tls, - * incompatible_with_shard_merge, - * # Some tenant migration statistics field names were changed in 6.1. - * requires_fcv_61, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import { - forgetMigrationAsync, - makeX509OptionsForTest, - runMigrationAsync, -} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/parallelTester.js"); -load("jstests/libs/uuid_util.js"); -load('jstests/replsets/rslib.js'); // 'createRstArgs' - -const kMaxSleepTimeMS = 100; -const kTenantId = ObjectId().str; - -// Set the delay before a state doc is garbage collected to be short to speed up the test but long -// enough for the state doc to still be around after stepup or restart. -const kGarbageCollectionDelayMS = 30 * 1000; - -// Set the TTL monitor to run at a smaller interval to speed up the test. -const kTTLMonitorSleepSecs = 1; - -const migrationX509Options = makeX509OptionsForTest(); - -/** - * Runs the donorStartMigration command to start a migration, and interrupts the migration on the - * recipient using the 'interruptFunc' after the migration starts on the recipient side, and - * asserts that migration eventually commits. - * @param {recipientRestarted} bool is needed to properly assert the tenant migrations stat count. - */ -function testRecipientSyncDataInterrupt(interruptFunc, recipientRestarted) { - const recipientRst = new ReplSetTest({ - nodes: 3, - name: "recipientRst", - serverless: true, - nodeOptions: migrationX509Options.recipient - }); - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), recipientRst}); - - const donorRst = tenantMigrationTest.getDonorRst(); - const donorPrimary = tenantMigrationTest.getDonorPrimary(); - let recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - }; - const donorRstArgs = createRstArgs(donorRst); - - const runMigrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs); - runMigrationThread.start(); - - // Wait for recipientSyncData command to start. - assert.soon( - () => recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"}) - .inprog.length > 0); - - sleep(Math.random() * kMaxSleepTimeMS); - interruptFunc(recipientRst); - - TenantMigrationTest.assertCommitted(runMigrationThread.returnData()); - tenantMigrationTest.waitForDonorNodesToReachState(donorRst.nodes, - migrationId, - migrationOpts.tenantId, - TenantMigrationTest.DonorState.kCommitted); - assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - - const donorStats = tenantMigrationTest.getTenantMigrationStats(donorPrimary); - assert.eq(1, donorStats.totalMigrationDonationsCommitted); - - tenantMigrationTest.stop(); - recipientRst.stopSet(); -} - -/** - * Starts a migration and waits for it to commit, then runs the donorForgetMigration, and interrupts - * the recipient using the 'interruptFunc', and asserts that the migration state is eventually - * garbage collected. - */ -function testRecipientForgetMigrationInterrupt(interruptFunc) { - const donorRst = new ReplSetTest({ - nodes: 1, - name: "donorRst", - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.donor, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: kTTLMonitorSleepSecs, - } - }) - }); - const recipientRst = new ReplSetTest({ - nodes: 3, - name: "recipientRst", - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.recipient, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: kTTLMonitorSleepSecs, - } - }) - }); - - donorRst.startSet(); - donorRst.initiate(); - - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst}); - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - - const migrationId = UUID(); - const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: kTenantId, - recipientConnString: recipientRst.getURL(), - }; - const donorRstArgs = createRstArgs(donorRst); - - TenantMigrationTest.assertCommitted( - tenantMigrationTest.runMigration(migrationOpts, {automaticForgetMigration: false})); - const forgetMigrationThread = new Thread(forgetMigrationAsync, - migrationOpts.migrationIdString, - donorRstArgs, - false /* retryOnRetryableErrors */); - forgetMigrationThread.start(); - - // Wait for recipientForgetMigration command to start. - assert.soon(() => { - const res = assert.commandWorked( - recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"})); - return res.inprog[0].expireAt != null; - }); - sleep(Math.random() * kMaxSleepTimeMS); - interruptFunc(recipientRst); - - assert.commandWorkedOrFailedWithCode( - tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString), - ErrorCodes.NoSuchTenantMigration); - - assert.commandWorked(forgetMigrationThread.returnData()); - tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, migrationOpts.tenantId); - - tenantMigrationTest.stop(); - donorRst.stopSet(); - recipientRst.stopSet(); -} - -(() => { - jsTest.log("Test that the migration resumes on stepup"); - testRecipientSyncDataInterrupt((recipientRst) => { - // Force the primary to step down but make it likely to step back up. - const recipientPrimary = recipientRst.getPrimary(); - assert.commandWorked(recipientPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); - }, false); -})(); - -(() => { - jsTest.log("Test that the migration resumes after restart"); - testRecipientSyncDataInterrupt((recipientRst) => { - recipientRst.stopSet(null /* signal */, true /*forRestart */); - recipientRst.startSet({restart: true}); - recipientRst.awaitSecondaryNodes(); - recipientRst.getPrimary(); - }, true); -})(); - -(() => { - jsTest.log("Test that the recipientForgetMigration command can be retried on stepup"); - testRecipientForgetMigrationInterrupt((recipientRst) => { - // Force the primary to step down but make it likely to step back up. - const recipientPrimary = recipientRst.getPrimary(); - assert.commandWorked(recipientPrimary.adminCommand( - {replSetStepDown: ReplSetTest.kForeverSecs, force: true})); - assert.commandWorked(recipientPrimary.adminCommand({replSetFreeze: 0})); - }); -})(); - -(() => { - jsTest.log("Test that the recipientForgetMigration command can be retried after restart"); - testRecipientForgetMigrationInterrupt((recipientRst) => { - recipientRst.stopSet(null /* signal */, true /*forRestart */); - recipientRst.startSet({restart: true}); - recipientRst.awaitSecondaryNodes(); - recipientRst.getPrimary(); - }); -})(); diff --git a/jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js b/jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js deleted file mode 100644 index 03b5f7eb972..00000000000 --- a/jstests/replsets/tenant_migration_recipient_retryable_writes_failover.js +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Tests whether the recipient correctly clears its oplog buffer if the recipient primary - * fails over while fetching retryable writes oplog entries from the donor. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject(). -load("jstests/libs/fail_point_util.js"); // For configureFailPoint(). - -const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), sharedOptions: {nodes: 2}}); - -const kMigrationId = UUID(); -const kTenantId = ObjectId().str; -const kDbName = tenantMigrationTest.tenantDB(kTenantId, "testDb"); -const kCollName = "testColl"; -const migrationOpts = { - migrationIdString: extractUUIDFromObject(kMigrationId), - tenantId: kTenantId, -}; - -const donorRst = tenantMigrationTest.getDonorRst(); -const donorPrimary = tenantMigrationTest.getDonorPrimary(); -const rsConn = new Mongo(donorRst.getURL()); -const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - -const session = rsConn.startSession({retryWrites: true}); -const sessionColl = session.getDatabase(kDbName)[kCollName]; - -const session2 = rsConn.startSession({retryWrites: true}); -const sessionColl2 = session2.getDatabase(kDbName)[kCollName]; - -jsTestLog("Run retryable writes prior to the migration."); -assert.commandWorked(sessionColl.insert({_id: "retryableWrite1"})); -assert.commandWorked(sessionColl2.insert({_id: "retryableWrite2"})); - -jsTestLog("Setting up failpoints."); -// Use `pauseAfterRetrievingRetryableWritesBatch` to hang after inserting the first batch of results -// from the aggregation request into the oplog buffer. -const fpPauseAfterRetrievingRetryableWritesBatch = - configureFailPoint(recipientPrimary, "pauseAfterRetrievingRetryableWritesBatch"); - -// Set aggregation request batch size to 1 so that we can failover in between batches. -const fpSetSmallAggregationBatchSize = - configureFailPoint(recipientPrimary, "fpSetSmallAggregationBatchSize"); - -jsTestLog("Starting tenant migration with migrationId: " + kMigrationId + - ", tenantId: " + kTenantId); -assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - -jsTestLog("Waiting until the recipient primary fetches a batch of retryable writes oplog entries."); -fpSetSmallAggregationBatchSize.wait(); -fpPauseAfterRetrievingRetryableWritesBatch.wait(); - -// Check that the oplog buffer is correctly populated. -const kOplogBufferNS = "repl.migration.oplog_" + migrationOpts.migrationIdString; -let recipientOplogBuffer = recipientPrimary.getDB("config")[kOplogBufferNS]; -// We expect to have only retryableWrite1 since the cursor batch size is 1 and we paused after -// inserting the first branch of results from the aggregation request. -let cursor = recipientOplogBuffer.find(); -assert.eq(cursor.itcount(), 1, "Incorrect number of oplog entries in buffer: " + cursor.toArray()); - -// Check that we haven't completed the retryable writes fetching stage yet. -let recipientConfigColl = recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS); -let recipientDoc = recipientConfigColl.find({"_id": kMigrationId}).toArray(); -assert.eq(recipientDoc.length, 1); -assert.eq(recipientDoc[0].completedFetchingRetryableWritesBeforeStartOpTime, false); - -jsTestLog("Stepping a new primary up."); -const recipientRst = tenantMigrationTest.getRecipientRst(); -const recipientSecondary = recipientRst.getSecondary(); -// Use `fpAfterFetchingRetryableWritesEntriesBeforeStartOpTime` to hang after populating the oplog -// buffer with retryable writes entries. Set this before stepping up instead of after so that the -// new primary will not be able to pass this stage without the failpoint being set. -const fpAfterFetchingRetryableWritesEntries = configureFailPoint( - recipientSecondary, "fpAfterFetchingRetryableWritesEntriesBeforeStartOpTime", {action: "hang"}); - -recipientRst.stepUp(recipientSecondary); - -fpPauseAfterRetrievingRetryableWritesBatch.off(); -const newRecipientPrimary = recipientRst.getPrimary(); - -fpAfterFetchingRetryableWritesEntries.wait(); -// The new primary should have cleared its oplog buffer and refetched both retryableWrite1 and -// retryableWrite2. Otherwise, we will invariant when trying to add those entries. -recipientOplogBuffer = newRecipientPrimary.getDB("config")[kOplogBufferNS]; -cursor = recipientOplogBuffer.find(); -assert.eq(cursor.itcount(), 2, "Incorrect number of oplog entries in buffer: " + cursor.toArray()); - -recipientConfigColl = newRecipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS); -recipientDoc = recipientConfigColl.find({"_id": kMigrationId}).toArray(); -assert.eq(recipientDoc.length, 1); -assert.eq(recipientDoc[0].completedFetchingRetryableWritesBeforeStartOpTime, true); - -fpAfterFetchingRetryableWritesEntries.off(); -fpSetSmallAggregationBatchSize.off(); - -jsTestLog("Waiting for migration to complete."); -TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); - -tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_recipient_rollback_recovery.js b/jstests/replsets/tenant_migration_recipient_rollback_recovery.js deleted file mode 100644 index ffb2cc4919e..00000000000 --- a/jstests/replsets/tenant_migration_recipient_rollback_recovery.js +++ /dev/null @@ -1,335 +0,0 @@ -/** - * Tests that tenant migrations that go through recipient rollback are recovered correctly. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import { - forgetMigrationAsync, - makeX509OptionsForTest, - runMigrationAsync, -} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); -load("jstests/libs/parallelTester.js"); -load("jstests/replsets/libs/rollback_test.js"); -load("jstests/replsets/rslib.js"); // 'createRstArgs' - -const kTenantId = ObjectId().str; - -const kMaxSleepTimeMS = 250; - -// Set the delay before a state doc is garbage collected to be short to speed up the test but long -// enough for the state doc to still be around after the recipient is back in the replication steady -// state. -const kGarbageCollectionDelayMS = 30 * 1000; - -const migrationX509Options = makeX509OptionsForTest(); - -function makeMigrationOpts(tenantMigrationTest, migrationId, tenantId) { - return { - migrationIdString: extractUUIDFromObject(migrationId), - tenantId: tenantId, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - readPreference: {mode: "primary"}, - }; -} - -/** - * Starts a recipient ReplSetTest and creates a TenantMigrationTest for it. Runs 'setUpFunc' after - * initiating the recipient. Then, runs 'rollbackOpsFunc' while replication is disabled on the - * secondaries, shuts down the primary and restarts it after re-election to force the operations in - * 'rollbackOpsFunc' to be rolled back. Finally, runs 'steadyStateFunc' after it is back in the - * replication steady state. - */ -function testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc) { - const donorRst = new ReplSetTest({ - name: "donorRst", - nodes: 1, - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.donor, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: 1, - } - }) - }); - donorRst.startSet(); - donorRst.initiate(); - - const donorRstArgs = createRstArgs(donorRst); - - const recipientRst = new ReplSetTest({ - name: "recipientRst", - nodes: 3, - serverless: true, - nodeOptions: Object.assign({}, migrationX509Options.recipient, { - setParameter: { - tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, - ttlMonitorSleepSecs: 1, - } - }) - }); - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), donorRst, recipientRst}); - setUpFunc(tenantMigrationTest, donorRstArgs); - - let originalRecipientPrimary = recipientRst.getPrimary(); - const originalRecipientSecondaries = recipientRst.getSecondaries(); - // The default WC is majority and stopServerReplication will prevent satisfying any majority - // writes. - assert.commandWorked(originalRecipientPrimary.adminCommand( - {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}})); - recipientRst.awaitLastOpCommitted(); - - // Disable replication on the secondaries so that writes during this step will be rolled back. - stopServerReplication(originalRecipientSecondaries); - rollbackOpsFunc(tenantMigrationTest, donorRstArgs); - - // Shut down the primary and re-enable replication to allow one of the secondaries to get - // elected, and make the writes above get rolled back on the original primary when it comes - // back up. - recipientRst.stop(originalRecipientPrimary); - restartServerReplication(originalRecipientSecondaries); - const newRecipientPrimary = recipientRst.getPrimary(); - assert.neq(originalRecipientPrimary, newRecipientPrimary); - - // Restart the original primary. - originalRecipientPrimary = - recipientRst.start(originalRecipientPrimary, {waitForConnect: true}, true /* restart */); - originalRecipientPrimary.setSecondaryOk(); - recipientRst.awaitReplication(); - - steadyStateFunc(tenantMigrationTest); - - donorRst.stopSet(); - recipientRst.stopSet(); -} - -/** - * Starts a migration and waits for the recipient's primary to insert the recipient's state doc. - * Forces the write to be rolled back. After the replication steady state is reached, asserts that - * recipientSyncData can restart the migration on the new primary. - */ -function testRollbackInitialState() { - const migrationId = UUID(); - let migrationOpts; - let migrationThread; - - let setUpFunc = (tenantMigrationTest, donorRstArgs) => {}; - - let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => { - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - - // Start the migration asynchronously and wait for the primary to insert the state doc. - migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str); - migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs); - migrationThread.start(); - assert.soon(() => { - return 1 === - recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS).count({ - _id: migrationId - }); - }); - }; - - let steadyStateFunc = (tenantMigrationTest) => { - // Verify that the migration restarted successfully on the new primary despite rollback. - TenantMigrationTest.assertCommitted(migrationThread.returnData()); - tenantMigrationTest.assertRecipientNodesInExpectedState({ - nodes: tenantMigrationTest.getRecipientRst().nodes, - migrationId: migrationId, - tenantId: migrationOpts.tenantId, - expectedState: TenantMigrationTest.RecipientState.kConsistent, - expectedAccessState: TenantMigrationTest.RecipientAccessState.kRejectBefore - }); - assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - }; - - testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc); -} - -/** - * Starts a migration after enabling 'pauseFailPoint' (must pause the migration) and - * 'setUpFailPoints' on the recipient's primary. Waits for the primary to do the write to transition - * to 'nextState' after reaching 'pauseFailPoint' (i.e. the state doc matches 'query'), then forces - * the write to be rolled back. After the replication steady state is reached, asserts that the - * migration is resumed successfully by new primary regardless of what the rolled back state - * transition is. - */ -function testRollBackStateTransition(pauseFailPoint, setUpFailPoints, nextState, query) { - jsTest.log(`Test roll back the write to transition to state "${ - nextState}" after reaching failpoint "${pauseFailPoint}"`); - - const migrationId = UUID(); - let migrationOpts; - let migrationThread, pauseFp; - - let setUpFunc = (tenantMigrationTest, donorRstArgs) => { - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - setUpFailPoints.forEach(failPoint => configureFailPoint(recipientPrimary, failPoint)); - pauseFp = configureFailPoint(recipientPrimary, pauseFailPoint, {action: "hang"}); - - migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str); - migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs); - migrationThread.start(); - pauseFp.wait(); - }; - - let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => { - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - // Resume the migration and wait for the primary to do the write for the state transition. - pauseFp.off(); - assert.soon(() => { - return 1 === - recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS) - .count(Object.assign({_id: migrationId}, query)); - }); - }; - - let steadyStateFunc = (tenantMigrationTest) => { - // Verify that the migration resumed successfully on the new primary despite the rollback. - TenantMigrationTest.assertCommitted(migrationThread.returnData()); - tenantMigrationTest.waitForRecipientNodesToReachState( - tenantMigrationTest.getRecipientRst().nodes, - migrationId, - migrationOpts.tenantId, - TenantMigrationTest.RecipientState.kConsistent, - TenantMigrationTest.RecipientAccessState.kRejectBefore); - assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - }; - - testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc); -} - -/** - * Runs donorForgetMigration after completing a migration. Waits for the recipient's primary to - * mark the recipient's state doc as garbage collectable, then forces the write to be rolled back. - * After the replication steady state is reached, asserts that recipientForgetMigration can be - * retried on the new primary and that the state doc is eventually garbage collected. - */ -function testRollBackMarkingStateGarbageCollectable() { - const migrationId = UUID(); - let migrationOpts; - let forgetMigrationThread; - - let setUpFunc = (tenantMigrationTest, donorRstArgs) => { - migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str); - TenantMigrationTest.assertCommitted( - tenantMigrationTest.runMigration(migrationOpts, {automaticForgetMigration: false})); - }; - - let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => { - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - // Run donorForgetMigration and wait for the primary to do the write to mark the state doc - // as garbage collectable. - forgetMigrationThread = new Thread(forgetMigrationAsync, - migrationOpts.migrationIdString, - donorRstArgs, - false /* retryOnRetryableErrors */); - forgetMigrationThread.start(); - assert.soon(() => { - return 1 === - recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS) - .count({_id: migrationId, expireAt: {$exists: 1}}); - }); - }; - - let steadyStateFunc = (tenantMigrationTest) => { - // Verify that the migration state got garbage collected successfully despite the rollback. - assert.commandWorked(forgetMigrationThread.returnData()); - tenantMigrationTest.waitForMigrationGarbageCollection( - migrationId, - migrationOpts.tenantId, - tenantMigrationTest.getDonorRst().nodes, - tenantMigrationTest.getRecipientRst().nodes); - }; - - testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc); -} - -/** - * Starts a migration and forces the recipient's primary to go through rollback after a random - * amount of time. After the replication steady state is reached, asserts that the migration is - * resumed successfully. - */ -function testRollBackRandom() { - const migrationId = UUID(); - let migrationOpts; - let migrationThread; - - let setUpFunc = (tenantMigrationTest, donorRstArgs) => { - migrationOpts = makeMigrationOpts(tenantMigrationTest, migrationId, ObjectId().str); - migrationThread = new Thread(async (donorRstArgs, migrationOpts) => { - const {runMigrationAsync, forgetMigrationAsync} = - await import("jstests/replsets/libs/tenant_migration_util.js"); - assert.commandWorked(await runMigrationAsync(migrationOpts, donorRstArgs)); - assert.commandWorked(await forgetMigrationAsync( - migrationOpts.migrationIdString, donorRstArgs, false /* retryOnRetryableErrors */)); - }, donorRstArgs, migrationOpts); - - // Start the migration and wait for a random amount of time before transitioning to the - // rollback operations state. - migrationThread.start(); - sleep(Math.random() * kMaxSleepTimeMS); - }; - - let rollbackOpsFunc = (tenantMigrationTest, donorRstArgs) => { - // Let the migration run in the rollback operations state for a random amount of time. - sleep(Math.random() * kMaxSleepTimeMS); - }; - - let steadyStateFunc = (tenantMigrationTest) => { - // Verify that the migration completed and was garbage collected successfully despite the - // rollback. - migrationThread.join(); - tenantMigrationTest.waitForRecipientNodesToReachState( - tenantMigrationTest.getRecipientRst().nodes, - migrationId, - migrationOpts.tenantId, - TenantMigrationTest.RecipientState.kDone, - TenantMigrationTest.RecipientAccessState.kRejectBefore); - tenantMigrationTest.waitForMigrationGarbageCollection( - migrationId, - migrationOpts.tenantId, - tenantMigrationTest.getDonorRst().nodes, - tenantMigrationTest.getRecipientRst().nodes); - }; - - testRollBack(setUpFunc, rollbackOpsFunc, steadyStateFunc); -} - -jsTest.log("Test roll back recipient's state doc insert"); -testRollbackInitialState(); - -jsTest.log("Test roll back recipient's state doc update"); -[{ - pauseFailPoint: "fpBeforeMarkingCloneSuccess", - nextState: "reject", - query: {dataConsistentStopDonorOpTime: {$exists: 1}} -}, - { - pauseFailPoint: "fpBeforePersistingRejectReadsBeforeTimestamp", - nextState: "rejectBefore", - query: {rejectReadsBeforeTimestamp: {$exists: 1}} - }].forEach(({pauseFailPoint, setUpFailPoints = [], nextState, query}) => { - testRollBackStateTransition(pauseFailPoint, setUpFailPoints, nextState, query); -}); - -jsTest.log("Test roll back marking the donor's state doc as garbage collectable"); -testRollBackMarkingStateGarbageCollectable(); - -jsTest.log("Test roll back random"); -testRollBackRandom(); diff --git a/jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js b/jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js deleted file mode 100644 index 81422e28454..00000000000 --- a/jstests/replsets/tenant_migration_recipient_sync_donor_timestamp.js +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Exercises the code path for the recipientSyncData command that waits until a timestamp provided - * by the donor is majority committed: make sure that in this code path, when the recipient is - * interrupted by a primary step down, the recipient properly swaps the error code to the true code - * (like primary step down) that the donor can retry on. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_persistence, - * requires_replication, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject() - -// Make the batch size small so that we can pause before all the batches are applied. -const tenantMigrationTest = new TenantMigrationTest( - {name: jsTestName(), sharedOptions: {setParameter: {tenantApplierBatchSizeOps: 2}}}); - -const kMigrationId = UUID(); -const kTenantId = ObjectId().str; -const kReadPreference = { - mode: "primary" -}; -const migrationOpts = { - migrationIdString: extractUUIDFromObject(kMigrationId), - tenantId: kTenantId, - readPreference: kReadPreference -}; - -const dbName = tenantMigrationTest.tenantDB(kTenantId, "testDB"); -const collName = jsTestName() + "_collection"; - -const recipientRst = tenantMigrationTest.getRecipientRst(); -const recipientPrimary = recipientRst.getPrimary(); - -// FailPoint to pause right before the data consistent promise is fulfilled. -const fpBeforeDataConsistent = configureFailPoint( - recipientPrimary, "fpBeforeFulfillingDataConsistentPromise", {action: "hang"}); -const fpBeforeApplierFutureCalled = - configureFailPoint(recipientPrimary, "fpWaitUntilTimestampMajorityCommitted"); - -tenantMigrationTest.insertDonorDB(dbName, collName); - -jsTestLog("Starting migration."); -// Start the migration, and allow it to progress to the point where the _dataConsistentPromise has -// been fulfilled. -tenantMigrationTest.startMigration(migrationOpts); - -jsTestLog("Waiting for data consistent promise."); -// Pause right before the _dataConsistentPromise is fulfilled. Therefore, the applier has -// finished applying entries at least until dataConsistentStopDonorOpTime. -fpBeforeDataConsistent.wait(); - -jsTestLog("Pausing the tenant oplog applier."); -// Pause the applier now. All the entries that the applier cannot process now are past the -// dataConsistentStopDonorOpTime. -const fpPauseOplogApplier = - configureFailPoint(recipientPrimary, "fpBeforeTenantOplogApplyingBatch"); - -jsTestLog("Writing to donor db."); -// Send writes to the donor. The applier will not be able to process these as it is paused. -const docsToApply = [...Array(10).keys()].map((i) => ({a: i})); -tenantMigrationTest.insertDonorDB(dbName, collName, docsToApply); - -jsTestLog("Waiting to hit failpoint in tenant oplog applier."); -fpPauseOplogApplier.wait(); - -jsTestLog("Allowing recipient to respond."); -// Allow the recipient to respond to the donor for the recipientSyncData command that waits on the -// fulfillment of the _dataConsistentPromise. The donor will then send another recipientSyncData -// command that waits on the provided donor timestamp to be majority committed. -fpBeforeDataConsistent.off(); - -jsTestLog("Reach the point where we are waiting for the tenant oplog applier to catch up."); -fpBeforeApplierFutureCalled.wait(); -fpBeforeApplierFutureCalled.off(); - -jsTestLog("Stepping another node up."); -// Make a new recipient primary step up. This will ask the applier to shutdown. -recipientRst.stepUp(recipientRst.getSecondaries()[0]); - -jsTestLog("Release the tenant oplog applier failpoint."); -fpPauseOplogApplier.off(); - -jsTestLog("Waiting for migration to complete."); -TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); - -tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js b/jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js deleted file mode 100644 index c95fb70d474..00000000000 --- a/jstests/replsets/tenant_migration_recipient_sync_source_reconnect_delayed_secondary.js +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Tests that a migration will continuously retry sync source selection when there are no available - * donor hosts. Also checks that a donor host is considered an uneligible sync source when it has a - * majority OpTime earlier than the recipient's stored 'startApplyingDonorOpTime'. - * - * Tests that if the stale donor host advances its majority OpTime to 'startApplyingDonorOpTime' - * or later, the recipient will successfully choose that donor as sync source and resume the - * migration. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * # The currentOp output field 'dataSyncCompleted' was renamed to 'migrationCompleted'. - * requires_fcv_70, - * serverless, - * ] - */ - -import { - setUpMigrationSyncSourceTest -} from "jstests/replsets/libs/tenant_migration_recipient_sync_source.js"; -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; - -// After this setUp() call, we should have a migration with 'secondary' read preference. The -// recipient should be continuously retrying sync source selection, unable to choose -// 'delayedSecondary' because it is too stale and 'donorSecondary' because it is down. -const { - tenantMigrationTest, - migrationOpts, - donorSecondary, - delayedSecondary, - hangAfterCreatingConnections -} = setUpMigrationSyncSourceTest(); - -if (!tenantMigrationTest) { - // Feature flag was not enabled. - quit(); -} - -jsTestLog("Restarting replication on 'delayedSecondary'"); -restartServerReplication(delayedSecondary); - -// The recipient should eventually be able to connect to the lagged secondary, after the secondary -// has caught up and the exclude timeout has expired. -hangAfterCreatingConnections.wait(); - -const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); -const res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"}); -const currOp = res.inprog[0]; -assert.eq(delayedSecondary.host, - currOp.donorSyncSource, - `the recipient should only be able to choose 'delayedSecondary' as sync source`); - -hangAfterCreatingConnections.off(); - -TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); -assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - -// Remove 'donorSecondary' so that the test can complete properly. -const donorRst = tenantMigrationTest.getDonorRst(); -donorRst.remove(donorSecondary); -donorRst.stopSet(); -tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js b/jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js deleted file mode 100644 index 10ded49a34b..00000000000 --- a/jstests/replsets/tenant_migration_recipient_sync_source_restart_donor_secondary.js +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Tests that a migration will continuously retry sync source selection when there are no available - * donor hosts. Also checks that a donor host is considered an uneligible sync source when it has a - * majority OpTime earlier than the recipient's stored 'startApplyingDonorOpTime'. - * - * Tests that if a donor host becomes available, the recipient will successfully choose it as a - * sync source and resume the migration. - * - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * # The currentOp output field 'dataSyncCompleted' was renamed to 'migrationCompleted'. - * requires_fcv_70, - * serverless, - * ] - */ - -import { - setUpMigrationSyncSourceTest -} from "jstests/replsets/libs/tenant_migration_recipient_sync_source.js"; -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; - -// After this setUp() call, we should have a migration with 'secondary' read preference. The -// recipient should be continuously retrying sync source selection, unable to choose -// 'delayedSecondary' because it is too stale and 'donorSecondary' because it is down. -const { - tenantMigrationTest, - migrationOpts, - donorSecondary, - delayedSecondary, - hangAfterCreatingConnections -} = setUpMigrationSyncSourceTest(); - -if (!tenantMigrationTest) { - // Feature flag was not enabled. - quit(); -} - -const donorRst = tenantMigrationTest.getDonorRst(); - -jsTestLog("Restarting 'donorSecondary'"); -donorRst.start(donorSecondary, null /* options */, true /* restart */); - -// The recipient should eventually be able to connect to the donor secondary, after the node reaches -// 'secondary' state. -hangAfterCreatingConnections.wait(); - -const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); -const res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"}); -const currOp = res.inprog[0]; -// 'donorSecondary' should always be the chosen sync source, since read preference is 'secondary' -// and 'delayedSecondary' cannot be chosen because it is too stale. -assert.eq(donorSecondary.host, - currOp.donorSyncSource, - `the recipient should only be able to choose 'donorSecondary' as sync source`); - -hangAfterCreatingConnections.off(); -restartServerReplication(delayedSecondary); - -TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); -assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); - -donorRst.stopSet(); -tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js deleted file mode 100644 index ea9654d97ac..00000000000 --- a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js +++ /dev/null @@ -1,132 +0,0 @@ -/** - * Tests that in tenant migration, the recipient set can resume collection cloning from the last - * document cloned after a failover. - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import { - checkTenantDBHashes, - makeX509OptionsForTest, -} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' - -const tenantMigrationFailoverTest = function(isTimeSeries, createCollFn, docs) { - const batchSize = 2; - const recipientRst = new ReplSetTest({ - nodes: 2, - name: jsTestName() + "_recipient", - serverless: true, - nodeOptions: Object.assign(makeX509OptionsForTest().recipient, { - setParameter: { - // Use a batch size of 2 so that collection cloner requires more than a single - // batch to complete. - collectionClonerBatchSize: batchSize, - // Allow reads on recipient before migration completes for testing. - 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), - } - }) - }); - - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst}); - const donorPrimary = tenantMigrationTest.getDonorPrimary(); - - const tenantId = ObjectId().str; - const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); - const donorDB = donorPrimary.getDB(dbName); - const collName = "testColl"; - - const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); - - // Create collection and insert documents. - assert.commandWorked(createCollFn(donorDB, collName)); - tenantMigrationTest.insertDonorDB(dbName, collName, docs); - - const migrationId = UUID(); - const migrationIdString = extractUUIDFromObject(migrationId); - const migrationOpts = { - migrationIdString: migrationIdString, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - tenantId, - }; - - // Configure a fail point to have the recipient primary hang after cloning 2 documents. - const recipientDb = recipientPrimary.getDB(dbName); - let recipientColl = isTimeSeries ? recipientDb.getCollection("system.buckets." + collName) - : recipientDb.getCollection(collName); - - const hangDuringCollectionClone = - configureFailPoint(recipientDb, - "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse", - {nss: recipientColl.getFullName()}); - - // Start a migration and wait for recipient to hang after cloning 2 documents. - assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - hangDuringCollectionClone.wait(); - assert.soon(() => recipientColl.find().itcount() === batchSize); - - // Insert some documents that will be fetched by the recipient. This is to test that on - // failover, the fetcher will resume fetching from where it left off. The system is expected - // to crash if the recipient fetches a duplicate oplog entry upon resuming the migration. - tenantMigrationTest.insertDonorDB(dbName, "aNewColl", [{_id: "docToBeFetched"}]); - assert.soon(() => { - const configDb = recipientPrimary.getDB("config"); - const oplogBuffer = configDb.getCollection("repl.migration.oplog_" + migrationIdString); - return oplogBuffer.find({"entry.o._id": "docToBeFetched"}).count() === 1; - }); - - // Step up a new node in the recipient set and trigger a failover. The new primary should resume - // cloning starting from the third document. - const newRecipientPrimary = recipientRst.getSecondaries()[0]; - recipientRst.stepUp(newRecipientPrimary); - hangDuringCollectionClone.off(); - recipientRst.getPrimary(); - - // The migration should go through after recipient failover. - TenantMigrationTest.assertCommitted( - tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); - - // Check that recipient has cloned all documents in the collection. - recipientColl = newRecipientPrimary.getDB(dbName).getCollection(collName); - assert.eq(docs.length, recipientColl.find().itcount()); - assert.docEq(docs, recipientColl.find().sort({_id: 1}).toArray()); - checkTenantDBHashes({ - donorRst: tenantMigrationTest.getDonorRst(), - recipientRst: tenantMigrationTest.getRecipientRst(), - tenantId - }); - - tenantMigrationTest.stop(); - recipientRst.stopSet(); -}; - -jsTestLog("Running tenant migration test for time-series collection"); -tenantMigrationFailoverTest(true, - (db, collName) => db.createCollection( - collName, {timeseries: {timeField: "time", metaField: "bucket"}}), - [ - // Group each document in its own bucket in order to work with the - // collectionClonerBatchSize we set at the recipient replSet. - {_id: 1, time: ISODate(), bucket: "a"}, - {_id: 2, time: ISODate(), bucket: "b"}, - {_id: 3, time: ISODate(), bucket: "c"}, - {_id: 4, time: ISODate(), bucket: "d"} - ]); - -jsTestLog("Running tenant migration test for regular collection"); -tenantMigrationFailoverTest(false, - (db, collName) => db.createCollection(collName), - [{_id: 0}, {_id: "string"}, {_id: UUID()}, {_id: new Date()}]); diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js deleted file mode 100644 index 262e40edf00..00000000000 --- a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover_with_dropped_views.js +++ /dev/null @@ -1,133 +0,0 @@ -/** - * Tests that in tenant migration, the collection recreated on a dropped view namespace is handled - * correctly on resuming the logical tenant collection cloning phase due to recipient failover. - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import {makeX509OptionsForTest} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' - -const tenantMigrationFailoverTest = function(isTimeSeries, createCollFn) { - const recipientRst = new ReplSetTest({ - nodes: 2, - name: jsTestName() + "_recipient", - serverless: true, - nodeOptions: Object.assign(makeX509OptionsForTest().recipient, { - setParameter: { - // Allow reads on recipient before migration completes for testing. - 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), - } - }) - }); - - recipientRst.startSet(); - recipientRst.initiate(); - - const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst}); - - const donorRst = tenantMigrationTest.getDonorRst(); - const donorPrimary = donorRst.getPrimary(); - - const tenantId = ObjectId().str; - const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); - const donorDB = donorPrimary.getDB(dbName); - const collName = "testColl"; - const donorColl = donorDB[collName]; - - let getCollectionInfo = function(conn) { - return conn.getDB(dbName).getCollectionInfos().filter(coll => { - return coll.name === collName; - }); - }; - - // Create a timeseries collection or a regular view. - assert.commandWorked(createCollFn(donorDB, collName)); - donorRst.awaitReplication(); - - const migrationId = UUID(); - const migrationIdString = extractUUIDFromObject(migrationId); - const migrationOpts = { - migrationIdString: migrationIdString, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - tenantId, - }; - - const recipientPrimary = recipientRst.getPrimary(); - const recipientDb = recipientPrimary.getDB(dbName); - const recipientSystemViewsColl = recipientDb.getCollection("system.views"); - - // Configure a fail point to have the recipient primary hang after cloning - // "<OID>_testDB.system.views" collection. - const hangDuringCollectionClone = - configureFailPoint(recipientPrimary, - "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse", - {nss: recipientSystemViewsColl.getFullName()}); - - // Start the migration and wait for the migration to hang after cloning - // "<OID>_testDB.system.views" collection. - assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); - hangDuringCollectionClone.wait(); - - assert.soon(() => recipientSystemViewsColl.find().itcount() >= 1); - recipientRst.awaitLastOpCommitted(); - const newRecipientPrimary = recipientRst.getSecondaries()[0]; - - // Verify that a view has been registered for "<OID>_testDB.testColl" on the new - // recipient primary. - let collectionInfo = getCollectionInfo(newRecipientPrimary); - assert.eq(1, collectionInfo.length); - assert(collectionInfo[0].type === (isTimeSeries ? "timeseries" : "view"), - "data store type mismatch: " + tojson(collectionInfo[0])); - - // Drop the view and create a regular collection with the same namespace as the - // dropped view on donor. - assert(donorColl.drop()); - assert.commandWorked(donorDB.createCollection(collName)); - - // We need to skip TenantDatabaseCloner::listExistingCollectionsStage() to make sure - // the recipient always clone the above newly created regular collection after the failover. - // Currently, we restart cloning after a failover, only from the collection whose UUID is - // greater than or equal to the last collection we have on disk. - const skiplistExistingCollectionsStage = - configureFailPoint(newRecipientPrimary, "skiplistExistingCollectionsStage"); - - // Step up a new node in the recipient set and trigger a failover. - recipientRst.stepUp(newRecipientPrimary); - hangDuringCollectionClone.off(); - - // The migration should go through after recipient failover. - TenantMigrationTest.assertCommitted( - tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); - - // Check that recipient has dropped the view and and re-created the regular collection as part - // of migration oplog catchup phase. - collectionInfo = getCollectionInfo(newRecipientPrimary); - assert.eq(1, collectionInfo.length); - assert(collectionInfo[0].type === "collection", - "data store type mismatch: " + tojson(collectionInfo[0])); - - tenantMigrationTest.stop(); - recipientRst.stopSet(); -}; - -jsTestLog("Running tenant migration test for time-series collection"); -// Creating a timeseries collection, implicity creates a view on the 'collName' collection -// namespace. -tenantMigrationFailoverTest(true, - (db, collName) => db.createCollection( - collName, {timeseries: {timeField: "time", metaField: "bucket"}})); - -jsTestLog("Running tenant migration test for regular view"); -tenantMigrationFailoverTest(false, - (db, collName) => db.createView(collName, "sourceCollection", [])); diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js deleted file mode 100644 index 0919240cf24..00000000000 --- a/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Tests that in tenant migration, the recipient set can resume collection cloning from the last - * document cloned after a failover even if the collection has been renamed on the donor. - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import { - checkTenantDBHashes, - makeX509OptionsForTest, - runMigrationAsync -} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' -load("jstests/libs/parallelTester.js"); // for 'Thread' -load('jstests/replsets/rslib.js'); // 'createRstArgs' - -const recipientRst = new ReplSetTest({ - nodes: 2, - name: jsTestName() + "_recipient", - serverless: true, - nodeOptions: Object.assign(makeX509OptionsForTest().recipient, { - setParameter: { - // Use a batch size of 2 so that collection cloner requires more than a single batch to - // complete. - collectionClonerBatchSize: 2, - // Allow reads on recipient before migration completes for testing. - 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), - } - }) -}); - -recipientRst.startSet(); -recipientRst.initiate(); - -const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst}); -const tenantId = ObjectId().str; -const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); -const collName = "testColl"; - -const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); -const donorPrimary = tenantMigrationTest.getDonorPrimary(); - -// Test _id with mixed bson types. -const docs = [{_id: 0}, {_id: "string"}, {_id: UUID()}, {_id: new Date()}]; -tenantMigrationTest.insertDonorDB(dbName, collName, docs); - -const migrationId = UUID(); -const migrationIdString = extractUUIDFromObject(migrationId); -const migrationOpts = { - migrationIdString: migrationIdString, - recipientConnString: tenantMigrationTest.getRecipientConnString(), - tenantId, -}; - -// Configure a fail point to have the recipient primary hang after cloning 2 documents. -const recipientDb = recipientPrimary.getDB(dbName); -let recipientColl = recipientDb.getCollection(collName); -const hangDuringCollectionClone = - configureFailPoint(recipientDb, - "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse", - {nss: recipientColl.getFullName()}); - -// Start a migration and wait for recipient to hang after cloning 2 documents. -const donorRstArgs = createRstArgs(tenantMigrationTest.getDonorRst()); -const migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs); -migrationThread.start(); -hangDuringCollectionClone.wait(); -assert.soon(() => recipientColl.find().itcount() === 2); - -// Insert some documents that will be fetched by the recipient. This is to test that on failover, -// the fetcher will resume fetching from where it left off. The system is expected to crash if -// the recipient fetches a duplicate oplog entry upon resuming the migration. -tenantMigrationTest.insertDonorDB(dbName, "aNewColl", [{_id: "docToBeFetched"}]); -assert.soon(() => { - const configDb = recipientPrimary.getDB("config"); - const oplogBuffer = configDb.getCollection("repl.migration.oplog_" + migrationIdString); - return oplogBuffer.find({"entry.o._id": "docToBeFetched"}).count() === 1; -}); - -recipientRst.awaitLastOpCommitted(); - -// Set a failpoint to prevent the new recipient primary from completing the migration before the -// donor renames the collection. -const newRecipientPrimary = recipientRst.getSecondaries()[0]; -const fpPauseAtStartOfMigration = - configureFailPoint(newRecipientPrimary, "pauseAfterRunTenantMigrationRecipientInstance"); - -// Step up a new node in the recipient set and trigger a failover. The new primary should resume -// cloning starting from the third document. -recipientRst.stepUp(newRecipientPrimary); -hangDuringCollectionClone.off(); -recipientRst.getPrimary(); - -// Rename the collection on the donor. -const donorColl = donorPrimary.getDB(dbName).getCollection(collName); -const collNameRenamed = collName + "_renamed"; -assert.commandWorked(donorColl.renameCollection(collNameRenamed)); - -// The migration should go through after recipient failover. -fpPauseAtStartOfMigration.off(); -TenantMigrationTest.assertCommitted(migrationThread.returnData()); - -// Check that recipient has cloned all documents in the renamed collection. -recipientColl = newRecipientPrimary.getDB(dbName).getCollection(collNameRenamed); -assert.eq(4, recipientColl.find().itcount()); -assert.eq(recipientColl.find().sort({_id: 1}).toArray(), docs); -checkTenantDBHashes({ - donorRst: tenantMigrationTest.getDonorRst(), - recipientRst: tenantMigrationTest.getRecipientRst(), - tenantId -}); - -tenantMigrationTest.stop(); -recipientRst.stopSet(); diff --git a/jstests/replsets/tenant_migration_resume_oplog_application.js b/jstests/replsets/tenant_migration_resume_oplog_application.js deleted file mode 100644 index 70849750914..00000000000 --- a/jstests/replsets/tenant_migration_resume_oplog_application.js +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Tests that in a tenant migration, the recipient primary will resume oplog application on - * failover. - * @tags: [ - * incompatible_with_macos, - * incompatible_with_shard_merge, - * incompatible_with_windows_tls, - * requires_majority_read_concern, - * requires_persistence, - * serverless, - * ] - */ - -import {TenantMigrationTest} from "jstests/replsets/libs/tenant_migration_test.js"; -import { - checkTenantDBHashes, - makeX509OptionsForTest, - runMigrationAsync, -} from "jstests/replsets/libs/tenant_migration_util.js"; - -load("jstests/libs/fail_point_util.js"); -load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' -load("jstests/libs/parallelTester.js"); // for 'Thread' -load("jstests/libs/write_concern_util.js"); // for 'stopReplicationOnSecondaries' -load("jstests/aggregation/extras/utils.js"); // For assertArrayEq. -load('jstests/replsets/rslib.js'); // For 'createRstArgs' - -const recipientRst = new ReplSetTest({ - nodes: 3, - name: jsTestName() + "_recipient", - serverless: true, - // Use a batch size of 2 so that we can hang in the middle of tenant oplog application. - nodeOptions: Object.assign(makeX509OptionsForTest().recipient, - {setParameter: {tenantApplierBatchSizeOps: 2}}) -}); - -recipientRst.startSet(); -recipientRst.initiate(); - -const tenantMigrationTest = - new TenantMigrationTest({name: jsTestName(), recipientRst: recipientRst}); - -const tenantId = ObjectId().str; -const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); -const collName = "testColl"; - -const donorPrimary = tenantMigrationTest.getDonorPrimary(); -const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); -const donorRst = tenantMigrationTest.getDonorRst(); -const donorTestColl = donorPrimary.getDB(dbName).getCollection(collName); - -// Populate the donor replica set with some initial data and make sure it is majority committed. -const majorityCommittedDocs = [{_id: 0, x: 0}, {_id: 1, x: 1}]; -assert.commandWorked(donorTestColl.insert(majorityCommittedDocs, {writeConcern: {w: "majority"}})); -assert.eq(2, donorTestColl.find().readConcern("majority").itcount()); - -const migrationId = UUID(); -const migrationOpts = { - migrationIdString: extractUUIDFromObject(migrationId), - recipientConnString: tenantMigrationTest.getRecipientConnString(), - tenantId, -}; - -// Configure fail point to have the recipient primary hang after the cloner completes and the oplog -// applier has started. -let waitAfterDatabaseClone = configureFailPoint( - recipientPrimary, "fpAfterStartingOplogApplierMigrationRecipientInstance", {action: "hang"}); -// Configure fail point to hang the tenant oplog applier after it applies the first batch. -let waitInOplogApplier = configureFailPoint(recipientPrimary, "hangInTenantOplogApplication"); - -// Start a migration and wait for recipient to hang in the tenant database cloner. -const donorRstArgs = createRstArgs(donorRst); -const migrationThread = new Thread(runMigrationAsync, migrationOpts, donorRstArgs); -migrationThread.start(); -waitAfterDatabaseClone.wait(); - -// Insert some writes that will eventually be picked up by the tenant oplog applier on the -// recipient. -const docsToApply = [{_id: 2, x: 2}, {_id: 3, x: 3}, {_id: 4, x: 4}]; -tenantMigrationTest.insertDonorDB(dbName, collName, docsToApply); - -// Wait for the applied oplog batch to be replicated. -waitInOplogApplier.wait(); -recipientRst.awaitReplication(); -let local = recipientPrimary.getDB("local"); -let appliedNoOps = local.oplog.rs.find({fromTenantMigration: migrationId, op: "n"}); -let resultsArr = appliedNoOps.toArray(); -// It is possible that the first batch applied includes a resume no-op token. We do not write no-op -// entries for resume token entries in tenant migrations. -assert.gt(appliedNoOps.count(), 0, resultsArr); -assert.lte(appliedNoOps.count(), 2, resultsArr); -assert.eq(docsToApply[0], resultsArr[0].o2.o, resultsArr); -if (appliedNoOps.count() === 2) { - assert.eq(docsToApply[1], resultsArr[1].o2.o, resultsArr); -} -// Step up a new node in the recipient set and trigger a failover. The new primary should resume -// fetching starting from the unapplied documents. -const newRecipientPrimary = recipientRst.getSecondaries()[0]; -recipientRst.stepUp(newRecipientPrimary); -waitAfterDatabaseClone.off(); -waitInOplogApplier.off(); -recipientRst.getPrimary(); - -// The migration should go through after recipient failover. -TenantMigrationTest.assertCommitted(migrationThread.returnData()); -// Validate that the last no-op entry is applied. -local = newRecipientPrimary.getDB("local"); -appliedNoOps = local.oplog.rs.find({fromTenantMigration: migrationId, op: "n"}); -resultsArr = appliedNoOps.toArray(); -assert.eq(3, appliedNoOps.count(), appliedNoOps); -assert.eq(docsToApply[2], resultsArr[2].o2.o, resultsArr); - -checkTenantDBHashes({ - donorRst: tenantMigrationTest.getDonorRst(), - recipientRst: tenantMigrationTest.getRecipientRst(), - tenantId -}); -tenantMigrationTest.stop(); -recipientRst.stopSet(); diff --git a/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js b/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js index 245673daa62..a9ee61f0993 100644 --- a/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js +++ b/jstests/replsets/tenant_migration_retryable_write_retry_on_recipient.js @@ -186,6 +186,7 @@ assert.commandWorked( jsTest.log("Waiting for migration to complete"); waitBeforeFetchingTransactions.off(); TenantMigrationTest.assertCommitted(migrationThread.returnData()); +tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString); // Print the no-op oplog entries for debugging purposes. jsTestLog("Recipient oplog migration entries."); @@ -262,6 +263,7 @@ function testRecipientRetryableWrites(db, writes) { jsTestLog("Run retryable write on primary after the migration"); testRecipientRetryableWrites(recipientDb, beforeWrites); testRecipientRetryableWrites(recipientDb, duringWrites); + jsTestLog("Step up secondary"); const recipientRst = tenantMigrationTest.getRecipientRst(); recipientRst.stepUp(recipientRst.getSecondary()); @@ -269,8 +271,6 @@ jsTestLog("Run retryable write on secondary after the migration"); testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), beforeWrites); testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), duringWrites); -tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString); - jsTestLog("Trying a back-to-back migration"); const tenantMigrationTest2 = new TenantMigrationTest( {name: jsTestName() + "2", donorRst: tenantMigrationTest.getRecipientRst()}); diff --git a/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js b/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js index 4a3475178f5..b5a466046f8 100644 --- a/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js +++ b/jstests/replsets/tenant_migration_timeseries_retryable_write_retry_on_recipient.js @@ -105,6 +105,7 @@ function testRetryOnRecipient(ordered) { jsTest.log("Waiting for migration to complete"); pauseTenantMigrationBeforeLeavingDataSyncState.off(); TenantMigrationTest.assertCommitted(migrationThread.returnData()); + tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString); // Print the no-op oplog entries for debugging purposes. jsTestLog("Recipient oplog migration entries."); @@ -132,8 +133,6 @@ function testRetryOnRecipient(ordered) { testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), beforeWrites); testRecipientRetryableWrites(recipientRst.getPrimary().getDB(kDbName), duringWrites); - tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString); - jsTestLog("Trying a back-to-back migration"); const tenantMigrationTest2 = new TenantMigrationTest( {name: jsTestName() + "2", donorRst: tenantMigrationTest.getRecipientRst()}); |