diff options
author | Jason Zhang <jason.zhang@mongodb.com> | 2020-09-18 16:57:21 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-09-29 22:54:59 +0000 |
commit | bdc9ed7d413d931d1b0097e6fd75bb127fbbc439 (patch) | |
tree | ceeba0e045506d66665632b596691ebfb1048311 | |
parent | d47784296374ed335d07c895820fdf8066e538e3 (diff) | |
download | mongo-bdc9ed7d413d931d1b0097e6fd75bb127fbbc439.tar.gz |
SERVER-50572 Make initial sync clear and recover a tenant migration donor's in-memory state
-rw-r--r-- | jstests/replsets/libs/tenant_migration_util.js | 11 | ||||
-rw-r--r-- | jstests/replsets/tenant_migration_donor_initial_sync_recovery.js | 131 | ||||
-rw-r--r-- | jstests/replsets/tenant_migration_donor_startup_recovery.js | 50 | ||||
-rw-r--r-- | src/mongo/db/repl/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer_test.cpp | 29 |
6 files changed, 211 insertions, 14 deletions
diff --git a/jstests/replsets/libs/tenant_migration_util.js b/jstests/replsets/libs/tenant_migration_util.js index a2313a56041..4adf50028bb 100644 --- a/jstests/replsets/libs/tenant_migration_util.js +++ b/jstests/replsets/libs/tenant_migration_util.js @@ -142,6 +142,14 @@ var TenantMigrationUtil = (function() { }); } + /** + * Returns the TenantMigrationAccessBlocker associated with given the database prefix on the + * node. + */ + function getTenantMigrationAccessBlocker(node, dbPrefix) { + return node.adminCommand({serverStatus: 1}).tenantMigrationAccessBlocker[dbPrefix]; + } + return { accessState, startMigration, @@ -150,6 +158,7 @@ var TenantMigrationUtil = (function() { forgetMigrationRetryOnNotPrimaryErrors, assertMigrationCommitted, waitForMigrationToCommit, - waitForMigrationGarbageCollection + waitForMigrationGarbageCollection, + getTenantMigrationAccessBlocker }; })(); diff --git a/jstests/replsets/tenant_migration_donor_initial_sync_recovery.js b/jstests/replsets/tenant_migration_donor_initial_sync_recovery.js new file mode 100644 index 00000000000..4f23e4ef9b0 --- /dev/null +++ b/jstests/replsets/tenant_migration_donor_initial_sync_recovery.js @@ -0,0 +1,131 @@ +/** + * Tests initial sync's recovery to a tenant migration's in-memory state. + * + * Tenant migrations are not expected to be run on servers with ephemeralForTest. + * + * @tags: [requires_fcv_47, requires_majority_read_concern, requires_persistence, + * incompatible_with_eft] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/parallelTester.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const donorRst = new ReplSetTest( + {nodes: 1, name: 'donor', nodeOptions: {setParameter: {enableTenantMigrations: true}}}); +const recipientRst = new ReplSetTest( + {nodes: 1, name: 'recipient', nodeOptions: {setParameter: {enableTenantMigrations: true}}}); + +donorRst.startSet(); +donorRst.initiate(); + +recipientRst.startSet(); +recipientRst.initiate(); + +const kMaxSleepTimeMS = 1000; +const kDBPrefix = 'testDb'; +const kConfigDonorsNS = "config.tenantMigrationDonors"; + +let donorPrimary = donorRst.getPrimary(); +let kRecipientConnString = recipientRst.getURL(); + +function startMigration(host, recipientConnString, dbPrefix) { + const primary = new Mongo(host); + assert.commandWorked(primary.adminCommand({ + donorStartMigration: 1, + migrationId: UUID(), + recipientConnectionString: recipientConnString, + databasePrefix: dbPrefix, + readPreference: {mode: "primary"} + })); +} + +let migrationThread = + new Thread(startMigration, donorPrimary.host, kRecipientConnString, kDBPrefix); + +// Force the migration to pause after entering a randomly selected state to simulate a failure. +Random.setRandomSeed(); +const kMigrationFpNames = [ + "pauseTenantMigrationAfterDataSync", + "pauseTenantMigrationAfterBlockingStarts", + "abortTenantMigrationAfterBlockingStarts" +]; +const index = Random.randInt(kMigrationFpNames.length + 1); +if (index < kMigrationFpNames.length) { + configureFailPoint(donorPrimary, kMigrationFpNames[index]); +} + +migrationThread.start(); +sleep(Math.random() * kMaxSleepTimeMS); + +// Add the initial sync node and make sure that it does not step up. +var initialSyncNode = + donorRst.add({rsConfig: {priority: 0, votes: 0}, setParameter: {enableTenantMigrations: true}}); + +donorRst.reInitiate(); +jsTestLog("Waiting for initial sync to finish."); +donorRst.awaitSecondaryNodes(); + +let configDonorsColl = initialSyncNode.getCollection(kConfigDonorsNS); +let donorDoc = configDonorsColl.findOne({databasePrefix: kDBPrefix}); +if (donorDoc) { + let state = donorDoc.state; + switch (state) { + case "data sync": + assert.soon(() => TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .access == TenantMigrationUtil.accessState.kAllow); + break; + case "blocking": + assert.soon( + () => + TenantMigrationUtil.getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .access == TenantMigrationUtil.accessState.kBlockingReadsAndWrites); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .blockTimestamp, + donorDoc.blockTimestamp) == 0); + break; + case "committed": + assert.soon(() => TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .access == TenantMigrationUtil.accessState.kReject); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .commitOrAbortOpTime, + donorDoc.commitOrAbortOpTime) == 0); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .blockTimestamp, + donorDoc.blockTimestamp) == 0); + break; + case "aborted": + assert.soon(() => TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .access == TenantMigrationUtil.accessState.kAllow); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .commitOrAbortOpTime, + donorDoc.commitOrAbortOpTime) == 0); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix) + .blockTimestamp, + donorDoc.blockTimestamp) == 0); + break; + default: + throw new Error(`Invalid state "${state}" from donor doc.`); + } +} + +migrationThread.join(); +donorRst.stopSet(); +recipientRst.stopSet(); +})(); diff --git a/jstests/replsets/tenant_migration_donor_startup_recovery.js b/jstests/replsets/tenant_migration_donor_startup_recovery.js index 12fc5c8e5ce..9527570ab13 100644 --- a/jstests/replsets/tenant_migration_donor_startup_recovery.js +++ b/jstests/replsets/tenant_migration_donor_startup_recovery.js @@ -13,6 +13,7 @@ load("jstests/libs/fail_point_util.js"); load("jstests/libs/parallelTester.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); // An object that mirrors the access states for the TenantMigrationAccessBlocker. const accessState = { @@ -93,40 +94,63 @@ donorRst.startSet({ "failpoint.PrimaryOnlyServiceSkipRebuildingInstances": "{'mode':'alwaysOn'}" } }); -donorPrimary = donorRst.getPrimary(); +donorPrimary = donorRst.getPrimary(); configDonorsColl = donorPrimary.getCollection(kConfigDonorsNS); let donorDoc = configDonorsColl.findOne({databasePrefix: kDBPrefix}); -let mtab = donorPrimary.adminCommand({serverStatus: 1}).tenantMigrationAccessBlocker; if (donorDoc) { let state = donorDoc.state; switch (state) { case "data sync": - assert.soon(() => mtab[kDBPrefix].access == accessState.kAllow); + assert.soon( + () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .access == TenantMigrationUtil.accessState.kAllow); break; case "blocking": - assert.soon(() => mtab[kDBPrefix].access == accessState.kBlockingReadsAndWrites); assert.soon( - () => bsonWoCompare(mtab[kDBPrefix].blockTimestamp, donorDoc.blockTimestamp) == 0); + () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .access == TenantMigrationUtil.accessState.kBlockingReadsAndWrites); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .blockTimestamp, + donorDoc.blockTimestamp) == 0); break; case "committed": - assert.soon(() => mtab[kDBPrefix].access == accessState.kReject); - assert.soon(() => bsonWoCompare(mtab[kDBPrefix].commitOrAbortOpTime, - donorDoc.commitOrAbortOpTime) == 0); assert.soon( - () => bsonWoCompare(mtab[kDBPrefix].blockTimestamp, donorDoc.blockTimestamp) == 0); + () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .access == TenantMigrationUtil.accessState.kReject); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .commitOrAbortOpTime, + donorDoc.commitOrAbortOpTime) == 0); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .blockTimestamp, + donorDoc.blockTimestamp) == 0); break; case "aborted": - assert.soon(() => mtab[kDBPrefix].access == accessState.kAllow); - assert.soon(() => bsonWoCompare(mtab[kDBPrefix].commitOrAbortOpTime, - donorDoc.commitOrAbortOpTime) == 0); assert.soon( - () => bsonWoCompare(mtab[kDBPrefix].blockTimestamp, donorDoc.blockTimestamp) == 0); + () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .access == TenantMigrationUtil.accessState.kAllow); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .commitOrAbortOpTime, + donorDoc.commitOrAbortOpTime) == 0); + assert.soon( + () => bsonWoCompare(TenantMigrationUtil + .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix) + .blockTimestamp, + donorDoc.blockTimestamp) == 0); break; default: throw new Error(`Invalid state "${state}" from donor doc.`); } } + migrationThread.join(); donorRst.stopSet(); recipientRst.stopSet(); diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index cd279da1fbf..140ee9bf8aa 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1179,6 +1179,7 @@ env.Library( 'repl_sync_shared_data', 'rollback_checker', 'storage_interface', + 'tenant_migration_donor' ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/commands/feature_compatibility_parsers', diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp index aaad51704bd..4df92abfa0e 100644 --- a/src/mongo/db/repl/initial_syncer.cpp +++ b/src/mongo/db/repl/initial_syncer.cpp @@ -60,6 +60,7 @@ #include "mongo/db/repl/replication_process.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/sync_source_selector.h" +#include "mongo/db/repl/tenant_migration_donor_util.h" #include "mongo/db/repl/transaction_oplog_application.h" #include "mongo/db/session_txn_record_gen.h" #include "mongo/executor/task_executor.h" @@ -536,6 +537,8 @@ void InitialSyncer::_tearDown_inlock(OperationContext* opCtx, const bool orderedCommit = true; _storage->oplogDiskLocRegister(opCtx, initialDataTimestamp, orderedCommit); + TenantMigrationAccessBlockerByPrefix::get(opCtx->getServiceContext()).shutDown(); + tenant_migration_donor::recoverTenantMigrationAccessBlockers(opCtx); reconstructPreparedTransactions(opCtx, repl::OplogApplication::Mode::kInitialSync); _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx); diff --git a/src/mongo/db/repl/initial_syncer_test.cpp b/src/mongo/db/repl/initial_syncer_test.cpp index 4031d4d3e2d..1ae6fa07d0a 100644 --- a/src/mongo/db/repl/initial_syncer_test.cpp +++ b/src/mongo/db/repl/initial_syncer_test.cpp @@ -1957,6 +1957,8 @@ TEST_F(InitialSyncerTest, auto net = getNet(); int baseRollbackId = 1; FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); { executor::NetworkInterfaceMock::InNetworkGuard guard(net); @@ -2059,6 +2061,9 @@ TEST_F(InitialSyncerTest, // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); auto initialSyncer = &getInitialSyncer(); auto opCtx = makeOpCtx(); @@ -2125,6 +2130,9 @@ TEST_F( // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); auto initialSyncer = &getInitialSyncer(); auto opCtx = makeOpCtx(); @@ -2431,6 +2439,9 @@ TEST_F(InitialSyncerTest, InitialSyncerRetriesLastOplogEntryFetcherNetworkError) // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); auto initialSyncer = &getInitialSyncer(); auto opCtx = makeOpCtx(); @@ -3070,6 +3081,9 @@ TEST_F(InitialSyncerTest, InitialSyncerHandlesNetworkErrorsFromRollbackCheckerAf // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); auto initialSyncer = &getInitialSyncer(); auto opCtx = makeOpCtx(); @@ -3379,6 +3393,9 @@ TEST_F(InitialSyncerTest, LastOpTimeShouldBeSetEvenIfNoOperationsAreAppliedAfter // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); auto initialSyncer = &getInitialSyncer(); auto opCtx = makeOpCtx(); @@ -4034,6 +4051,9 @@ TEST_F(InitialSyncerTest, // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); doSuccessfulInitialSyncWithOneBatch(); } @@ -4044,6 +4064,9 @@ TEST_F(InitialSyncerTest, // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); auto initialSyncer = &getInitialSyncer(); auto opCtx = makeOpCtx(); @@ -4222,6 +4245,9 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) { // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); // Skip clearing initial sync progress so that we can check initialSyncStatus fields after // initial sync is complete. @@ -4535,6 +4561,9 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgressForNetwork // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint. FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions"); + // Skip recovering tenant migration access blockers for the same reason as the above. + FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers( + "skipRecoverTenantMigrationAccessBlockers"); // Skip clearing initial sync progress so that we can check initialSyncStatus fields after // initial sync is complete. |