summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Zhang <jason.zhang@mongodb.com>2020-09-18 16:57:21 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-09-29 22:54:59 +0000
commitbdc9ed7d413d931d1b0097e6fd75bb127fbbc439 (patch)
treeceeba0e045506d66665632b596691ebfb1048311
parentd47784296374ed335d07c895820fdf8066e538e3 (diff)
downloadmongo-bdc9ed7d413d931d1b0097e6fd75bb127fbbc439.tar.gz
SERVER-50572 Make initial sync clear and recover a tenant migration donor's in-memory state
-rw-r--r--jstests/replsets/libs/tenant_migration_util.js11
-rw-r--r--jstests/replsets/tenant_migration_donor_initial_sync_recovery.js131
-rw-r--r--jstests/replsets/tenant_migration_donor_startup_recovery.js50
-rw-r--r--src/mongo/db/repl/SConscript1
-rw-r--r--src/mongo/db/repl/initial_syncer.cpp3
-rw-r--r--src/mongo/db/repl/initial_syncer_test.cpp29
6 files changed, 211 insertions, 14 deletions
diff --git a/jstests/replsets/libs/tenant_migration_util.js b/jstests/replsets/libs/tenant_migration_util.js
index a2313a56041..4adf50028bb 100644
--- a/jstests/replsets/libs/tenant_migration_util.js
+++ b/jstests/replsets/libs/tenant_migration_util.js
@@ -142,6 +142,14 @@ var TenantMigrationUtil = (function() {
});
}
+ /**
+ * Returns the TenantMigrationAccessBlocker associated with given the database prefix on the
+ * node.
+ */
+ function getTenantMigrationAccessBlocker(node, dbPrefix) {
+ return node.adminCommand({serverStatus: 1}).tenantMigrationAccessBlocker[dbPrefix];
+ }
+
return {
accessState,
startMigration,
@@ -150,6 +158,7 @@ var TenantMigrationUtil = (function() {
forgetMigrationRetryOnNotPrimaryErrors,
assertMigrationCommitted,
waitForMigrationToCommit,
- waitForMigrationGarbageCollection
+ waitForMigrationGarbageCollection,
+ getTenantMigrationAccessBlocker
};
})();
diff --git a/jstests/replsets/tenant_migration_donor_initial_sync_recovery.js b/jstests/replsets/tenant_migration_donor_initial_sync_recovery.js
new file mode 100644
index 00000000000..4f23e4ef9b0
--- /dev/null
+++ b/jstests/replsets/tenant_migration_donor_initial_sync_recovery.js
@@ -0,0 +1,131 @@
+/**
+ * Tests initial sync's recovery to a tenant migration's in-memory state.
+ *
+ * Tenant migrations are not expected to be run on servers with ephemeralForTest.
+ *
+ * @tags: [requires_fcv_47, requires_majority_read_concern, requires_persistence,
+ * incompatible_with_eft]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/replsets/libs/tenant_migration_util.js");
+
+const donorRst = new ReplSetTest(
+ {nodes: 1, name: 'donor', nodeOptions: {setParameter: {enableTenantMigrations: true}}});
+const recipientRst = new ReplSetTest(
+ {nodes: 1, name: 'recipient', nodeOptions: {setParameter: {enableTenantMigrations: true}}});
+
+donorRst.startSet();
+donorRst.initiate();
+
+recipientRst.startSet();
+recipientRst.initiate();
+
+const kMaxSleepTimeMS = 1000;
+const kDBPrefix = 'testDb';
+const kConfigDonorsNS = "config.tenantMigrationDonors";
+
+let donorPrimary = donorRst.getPrimary();
+let kRecipientConnString = recipientRst.getURL();
+
+function startMigration(host, recipientConnString, dbPrefix) {
+ const primary = new Mongo(host);
+ assert.commandWorked(primary.adminCommand({
+ donorStartMigration: 1,
+ migrationId: UUID(),
+ recipientConnectionString: recipientConnString,
+ databasePrefix: dbPrefix,
+ readPreference: {mode: "primary"}
+ }));
+}
+
+let migrationThread =
+ new Thread(startMigration, donorPrimary.host, kRecipientConnString, kDBPrefix);
+
+// Force the migration to pause after entering a randomly selected state to simulate a failure.
+Random.setRandomSeed();
+const kMigrationFpNames = [
+ "pauseTenantMigrationAfterDataSync",
+ "pauseTenantMigrationAfterBlockingStarts",
+ "abortTenantMigrationAfterBlockingStarts"
+];
+const index = Random.randInt(kMigrationFpNames.length + 1);
+if (index < kMigrationFpNames.length) {
+ configureFailPoint(donorPrimary, kMigrationFpNames[index]);
+}
+
+migrationThread.start();
+sleep(Math.random() * kMaxSleepTimeMS);
+
+// Add the initial sync node and make sure that it does not step up.
+var initialSyncNode =
+ donorRst.add({rsConfig: {priority: 0, votes: 0}, setParameter: {enableTenantMigrations: true}});
+
+donorRst.reInitiate();
+jsTestLog("Waiting for initial sync to finish.");
+donorRst.awaitSecondaryNodes();
+
+let configDonorsColl = initialSyncNode.getCollection(kConfigDonorsNS);
+let donorDoc = configDonorsColl.findOne({databasePrefix: kDBPrefix});
+if (donorDoc) {
+ let state = donorDoc.state;
+ switch (state) {
+ case "data sync":
+ assert.soon(() => TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kAllow);
+ break;
+ case "blocking":
+ assert.soon(
+ () =>
+ TenantMigrationUtil.getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kBlockingReadsAndWrites);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .blockTimestamp,
+ donorDoc.blockTimestamp) == 0);
+ break;
+ case "committed":
+ assert.soon(() => TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kReject);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .commitOrAbortOpTime,
+ donorDoc.commitOrAbortOpTime) == 0);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .blockTimestamp,
+ donorDoc.blockTimestamp) == 0);
+ break;
+ case "aborted":
+ assert.soon(() => TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kAllow);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .commitOrAbortOpTime,
+ donorDoc.commitOrAbortOpTime) == 0);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(initialSyncNode, kDBPrefix)
+ .blockTimestamp,
+ donorDoc.blockTimestamp) == 0);
+ break;
+ default:
+ throw new Error(`Invalid state "${state}" from donor doc.`);
+ }
+}
+
+migrationThread.join();
+donorRst.stopSet();
+recipientRst.stopSet();
+})();
diff --git a/jstests/replsets/tenant_migration_donor_startup_recovery.js b/jstests/replsets/tenant_migration_donor_startup_recovery.js
index 12fc5c8e5ce..9527570ab13 100644
--- a/jstests/replsets/tenant_migration_donor_startup_recovery.js
+++ b/jstests/replsets/tenant_migration_donor_startup_recovery.js
@@ -13,6 +13,7 @@
load("jstests/libs/fail_point_util.js");
load("jstests/libs/parallelTester.js");
+load("jstests/replsets/libs/tenant_migration_util.js");
// An object that mirrors the access states for the TenantMigrationAccessBlocker.
const accessState = {
@@ -93,40 +94,63 @@ donorRst.startSet({
"failpoint.PrimaryOnlyServiceSkipRebuildingInstances": "{'mode':'alwaysOn'}"
}
});
-donorPrimary = donorRst.getPrimary();
+donorPrimary = donorRst.getPrimary();
configDonorsColl = donorPrimary.getCollection(kConfigDonorsNS);
let donorDoc = configDonorsColl.findOne({databasePrefix: kDBPrefix});
-let mtab = donorPrimary.adminCommand({serverStatus: 1}).tenantMigrationAccessBlocker;
if (donorDoc) {
let state = donorDoc.state;
switch (state) {
case "data sync":
- assert.soon(() => mtab[kDBPrefix].access == accessState.kAllow);
+ assert.soon(
+ () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kAllow);
break;
case "blocking":
- assert.soon(() => mtab[kDBPrefix].access == accessState.kBlockingReadsAndWrites);
assert.soon(
- () => bsonWoCompare(mtab[kDBPrefix].blockTimestamp, donorDoc.blockTimestamp) == 0);
+ () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kBlockingReadsAndWrites);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .blockTimestamp,
+ donorDoc.blockTimestamp) == 0);
break;
case "committed":
- assert.soon(() => mtab[kDBPrefix].access == accessState.kReject);
- assert.soon(() => bsonWoCompare(mtab[kDBPrefix].commitOrAbortOpTime,
- donorDoc.commitOrAbortOpTime) == 0);
assert.soon(
- () => bsonWoCompare(mtab[kDBPrefix].blockTimestamp, donorDoc.blockTimestamp) == 0);
+ () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kReject);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .commitOrAbortOpTime,
+ donorDoc.commitOrAbortOpTime) == 0);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .blockTimestamp,
+ donorDoc.blockTimestamp) == 0);
break;
case "aborted":
- assert.soon(() => mtab[kDBPrefix].access == accessState.kAllow);
- assert.soon(() => bsonWoCompare(mtab[kDBPrefix].commitOrAbortOpTime,
- donorDoc.commitOrAbortOpTime) == 0);
assert.soon(
- () => bsonWoCompare(mtab[kDBPrefix].blockTimestamp, donorDoc.blockTimestamp) == 0);
+ () => TenantMigrationUtil.getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .access == TenantMigrationUtil.accessState.kAllow);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .commitOrAbortOpTime,
+ donorDoc.commitOrAbortOpTime) == 0);
+ assert.soon(
+ () => bsonWoCompare(TenantMigrationUtil
+ .getTenantMigrationAccessBlocker(donorPrimary, kDBPrefix)
+ .blockTimestamp,
+ donorDoc.blockTimestamp) == 0);
break;
default:
throw new Error(`Invalid state "${state}" from donor doc.`);
}
}
+
migrationThread.join();
donorRst.stopSet();
recipientRst.stopSet();
diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript
index cd279da1fbf..140ee9bf8aa 100644
--- a/src/mongo/db/repl/SConscript
+++ b/src/mongo/db/repl/SConscript
@@ -1179,6 +1179,7 @@ env.Library(
'repl_sync_shared_data',
'rollback_checker',
'storage_interface',
+ 'tenant_migration_donor'
],
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/db/commands/feature_compatibility_parsers',
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index aaad51704bd..4df92abfa0e 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -60,6 +60,7 @@
#include "mongo/db/repl/replication_process.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/sync_source_selector.h"
+#include "mongo/db/repl/tenant_migration_donor_util.h"
#include "mongo/db/repl/transaction_oplog_application.h"
#include "mongo/db/session_txn_record_gen.h"
#include "mongo/executor/task_executor.h"
@@ -536,6 +537,8 @@ void InitialSyncer::_tearDown_inlock(OperationContext* opCtx,
const bool orderedCommit = true;
_storage->oplogDiskLocRegister(opCtx, initialDataTimestamp, orderedCommit);
+ TenantMigrationAccessBlockerByPrefix::get(opCtx->getServiceContext()).shutDown();
+ tenant_migration_donor::recoverTenantMigrationAccessBlockers(opCtx);
reconstructPreparedTransactions(opCtx, repl::OplogApplication::Mode::kInitialSync);
_replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx);
diff --git a/src/mongo/db/repl/initial_syncer_test.cpp b/src/mongo/db/repl/initial_syncer_test.cpp
index 4031d4d3e2d..1ae6fa07d0a 100644
--- a/src/mongo/db/repl/initial_syncer_test.cpp
+++ b/src/mongo/db/repl/initial_syncer_test.cpp
@@ -1957,6 +1957,8 @@ TEST_F(InitialSyncerTest,
auto net = getNet();
int baseRollbackId = 1;
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
{
executor::NetworkInterfaceMock::InNetworkGuard guard(net);
@@ -2059,6 +2061,9 @@ TEST_F(InitialSyncerTest,
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
auto initialSyncer = &getInitialSyncer();
auto opCtx = makeOpCtx();
@@ -2125,6 +2130,9 @@ TEST_F(
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
auto initialSyncer = &getInitialSyncer();
auto opCtx = makeOpCtx();
@@ -2431,6 +2439,9 @@ TEST_F(InitialSyncerTest, InitialSyncerRetriesLastOplogEntryFetcherNetworkError)
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
auto initialSyncer = &getInitialSyncer();
auto opCtx = makeOpCtx();
@@ -3070,6 +3081,9 @@ TEST_F(InitialSyncerTest, InitialSyncerHandlesNetworkErrorsFromRollbackCheckerAf
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
auto initialSyncer = &getInitialSyncer();
auto opCtx = makeOpCtx();
@@ -3379,6 +3393,9 @@ TEST_F(InitialSyncerTest, LastOpTimeShouldBeSetEvenIfNoOperationsAreAppliedAfter
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
auto initialSyncer = &getInitialSyncer();
auto opCtx = makeOpCtx();
@@ -4034,6 +4051,9 @@ TEST_F(InitialSyncerTest,
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
doSuccessfulInitialSyncWithOneBatch();
}
@@ -4044,6 +4064,9 @@ TEST_F(InitialSyncerTest,
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
auto initialSyncer = &getInitialSyncer();
auto opCtx = makeOpCtx();
@@ -4222,6 +4245,9 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
// Skip clearing initial sync progress so that we can check initialSyncStatus fields after
// initial sync is complete.
@@ -4535,6 +4561,9 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgressForNetwork
// InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
// when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+ // Skip recovering tenant migration access blockers for the same reason as the above.
+ FailPointEnableBlock skipRecoverTenantMigrationAccessBlockers(
+ "skipRecoverTenantMigrationAccessBlockers");
// Skip clearing initial sync progress so that we can check initialSyncStatus fields after
// initial sync is complete.