summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCheahuychou Mao <mao.cheahuychou@gmail.com>2021-05-03 20:18:40 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-04 16:09:22 +0000
commitc02a82e18fe3fc3cf9ed76962fe05c22bf376332 (patch)
tree2edfb08617311713868f562a3acd8b6852b5321f
parent00e0dd57a75e2ec51857afd38f106e3ba3c13281 (diff)
downloadmongo-c02a82e18fe3fc3cf9ed76962fe05c22bf376332.tar.gz
SERVER-54460 Ensure that opCtx that waits on PrimaryOnlyService completion promise gets killed on stepdown
-rw-r--r--jstests/replsets/libs/tenant_migration_test.js9
-rw-r--r--jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js69
-rw-r--r--src/mongo/db/commands/tenant_migration_donor_cmds.cpp1
-rw-r--r--src/mongo/db/commands/tenant_migration_recipient_cmds.cpp1
-rw-r--r--src/mongo/db/repl/primary_only_service_op_observer.cpp22
-rw-r--r--src/mongo/db/repl/tenant_migration_donor_service.cpp4
-rw-r--r--src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp1
7 files changed, 97 insertions, 10 deletions
diff --git a/jstests/replsets/libs/tenant_migration_test.js b/jstests/replsets/libs/tenant_migration_test.js
index fc9744d6460..577ab5f3f1b 100644
--- a/jstests/replsets/libs/tenant_migration_test.js
+++ b/jstests/replsets/libs/tenant_migration_test.js
@@ -22,6 +22,8 @@ load("jstests/replsets/libs/tenant_migration_util.js");
* each RST will contain, and 'setParameter' <object>, an object with various server parameters.
* @param {boolean} [allowDonorReadAfterMigration] whether donor would allow reads after a committed
* migration.
+ * @param {boolean} [initiateRstWithHighElectionTimeout] whether donor and recipient replica sets
+ * should be initiated with high election timeout.
*/
function TenantMigrationTest({
name = "TenantMigrationTest",
@@ -31,6 +33,7 @@ function TenantMigrationTest({
sharedOptions = {},
// Default this to true so it is easier for data consistency checks.
allowStaleReadsOnDonor = true,
+ initiateRstWithHighElectionTimeout = true
}) {
const donorPassedIn = (donorRst !== undefined);
const recipientPassedIn = (recipientRst !== undefined);
@@ -81,7 +84,11 @@ function TenantMigrationTest({
const rstName = `${name}_${(isDonor ? "donor" : "recipient")}`;
const rst = new ReplSetTest({name: rstName, nodes, nodeOptions});
rst.startSet();
- rst.initiateWithHighElectionTimeout();
+ if (initiateRstWithHighElectionTimeout) {
+ rst.initiateWithHighElectionTimeout();
+ } else {
+ rst.initiate();
+ }
return rst;
}
diff --git a/jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js b/jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js
new file mode 100644
index 00000000000..7f431576382
--- /dev/null
+++ b/jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js
@@ -0,0 +1,69 @@
+/**
+ * Tests that donorForgetMigration command doesn't hang if failover occurs immediately after the
+ * state doc for the migration has been removed.
+ *
+ * @tags: [requires_fcv_47, requires_majority_read_concern, incompatible_with_eft,
+ * incompatible_with_windows_tls, incompatible_with_macos, requires_persistence]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/uuid_util.js");
+load("jstests/replsets/libs/tenant_migration_test.js");
+load("jstests/replsets/libs/tenant_migration_util.js");
+
+const tenantMigrationTest = new TenantMigrationTest({
+ name: jsTestName(),
+ sharedOptions: {
+ setParameter: {
+ tenantMigrationGarbageCollectionDelayMS: 1,
+ ttlMonitorSleepSecs: 1,
+ }
+ },
+ initiateRstWithHighElectionTimeout: false
+});
+if (!tenantMigrationTest.isFeatureFlagEnabled()) {
+ jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
+ return;
+}
+
+const kTenantId = "testTenantId";
+
+const donorRst = tenantMigrationTest.getDonorRst();
+const donorRstArgs = TenantMigrationUtil.createRstArgs(donorRst);
+let donorPrimary = tenantMigrationTest.getDonorPrimary();
+
+const migrationId = UUID();
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+};
+
+assert.commandWorked(tenantMigrationTest.runMigration(
+ migrationOpts, false /* retryOnRetryableErrors */, false /* automaticForgetMigration */));
+
+let fp = configureFailPoint(donorPrimary,
+ "pauseTenantMigrationDonorAfterMarkingStateGarbageCollectable");
+const forgetMigrationThread = new Thread(TenantMigrationUtil.forgetMigrationAsync,
+ migrationOpts.migrationIdString,
+ donorRstArgs,
+ false /* retryOnRetryableErrors */);
+forgetMigrationThread.start();
+fp.wait();
+tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, migrationOpts.tenantId);
+
+assert.commandWorked(
+ donorPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
+assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0}));
+fp.off();
+donorPrimary = donorRst.getPrimary();
+
+assert.commandFailedWithCode(forgetMigrationThread.returnData(),
+ ErrorCodes.InterruptedDueToReplStateChange);
+
+donorRst.stopSet();
+tenantMigrationTest.stop();
+})(); \ No newline at end of file
diff --git a/src/mongo/db/commands/tenant_migration_donor_cmds.cpp b/src/mongo/db/commands/tenant_migration_donor_cmds.cpp
index 129aa787b40..c81e5a551e1 100644
--- a/src/mongo/db/commands/tenant_migration_donor_cmds.cpp
+++ b/src/mongo/db/commands/tenant_migration_donor_cmds.cpp
@@ -181,6 +181,7 @@ public:
const auto& cmd = request();
+ opCtx->setAlwaysInterruptAtStepDownOrUp();
auto donorService =
repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext())
->lookupServiceByName(TenantMigrationDonorService::kServiceName);
diff --git a/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp b/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp
index 3d42735fd65..52f638374ca 100644
--- a/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp
+++ b/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp
@@ -190,6 +190,7 @@ public:
const auto& cmd = request();
+ opCtx->setAlwaysInterruptAtStepDownOrUp();
auto recipientService =
repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext())
->lookupServiceByName(repl::TenantMigrationRecipientService::
diff --git a/src/mongo/db/repl/primary_only_service_op_observer.cpp b/src/mongo/db/repl/primary_only_service_op_observer.cpp
index 711b7e765a2..4c939efb7d1 100644
--- a/src/mongo/db/repl/primary_only_service_op_observer.cpp
+++ b/src/mongo/db/repl/primary_only_service_op_observer.cpp
@@ -70,10 +70,12 @@ void PrimaryOnlyServiceOpObserver::onDelete(OperationContext* opCtx,
if (!service) {
return;
}
- // Passing OK() as an argument does not invoke the interrupt() method on the instance.
- // TODO(SERVER-54460): when state document deletion race is fixed in resharding, release with
- // error as in 'onDropCollection()'.
- service->releaseInstance(documentId, Status::OK());
+ opCtx->recoveryUnit()->onCommit(
+ [service, documentId, nss](boost::optional<Timestamp> unusedCommitTime) {
+ // Release the instance without interrupting it since for some primary-only services
+ // there is still work to be done after the state document is removed.
+ service->releaseInstance(documentId, Status::OK());
+ });
}
@@ -84,11 +86,13 @@ repl::OpTime PrimaryOnlyServiceOpObserver::onDropCollection(OperationContext* op
const CollectionDropType dropType) {
auto service = _registry->lookupServiceByNamespace(collectionName);
if (service) {
- // Dropping the state doc collection also interrups all the instances with 'interrupted'
- // status.
- service->releaseAllInstances(Status(ErrorCodes::Interrupted,
- str::stream() << collectionName << " is dropped",
- BSON("collection" << collectionName.toString())));
+ opCtx->recoveryUnit()->onCommit(
+ [service, collectionName](boost::optional<Timestamp> unusedCommitTime) {
+ // Release and interrupt all the instances since the state document collection is
+ // not supposed to be dropped.
+ service->releaseAllInstances(Status(
+ ErrorCodes::Interrupted, str::stream() << collectionName << " is dropped"));
+ });
}
return {};
}
diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp
index 35023a2f7b7..5ea899d3071 100644
--- a/src/mongo/db/repl/tenant_migration_donor_service.cpp
+++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp
@@ -68,6 +68,7 @@ MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeLeavingDataSyncState);
MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeFetchingKeys);
MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorBeforeWaitingForKeysToReplicate);
MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorBeforeMarkingStateGarbageCollectable);
+MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorAfterMarkingStateGarbageCollectable);
MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeEnteringFutureChain);
MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationAfterFetchingAndStoringKeys);
MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorWhileUpdatingStateDoc);
@@ -1226,6 +1227,9 @@ TenantMigrationDonorService::Instance::_waitForForgetMigrationThenMarkMigrationG
})
.then([this, self = shared_from_this(), executor, token](repl::OpTime opTime) {
return _waitForMajorityWriteConcern(executor, std::move(opTime), token);
+ })
+ .then([this, self = shared_from_this()] {
+ pauseTenantMigrationDonorAfterMarkingStateGarbageCollectable.pauseWhileSet();
});
}
diff --git a/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp b/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp
index 5cee7b9b6c0..653d1dcd006 100644
--- a/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp
+++ b/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp
@@ -141,6 +141,7 @@ public:
coordinatorDoc.setPresetReshardedChunks(request().get_presetReshardedChunks());
coordinatorDoc.setNumInitialChunks(request().getNumInitialChunks());
+ opCtx->setAlwaysInterruptAtStepDownOrUp();
auto registry = repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext());
auto service =
registry->lookupServiceByName(ReshardingCoordinatorService::kServiceName);