From c02a82e18fe3fc3cf9ed76962fe05c22bf376332 Mon Sep 17 00:00:00 2001 From: Cheahuychou Mao Date: Mon, 3 May 2021 20:18:40 +0000 Subject: SERVER-54460 Ensure that opCtx that waits on PrimaryOnlyService completion promise gets killed on stepdown --- jstests/replsets/libs/tenant_migration_test.js | 9 ++- ...on_concurrent_state_doc_removal_and_stepdown.js | 69 ++++++++++++++++++++++ .../db/commands/tenant_migration_donor_cmds.cpp | 1 + .../commands/tenant_migration_recipient_cmds.cpp | 1 + .../db/repl/primary_only_service_op_observer.cpp | 22 ++++--- .../db/repl/tenant_migration_donor_service.cpp | 4 ++ .../s/config/configsvr_reshard_collection_cmd.cpp | 1 + 7 files changed, 97 insertions(+), 10 deletions(-) create mode 100644 jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js diff --git a/jstests/replsets/libs/tenant_migration_test.js b/jstests/replsets/libs/tenant_migration_test.js index fc9744d6460..577ab5f3f1b 100644 --- a/jstests/replsets/libs/tenant_migration_test.js +++ b/jstests/replsets/libs/tenant_migration_test.js @@ -22,6 +22,8 @@ load("jstests/replsets/libs/tenant_migration_util.js"); * each RST will contain, and 'setParameter' , an object with various server parameters. * @param {boolean} [allowDonorReadAfterMigration] whether donor would allow reads after a committed * migration. + * @param {boolean} [initiateRstWithHighElectionTimeout] whether donor and recipient replica sets + * should be initiated with high election timeout. */ function TenantMigrationTest({ name = "TenantMigrationTest", @@ -31,6 +33,7 @@ function TenantMigrationTest({ sharedOptions = {}, // Default this to true so it is easier for data consistency checks. allowStaleReadsOnDonor = true, + initiateRstWithHighElectionTimeout = true }) { const donorPassedIn = (donorRst !== undefined); const recipientPassedIn = (recipientRst !== undefined); @@ -81,7 +84,11 @@ function TenantMigrationTest({ const rstName = `${name}_${(isDonor ? "donor" : "recipient")}`; const rst = new ReplSetTest({name: rstName, nodes, nodeOptions}); rst.startSet(); - rst.initiateWithHighElectionTimeout(); + if (initiateRstWithHighElectionTimeout) { + rst.initiateWithHighElectionTimeout(); + } else { + rst.initiate(); + } return rst; } diff --git a/jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js b/jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js new file mode 100644 index 00000000000..7f431576382 --- /dev/null +++ b/jstests/replsets/tenant_migration_concurrent_state_doc_removal_and_stepdown.js @@ -0,0 +1,69 @@ +/** + * Tests that donorForgetMigration command doesn't hang if failover occurs immediately after the + * state doc for the migration has been removed. + * + * @tags: [requires_fcv_47, requires_majority_read_concern, incompatible_with_eft, + * incompatible_with_windows_tls, incompatible_with_macos, requires_persistence] + */ + +(function() { +"use strict"; + +load("jstests/libs/parallelTester.js"); +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/uuid_util.js"); +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const tenantMigrationTest = new TenantMigrationTest({ + name: jsTestName(), + sharedOptions: { + setParameter: { + tenantMigrationGarbageCollectionDelayMS: 1, + ttlMonitorSleepSecs: 1, + } + }, + initiateRstWithHighElectionTimeout: false +}); +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + return; +} + +const kTenantId = "testTenantId"; + +const donorRst = tenantMigrationTest.getDonorRst(); +const donorRstArgs = TenantMigrationUtil.createRstArgs(donorRst); +let donorPrimary = tenantMigrationTest.getDonorPrimary(); + +const migrationId = UUID(); +const migrationOpts = { + migrationIdString: extractUUIDFromObject(migrationId), + tenantId: kTenantId, +}; + +assert.commandWorked(tenantMigrationTest.runMigration( + migrationOpts, false /* retryOnRetryableErrors */, false /* automaticForgetMigration */)); + +let fp = configureFailPoint(donorPrimary, + "pauseTenantMigrationDonorAfterMarkingStateGarbageCollectable"); +const forgetMigrationThread = new Thread(TenantMigrationUtil.forgetMigrationAsync, + migrationOpts.migrationIdString, + donorRstArgs, + false /* retryOnRetryableErrors */); +forgetMigrationThread.start(); +fp.wait(); +tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, migrationOpts.tenantId); + +assert.commandWorked( + donorPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true})); +assert.commandWorked(donorPrimary.adminCommand({replSetFreeze: 0})); +fp.off(); +donorPrimary = donorRst.getPrimary(); + +assert.commandFailedWithCode(forgetMigrationThread.returnData(), + ErrorCodes.InterruptedDueToReplStateChange); + +donorRst.stopSet(); +tenantMigrationTest.stop(); +})(); \ No newline at end of file diff --git a/src/mongo/db/commands/tenant_migration_donor_cmds.cpp b/src/mongo/db/commands/tenant_migration_donor_cmds.cpp index 129aa787b40..c81e5a551e1 100644 --- a/src/mongo/db/commands/tenant_migration_donor_cmds.cpp +++ b/src/mongo/db/commands/tenant_migration_donor_cmds.cpp @@ -181,6 +181,7 @@ public: const auto& cmd = request(); + opCtx->setAlwaysInterruptAtStepDownOrUp(); auto donorService = repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()) ->lookupServiceByName(TenantMigrationDonorService::kServiceName); diff --git a/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp b/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp index 3d42735fd65..52f638374ca 100644 --- a/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp +++ b/src/mongo/db/commands/tenant_migration_recipient_cmds.cpp @@ -190,6 +190,7 @@ public: const auto& cmd = request(); + opCtx->setAlwaysInterruptAtStepDownOrUp(); auto recipientService = repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()) ->lookupServiceByName(repl::TenantMigrationRecipientService:: diff --git a/src/mongo/db/repl/primary_only_service_op_observer.cpp b/src/mongo/db/repl/primary_only_service_op_observer.cpp index 711b7e765a2..4c939efb7d1 100644 --- a/src/mongo/db/repl/primary_only_service_op_observer.cpp +++ b/src/mongo/db/repl/primary_only_service_op_observer.cpp @@ -70,10 +70,12 @@ void PrimaryOnlyServiceOpObserver::onDelete(OperationContext* opCtx, if (!service) { return; } - // Passing OK() as an argument does not invoke the interrupt() method on the instance. - // TODO(SERVER-54460): when state document deletion race is fixed in resharding, release with - // error as in 'onDropCollection()'. - service->releaseInstance(documentId, Status::OK()); + opCtx->recoveryUnit()->onCommit( + [service, documentId, nss](boost::optional unusedCommitTime) { + // Release the instance without interrupting it since for some primary-only services + // there is still work to be done after the state document is removed. + service->releaseInstance(documentId, Status::OK()); + }); } @@ -84,11 +86,13 @@ repl::OpTime PrimaryOnlyServiceOpObserver::onDropCollection(OperationContext* op const CollectionDropType dropType) { auto service = _registry->lookupServiceByNamespace(collectionName); if (service) { - // Dropping the state doc collection also interrups all the instances with 'interrupted' - // status. - service->releaseAllInstances(Status(ErrorCodes::Interrupted, - str::stream() << collectionName << " is dropped", - BSON("collection" << collectionName.toString()))); + opCtx->recoveryUnit()->onCommit( + [service, collectionName](boost::optional unusedCommitTime) { + // Release and interrupt all the instances since the state document collection is + // not supposed to be dropped. + service->releaseAllInstances(Status( + ErrorCodes::Interrupted, str::stream() << collectionName << " is dropped")); + }); } return {}; } diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp index 35023a2f7b7..5ea899d3071 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.cpp +++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp @@ -68,6 +68,7 @@ MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeLeavingDataSyncState); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeFetchingKeys); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorBeforeWaitingForKeysToReplicate); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorBeforeMarkingStateGarbageCollectable); +MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorAfterMarkingStateGarbageCollectable); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeEnteringFutureChain); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationAfterFetchingAndStoringKeys); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorWhileUpdatingStateDoc); @@ -1226,6 +1227,9 @@ TenantMigrationDonorService::Instance::_waitForForgetMigrationThenMarkMigrationG }) .then([this, self = shared_from_this(), executor, token](repl::OpTime opTime) { return _waitForMajorityWriteConcern(executor, std::move(opTime), token); + }) + .then([this, self = shared_from_this()] { + pauseTenantMigrationDonorAfterMarkingStateGarbageCollectable.pauseWhileSet(); }); } diff --git a/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp b/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp index 5cee7b9b6c0..653d1dcd006 100644 --- a/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp +++ b/src/mongo/db/s/config/configsvr_reshard_collection_cmd.cpp @@ -141,6 +141,7 @@ public: coordinatorDoc.setPresetReshardedChunks(request().get_presetReshardedChunks()); coordinatorDoc.setNumInitialChunks(request().getNumInitialChunks()); + opCtx->setAlwaysInterruptAtStepDownOrUp(); auto registry = repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()); auto service = registry->lookupServiceByName(ReshardingCoordinatorService::kServiceName); -- cgit v1.2.1