diff options
author | Matt Broadstone <mbroadst@mongodb.com> | 2022-05-18 12:14:15 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-05-18 13:00:33 +0000 |
commit | 005aabeba20d762170e76e924ccb9b9b663fe983 (patch) | |
tree | 5b6ee51ab36e3394d0c2d8034d827a477d8849e8 /src/mongo/db | |
parent | fe5d7920309ac836c7b45daa0f314de442c00636 (diff) | |
download | mongo-005aabeba20d762170e76e924ccb9b9b663fe983.tar.gz |
SERVER-65076 Add stepdown passthrough suites for shard split
Diffstat (limited to 'src/mongo/db')
4 files changed, 44 insertions, 73 deletions
diff --git a/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp b/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp index 0cbea9ac7f3..53e7b24f135 100644 --- a/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp +++ b/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp @@ -448,19 +448,16 @@ void recoverTenantMigrationAccessBlockers(OperationContext* opCtx) { return true; } - // TODO(SERVER-64619) No longer use a dummy connectiong string when it is no longer a - // required parameter. - std::string dummmyRecipientConnectionString = "mongodb://FAKE_URI/?replSet=INVALID"; - auto optionalTenants = doc.getTenantIds(); invariant(optionalTenants); for (const auto& tenantId : optionalTenants.get()) { + invariant(doc.getRecipientConnectionString()); auto mtab = std::make_shared<TenantMigrationDonorAccessBlocker>( opCtx->getServiceContext(), doc.getId(), tenantId.toString(), MigrationProtocolEnum::kMultitenantMigrations, - dummmyRecipientConnectionString); + doc.getRecipientConnectionString()->toString()); TenantMigrationAccessBlockerRegistry::get(opCtx->getServiceContext()) .add(tenantId.toString(), mtab); diff --git a/src/mongo/db/serverless/shard_split_donor_op_observer.cpp b/src/mongo/db/serverless/shard_split_donor_op_observer.cpp index 900c1249458..ce1d0e55ddf 100644 --- a/src/mongo/db/serverless/shard_split_donor_op_observer.cpp +++ b/src/mongo/db/serverless/shard_split_donor_op_observer.cpp @@ -138,14 +138,19 @@ void onBlockerInitialization(OperationContext* opCtx, // The primary create and sets the tenant access blocker to blocking within the // ShardSplitDonorService. if (isSecondary(opCtx)) { - auto recipientTagName = donorStateDoc.getRecipientTagName(); - auto recipientSetName = donorStateDoc.getRecipientSetName(); - invariant(recipientTagName); - invariant(recipientSetName); - - auto config = repl::ReplicationCoordinator::get(cc().getServiceContext())->getConfig(); - auto recipientConnectionString = - serverless::makeRecipientConnectionString(config, *recipientTagName, *recipientSetName); + auto recipientConnectionString = [stateDoc = donorStateDoc]() { + if (stateDoc.getRecipientConnectionString()) { + return *stateDoc.getRecipientConnectionString(); + } + + auto recipientTagName = stateDoc.getRecipientTagName(); + invariant(recipientTagName); + auto recipientSetName = stateDoc.getRecipientSetName(); + invariant(recipientSetName); + auto config = repl::ReplicationCoordinator::get(cc().getServiceContext())->getConfig(); + return serverless::makeRecipientConnectionString( + config, *recipientTagName, *recipientSetName); + }(); for (const auto& tenantId : tenantIds) { auto mtab = std::make_shared<TenantMigrationDonorAccessBlocker>( diff --git a/src/mongo/db/serverless/shard_split_donor_service.cpp b/src/mongo/db/serverless/shard_split_donor_service.cpp index 9553aae29df..5ad7b0de654 100644 --- a/src/mongo/db/serverless/shard_split_donor_service.cpp +++ b/src/mongo/db/serverless/shard_split_donor_service.cpp @@ -58,7 +58,6 @@ namespace mongo { namespace { MONGO_FAIL_POINT_DEFINE(abortShardSplitBeforeLeavingBlockingState); -MONGO_FAIL_POINT_DEFINE(pauseShardSplitRecipientListenerCompletion); MONGO_FAIL_POINT_DEFINE(pauseShardSplitBeforeBlockingState); MONGO_FAIL_POINT_DEFINE(pauseShardSplitAfterBlocking); MONGO_FAIL_POINT_DEFINE(pauseShardSplitAfterDecision); @@ -66,6 +65,7 @@ MONGO_FAIL_POINT_DEFINE(skipShardSplitWaitForSplitAcceptance); MONGO_FAIL_POINT_DEFINE(pauseShardSplitBeforeRecipientCleanup); MONGO_FAIL_POINT_DEFINE(pauseShardSplitAfterMarkingStateGarbageCollectable); MONGO_FAIL_POINT_DEFINE(skipShardSplitRecipientCleanup); +MONGO_FAIL_POINT_DEFINE(pauseShardSplitBeforeLeavingBlockingState); const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max()); @@ -197,13 +197,6 @@ SemiFuture<void> makeRecipientAcceptSplitFuture( monitor->shutdown(); } - LOGV2(6634900, - "Shutting down shard split recipient listener.", - "id"_attr = migrationId, - "recipientReady"_attr = listener->getFuture().isReady()); - - pauseShardSplitRecipientListenerCompletion.pauseWhileSet(); - return s; }) .semi(); @@ -417,10 +410,6 @@ SemiFuture<void> ShardSplitDonorService::DonorStateMachine::run( auto opCtx = _cancelableOpCtxFactory->makeOperationContext(&cc()); pauseShardSplitAfterBlocking.pauseWhileSet(opCtx.get()); - if (MONGO_unlikely(abortShardSplitBeforeLeavingBlockingState.shouldFail())) { - uasserted(ErrorCodes::InternalError, "simulate a shard split error"); - } - return _waitForRecipientToReachBlockTimestamp(executor, abortToken); }) .then([this, executor, abortToken] { @@ -655,14 +644,34 @@ ShardSplitDonorService::DonorStateMachine::_waitForRecipientToAcceptSplitAndTrig LOGV2(6142501, "Waiting for recipient to accept the split.", "id"_attr = _migrationId); - return _splitAcceptancePromise.getFuture() - .thenRunOn(**executor) + return ExecutorFuture(**executor) + .then([&]() { return _splitAcceptancePromise.getFuture(); }) + .then([this] { + auto opCtx = _cancelableOpCtxFactory->makeOperationContext(&cc()); + if (MONGO_unlikely(pauseShardSplitBeforeLeavingBlockingState.shouldFail())) { + pauseShardSplitBeforeLeavingBlockingState.execute([&](const BSONObj& data) { + if (!data.hasField("blockTimeMS")) { + pauseShardSplitBeforeLeavingBlockingState.pauseWhileSet(opCtx.get()); + } else { + const auto blockTime = Milliseconds{data.getIntField("blockTimeMS")}; + LOGV2(8423359, + "Keeping shard split in blocking state.", + "blockTime"_attr = blockTime); + opCtx->sleepFor(blockTime); + } + }); + } + + if (MONGO_unlikely(abortShardSplitBeforeLeavingBlockingState.shouldFail())) { + uasserted(ErrorCodes::InternalError, "simulate a shard split error"); + } + }) .then([this, recipients, abortToken, remoteCommandExecutor] { LOGV2(6493901, - "Triggering an election after applying the split config.", + "Triggering an election after recipient has accepted the split.", "id"_attr = _migrationId); - // replSetStepUp on a random node will succeed as long as it's note the most out-of-date + // replSetStepUp on a random node will succeed as long as it's not the most out-of-date // node (in that case at least another node will vote for it and the election will // succeed). Selecting a random node has a 2/3 chance to succeed for replSetStepUp. If // the first command fail, we know this node is the most out-of-date. Therefore we @@ -679,7 +688,7 @@ ShardSplitDonorService::DonorStateMachine::_waitForRecipientToAcceptSplitAndTrig if (!replSetStepUpStatus.isOK()) { LOGV2(6493904, "Failed to trigger an election on the recipient replica set.", - "replSetStatus"_attr = replSetStepUpStatus); + "status"_attr = replSetStepUpStatus); } // Even if replSetStepUp failed, the recipient nodes have joined the @@ -690,7 +699,7 @@ ShardSplitDonorService::DonorStateMachine::_waitForRecipientToAcceptSplitAndTrig }); }) .thenRunOn(**executor) - .onCompletion([this, executor, abortToken](Status status) { + .then([this, executor, abortToken]() { LOGV2(6142503, "Entering 'committed' state.", "id"_attr = _stateDoc.getId()); return _updateStateDocument(executor, abortToken, ShardSplitDonorStateEnum::kCommitted) diff --git a/src/mongo/db/serverless/shard_split_donor_service_test.cpp b/src/mongo/db/serverless/shard_split_donor_service_test.cpp index 3104e978af8..9c939042986 100644 --- a/src/mongo/db/serverless/shard_split_donor_service_test.cpp +++ b/src/mongo/db/serverless/shard_split_donor_service_test.cpp @@ -30,6 +30,7 @@ #include <memory> +#include "mongo/client/connection_string.h" #include "mongo/client/replica_set_monitor.h" #include "mongo/client/sdam/server_description_builder.h" #include "mongo/client/streamable_replica_set_monitor_for_testing.h" @@ -644,48 +645,6 @@ TEST_F(ShardSplitDonorServiceTest, ShardSplitDonorServiceTimeout) { ASSERT_TRUE(serviceInstance->isGarbageCollectable()); } -// Verify the recipient acceptance listener stops when the shard split aborts due to an error. If it -// does not, the test will timeout. -TEST_F(ShardSplitDonorServiceTest, ListenerTerminatesOnError) { - auto opCtx = makeOperationContext(); - auto serviceContext = getServiceContext(); - - FailPointEnableBlock fp("abortShardSplitBeforeLeavingBlockingState"); - - auto listenerCompleted = - std::make_unique<FailPointEnableBlock>("pauseShardSplitRecipientListenerCompletion"); - const auto initialTimes = listenerCompleted->initialTimesEntered(); - - // Ensure the listener is created. - ShardSplitDonorService::DonorStateMachine::setSplitAcceptanceTaskExecutor_forTest(_executor); - _skipAcceptanceFP.reset(); - - test::shard_split::ScopedTenantAccessBlocker scopedTenants(_tenantIds, opCtx.get()); - test::shard_split::reconfigToAddRecipientNodes( - serviceContext, _recipientTagName, _replSet.getHosts(), _recipientSet.getHosts()); - - auto stateDocument = defaultStateDocument(); - - auto serviceInstance = ShardSplitDonorService::DonorStateMachine::getOrCreate( - opCtx.get(), _service, stateDocument.toBSON()); - ASSERT(serviceInstance.get()); - - auto result = serviceInstance->decisionFuture().get(opCtx.get()); - - ASSERT(!!result.abortReason); - ASSERT_EQ(result.abortReason->code(), ErrorCodes::InternalError); - ASSERT_EQ(result.state, mongo::ShardSplitDonorStateEnum::kAborted); - - serviceInstance->tryForget(); - - ASSERT_OK(serviceInstance->completionFuture().getNoThrow()); - ASSERT_TRUE(serviceInstance->isGarbageCollectable()); - - // If the listener does not stop, this will hang indefinitely and the test will timeout. - (*listenerCompleted)->waitForTimesEntered(initialTimes + 1); - listenerCompleted.reset(); -} - // Abort scenario : abortSplit called before startSplit. TEST_F(ShardSplitDonorServiceTest, CreateInstanceInAbortState) { auto opCtx = makeOperationContext(); @@ -1015,6 +974,7 @@ public: auto stateDocument = defaultStateDocument(); stateDocument.setBlockTimestamp(Timestamp(1, 1)); stateDocument.setState(ShardSplitDonorStateEnum::kBlocking); + stateDocument.setRecipientConnectionString(ConnectionString::forLocal()); return stateDocument; } |