From ec65af7e2185c7e266bc6a04ea7777eea713fa58 Mon Sep 17 00:00:00 2001 From: nandinibhartiyaMDB Date: Thu, 3 Nov 2022 12:45:29 +0000 Subject: SERVER-70373: Avoid invariant failure while aborting a resharding operation (cherry picked from commit fed6f5b5c41b8f9ba5451192f5bf72eb7742c1d5) SERVER-71112: Fix resharding failures count in unittest (cherry picked from commit d8329fbe00da54b5f8b0f60889364e7920968756) --- src/mongo/db/s/resharding/resharding_metrics.cpp | 7 ++ .../resharding_recipient_service_test.cpp | 102 ++++++++++++--------- 2 files changed, 64 insertions(+), 45 deletions(-) diff --git a/src/mongo/db/s/resharding/resharding_metrics.cpp b/src/mongo/db/s/resharding/resharding_metrics.cpp index fcd4abc7f27..bda2e2379c7 100644 --- a/src/mongo/db/s/resharding/resharding_metrics.cpp +++ b/src/mongo/db/s/resharding/resharding_metrics.cpp @@ -536,6 +536,9 @@ void ReshardingMetrics::setDonorState(DonorStateEnum state) noexcept { void ReshardingMetrics::setRecipientState(RecipientStateEnum state) noexcept { stdx::lock_guard lk(_mutex); + if (!_currentOp && state == RecipientStateEnum::kDone) { + return; + } invariant(_currentOp, kNoOperationInProgress); const auto oldState = std::exchange(_currentOp->recipientState, state); @@ -686,6 +689,10 @@ void ReshardingMetrics::enterCriticalSection(Date_t start) { void ReshardingMetrics::leaveCriticalSection(Date_t end) { stdx::lock_guard lk(_mutex); + if (!_currentOp) { + return; + } + _currentOp->inCriticalSection.forceEnd(end); } diff --git a/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp b/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp index 0a8beca187e..0d861d0a686 100644 --- a/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp +++ b/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp @@ -502,69 +502,81 @@ DEATH_TEST_REGEX_F(ReshardingRecipientServiceTest, CommitFn, "4457001.*tripwire" TEST_F(ReshardingRecipientServiceTest, DropsTemporaryReshardingCollectionOnAbort) { auto metrics = ReshardingRecipientServiceTest::metrics(); for (bool isAlsoDonor : {false, true}) { - LOGV2(5551107, - "Running case", - "test"_attr = _agent.getTestName(), - "isAlsoDonor"_attr = isAlsoDonor); - - boost::optional doneTransitionGuard; - doneTransitionGuard.emplace(controller(), RecipientStateEnum::kDone); - - auto doc = makeStateDocument(isAlsoDonor); - auto instanceId = - BSON(ReshardingRecipientDocument::kReshardingUUIDFieldName << doc.getReshardingUUID()); + for (bool waitForMetricsInitialized : {false, true}) { + LOGV2(5551107, + "Running case", + "test"_attr = _agent.getTestName(), + "isAlsoDonor"_attr = isAlsoDonor, + "waitForMetricsInitialized"_attr = waitForMetricsInitialized); + + boost::optional doneTransitionGuard; + doneTransitionGuard.emplace(controller(), RecipientStateEnum::kDone); + + auto doc = makeStateDocument(isAlsoDonor); + auto instanceId = BSON(ReshardingRecipientDocument::kReshardingUUIDFieldName + << doc.getReshardingUUID()); + + auto opCtx = makeOperationContext(); + + if (isAlsoDonor) { + // If the recipient is also a donor, the original collection should already exist on + // this shard. + createSourceCollection(opCtx.get(), doc); + } - auto opCtx = makeOperationContext(); + RecipientStateMachine::insertStateDocument(opCtx.get(), doc); + auto recipient = + RecipientStateMachine::getOrCreate(opCtx.get(), _service, doc.toBSON()); - if (isAlsoDonor) { - // If the recipient is also a donor, the original collection should already exist on - // this shard. - createSourceCollection(opCtx.get(), doc); - } + notifyToStartCloning(opCtx.get(), *recipient, doc); + if (waitForMetricsInitialized) { + // Waiting for the metrics to be initialized here causes the second abort to occur + // before the metrics are initialized after the step up, thereby testing a different + // code path. + doneTransitionGuard->wait(RecipientStateEnum::kCreatingCollection); + } - RecipientStateMachine::insertStateDocument(opCtx.get(), doc); - auto recipient = RecipientStateMachine::getOrCreate(opCtx.get(), _service, doc.toBSON()); + recipient->abort(false); - notifyToStartCloning(opCtx.get(), *recipient, doc); - recipient->abort(false); + doneTransitionGuard->wait(RecipientStateEnum::kDone); - doneTransitionGuard->wait(RecipientStateEnum::kDone); - stepDown(); + stepDown(); - ASSERT_EQ(recipient->getCompletionFuture().getNoThrow(), - ErrorCodes::InterruptedDueToReplStateChange); + ASSERT_EQ(recipient->getCompletionFuture().getNoThrow(), + ErrorCodes::InterruptedDueToReplStateChange); - recipient.reset(); - stepUp(opCtx.get()); + recipient.reset(); + stepUp(opCtx.get()); - auto maybeRecipient = RecipientStateMachine::lookup(opCtx.get(), _service, instanceId); - ASSERT_TRUE(bool(maybeRecipient)); - recipient = *maybeRecipient; + auto maybeRecipient = RecipientStateMachine::lookup(opCtx.get(), _service, instanceId); + ASSERT_TRUE(bool(maybeRecipient)); + recipient = *maybeRecipient; - doneTransitionGuard.reset(); - recipient->abort(false); + doneTransitionGuard.reset(); + recipient->abort(false); - ASSERT_OK(recipient->getCompletionFuture().getNoThrow()); - checkStateDocumentRemoved(opCtx.get()); + ASSERT_OK(recipient->getCompletionFuture().getNoThrow()); + checkStateDocumentRemoved(opCtx.get()); - if (isAlsoDonor) { - // Verify original collection still exists after aborting. - AutoGetCollection coll(opCtx.get(), doc.getSourceNss(), MODE_IS); - ASSERT_TRUE(bool(coll)); - ASSERT_EQ(coll->uuid(), doc.getSourceUUID()); - } + if (isAlsoDonor) { + // Verify original collection still exists after aborting. + AutoGetCollection coll(opCtx.get(), doc.getSourceNss(), MODE_IS); + ASSERT_TRUE(bool(coll)); + ASSERT_EQ(coll->uuid(), doc.getSourceUUID()); + } - // Verify the temporary collection no longer exists. - { - AutoGetCollection coll(opCtx.get(), doc.getTempReshardingNss(), MODE_IS); - ASSERT_FALSE(bool(coll)); + // Verify the temporary collection no longer exists. + { + AutoGetCollection coll(opCtx.get(), doc.getTempReshardingNss(), MODE_IS); + ASSERT_FALSE(bool(coll)); + } } } BSONObjBuilder result; metrics->serializeCumulativeOpMetrics(&result); - ASSERT_EQ(result.obj().getField("countReshardingFailures").numberLong(), 2); + ASSERT_LESS_THAN_OR_EQUALS(result.obj().getField("countReshardingFailures").numberLong(), 4); } TEST_F(ReshardingRecipientServiceTest, RenamesTemporaryReshardingCollectionWhenDone) { -- cgit v1.2.1