summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornandinibhartiyaMDB <nandini.bhartiya@mongodb.com>2022-11-03 12:45:29 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-11-09 16:01:19 +0000
commitec65af7e2185c7e266bc6a04ea7777eea713fa58 (patch)
tree65dd1168cb3019ac034d3ce60eda0a15d3f8f734
parentdefa7826e96929cabbaa204f91a39d6bd6e30239 (diff)
downloadmongo-ec65af7e2185c7e266bc6a04ea7777eea713fa58.tar.gz
SERVER-70373: Avoid invariant failure while aborting a resharding operation
(cherry picked from commit fed6f5b5c41b8f9ba5451192f5bf72eb7742c1d5) SERVER-71112: Fix resharding failures count in unittest (cherry picked from commit d8329fbe00da54b5f8b0f60889364e7920968756)
-rw-r--r--src/mongo/db/s/resharding/resharding_metrics.cpp7
-rw-r--r--src/mongo/db/s/resharding/resharding_recipient_service_test.cpp102
2 files changed, 64 insertions, 45 deletions
diff --git a/src/mongo/db/s/resharding/resharding_metrics.cpp b/src/mongo/db/s/resharding/resharding_metrics.cpp
index fcd4abc7f27..bda2e2379c7 100644
--- a/src/mongo/db/s/resharding/resharding_metrics.cpp
+++ b/src/mongo/db/s/resharding/resharding_metrics.cpp
@@ -536,6 +536,9 @@ void ReshardingMetrics::setDonorState(DonorStateEnum state) noexcept {
void ReshardingMetrics::setRecipientState(RecipientStateEnum state) noexcept {
stdx::lock_guard<Latch> lk(_mutex);
+ if (!_currentOp && state == RecipientStateEnum::kDone) {
+ return;
+ }
invariant(_currentOp, kNoOperationInProgress);
const auto oldState = std::exchange(_currentOp->recipientState, state);
@@ -686,6 +689,10 @@ void ReshardingMetrics::enterCriticalSection(Date_t start) {
void ReshardingMetrics::leaveCriticalSection(Date_t end) {
stdx::lock_guard<Latch> lk(_mutex);
+ if (!_currentOp) {
+ return;
+ }
+
_currentOp->inCriticalSection.forceEnd(end);
}
diff --git a/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp b/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp
index 0a8beca187e..0d861d0a686 100644
--- a/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp
+++ b/src/mongo/db/s/resharding/resharding_recipient_service_test.cpp
@@ -502,69 +502,81 @@ DEATH_TEST_REGEX_F(ReshardingRecipientServiceTest, CommitFn, "4457001.*tripwire"
TEST_F(ReshardingRecipientServiceTest, DropsTemporaryReshardingCollectionOnAbort) {
auto metrics = ReshardingRecipientServiceTest::metrics();
for (bool isAlsoDonor : {false, true}) {
- LOGV2(5551107,
- "Running case",
- "test"_attr = _agent.getTestName(),
- "isAlsoDonor"_attr = isAlsoDonor);
-
- boost::optional<PauseDuringStateTransitions> doneTransitionGuard;
- doneTransitionGuard.emplace(controller(), RecipientStateEnum::kDone);
-
- auto doc = makeStateDocument(isAlsoDonor);
- auto instanceId =
- BSON(ReshardingRecipientDocument::kReshardingUUIDFieldName << doc.getReshardingUUID());
+ for (bool waitForMetricsInitialized : {false, true}) {
+ LOGV2(5551107,
+ "Running case",
+ "test"_attr = _agent.getTestName(),
+ "isAlsoDonor"_attr = isAlsoDonor,
+ "waitForMetricsInitialized"_attr = waitForMetricsInitialized);
+
+ boost::optional<PauseDuringStateTransitions> doneTransitionGuard;
+ doneTransitionGuard.emplace(controller(), RecipientStateEnum::kDone);
+
+ auto doc = makeStateDocument(isAlsoDonor);
+ auto instanceId = BSON(ReshardingRecipientDocument::kReshardingUUIDFieldName
+ << doc.getReshardingUUID());
+
+ auto opCtx = makeOperationContext();
+
+ if (isAlsoDonor) {
+ // If the recipient is also a donor, the original collection should already exist on
+ // this shard.
+ createSourceCollection(opCtx.get(), doc);
+ }
- auto opCtx = makeOperationContext();
+ RecipientStateMachine::insertStateDocument(opCtx.get(), doc);
+ auto recipient =
+ RecipientStateMachine::getOrCreate(opCtx.get(), _service, doc.toBSON());
- if (isAlsoDonor) {
- // If the recipient is also a donor, the original collection should already exist on
- // this shard.
- createSourceCollection(opCtx.get(), doc);
- }
+ notifyToStartCloning(opCtx.get(), *recipient, doc);
+ if (waitForMetricsInitialized) {
+ // Waiting for the metrics to be initialized here causes the second abort to occur
+ // before the metrics are initialized after the step up, thereby testing a different
+ // code path.
+ doneTransitionGuard->wait(RecipientStateEnum::kCreatingCollection);
+ }
- RecipientStateMachine::insertStateDocument(opCtx.get(), doc);
- auto recipient = RecipientStateMachine::getOrCreate(opCtx.get(), _service, doc.toBSON());
+ recipient->abort(false);
- notifyToStartCloning(opCtx.get(), *recipient, doc);
- recipient->abort(false);
+ doneTransitionGuard->wait(RecipientStateEnum::kDone);
- doneTransitionGuard->wait(RecipientStateEnum::kDone);
- stepDown();
+ stepDown();
- ASSERT_EQ(recipient->getCompletionFuture().getNoThrow(),
- ErrorCodes::InterruptedDueToReplStateChange);
+ ASSERT_EQ(recipient->getCompletionFuture().getNoThrow(),
+ ErrorCodes::InterruptedDueToReplStateChange);
- recipient.reset();
- stepUp(opCtx.get());
+ recipient.reset();
+ stepUp(opCtx.get());
- auto maybeRecipient = RecipientStateMachine::lookup(opCtx.get(), _service, instanceId);
- ASSERT_TRUE(bool(maybeRecipient));
- recipient = *maybeRecipient;
+ auto maybeRecipient = RecipientStateMachine::lookup(opCtx.get(), _service, instanceId);
+ ASSERT_TRUE(bool(maybeRecipient));
+ recipient = *maybeRecipient;
- doneTransitionGuard.reset();
- recipient->abort(false);
+ doneTransitionGuard.reset();
+ recipient->abort(false);
- ASSERT_OK(recipient->getCompletionFuture().getNoThrow());
- checkStateDocumentRemoved(opCtx.get());
+ ASSERT_OK(recipient->getCompletionFuture().getNoThrow());
+ checkStateDocumentRemoved(opCtx.get());
- if (isAlsoDonor) {
- // Verify original collection still exists after aborting.
- AutoGetCollection coll(opCtx.get(), doc.getSourceNss(), MODE_IS);
- ASSERT_TRUE(bool(coll));
- ASSERT_EQ(coll->uuid(), doc.getSourceUUID());
- }
+ if (isAlsoDonor) {
+ // Verify original collection still exists after aborting.
+ AutoGetCollection coll(opCtx.get(), doc.getSourceNss(), MODE_IS);
+ ASSERT_TRUE(bool(coll));
+ ASSERT_EQ(coll->uuid(), doc.getSourceUUID());
+ }
- // Verify the temporary collection no longer exists.
- {
- AutoGetCollection coll(opCtx.get(), doc.getTempReshardingNss(), MODE_IS);
- ASSERT_FALSE(bool(coll));
+ // Verify the temporary collection no longer exists.
+ {
+ AutoGetCollection coll(opCtx.get(), doc.getTempReshardingNss(), MODE_IS);
+ ASSERT_FALSE(bool(coll));
+ }
}
}
BSONObjBuilder result;
metrics->serializeCumulativeOpMetrics(&result);
- ASSERT_EQ(result.obj().getField("countReshardingFailures").numberLong(), 2);
+ ASSERT_LESS_THAN_OR_EQUALS(result.obj().getField("countReshardingFailures").numberLong(), 4);
}
TEST_F(ReshardingRecipientServiceTest, RenamesTemporaryReshardingCollectionWhenDone) {