diff options
author | Matt Walak <matt.walak@mongodb.com> | 2021-07-27 16:15:58 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-08-12 13:49:51 +0000 |
commit | 407ebc679c51541204fdacb736ae0399c45bc78b (patch) | |
tree | 0b95d5605878eeac2633df4749cbdebee2c765de | |
parent | 88ef21684a4baafa8d018520c22604179fee9aad (diff) | |
download | mongo-407ebc679c51541204fdacb736ae0399c45bc78b.tar.gz |
SERVER-58781 ReshardingCoordinatorObserver should not fulfill promises when stepping up in state kInitializing
3 files changed, 66 insertions, 2 deletions
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp index 4c4e402e288..ff6cda2ab37 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp @@ -61,11 +61,16 @@ const std::vector<RecipientShardEntry>& getParticipants( /** * Returns true if all participants are in a state greater than or equal to the expectedState. + * Returns false if the participants list is empty. */ template <class TState, class TParticipant> bool allParticipantsInStateGTE(WithLock lk, TState expectedState, const std::vector<TParticipant>& participants) { + if (participants.size() == 0) { + return false; + } + for (const auto& shard : participants) { if (shard.getMutableState().getState() < expectedState) { return false; diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp index a703e0c80fa..30fc19f9afb 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp @@ -78,6 +78,7 @@ MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforeCompletion); MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforeStartingErrorFlow); MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforePersistingStateTransition); MONGO_FAIL_POINT_DEFINE(pauseBeforeTellDonorToRefresh); +MONGO_FAIL_POINT_DEFINE(pauseBeforeInsertCoordinatorDoc); const std::string kReshardingCoordinatorActiveIndexName = "ReshardingCoordinatorActiveIndex"; const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max()); @@ -985,7 +986,7 @@ ReshardingCoordinatorService::ReshardingCoordinator::ReshardingCoordinator( // If the coordinator is recovering from step-up, make sure to properly initialize the // promises to reflect the latest state of this resharding operation. - if (coordinatorDoc.getState() != CoordinatorStateEnum::kUnused) { + if (coordinatorDoc.getState() > CoordinatorStateEnum::kInitializing) { _reshardingCoordinatorObserver->onReshardingParticipantTransition(coordinatorDoc); } } @@ -1463,6 +1464,7 @@ void ReshardingCoordinatorService::ReshardingCoordinator::_insertCoordDocAndChan // TODO SERVER-53914 to accommodate loading metrics for the coordinator. ReshardingMetrics::get(cc().getServiceContext()) ->onStart(ReshardingMetrics::Role::kCoordinator, getCurrentTime()); + pauseBeforeInsertCoordinatorDoc.pauseWhileSet(); } void ReshardingCoordinatorService::ReshardingCoordinator:: @@ -1470,7 +1472,6 @@ void ReshardingCoordinatorService::ReshardingCoordinator:: if (_coordinatorDoc.getState() > CoordinatorStateEnum::kInitializing) { return; } - auto opCtx = _cancelableOpCtxFactory->makeOperationContext(&cc()); ReshardingCoordinatorDocument updatedCoordinatorDoc = _coordinatorDoc; diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp index 071ec5c94ae..806639af7dc 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp @@ -53,6 +53,7 @@ #include "mongo/s/catalog/type_shard.h" #include "mongo/unittest/unittest.h" #include "mongo/util/clock_source_mock.h" +#include "mongo/util/fail_point.h" namespace mongo { namespace { @@ -578,6 +579,63 @@ TEST_F(ReshardingCoordinatorServiceTest, ReshardingCoordinatorSuccessfullyTransi coordinator->getCompletionFuture().get(opCtx); } +TEST_F(ReshardingCoordinatorServiceTest, StepDownStepUpDuringInitializing) { + PauseDuringStateTransitions stateTransitionsGuard{controller(), + CoordinatorStateEnum::kPreparingToDonate}; + + auto opCtx = operationContext(); + auto pauseBeforeInsertCoordinatorDoc = + globalFailPointRegistry().find("pauseBeforeInsertCoordinatorDoc"); + auto timesEnteredFailPoint = pauseBeforeInsertCoordinatorDoc->setMode(FailPoint::alwaysOn, 0); + + auto doc = insertStateAndCatalogEntries(CoordinatorStateEnum::kUnused, _originalEpoch); + doc.setRecipientShards({}); + doc.setDonorShards({}); + + auto donorChunk = makeAndInsertChunksForDonorShard( + _originalUUID, _originalEpoch, _oldShardKey, std::vector{OID::gen(), OID::gen()}); + + auto initialChunks = + makeChunks(_reshardingUUID, _tempEpoch, _newShardKey, std::vector{OID::gen(), OID::gen()}); + + std::vector<ReshardedChunk> presetReshardedChunks; + for (const auto& chunk : initialChunks) { + presetReshardedChunks.emplace_back(chunk.getShard(), chunk.getMin(), chunk.getMax()); + } + + doc.setPresetReshardedChunks(presetReshardedChunks); + + (void)ReshardingCoordinator::getOrCreate(opCtx, _service, doc.toBSON()); + auto instanceId = + BSON(ReshardingCoordinatorDocument::kReshardingUUIDFieldName << doc.getReshardingUUID()); + + pauseBeforeInsertCoordinatorDoc->waitForTimesEntered(timesEnteredFailPoint + 1); + + auto coordinator = getCoordinator(opCtx, instanceId); + stepDown(opCtx); + pauseBeforeInsertCoordinatorDoc->setMode(FailPoint::off, 0); + ASSERT_EQ(coordinator->getCompletionFuture().getNoThrow(), + ErrorCodes::InterruptedDueToReplStateChange); + + coordinator.reset(); + stepUp(opCtx); + + stateTransitionsGuard.wait(CoordinatorStateEnum::kPreparingToDonate); + + // Ensure that promises are not fulfilled on the new coordinator. + auto newCoordinator = getCoordinator(opCtx, instanceId); + auto newObserver = newCoordinator->getObserver(); + ASSERT_FALSE(newObserver->awaitAllDonorsReadyToDonate().isReady()); + ASSERT_FALSE(newObserver->awaitAllRecipientsFinishedCloning().isReady()); + ASSERT_FALSE(newObserver->awaitAllRecipientsInStrictConsistency().isReady()); + ASSERT_FALSE(newObserver->awaitAllDonorsDone().isReady()); + ASSERT_FALSE(newObserver->awaitAllRecipientsDone().isReady()); + + stepDown(opCtx); + ASSERT_EQ(newCoordinator->getCompletionFuture().getNoThrow(), + ErrorCodes::InterruptedDueToReplStateChange); +} + /** * Test stepping down right when coordinator doc is being updated. Causing the change to be * rolled back and redo the work again on step up. |