summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Walak <matt.walak@mongodb.com>2021-07-27 16:15:58 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-08-12 13:49:51 +0000
commit407ebc679c51541204fdacb736ae0399c45bc78b (patch)
tree0b95d5605878eeac2633df4749cbdebee2c765de
parent88ef21684a4baafa8d018520c22604179fee9aad (diff)
downloadmongo-407ebc679c51541204fdacb736ae0399c45bc78b.tar.gz
SERVER-58781 ReshardingCoordinatorObserver should not fulfill promises when stepping up in state kInitializing
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_observer.cpp5
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_service.cpp5
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp58
3 files changed, 66 insertions, 2 deletions
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp
index 4c4e402e288..ff6cda2ab37 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp
@@ -61,11 +61,16 @@ const std::vector<RecipientShardEntry>& getParticipants(
/**
* Returns true if all participants are in a state greater than or equal to the expectedState.
+ * Returns false if the participants list is empty.
*/
template <class TState, class TParticipant>
bool allParticipantsInStateGTE(WithLock lk,
TState expectedState,
const std::vector<TParticipant>& participants) {
+ if (participants.size() == 0) {
+ return false;
+ }
+
for (const auto& shard : participants) {
if (shard.getMutableState().getState() < expectedState) {
return false;
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
index a703e0c80fa..30fc19f9afb 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
@@ -78,6 +78,7 @@ MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforeCompletion);
MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforeStartingErrorFlow);
MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforePersistingStateTransition);
MONGO_FAIL_POINT_DEFINE(pauseBeforeTellDonorToRefresh);
+MONGO_FAIL_POINT_DEFINE(pauseBeforeInsertCoordinatorDoc);
const std::string kReshardingCoordinatorActiveIndexName = "ReshardingCoordinatorActiveIndex";
const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
@@ -985,7 +986,7 @@ ReshardingCoordinatorService::ReshardingCoordinator::ReshardingCoordinator(
// If the coordinator is recovering from step-up, make sure to properly initialize the
// promises to reflect the latest state of this resharding operation.
- if (coordinatorDoc.getState() != CoordinatorStateEnum::kUnused) {
+ if (coordinatorDoc.getState() > CoordinatorStateEnum::kInitializing) {
_reshardingCoordinatorObserver->onReshardingParticipantTransition(coordinatorDoc);
}
}
@@ -1463,6 +1464,7 @@ void ReshardingCoordinatorService::ReshardingCoordinator::_insertCoordDocAndChan
// TODO SERVER-53914 to accommodate loading metrics for the coordinator.
ReshardingMetrics::get(cc().getServiceContext())
->onStart(ReshardingMetrics::Role::kCoordinator, getCurrentTime());
+ pauseBeforeInsertCoordinatorDoc.pauseWhileSet();
}
void ReshardingCoordinatorService::ReshardingCoordinator::
@@ -1470,7 +1472,6 @@ void ReshardingCoordinatorService::ReshardingCoordinator::
if (_coordinatorDoc.getState() > CoordinatorStateEnum::kInitializing) {
return;
}
-
auto opCtx = _cancelableOpCtxFactory->makeOperationContext(&cc());
ReshardingCoordinatorDocument updatedCoordinatorDoc = _coordinatorDoc;
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp
index 071ec5c94ae..806639af7dc 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service_test.cpp
@@ -53,6 +53,7 @@
#include "mongo/s/catalog/type_shard.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/clock_source_mock.h"
+#include "mongo/util/fail_point.h"
namespace mongo {
namespace {
@@ -578,6 +579,63 @@ TEST_F(ReshardingCoordinatorServiceTest, ReshardingCoordinatorSuccessfullyTransi
coordinator->getCompletionFuture().get(opCtx);
}
+TEST_F(ReshardingCoordinatorServiceTest, StepDownStepUpDuringInitializing) {
+ PauseDuringStateTransitions stateTransitionsGuard{controller(),
+ CoordinatorStateEnum::kPreparingToDonate};
+
+ auto opCtx = operationContext();
+ auto pauseBeforeInsertCoordinatorDoc =
+ globalFailPointRegistry().find("pauseBeforeInsertCoordinatorDoc");
+ auto timesEnteredFailPoint = pauseBeforeInsertCoordinatorDoc->setMode(FailPoint::alwaysOn, 0);
+
+ auto doc = insertStateAndCatalogEntries(CoordinatorStateEnum::kUnused, _originalEpoch);
+ doc.setRecipientShards({});
+ doc.setDonorShards({});
+
+ auto donorChunk = makeAndInsertChunksForDonorShard(
+ _originalUUID, _originalEpoch, _oldShardKey, std::vector{OID::gen(), OID::gen()});
+
+ auto initialChunks =
+ makeChunks(_reshardingUUID, _tempEpoch, _newShardKey, std::vector{OID::gen(), OID::gen()});
+
+ std::vector<ReshardedChunk> presetReshardedChunks;
+ for (const auto& chunk : initialChunks) {
+ presetReshardedChunks.emplace_back(chunk.getShard(), chunk.getMin(), chunk.getMax());
+ }
+
+ doc.setPresetReshardedChunks(presetReshardedChunks);
+
+ (void)ReshardingCoordinator::getOrCreate(opCtx, _service, doc.toBSON());
+ auto instanceId =
+ BSON(ReshardingCoordinatorDocument::kReshardingUUIDFieldName << doc.getReshardingUUID());
+
+ pauseBeforeInsertCoordinatorDoc->waitForTimesEntered(timesEnteredFailPoint + 1);
+
+ auto coordinator = getCoordinator(opCtx, instanceId);
+ stepDown(opCtx);
+ pauseBeforeInsertCoordinatorDoc->setMode(FailPoint::off, 0);
+ ASSERT_EQ(coordinator->getCompletionFuture().getNoThrow(),
+ ErrorCodes::InterruptedDueToReplStateChange);
+
+ coordinator.reset();
+ stepUp(opCtx);
+
+ stateTransitionsGuard.wait(CoordinatorStateEnum::kPreparingToDonate);
+
+ // Ensure that promises are not fulfilled on the new coordinator.
+ auto newCoordinator = getCoordinator(opCtx, instanceId);
+ auto newObserver = newCoordinator->getObserver();
+ ASSERT_FALSE(newObserver->awaitAllDonorsReadyToDonate().isReady());
+ ASSERT_FALSE(newObserver->awaitAllRecipientsFinishedCloning().isReady());
+ ASSERT_FALSE(newObserver->awaitAllRecipientsInStrictConsistency().isReady());
+ ASSERT_FALSE(newObserver->awaitAllDonorsDone().isReady());
+ ASSERT_FALSE(newObserver->awaitAllRecipientsDone().isReady());
+
+ stepDown(opCtx);
+ ASSERT_EQ(newCoordinator->getCompletionFuture().getNoThrow(),
+ ErrorCodes::InterruptedDueToReplStateChange);
+}
+
/**
* Test stepping down right when coordinator doc is being updated. Causing the change to be
* rolled back and redo the work again on step up.