diff options
-rw-r--r-- | src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp | 30 | ||||
-rw-r--r-- | src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp | 41 |
2 files changed, 64 insertions, 7 deletions
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp index 4046e7d8448..963ec6a7c54 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp @@ -167,12 +167,20 @@ CoordinatorCommitMonitor::queryRemainingOperationTimeForRecipients() const { uassertStatusOKWithContext(status, errorContext); const auto remainingTime = extractOperationRemainingTime(shardResponse.data); - // A recipient shard does not report the remaining operation time when there is no data - // to copy and no oplog entry to apply. - if (remainingTime && remainingTime.value() < minRemainingTime) { + + // If any recipient omits the "remainingMillis" field of the response then + // we cannot conclude that it is safe to begin the critical section. + // It is possible that the recipient just had a failover and + // was not able to restore its metrics before it replied to the + // _shardsvrReshardingOperationTime command. + if (!remainingTime) { + maxRemainingTime = Milliseconds::max(); + continue; + } + if (remainingTime.value() < minRemainingTime) { minRemainingTime = remainingTime.value(); } - if (remainingTime && remainingTime.value() > maxRemainingTime) { + if (remainingTime.value() > maxRemainingTime) { maxRemainingTime = remainingTime.value(); } } @@ -206,11 +214,19 @@ ExecutorFuture<void> CoordinatorCommitMonitor::_makeFuture() const { "Encountered an error while querying recipients, will retry shortly", "error"_attr = status); - return RemainingOperationTimes{Milliseconds(0), Milliseconds::max()}; + // On error we definitely cannot begin the critical section. Therefore, + // return Milliseconds::max for remainingTimes.max (remainingTimes.max is used + // for determining whether the critical section should begin). + return RemainingOperationTimes{Milliseconds(-1), Milliseconds::max()}; }) .then([this, anchor = shared_from_this()](RemainingOperationTimes remainingTimes) { - _metrics->setCoordinatorHighEstimateRemainingTimeMillis(remainingTimes.max); - _metrics->setCoordinatorLowEstimateRemainingTimeMillis(remainingTimes.min); + // If remainingTimes.max (or remainingTimes.min) is Milliseconds::max, then use -1 so + // that the scale of the y-axis is still useful when looking at FTDC metrics. + auto clampIfMax = [](Milliseconds t) { + return t != Milliseconds::max() ? t : Milliseconds(-1); + }; + _metrics->setCoordinatorHighEstimateRemainingTimeMillis(clampIfMax(remainingTimes.max)); + _metrics->setCoordinatorLowEstimateRemainingTimeMillis(clampIfMax(remainingTimes.min)); // Check if all recipient shards are within the commit threshold. if (remainingTimes.max <= _threshold) diff --git a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp index 11a54beed1e..b31766ddbbc 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp @@ -94,6 +94,8 @@ protected: void tearDown() override; void mockCommandForRecipients(Milliseconds remainingOperationTime); + void mockOmitRemainingMillisForRecipients(); + void mockOmitRemainingMillisForOneRecipient(); void mockRemaingOperationTimesCommandForRecipients( CoordinatorCommitMonitor::RemainingOperationTimes remainingOperationTimes); @@ -195,6 +197,31 @@ void CoordinatorCommitMonitorTest::mockCommandForRecipients(Milliseconds remaini _recipientShards.begin(), _recipientShards.end(), [&](const ShardId&) { onCommand(func); }); } +void CoordinatorCommitMonitorTest::mockOmitRemainingMillisForRecipients() { + // Omit remainingMillis from all shard responses. + std::for_each(_recipientShards.begin(), _recipientShards.end(), [this](const ShardId&) { + onCommand([](const executor::RemoteCommandRequest& request) -> StatusWith<BSONObj> { + // Return an empty BSON object. + return BSONObj(); + }); + }); +} + +void CoordinatorCommitMonitorTest::mockOmitRemainingMillisForOneRecipient() { + // Omit remainingMillis from a single recipient. + for (const auto& shard : _recipientShards) { + onCommand([&](const executor::RemoteCommandRequest&) -> StatusWith<BSONObj> { + if (shard == _recipientShards.front()) { + // Return an empty BSON object. + return BSONObj(); + } + auto threshold = Milliseconds(gRemainingReshardingOperationTimeThresholdMillis.load()); + return BSON("remainingMillis" + << durationCount<Milliseconds>(threshold - Milliseconds(1))); + }); + } +} + void CoordinatorCommitMonitorTest::mockRemaingOperationTimesCommandForRecipients( CoordinatorCommitMonitor::RemainingOperationTimes remainingOperationTimes) { bool useMin = true; @@ -279,6 +306,20 @@ TEST_F(CoordinatorCommitMonitorTest, RetriesWhenEncountersErrorsWhileQueryingRec future.get(); } +TEST_F(CoordinatorCommitMonitorTest, BlocksWhenRemainingMillisIsOmitted) { + auto future = getCommitMonitor()->waitUntilRecipientsAreWithinCommitThreshold(); + + mockOmitRemainingMillisForRecipients(); + ASSERT(!future.isReady()); + + // If even a single shard omits remainingMillis, we cannot begin the critical section. + mockOmitRemainingMillisForOneRecipient(); + ASSERT(!future.isReady()); + + respondWithReadyToCommit(); + future.get(); +} + } // namespace } // namespace resharding } // namespace mongo |