summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Witten <andrew.witten@mongodb.com>2022-08-24 15:11:12 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-14 21:26:12 +0000
commit1db7b70c4ff5fd88fd4922350c65b09805e19710 (patch)
tree6859a9d4aabd134162c6d401136f6380a2767239
parentfadf5a97702162cda1642a5d6a4d47f0ece43994 (diff)
downloadmongo-1db7b70c4ff5fd88fd4922350c65b09805e19710.tar.gz
SERVER-67653 don't initiate critical section if remainingMillis is omitted
(cherry picked from commit abd6330d793235c8fbc5de7bf3ec53855ebea9d3) SERVER-69693 Use lambda instead of repeating code in resharding coordinator
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp34
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp41
2 files changed, 66 insertions, 9 deletions
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp
index 374dcd6538f..5700f0326ae 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor.cpp
@@ -164,13 +164,21 @@ CoordinatorCommitMonitor::queryRemainingOperationTimeForRecipients() const {
uassertStatusOKWithContext(status, errorContext);
const auto remainingTime = extractOperationRemainingTime(shardResponse.data);
- // A recipient shard does not report the remaining operation time when there is no data
- // to copy and no oplog entry to apply.
- if (remainingTime && remainingTime.get() < minRemainingTime) {
- minRemainingTime = remainingTime.get();
+
+ // If any recipient omits the "remainingMillis" field of the response then
+ // we cannot conclude that it is safe to begin the critical section.
+ // It is possible that the recipient just had a failover and
+ // was not able to restore its metrics before it replied to the
+ // _shardsvrReshardingOperationTime command.
+ if (!remainingTime) {
+ maxRemainingTime = Milliseconds::max();
+ continue;
+ }
+ if (remainingTime.value() < minRemainingTime) {
+ minRemainingTime = remainingTime.value();
}
- if (remainingTime && remainingTime.get() > maxRemainingTime) {
- maxRemainingTime = remainingTime.get();
+ if (remainingTime.value() > maxRemainingTime) {
+ maxRemainingTime = remainingTime.value();
}
}
@@ -203,12 +211,20 @@ ExecutorFuture<void> CoordinatorCommitMonitor::_makeFuture() const {
"Encountered an error while querying recipients, will retry shortly",
"error"_attr = status);
- return RemainingOperationTimes{Milliseconds(0), Milliseconds::max()};
+ // On error we definitely cannot begin the critical section. Therefore,
+ // return Milliseconds::max for remainingTimes.max (remainingTimes.max is used
+ // for determining whether the critical section should begin).
+ return RemainingOperationTimes{Milliseconds(-1), Milliseconds::max()};
})
.then([this, anchor = shared_from_this()](RemainingOperationTimes remainingTimes) {
auto metrics = ReshardingMetrics::get(cc().getServiceContext());
- metrics->setMinRemainingOperationTime(remainingTimes.min);
- metrics->setMaxRemainingOperationTime(remainingTimes.max);
+ // If remainingTimes.max (or remainingTimes.min) is Milliseconds::max, then use -1 so
+ // that the scale of the y-axis is still useful when looking at FTDC metrics.
+ auto clampIfMax = [](Milliseconds t) {
+ return t != Milliseconds::max() ? t : Milliseconds(-1);
+ };
+ metrics->setMinRemainingOperationTime(clampIfMax(remainingTimes.min));
+ metrics->setMaxRemainingOperationTime(clampIfMax(remainingTimes.max));
// Check if all recipient shards are within the commit threshold.
if (remainingTimes.max <= _threshold)
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp
index e7f90cc41fa..2fe3075f1fc 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_commit_monitor_test.cpp
@@ -92,6 +92,8 @@ protected:
void tearDown() override;
void mockCommandForRecipients(Milliseconds remainingOperationTime);
+ void mockOmitRemainingMillisForRecipients();
+ void mockOmitRemainingMillisForOneRecipient();
void mockRemaingOperationTimesCommandForRecipients(
CoordinatorCommitMonitor::RemainingOperationTimes remainingOperationTimes);
@@ -180,6 +182,31 @@ void CoordinatorCommitMonitorTest::mockCommandForRecipients(Milliseconds remaini
_recipientShards.begin(), _recipientShards.end(), [&](const ShardId&) { onCommand(func); });
}
+void CoordinatorCommitMonitorTest::mockOmitRemainingMillisForRecipients() {
+ // Omit remainingMillis from all shard responses.
+ std::for_each(_recipientShards.begin(), _recipientShards.end(), [this](const ShardId&) {
+ onCommand([](const executor::RemoteCommandRequest& request) -> StatusWith<BSONObj> {
+ // Return an empty BSON object.
+ return BSONObj();
+ });
+ });
+}
+
+void CoordinatorCommitMonitorTest::mockOmitRemainingMillisForOneRecipient() {
+ // Omit remainingMillis from a single recipient.
+ for (const auto& shard : _recipientShards) {
+ onCommand([&](const executor::RemoteCommandRequest&) -> StatusWith<BSONObj> {
+ if (shard == _recipientShards.front()) {
+ // Return an empty BSON object.
+ return BSONObj();
+ }
+ auto threshold = Milliseconds(gRemainingReshardingOperationTimeThresholdMillis.load());
+ return BSON("remainingMillis"
+ << durationCount<Milliseconds>(threshold - Milliseconds(1)));
+ });
+ }
+}
+
void CoordinatorCommitMonitorTest::mockRemaingOperationTimesCommandForRecipients(
CoordinatorCommitMonitor::RemainingOperationTimes remainingOperationTimes) {
bool useMin = true;
@@ -264,6 +291,20 @@ TEST_F(CoordinatorCommitMonitorTest, RetriesWhenEncountersErrorsWhileQueryingRec
future.get();
}
+TEST_F(CoordinatorCommitMonitorTest, BlocksWhenRemainingMillisIsOmitted) {
+ auto future = getCommitMonitor()->waitUntilRecipientsAreWithinCommitThreshold();
+
+ mockOmitRemainingMillisForRecipients();
+ ASSERT(!future.isReady());
+
+ // If even a single shard omits remainingMillis, we cannot begin the critical section.
+ mockOmitRemainingMillisForOneRecipient();
+ ASSERT(!future.isReady());
+
+ respondWithReadyToCommit();
+ future.get();
+}
+
} // namespace
} // namespace resharding
} // namespace mongo