summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKshitij Gupta <kshitij.gupta@mongodb.com>2021-03-24 15:19:50 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-03-31 16:24:07 +0000
commit5ac830813af2b84752aecc02d1dc6558274ff2fd (patch)
treedb710b87b82815040866a5e9a7f50839e6c2b159
parent49381f5929198b1eff59bf46c9a5c5eca8ab2fb4 (diff)
downloadmongo-5ac830813af2b84752aecc02d1dc6558274ff2fd.tar.gz
SERVER-53923: Enforce reshardingCriticalSectionTimeout parameter.
-rw-r--r--jstests/sharding/resharding_critical_section_timeout.js85
-rw-r--r--src/mongo/base/error_codes.yml1
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_observer.cpp9
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_observer.h6
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_service.cpp25
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_service.h3
-rw-r--r--src/mongo/db/s/resharding/resharding_server_parameters.idl12
7 files changed, 139 insertions, 2 deletions
diff --git a/jstests/sharding/resharding_critical_section_timeout.js b/jstests/sharding/resharding_critical_section_timeout.js
new file mode 100644
index 00000000000..d850d89911f
--- /dev/null
+++ b/jstests/sharding/resharding_critical_section_timeout.js
@@ -0,0 +1,85 @@
+/**
+ * Verifies that resharding honors the critical section timeout.
+ *
+ * @tags: [
+ * requires_fcv_49,
+ * uses_atclustertime,
+ * ]
+ */
+
+(function() {
+'use strict';
+
+load("jstests/libs/discover_topology.js");
+load("jstests/sharding/libs/resharding_test_fixture.js");
+
+function setupTest(reshardingTest, namespace, timeout) {
+ reshardingTest.setup();
+
+ jsTest.log(`Running test for criticalSectionTimeoutMillis = ${timeout}`);
+
+ const donorShardNames = reshardingTest.donorShardNames;
+ const inputCollection = reshardingTest.createShardedCollection({
+ ns: namespace,
+ shardKeyPattern: {oldKey: 1},
+ chunks: [
+ {min: {oldKey: MinKey}, max: {oldKey: 0}, shard: donorShardNames[0]},
+ {min: {oldKey: 0}, max: {oldKey: MaxKey}, shard: donorShardNames[1]},
+ ],
+ });
+
+ const mongos = inputCollection.getMongo();
+ const topology = DiscoverTopology.findConnectedNodes(mongos);
+ const coordinator = new Mongo(topology.configsvr.nodes[0]);
+ assert.commandWorked(coordinator.getDB("admin").adminCommand(
+ {setParameter: 1, reshardingCriticalSectionTimeoutMillis: timeout}));
+
+ assert.commandWorked(inputCollection.insert([
+ {_id: "stays on shard0", oldKey: -10, newKey: -10},
+ {_id: "moves to shard0", oldKey: 10, newKey: -10},
+ {_id: "moves to shard1", oldKey: -10, newKey: 10},
+ {_id: "stays on shard1", oldKey: 10, newKey: 10},
+ ]));
+}
+
+// This test will not timeout.
+const successReshardingTest =
+ new ReshardingTest({numDonors: 2, numRecipients: 2, reshardInPlace: true});
+const noTimeoutMillis = 8000;
+var namespace = `reshardingDb.coll${noTimeoutMillis}`;
+
+setupTest(successReshardingTest, namespace, noTimeoutMillis);
+
+var recipientShardNames = successReshardingTest.recipientShardNames;
+successReshardingTest.withReshardingInBackground({
+ newShardKeyPattern: {newKey: 1},
+ newChunks: [
+ {min: {newKey: MinKey}, max: {newKey: 0}, shard: recipientShardNames[0]},
+ {min: {newKey: 0}, max: {newKey: MaxKey}, shard: recipientShardNames[1]},
+ ],
+});
+
+successReshardingTest.teardown();
+
+// This test will timeout.
+const failureReshardingTest =
+ new ReshardingTest({numDonors: 2, numRecipients: 2, reshardInPlace: true});
+const shouldTimeoutMillis = 0;
+namespace = `reshardingDb.coll${shouldTimeoutMillis}`;
+
+setupTest(failureReshardingTest, namespace, shouldTimeoutMillis);
+
+recipientShardNames = failureReshardingTest.recipientShardNames;
+failureReshardingTest.withReshardingInBackground(
+ {
+ newShardKeyPattern: {newKey: 1},
+ newChunks: [
+ {min: {newKey: MinKey}, max: {newKey: 0}, shard: recipientShardNames[0]},
+ {min: {newKey: 0}, max: {newKey: MaxKey}, shard: recipientShardNames[1]},
+ ],
+ },
+ (tempNs) => {},
+ {expectedErrorCode: ErrorCodes.ReshardingCriticalSectionTimeout});
+
+failureReshardingTest.teardown();
+})();
diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml
index ade1b613d3f..07a8353e4a7 100644
--- a/src/mongo/base/error_codes.yml
+++ b/src/mongo/base/error_codes.yml
@@ -423,6 +423,7 @@ error_codes:
- {code: 339, name: NoSuchReshardCollection}
- {code: 340, name: ReshardCollectionCommitted}
- {code: 341, name: ReshardCollectionAborted}
+ - {code: 342, name: ReshardingCriticalSectionTimeout}
# Error codes 4000-8999 are reserved.
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp
index 0522ac65ef4..2cc932b201e 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp
@@ -275,6 +275,15 @@ void ReshardingCoordinatorObserver::interrupt(Status status) {
}
}
+void ReshardingCoordinatorObserver::onCriticalSectionTimeout() {
+ stdx::lock_guard<Latch> lk(_mutex);
+ if (!_allRecipientsReportedStrictConsistencyTimestamp.getFuture().isReady()) {
+ _allRecipientsReportedStrictConsistencyTimestamp.setError(
+ Status{ErrorCodes::ReshardingCriticalSectionTimeout,
+ "Resharding critical section timed out."});
+ }
+}
+
void ReshardingCoordinatorObserver::_onAbortOrStepdown(WithLock, Status status) {
if (!_allDonorsReportedMinFetchTimestamp.getFuture().isReady()) {
_allDonorsReportedMinFetchTimestamp.setError(status);
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_observer.h b/src/mongo/db/s/resharding/resharding_coordinator_observer.h
index 50d4c2fdd5e..b0c6eedfe95 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_observer.h
+++ b/src/mongo/db/s/resharding/resharding_coordinator_observer.h
@@ -110,6 +110,12 @@ public:
SharedSemiFuture<void> awaitAllParticipantsDoneAborting();
/**
+ * Checks if all recipients are in steady state. Otherwise, sets an error state so that
+ * resharding is aborted.
+ */
+ void onCriticalSectionTimeout();
+
+ /**
* Sets errors on any promises that have not yet been fulfilled.
*/
void interrupt(Status status);
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
index e76a0f93d03..50fe65125ee 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
@@ -1001,7 +1001,7 @@ SemiFuture<void> ReshardingCoordinatorService::ReshardingCoordinator::run(
const ReshardingCoordinatorDocument& updatedCoordinatorDoc) {
return _persistDecisionAndFinishReshardOperation(executor, updatedCoordinatorDoc);
})
- .onCompletion([this, self = shared_from_this()](Status status) {
+ .onCompletion([this, self = shared_from_this(), executor](Status status) {
// TODO SERVER-53914 depending on where we load metrics at the start of the operation,
// this may need to change
if (_coordinatorDoc.getState() != CoordinatorStateEnum::kUnused) {
@@ -1021,6 +1021,10 @@ SemiFuture<void> ReshardingCoordinatorService::ReshardingCoordinator::run(
_completionPromise.setError(status);
}
+ if (_criticalSectionTimeoutCbHandle) {
+ (*executor)->cancel(*_criticalSectionTimeoutCbHandle);
+ }
+
return status;
})
.thenRunOn(_coordinatorService->getInstanceCleanupExecutor())
@@ -1243,7 +1247,7 @@ ReshardingCoordinatorService::ReshardingCoordinator::_awaitAllRecipientsFinished
_reshardingCoordinatorObserver->awaitAllRecipientsFinishedApplying(),
_ctHolder->getAbortToken())
.thenRunOn(**executor)
- .then([this](ReshardingCoordinatorDocument coordinatorDocChangedOnDisk) {
+ .then([this, executor](ReshardingCoordinatorDocument coordinatorDocChangedOnDisk) {
{
auto opCtx = cc().makeOperationContext();
reshardingPauseCoordinatorInSteadyState.pauseWhileSetAndNotCanceled(
@@ -1252,6 +1256,23 @@ ReshardingCoordinatorService::ReshardingCoordinator::_awaitAllRecipientsFinished
this->_updateCoordinatorDocStateAndCatalogEntries(CoordinatorStateEnum::kBlockingWrites,
coordinatorDocChangedOnDisk);
+
+ const auto criticalSectionTimeout =
+ Milliseconds(resharding::gReshardingCriticalSectionTimeoutMillis.load());
+ auto swCbHandle = (*executor)->scheduleWorkAt(
+ (*executor)->now() + criticalSectionTimeout,
+ [this](const executor::TaskExecutor::CallbackArgs& cbData) {
+ if (!cbData.status.isOK()) {
+ return;
+ }
+ _reshardingCoordinatorObserver->onCriticalSectionTimeout();
+ });
+
+ if (!swCbHandle.isOK()) {
+ _reshardingCoordinatorObserver->interrupt(swCbHandle.getStatus());
+ }
+
+ _criticalSectionTimeoutCbHandle = swCbHandle.getValue();
});
}
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.h b/src/mongo/db/s/resharding/resharding_coordinator_service.h
index b99accd3050..6efe33d2ea9 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service.h
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service.h
@@ -377,6 +377,9 @@ private:
// Promise that is resolved when the chain of work kicked off by run() has completed.
SharedPromise<void> _completionPromise;
+
+ // Callback handle for scheduled work to handle critical section timeout.
+ boost::optional<executor::TaskExecutor::CallbackHandle> _criticalSectionTimeoutCbHandle;
};
} // namespace mongo
diff --git a/src/mongo/db/s/resharding/resharding_server_parameters.idl b/src/mongo/db/s/resharding/resharding_server_parameters.idl
index 30cf8e9b728..4c13b83d93b 100644
--- a/src/mongo/db/s/resharding/resharding_server_parameters.idl
+++ b/src/mongo/db/s/resharding/resharding_server_parameters.idl
@@ -119,3 +119,15 @@ server_parameters:
expr: 5 * 60 * 1000
validator:
gte: 0
+
+ reshardingCriticalSectionTimeoutMillis:
+ description: >-
+ The upper limit on how long to wait to hear back from recipient shards reaching strict
+ consistency after engaging the critical section.
+ set_at: [startup, runtime]
+ cpp_vartype: AtomicWord<int>
+ cpp_varname: gReshardingCriticalSectionTimeoutMillis
+ default:
+ expr: 5 * 1000
+ validator:
+ gte: 0