summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMax Hirschhorn <max.hirschhorn@mongodb.com>2021-11-16 13:07:24 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-11-16 13:31:24 +0000
commit201967a1696da1340f5dd7e328fe1229667e8e36 (patch)
treea82b9629a205df13916a26593e994f5d1f3904b3
parent2921ff2372c9d05f86eb7614a07b731cd5b5b544 (diff)
downloadmongo-201967a1696da1340f5dd7e328fe1229667e8e36.tar.gz
SERVER-61483 Fix resharding coordinator to recover its abort decision.
(cherry picked from commit d9fcd9f124ece9ab0b3a3c46cb6d7052b7282dd2)
-rw-r--r--jstests/sharding/libs/resharding_test_fixture.js34
-rw-r--r--jstests/sharding/resharding_coordinator_recovers_abort_decision.js87
-rw-r--r--src/mongo/db/s/resharding/resharding_coordinator_service.cpp5
3 files changed, 123 insertions, 3 deletions
diff --git a/jstests/sharding/libs/resharding_test_fixture.js b/jstests/sharding/libs/resharding_test_fixture.js
index 80010620013..4584436f82d 100644
--- a/jstests/sharding/libs/resharding_test_fixture.js
+++ b/jstests/sharding/libs/resharding_test_fixture.js
@@ -540,11 +540,16 @@ var ReshardingTest = class {
});
} else {
this._callFunctionSafely(() => {
- this._pauseCoordinatorBeforeBlockingWrites.off();
+ this.retryOnceOnNetworkError( //
+ () => this._pauseCoordinatorBeforeBlockingWrites.off());
+
postCheckConsistencyFn();
- this._pauseCoordinatorBeforeDecisionPersistedFailpoint.off();
+ this.retryOnceOnNetworkError(
+ () => this._pauseCoordinatorBeforeDecisionPersistedFailpoint.off());
+
postDecisionPersistedFn();
- this._pauseCoordinatorBeforeCompletionFailpoint.off();
+ this.retryOnceOnNetworkError(
+ () => this._pauseCoordinatorBeforeCompletionFailpoint.off());
});
}
@@ -899,4 +904,27 @@ var ReshardingTest = class {
return cloneTimestamp;
}
+
+ /**
+ * Calls and returns the value from the supplied function.
+ *
+ * If a network error is thrown during its execution, then this function will invoke the
+ * supplied function a second time. This pattern is useful for tolerating network errors which
+ * result from elections triggered by any of the stepUpNewPrimaryOnShard(),
+ * killAndRestartPrimaryOnShard(), and shutdownAndRestartPrimaryOnShard() methods.
+ *
+ * @param fn - the function to be called.
+ * @returns the return value from fn.
+ */
+ retryOnceOnNetworkError(fn) {
+ try {
+ return fn();
+ } catch (e) {
+ if (!isNetworkError(e)) {
+ throw e;
+ }
+
+ return fn();
+ }
+ }
};
diff --git a/jstests/sharding/resharding_coordinator_recovers_abort_decision.js b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js
new file mode 100644
index 00000000000..cc1e6605c46
--- /dev/null
+++ b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js
@@ -0,0 +1,87 @@
+/**
+ * Tests that the resharding coordinator recovers its abort decision after a primary failover.
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/discover_topology.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/parallel_shell_helpers.js");
+load("jstests/sharding/libs/resharding_test_fixture.js");
+
+const reshardingTest = new ReshardingTest({enableElections: true});
+reshardingTest.setup();
+
+const donorShardNames = reshardingTest.donorShardNames;
+const sourceCollection = reshardingTest.createShardedCollection({
+ ns: "reshardingDb.coll",
+ shardKeyPattern: {oldKey: 1},
+ chunks: [{min: {oldKey: MinKey}, max: {oldKey: MaxKey}, shard: donorShardNames[0]}],
+});
+
+const mongos = sourceCollection.getMongo();
+const topology = DiscoverTopology.findConnectedNodes(mongos);
+
+const recipientShardNames = reshardingTest.recipientShardNames;
+const recipient = new Mongo(topology.shards[recipientShardNames[0]].primary);
+
+// We have the recipient shard fail the _shardsvrAbortReshardCollection to synchronize around
+// (1) the resharding coordinator having persisted its abort decision locally,
+// (2) the resharding coordinator having waited for its abort decision to become majority
+// committed, and
+// (3) the resharding coordinator not yet having finished delivering the abort decision to all of
+// the participant shards.
+const shardsvrAbortReshardCollectionFailpoint = configureFailPoint(recipient, "failCommand", {
+ failInternalCommands: true,
+ errorCode: ErrorCodes.HostUnreachable,
+ failCommands: ["_shardsvrAbortReshardCollection"],
+});
+
+let awaitAbort;
+reshardingTest.withReshardingInBackground(
+ {
+ newShardKeyPattern: {newKey: 1},
+ newChunks: [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: recipientShardNames[0]}],
+ },
+ () => {
+ // Wait until participants are aware of the resharding operation.
+ reshardingTest.awaitCloneTimestampChosen();
+
+ awaitAbort = startParallelShell(funWithArgs(function(ns) {
+ db.adminCommand({abortReshardCollection: ns});
+ }, sourceCollection.getFullName()), mongos.port);
+ },
+ {
+ expectedErrorCode: ErrorCodes.ReshardCollectionAborted,
+ postDecisionPersistedFn: () => {
+ shardsvrAbortReshardCollectionFailpoint.wait();
+
+ // Mongos automatically retries the abortReshardCollection command on retryable errors.
+ // We interrupt the abortReshardCollection command running on mongos to verify that the
+ // ReshardingCoordinator recovers the decision on its own.
+ const ops =
+ mongos.getDB("admin")
+ .aggregate([
+ {$currentOp: {localOps: true}},
+ {$match: {"command.abortReshardCollection": sourceCollection.getFullName()}}
+ ])
+ .toArray();
+
+ assert.neq([], ops, "failed to find abortReshardCollection command running on mongos");
+ assert.eq(
+ 1,
+ ops.length,
+ () =>
+ `found more than one abortReshardCollection command on mongos: ${tojson(ops)}`);
+
+ assert.commandWorked(mongos.getDB("admin").killOp(ops[0].opid));
+
+ reshardingTest.stepUpNewPrimaryOnShard(reshardingTest.configShardName);
+ shardsvrAbortReshardCollectionFailpoint.off();
+ },
+ });
+
+awaitAbort();
+
+reshardingTest.teardown();
+})();
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
index e3003540412..c66ee925993 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
@@ -1463,6 +1463,10 @@ void ReshardingCoordinatorService::ReshardingCoordinator::_insertCoordDocAndChan
->onStepUp(ReshardingMetrics::Role::kCoordinator);
}
+ if (_coordinatorDoc.getState() == CoordinatorStateEnum::kAborting) {
+ _ctHolder->abort();
+ }
+
return;
}
@@ -1682,6 +1686,7 @@ ReshardingCoordinatorService::ReshardingCoordinator::_awaitAllRecipientsInStrict
Future<void> ReshardingCoordinatorService::ReshardingCoordinator::_commit(
const ReshardingCoordinatorDocument& coordinatorDoc) {
if (_coordinatorDoc.getState() > CoordinatorStateEnum::kBlockingWrites) {
+ invariant(_coordinatorDoc.getState() != CoordinatorStateEnum::kAborting);
return Status::OK();
}