SERVER-61483 Fix resharding coordinator to recover its abort decision.

(cherry picked from commit d9fcd9f124ece9ab0b3a3c46cb6d7052b7282dd2)
author: Max Hirschhorn <max.hirschhorn@mongodb.com> 2021-11-16 13:07:24 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-11-16 13:31:24 +0000
commit: 201967a1696da1340f5dd7e328fe1229667e8e36 (patch)
tree: a82b9629a205df13916a26593e994f5d1f3904b3
parent: 2921ff2372c9d05f86eb7614a07b731cd5b5b544 (diff)
download: mongo-201967a1696da1340f5dd7e328fe1229667e8e36.tar.gz
3 files changed, 123 insertions, 3 deletions
diff --git a/jstests/sharding/libs/resharding_test_fixture.js b/jstests/sharding/libs/resharding_test_fixture.js
index 80010620013..4584436f82d 100644
--- a/jstests/sharding/libs/resharding_test_fixture.js
+++ b/jstests/sharding/libs/resharding_test_fixture.js
@@ -540,11 +540,16 @@ var ReshardingTest = class {
             });
         } else {
             this._callFunctionSafely(() => {
-                this._pauseCoordinatorBeforeBlockingWrites.off();
+                this.retryOnceOnNetworkError(  //
+                    () => this._pauseCoordinatorBeforeBlockingWrites.off());
+
                 postCheckConsistencyFn();
-                this._pauseCoordinatorBeforeDecisionPersistedFailpoint.off();
+                this.retryOnceOnNetworkError(
+                    () => this._pauseCoordinatorBeforeDecisionPersistedFailpoint.off());
+
                 postDecisionPersistedFn();
-                this._pauseCoordinatorBeforeCompletionFailpoint.off();
+                this.retryOnceOnNetworkError(
+                    () => this._pauseCoordinatorBeforeCompletionFailpoint.off());
             });
         }
 
@@ -899,4 +904,27 @@ var ReshardingTest = class {
 
         return cloneTimestamp;
     }
+
+    /**
+     * Calls and returns the value from the supplied function.
+     *
+     * If a network error is thrown during its execution, then this function will invoke the
+     * supplied function a second time. This pattern is useful for tolerating network errors which
+     * result from elections triggered by any of the stepUpNewPrimaryOnShard(),
+     * killAndRestartPrimaryOnShard(), and shutdownAndRestartPrimaryOnShard() methods.
+     *
+     * @param fn - the function to be called.
+     * @returns the return value from fn.
+     */
+    retryOnceOnNetworkError(fn) {
+        try {
+            return fn();
+        } catch (e) {
+            if (!isNetworkError(e)) {
+                throw e;
+            }
+
+            return fn();
+        }
+    }
 };
diff --git a/jstests/sharding/resharding_coordinator_recovers_abort_decision.js b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js
new file mode 100644
index 00000000000..cc1e6605c46
--- /dev/null
+++ b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js
@@ -0,0 +1,87 @@
+/**
+ * Tests that the resharding coordinator recovers its abort decision after a primary failover.
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/discover_topology.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/parallel_shell_helpers.js");
+load("jstests/sharding/libs/resharding_test_fixture.js");
+
+const reshardingTest = new ReshardingTest({enableElections: true});
+reshardingTest.setup();
+
+const donorShardNames = reshardingTest.donorShardNames;
+const sourceCollection = reshardingTest.createShardedCollection({
+    ns: "reshardingDb.coll",
+    shardKeyPattern: {oldKey: 1},
+    chunks: [{min: {oldKey: MinKey}, max: {oldKey: MaxKey}, shard: donorShardNames[0]}],
+});
+
+const mongos = sourceCollection.getMongo();
+const topology = DiscoverTopology.findConnectedNodes(mongos);
+
+const recipientShardNames = reshardingTest.recipientShardNames;
+const recipient = new Mongo(topology.shards[recipientShardNames[0]].primary);
+
+// We have the recipient shard fail the _shardsvrAbortReshardCollection to synchronize around
+//   (1) the resharding coordinator having persisted its abort decision locally,
+//   (2) the resharding coordinator having waited for its abort decision to become majority
+//       committed, and
+//   (3) the resharding coordinator not yet having finished delivering the abort decision to all of
+//       the participant shards.
+const shardsvrAbortReshardCollectionFailpoint = configureFailPoint(recipient, "failCommand", {
+    failInternalCommands: true,
+    errorCode: ErrorCodes.HostUnreachable,
+    failCommands: ["_shardsvrAbortReshardCollection"],
+});
+
+let awaitAbort;
+reshardingTest.withReshardingInBackground(
+    {
+        newShardKeyPattern: {newKey: 1},
+        newChunks: [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: recipientShardNames[0]}],
+    },
+    () => {
+        // Wait until participants are aware of the resharding operation.
+        reshardingTest.awaitCloneTimestampChosen();
+
+        awaitAbort = startParallelShell(funWithArgs(function(ns) {
+                                            db.adminCommand({abortReshardCollection: ns});
+                                        }, sourceCollection.getFullName()), mongos.port);
+    },
+    {
+        expectedErrorCode: ErrorCodes.ReshardCollectionAborted,
+        postDecisionPersistedFn: () => {
+            shardsvrAbortReshardCollectionFailpoint.wait();
+
+            // Mongos automatically retries the abortReshardCollection command on retryable errors.
+            // We interrupt the abortReshardCollection command running on mongos to verify that the
+            // ReshardingCoordinator recovers the decision on its own.
+            const ops =
+                mongos.getDB("admin")
+                    .aggregate([
+                        {$currentOp: {localOps: true}},
+                        {$match: {"command.abortReshardCollection": sourceCollection.getFullName()}}
+                    ])
+                    .toArray();
+
+            assert.neq([], ops, "failed to find abortReshardCollection command running on mongos");
+            assert.eq(
+                1,
+                ops.length,
+                () =>
+                    `found more than one abortReshardCollection command on mongos: ${tojson(ops)}`);
+
+            assert.commandWorked(mongos.getDB("admin").killOp(ops[0].opid));
+
+            reshardingTest.stepUpNewPrimaryOnShard(reshardingTest.configShardName);
+            shardsvrAbortReshardCollectionFailpoint.off();
+        },
+    });
+
+awaitAbort();
+
+reshardingTest.teardown();
+})();
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
index e3003540412..c66ee925993 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
@@ -1463,6 +1463,10 @@ void ReshardingCoordinatorService::ReshardingCoordinator::_insertCoordDocAndChan
                 ->onStepUp(ReshardingMetrics::Role::kCoordinator);
         }
 
+        if (_coordinatorDoc.getState() == CoordinatorStateEnum::kAborting) {
+            _ctHolder->abort();
+        }
+
         return;
     }
 
@@ -1682,6 +1686,7 @@ ReshardingCoordinatorService::ReshardingCoordinator::_awaitAllRecipientsInStrict
 Future<void> ReshardingCoordinatorService::ReshardingCoordinator::_commit(
     const ReshardingCoordinatorDocument& coordinatorDoc) {
     if (_coordinatorDoc.getState() > CoordinatorStateEnum::kBlockingWrites) {
+        invariant(_coordinatorDoc.getState() != CoordinatorStateEnum::kAborting);
         return Status::OK();
     }
author	Max Hirschhorn <max.hirschhorn@mongodb.com>	2021-11-16 13:07:24 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-11-16 13:31:24 +0000
commit	201967a1696da1340f5dd7e328fe1229667e8e36 (patch)
tree	a82b9629a205df13916a26593e994f5d1f3904b3
parent	2921ff2372c9d05f86eb7614a07b731cd5b5b544 (diff)
download	mongo-201967a1696da1340f5dd7e328fe1229667e8e36.tar.gz