diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2021-11-16 13:07:24 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-11-16 13:31:24 +0000 |
commit | 201967a1696da1340f5dd7e328fe1229667e8e36 (patch) | |
tree | a82b9629a205df13916a26593e994f5d1f3904b3 | |
parent | 2921ff2372c9d05f86eb7614a07b731cd5b5b544 (diff) | |
download | mongo-201967a1696da1340f5dd7e328fe1229667e8e36.tar.gz |
SERVER-61483 Fix resharding coordinator to recover its abort decision.
(cherry picked from commit d9fcd9f124ece9ab0b3a3c46cb6d7052b7282dd2)
3 files changed, 123 insertions, 3 deletions
diff --git a/jstests/sharding/libs/resharding_test_fixture.js b/jstests/sharding/libs/resharding_test_fixture.js index 80010620013..4584436f82d 100644 --- a/jstests/sharding/libs/resharding_test_fixture.js +++ b/jstests/sharding/libs/resharding_test_fixture.js @@ -540,11 +540,16 @@ var ReshardingTest = class { }); } else { this._callFunctionSafely(() => { - this._pauseCoordinatorBeforeBlockingWrites.off(); + this.retryOnceOnNetworkError( // + () => this._pauseCoordinatorBeforeBlockingWrites.off()); + postCheckConsistencyFn(); - this._pauseCoordinatorBeforeDecisionPersistedFailpoint.off(); + this.retryOnceOnNetworkError( + () => this._pauseCoordinatorBeforeDecisionPersistedFailpoint.off()); + postDecisionPersistedFn(); - this._pauseCoordinatorBeforeCompletionFailpoint.off(); + this.retryOnceOnNetworkError( + () => this._pauseCoordinatorBeforeCompletionFailpoint.off()); }); } @@ -899,4 +904,27 @@ var ReshardingTest = class { return cloneTimestamp; } + + /** + * Calls and returns the value from the supplied function. + * + * If a network error is thrown during its execution, then this function will invoke the + * supplied function a second time. This pattern is useful for tolerating network errors which + * result from elections triggered by any of the stepUpNewPrimaryOnShard(), + * killAndRestartPrimaryOnShard(), and shutdownAndRestartPrimaryOnShard() methods. + * + * @param fn - the function to be called. + * @returns the return value from fn. + */ + retryOnceOnNetworkError(fn) { + try { + return fn(); + } catch (e) { + if (!isNetworkError(e)) { + throw e; + } + + return fn(); + } + } }; diff --git a/jstests/sharding/resharding_coordinator_recovers_abort_decision.js b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js new file mode 100644 index 00000000000..cc1e6605c46 --- /dev/null +++ b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js @@ -0,0 +1,87 @@ +/** + * Tests that the resharding coordinator recovers its abort decision after a primary failover. + */ +(function() { +"use strict"; + +load("jstests/libs/discover_topology.js"); +load("jstests/libs/parallelTester.js"); +load("jstests/libs/parallel_shell_helpers.js"); +load("jstests/sharding/libs/resharding_test_fixture.js"); + +const reshardingTest = new ReshardingTest({enableElections: true}); +reshardingTest.setup(); + +const donorShardNames = reshardingTest.donorShardNames; +const sourceCollection = reshardingTest.createShardedCollection({ + ns: "reshardingDb.coll", + shardKeyPattern: {oldKey: 1}, + chunks: [{min: {oldKey: MinKey}, max: {oldKey: MaxKey}, shard: donorShardNames[0]}], +}); + +const mongos = sourceCollection.getMongo(); +const topology = DiscoverTopology.findConnectedNodes(mongos); + +const recipientShardNames = reshardingTest.recipientShardNames; +const recipient = new Mongo(topology.shards[recipientShardNames[0]].primary); + +// We have the recipient shard fail the _shardsvrAbortReshardCollection to synchronize around +// (1) the resharding coordinator having persisted its abort decision locally, +// (2) the resharding coordinator having waited for its abort decision to become majority +// committed, and +// (3) the resharding coordinator not yet having finished delivering the abort decision to all of +// the participant shards. +const shardsvrAbortReshardCollectionFailpoint = configureFailPoint(recipient, "failCommand", { + failInternalCommands: true, + errorCode: ErrorCodes.HostUnreachable, + failCommands: ["_shardsvrAbortReshardCollection"], +}); + +let awaitAbort; +reshardingTest.withReshardingInBackground( + { + newShardKeyPattern: {newKey: 1}, + newChunks: [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: recipientShardNames[0]}], + }, + () => { + // Wait until participants are aware of the resharding operation. + reshardingTest.awaitCloneTimestampChosen(); + + awaitAbort = startParallelShell(funWithArgs(function(ns) { + db.adminCommand({abortReshardCollection: ns}); + }, sourceCollection.getFullName()), mongos.port); + }, + { + expectedErrorCode: ErrorCodes.ReshardCollectionAborted, + postDecisionPersistedFn: () => { + shardsvrAbortReshardCollectionFailpoint.wait(); + + // Mongos automatically retries the abortReshardCollection command on retryable errors. + // We interrupt the abortReshardCollection command running on mongos to verify that the + // ReshardingCoordinator recovers the decision on its own. + const ops = + mongos.getDB("admin") + .aggregate([ + {$currentOp: {localOps: true}}, + {$match: {"command.abortReshardCollection": sourceCollection.getFullName()}} + ]) + .toArray(); + + assert.neq([], ops, "failed to find abortReshardCollection command running on mongos"); + assert.eq( + 1, + ops.length, + () => + `found more than one abortReshardCollection command on mongos: ${tojson(ops)}`); + + assert.commandWorked(mongos.getDB("admin").killOp(ops[0].opid)); + + reshardingTest.stepUpNewPrimaryOnShard(reshardingTest.configShardName); + shardsvrAbortReshardCollectionFailpoint.off(); + }, + }); + +awaitAbort(); + +reshardingTest.teardown(); +})(); diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp index e3003540412..c66ee925993 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp @@ -1463,6 +1463,10 @@ void ReshardingCoordinatorService::ReshardingCoordinator::_insertCoordDocAndChan ->onStepUp(ReshardingMetrics::Role::kCoordinator); } + if (_coordinatorDoc.getState() == CoordinatorStateEnum::kAborting) { + _ctHolder->abort(); + } + return; } @@ -1682,6 +1686,7 @@ ReshardingCoordinatorService::ReshardingCoordinator::_awaitAllRecipientsInStrict Future<void> ReshardingCoordinatorService::ReshardingCoordinator::_commit( const ReshardingCoordinatorDocument& coordinatorDoc) { if (_coordinatorDoc.getState() > CoordinatorStateEnum::kBlockingWrites) { + invariant(_coordinatorDoc.getState() != CoordinatorStateEnum::kAborting); return Status::OK(); } |