1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
/**
* Tests that the resharding coordinator recovers its abort decision after a primary failover.
*/
(function() {
"use strict";
load("jstests/libs/discover_topology.js");
load("jstests/libs/parallelTester.js");
load("jstests/libs/parallel_shell_helpers.js");
load("jstests/sharding/libs/resharding_test_fixture.js");
const reshardingTest = new ReshardingTest(
{enableElections: true, logComponentVerbosity: tojson({sharding: 2, network: 4})});
reshardingTest.setup();
const donorShardNames = reshardingTest.donorShardNames;
const sourceCollection = reshardingTest.createShardedCollection({
ns: "reshardingDb.coll",
shardKeyPattern: {oldKey: 1},
chunks: [{min: {oldKey: MinKey}, max: {oldKey: MaxKey}, shard: donorShardNames[0]}],
});
const mongos = sourceCollection.getMongo();
const ns = sourceCollection.getFullName();
let topology = DiscoverTopology.findConnectedNodes(mongos);
const recipientShardNames = reshardingTest.recipientShardNames;
const recipient = new Mongo(topology.shards[recipientShardNames[0]].primary);
// We have the recipient shard fail the _shardsvrAbortReshardCollection to synchronize around
// (1) the resharding coordinator having persisted its abort decision locally,
// (2) the resharding coordinator having waited for its abort decision to become majority
// committed, and
// (3) the resharding coordinator not yet having finished delivering the abort decision to all of
// the participant shards.
const shardsvrAbortReshardCollectionFailpoint = configureFailPoint(recipient, "failCommand", {
failInternalCommands: true,
errorCode: ErrorCodes.HostUnreachable,
failCommands: ["_shardsvrAbortReshardCollection"],
});
// We pause the _configsvrReshardCollection command upon joining an existing ReshardingCoordinator
// instance on all of the config server replica set because we don't know which node will be elected
// primary from calling stepUpNewPrimaryOnShard().
const configsvrConnections = topology.configsvr.nodes.map(host => new Mongo(host));
const reshardCollectionJoinedFailPointsList = configsvrConnections.map(
conn => configureFailPoint(conn, "reshardCollectionJoinedExistingOperation"));
let awaitAbort;
reshardingTest.withReshardingInBackground(
{
newShardKeyPattern: {newKey: 1},
newChunks: [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: recipientShardNames[0]}],
},
() => {
// Wait until participants are aware of the resharding operation.
reshardingTest.awaitCloneTimestampChosen();
awaitAbort = startParallelShell(funWithArgs(function(ns) {
db.adminCommand({abortReshardCollection: ns});
}, ns), mongos.port);
// Wait for the coordinator to have persisted its decision to abort the resharding operation
// as a result of the abortReshardCollection command being processed.
assert.soon(() => {
const coordinatorDoc =
mongos.getCollection("config.reshardingOperations").findOne({ns: ns});
return coordinatorDoc !== null && coordinatorDoc.state === "aborting";
});
},
{
expectedErrorCode: ErrorCodes.ReshardCollectionAborted,
postDecisionPersistedFn: () => {
shardsvrAbortReshardCollectionFailpoint.wait();
// Mongos automatically retries the abortReshardCollection command on retryable errors.
// We interrupt the abortReshardCollection command running on mongos to verify that the
// ReshardingCoordinator recovers the decision on its own.
const ops = mongos.getDB("admin")
.aggregate([
{$currentOp: {localOps: true}},
{$match: {"command.abortReshardCollection": ns}}
])
.toArray();
assert.neq([], ops, "failed to find abortReshardCollection command running on mongos");
assert.eq(
1,
ops.length,
() =>
`found more than one abortReshardCollection command on mongos: ${tojson(ops)}`);
assert.commandWorked(mongos.getDB("admin").killOp(ops[0].opid));
reshardingTest.stepUpNewPrimaryOnShard(reshardingTest.configShardName);
// After a stepdown, the _configsvrReshardCollection command will be retried by the
// primary shard. We use the reshardCollectionJoinedExistingOperation failpoint to
// ensure the primary shard upon retrying finds the ongoing resharding operation on the
// new config server primary. It would otherwise be possible for the
// reshardingPauseCoordinatorBeforeCompletion failpoint to be released by the
// ReshardingTest fixture after this function returns, for the ongoing resharding
// operation to complete, and for the retried _configsvrReshardCollection command to
// spawn an entirely new resharding operation which won't get aborted by the test
// client.
topology = DiscoverTopology.findConnectedNodes(mongos);
const configsvrPrimary = new Mongo(topology.configsvr.primary);
const idx = reshardCollectionJoinedFailPointsList.findIndex(fp => fp.conn.host ===
configsvrPrimary.host);
reshardCollectionJoinedFailPointsList[idx].wait();
// Wait for secondaries to recover and catchup with primary before turning off the
// failpoints as a replication roll back can disconnect the test client.
const configRS = reshardingTest.getReplSetForShard(reshardingTest.configShardName);
configRS.awaitSecondaryNodes();
configRS.awaitReplication();
reshardCollectionJoinedFailPointsList.forEach(fp => fp.off());
shardsvrAbortReshardCollectionFailpoint.off();
},
});
awaitAbort();
reshardingTest.teardown();
})();
|