jstests/sharding/move_primary_donor_cleaned_up_if_coordinator_steps_up_aborted.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

/**
 * Test that movePrimary coordinator recovers and cleans up the donor after a failover when it is
 * already aborted.
 *
 *  @tags: [
 *    requires_fcv_70,
 *    featureFlagOnlineMovePrimaryLifecycle
 * ]
 */
(function() {
'use strict';
load("jstests/libs/fail_point_util.js");
load("jstests/libs/parallel_shell_helpers.js");

const st = new ShardingTest({mongos: 1, shards: 2, rs: {nodes: 3}});

const mongos = st.s0;
const shard0 = st.shard0;
const oldDonorPrimary = st.rs0.getPrimary();
const shard1 = st.shard1;

const dbName = 'test_db';
const collName = 'test_coll';
const collNS = dbName + '.' + collName;

assert.commandWorked(mongos.adminCommand({enableSharding: dbName, primaryShard: shard0.shardName}));
assert.commandWorked(mongos.getCollection(collNS).insert({value: 1}));
assert.commandWorked(mongos.getCollection(collNS).insert({value: 2}));

const donorStartedCloningFp = configureFailPoint(oldDonorPrimary,
                                                 "pauseDuringMovePrimaryDonorStateTransition",
                                                 {progress: "after", state: "cloning"});

// Run movePrimary and wait for MovePrimaryDonor to start.
const joinMovePrimary = startParallelShell(
    funWithArgs(function(dbName, toShard) {
        assert.commandFailed(db.adminCommand({movePrimary: dbName, to: toShard}));
    }, dbName, shard1.shardName), mongos.port);

donorStartedCloningFp.wait();

// Trigger a failover. The MovePrimaryCoordinator will abort on step up. Make sure it does not clean
// up the donor yet.
const pauseCoordinatorFps = new Map();
st.rs0.nodes.map(node => pauseCoordinatorFps.put(
                     node, configureFailPoint(node, "movePrimaryCoordinatorHangBeforeCleaningUp")));
st.rs0.getPrimary().adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: 1});
donorStartedCloningFp.off();
st.rs0.awaitNodesAgreeOnPrimary();

// TODO SERVER-77115: Investigate why test times out if this sleep is removed.
sleep(5000);

// Trigger another failover when 1. the MovePrimaryCoordinator is already aborted and 2. the
// MovePrimaryDonor is still alive. This is the case this test is trying to set up.
pauseCoordinatorFps.get(st.rs0.getPrimary()).wait();
st.rs0.getPrimary().adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: 1});
st.rs0.awaitNodesAgreeOnPrimary();
pauseCoordinatorFps.values().map(fp => fp.off());
joinMovePrimary();

// Verify that the MovePrimaryCoordinator has cleaned up the MovePrimaryDonor.
assert.eq([], shard0.getDB("config").movePrimaryDonors.find({}).toArray());

st.stop();
})();