jstests/sharding/txn_two_phase_commit_wait_for_majority_commit_after_stepup.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

/**
 * Verifies that a node waits for the write done on stepup to become majority committed before
 * resuming coordinating transaction commits.
 *
 * @tags: [uses_transactions, uses_multi_shard_transaction]
 */

// The UUID consistency check uses connections to shards cached on the ShardingTest object, but this
// test causes failovers on a shard, so the cached connection is not usable.
TestData.skipCheckingUUIDsConsistentAcrossCluster = true;

(function() {
'use strict';

load("jstests/libs/fail_point_util.js");
load('jstests/libs/write_concern_util.js');  // for stopping/restarting replication

const dbName = "test";
const collName = "foo";
const ns = dbName + "." + collName;

let st = new ShardingTest({
    shards: 3,
    rs0: {nodes: [{}, {rsConfig: {priority: 0}}]},
    causallyConsistent: true,
    other: {
        mongosOptions: {verbose: 3},
    }
});

let coordinatorReplSetTest = st.rs0;
let participant0 = st.shard0;
let participant1 = st.shard1;
let participant2 = st.shard2;

let lsid = {id: UUID()};
let txnNumber = 0;

const runCommitThroughMongosInParallelShellExpectTimeOut = function() {
    const runCommitExpectTimeOutCode = "assert.commandFailedWithCode(db.adminCommand({" +
        "commitTransaction: 1, maxTimeMS: 1000 * 10, " +
        "lsid: " + tojson(lsid) + "," +
        "txnNumber: NumberLong(" + txnNumber + ")," +
        "stmtId: NumberInt(0)," +
        "autocommit: false," +
        "})," +
        "ErrorCodes.MaxTimeMSExpired);";
    return startParallelShell(runCommitExpectTimeOutCode, st.s.port);
};

const setUp = function() {
    // Create a sharded collection with a chunk on each shard:
    // shard0: [-inf, 0)
    // shard1: [0, 10)
    // shard2: [10, +inf)
    assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
    // The default WC is majority and stopServerReplication will prevent satisfying any majority
    // writes.
    assert.commandWorked(st.s.adminCommand(
        {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}));

    assert.commandWorked(st.s.adminCommand({movePrimary: dbName, to: participant0.shardName}));
    assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}}));
    assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}}));
    assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 10}}));
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: 0}, to: participant1.shardName}));
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: 10}, to: participant2.shardName}));

    // These forced refreshes are not strictly necessary; they just prevent extra TXN log lines
    // from the shards starting, aborting, and restarting the transaction due to needing to
    // refresh after the transaction has started.
    assert.commandWorked(participant0.adminCommand({_flushRoutingTableCacheUpdates: ns}));
    assert.commandWorked(participant1.adminCommand({_flushRoutingTableCacheUpdates: ns}));
    assert.commandWorked(participant2.adminCommand({_flushRoutingTableCacheUpdates: ns}));
    st.refreshCatalogCacheForNs(st.s, ns);

    // Start a new transaction by inserting a document onto each shard.
    assert.commandWorked(st.s.getDB(dbName).runCommand({
        insert: collName,
        documents: [{_id: -5}, {_id: 5}, {_id: 15}],
        lsid: lsid,
        txnNumber: NumberLong(txnNumber),
        stmtId: NumberInt(0),
        startTransaction: true,
        autocommit: false,
    }));
};
setUp();

let coordPrimary = coordinatorReplSetTest.getPrimary();
let coordSecondary = coordinatorReplSetTest.getSecondary();

// Make the commit coordination hang before writing the decision, and send commitTransaction.
let failPoint = configureFailPoint(coordPrimary, "hangBeforeWritingDecision");
let awaitResult = runCommitThroughMongosInParallelShellExpectTimeOut();
failPoint.wait();

// Stop replication on all nodes in the coordinator replica set so that the write done on stepup
// cannot become majority committed, regardless of which node steps up.
stopServerReplication([coordPrimary, coordSecondary]);

// Induce the coordinator primary to step down, but allow it to immediately step back up.
assert.commandWorked(
    coordPrimary.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}));
assert.commandWorked(coordPrimary.adminCommand({replSetFreeze: 0}));

failPoint.off();

// The router should retry commitTransaction against the primary and time out waiting to
// access the coordinator catalog.
awaitResult();

// Re-enable replication, so that the write done on stepup can become majority committed.
restartReplSetReplication(coordinatorReplSetTest);

// Now, commitTransaction should succeed.
assert.commandWorked(st.s.adminCommand({
    commitTransaction: 1,
    lsid: lsid,
    txnNumber: NumberLong(txnNumber),
    stmtId: NumberInt(0),
    autocommit: false
}));

jsTest.log("Verify that the transaction was committed on all shards.");
assert.eq(3, st.s.getDB(dbName).getCollection(collName).find().itcount());

st.stop();
})();