summaryrefslogtreecommitdiff
path: root/jstests/sharding/transactions_stale_shard_version_errors.js
blob: 4d9b7f7edf32bad596c5e974acd7f0edaa55e025 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Tests mongos behavior on stale shard version errors received in a transaction.
//
// @tags: [requires_sharding, uses_transactions]
(function() {
    "use strict";

    function expectChunks(st, ns, chunks) {
        for (let i = 0; i < chunks.length; i++) {
            assert.eq(chunks[i],
                      st.s.getDB("config").chunks.count({ns: ns, shard: st["shard" + i].shardName}),
                      "unexpected number of chunks on shard " + i);
        }
    }

    const dbName = "test";
    const collName = "foo";
    const ns = dbName + '.' + collName;

    const st = new ShardingTest({shards: 3, mongos: 2, config: 1});

    // Disable the best-effort recipient metadata refresh after migrations to simplify simulating
    // stale shard version errors.
    assert.commandWorked(st.rs0.getPrimary().adminCommand(
        {configureFailPoint: "doNotRefreshRecipientAfterCommit", mode: "alwaysOn"}));
    assert.commandWorked(st.rs1.getPrimary().adminCommand(
        {configureFailPoint: "doNotRefreshRecipientAfterCommit", mode: "alwaysOn"}));
    assert.commandWorked(st.rs2.getPrimary().adminCommand(
        {configureFailPoint: "doNotRefreshRecipientAfterCommit", mode: "alwaysOn"}));

    // Shard two collections in the same database, each with 2 chunks, [minKey, 0), [0, maxKey),
    // with one document each, all on Shard0.

    assert.writeOK(st.s.getDB(dbName)[collName].insert({_id: -5}, {writeConcern: {w: "majority"}}));
    assert.writeOK(st.s.getDB(dbName)[collName].insert({_id: 5}, {writeConcern: {w: "majority"}}));

    assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
    st.ensurePrimaryShard(dbName, st.shard0.shardName);

    assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}}));
    assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}}));

    expectChunks(st, ns, [2, 0, 0]);

    const otherCollName = "bar";
    const otherNs = dbName + "." + otherCollName;

    assert.writeOK(
        st.s.getDB(dbName)[otherCollName].insert({_id: -5}, {writeConcern: {w: "majority"}}));
    assert.writeOK(
        st.s.getDB(dbName)[otherCollName].insert({_id: 5}, {writeConcern: {w: "majority"}}));

    assert.commandWorked(st.s.adminCommand({shardCollection: otherNs, key: {_id: 1}}));
    assert.commandWorked(st.s.adminCommand({split: otherNs, middle: {_id: 0}}));

    expectChunks(st, otherNs, [2, 0, 0]);

    const session = st.s.startSession();
    const sessionDB = session.getDatabase(dbName);

    //
    // Stale shard version on first overall command should succeed.
    //

    // Move a chunk in the first collection from Shard0 to Shard1 through the main mongos, so Shard1
    // is stale but not the router.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard1.shardName}));
    expectChunks(st, ns, [1, 1, 0]);

    session.startTransaction();

    // Targets Shard1, which is stale.
    assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

    // TODO SERVER-36304: Change this to commitTransaction once multi shard transactions can be
    // committed through mongos.
    session.abortTransaction();

    //
    // Stale shard version on second command to a shard should fail.
    //

    expectChunks(st, ns, [1, 1, 0]);

    // Move a chunk in the other collection from Shard0 to Shard1 through the main mongos, so Shard1
    // is stale for the other collection but not the router.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: otherNs, find: {_id: 5}, to: st.shard1.shardName}));
    expectChunks(st, otherNs, [1, 1, 0]);

    session.startTransaction();

    // Targets Shard1 for the first ns, which is not stale.
    assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

    // Targets the other sharded collection on Shard1, which is stale. Because a previous statement
    // has executed on Shard1, the retry will not restart the transaction, and will fail when it
    // finds the transaction has aborted because of the stale shard version.
    let res =
        assert.commandFailedWithCode(sessionDB.runCommand({find: otherCollName, filter: {_id: 5}}),
                                     ErrorCodes.NoSuchTransaction);
    assert.eq(res.errorLabels, ["TransientTransactionError"]);

    session.abortTransaction();

    //
    // Stale shard version on first command to a new shard should succeed.
    //

    expectChunks(st, ns, [1, 1, 0]);

    // Move a chunk for the other collection from Shard1 to Shard0 through the main mongos, so
    // Shard0 is stale for it and the router is not.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: otherNs, find: {_id: 5}, to: st.shard0.shardName}));
    expectChunks(st, otherNs, [2, 0, 0]);

    session.startTransaction();

    // Targets Shard1 for the first ns, which is not stale.
    assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

    // Targets Shard0 for the other ns, which is stale.
    assert.commandWorked(sessionDB.runCommand({find: otherCollName, filter: {_id: 5}}));

    // TODO SERVER-36304: Change this to commitTransaction.
    session.abortTransaction();

    //
    // Stale mongos aborts on old shard.
    //

    // Move a chunk in the first collection from Shard1 to Shard0 through the other mongos, so
    // Shard1 and the main mongos are stale for it.
    const otherMongos = st.s1;
    assert.commandWorked(
        otherMongos.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard0.shardName}));
    expectChunks(st, ns, [2, 0, 0]);

    session.startTransaction();

    // Targets Shard1, which hits a stale version error, then re-targets Shard0, which is also
    // stale but should succeed.
    assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

    // TODO SERVER-36304: Change this to commitTransaction.
    session.abortTransaction();

    // Verify there is no in-progress transaction on Shard1.
    res = assert.commandFailedWithCode(st.rs1.getPrimary().getDB(dbName).runCommand({
        find: collName,
        lsid: session.getSessionId(),
        txnNumber: NumberLong(session.getTxnNumber_forTesting()),
        autocommit: false,
    }),
                                       ErrorCodes.NoSuchTransaction);
    assert.eq(res.errorLabels, ["TransientTransactionError"]);

    //
    // More than one stale shard version error.
    //

    // Move chunks for the first ns from Shard0 to Shard1 and Shard2 through the main mongos, so
    // both are stale but not the router.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard2.shardName}));
    expectChunks(st, ns, [1, 0, 1]);

    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: -5}, to: st.shard1.shardName}));
    expectChunks(st, ns, [0, 1, 1]);

    session.startTransaction();

    // Targets all shards, two of which are stale.
    assert.commandWorked(sessionDB.runCommand({find: collName}));

    // TODO SERVER-36304: Change this to commitTransaction.
    session.abortTransaction();

    //
    // Can retry a stale write on the first statement.
    //

    // Move a chunk to Shard1 to make it stale.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard1.shardName}));
    expectChunks(st, ns, [0, 2, 0]);

    session.startTransaction();

    // Targets Shard1, which is stale.
    assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: 6}]}));

    // TODO SERVER-36304: Change this to commitTransaction.
    session.abortTransaction();

    //
    // Cannot retry a stale write past the first statement.
    //
    // TODO SERVER-37207: Change batch writes to retry only the failed writes in a batch, to allow
    // retrying writes beyond the first overall statement.
    //

    // Move a chunk to Shard2 to make it stale.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard2.shardName}));
    expectChunks(st, ns, [0, 1, 1]);

    session.startTransaction();

    // Targets Shard1, which is not stale.
    assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: -4}]}));

    // Targets Shard2, which is stale.
    res = assert.commandFailedWithCode(
        sessionDB.runCommand({insert: collName, documents: [{_id: 7}]}),
        ErrorCodes.NoSuchTransaction);
    assert.eq(res.errorLabels, ["TransientTransactionError"]);

    // TODO SERVER-36304: Change this to commitTransaction.
    session.abortTransaction();

    //
    // NoSuchTransaction should be returned if the router exhausts its retries.
    //

    // Move a chunk to Shard0 to make it stale.
    assert.commandWorked(
        st.s.adminCommand({moveChunk: ns, find: {_id: -5}, to: st.shard0.shardName}));
    expectChunks(st, ns, [1, 0, 1]);

    // Disable metadata refreshes on the stale shard so it will indefinitely return a stale version
    // error.
    assert.commandWorked(st.rs0.getPrimary().adminCommand(
        {configureFailPoint: "skipShardFilteringMetadataRefresh", mode: "alwaysOn"}));

    session.startTransaction();

    // Targets Shard0, which is stale and won't refresh its metadata, so mongos should exhaust its
    // retries and implicitly abort the transaction.
    assert.commandFailedWithCode(sessionDB.runCommand({find: collName, filter: {_id: -5}}),
                                 ErrorCodes.NoSuchTransaction);

    session.abortTransaction();

    assert.commandWorked(st.rs0.getPrimary().adminCommand(
        {configureFailPoint: "skipShardFilteringMetadataRefresh", mode: "off"}));

    st.stop();
})();