jstests/aggregation/shard_targeting.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386

/**
 * Test that aggregations are sent directly to a single shard in the case where the data required by
 * the pipeline's initial query all resides on that shard, and that we correctly back out and
 * re-target in the event that a stale config exception is received.
 *
 * In particular:
 *
 * - If the data required by the aggregation all resides on a single shard (including multi-chunk
 * range $matches), send the entire pipeline to that shard and do not perform a $mergeCursors.
 * - In the case of a stage which requires a primary shard merge, do not split the pipeline or
 * generate a $mergeCursors if the data required by the aggregation all resides on the primary
 * shard.
 * - In the event that a stale config exception is encountered:
 *     - If the pipeline is split but we now only need to target a single shard, coalesce the split
 *       pipeline and dispatch it to that shard.
 *     - If the pipeline is not split but we must now target more than one shard, split it and
 *       redispatch.
 *
 * Because wrapping these aggregations in a $facet stage will affect how the pipeline is targeted,
 * and will therefore invalidate the results of the test cases below, we tag this test to prevent it
 * running under the 'aggregation_facet_unwind' passthrough.
 *
 * @tags: [do_not_wrap_aggregations_in_facets]
 */
(function() {
    load("jstests/libs/profiler.js");  // For profilerHas*OrThrow helper functions.

    const st = new ShardingTest({shards: 2, mongos: 2, config: 1});

    // mongosForAgg will be used to perform all aggregations.
    // mongosForMove does all chunk migrations, leaving mongosForAgg with stale config metadata.
    const mongosForAgg = st.s0;
    const mongosForMove = st.s1;

    const mongosDB = mongosForAgg.getDB(jsTestName());
    const mongosColl = mongosDB.test;

    const shard0DB = primaryShardDB = st.shard0.getDB(jsTestName());
    const shard1DB = st.shard1.getDB(jsTestName());

    // Turn off best-effort recipient metadata refresh post-migration commit on both shards because
    // it creates non-determinism for the profiler.
    assert.commandWorked(st.shard0.getDB('admin').runCommand(
        {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
    assert.commandWorked(st.shard1.getDB('admin').runCommand(
        {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));

    assert.commandWorked(mongosDB.dropDatabase());

    // Enable sharding on the test DB and ensure its primary is shard0000.
    assert.commandWorked(mongosDB.adminCommand({enableSharding: mongosDB.getName()}));
    st.ensurePrimaryShard(mongosDB.getName(), "shard0000");

    // Shard the test collection on _id.
    assert.commandWorked(
        mongosDB.adminCommand({shardCollection: mongosColl.getFullName(), key: {_id: 1}}));

    // Split the collection into 4 chunks: [MinKey, -100), [-100, 0), [0, 100), [100, MaxKey).
    assert.commandWorked(
        mongosDB.adminCommand({split: mongosColl.getFullName(), middle: {_id: -100}}));
    assert.commandWorked(
        mongosDB.adminCommand({split: mongosColl.getFullName(), middle: {_id: 0}}));
    assert.commandWorked(
        mongosDB.adminCommand({split: mongosColl.getFullName(), middle: {_id: 100}}));

    // Move the [0, 100) and [100, MaxKey) chunks to shard0001.
    assert.commandWorked(mongosDB.adminCommand(
        {moveChunk: mongosColl.getFullName(), find: {_id: 50}, to: "shard0001"}));
    assert.commandWorked(mongosDB.adminCommand(
        {moveChunk: mongosColl.getFullName(), find: {_id: 150}, to: "shard0001"}));

    // Write one document into each of the chunks.
    assert.writeOK(mongosColl.insert({_id: -150}));
    assert.writeOK(mongosColl.insert({_id: -50}));
    assert.writeOK(mongosColl.insert({_id: 50}));
    assert.writeOK(mongosColl.insert({_id: 150}));

    const shardExceptions =
        [ErrorCodes.StaleConfig, ErrorCodes.StaleShardVersion, ErrorCodes.StaleEpoch];

    // Create an $_internalSplitPipeline stage that forces the merge to occur on the Primary shard.
    const forcePrimaryMerge = [{$_internalSplitPipeline: {mergeType: "primaryShard"}}];

    function runAggShardTargetTest({splitPoint}) {
        // Ensure that both mongoS have up-to-date caches, and enable the profiler on both shards.
        assert.commandWorked(mongosForAgg.getDB("admin").runCommand({flushRouterConfig: 1}));
        assert.commandWorked(mongosForMove.getDB("admin").runCommand({flushRouterConfig: 1}));

        assert.commandWorked(shard0DB.setProfilingLevel(2));
        assert.commandWorked(shard1DB.setProfilingLevel(2));

        //
        // Test cases.
        //

        let testName, outColl;

        // Test that a range query is passed through if the chunks encompassed by the query all lie
        // on a single shard, in this case shard0000.
        testName = "agg_shard_targeting_range_single_shard_all_chunks_on_same_shard";
        assert.eq(mongosColl
                      .aggregate([{$match: {_id: {$gte: -150, $lte: -50}}}].concat(splitPoint),
                                 {comment: testName})
                      .itcount(),
                  2);

        // We expect one aggregation on shard0, none on shard1, and no $mergeCursors on shard0 (the
        // primary shard).
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard0DB,
            filter: {"command.aggregate": mongosColl.getName(), "command.comment": testName}
        });
        profilerHasZeroMatchingEntriesOrThrow({
            profileDB: shard1DB,
            filter: {"command.aggregate": mongosColl.getName(), "command.comment": testName}
        });
        profilerHasZeroMatchingEntriesOrThrow({
            profileDB: primaryShardDB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: 1}
            }
        });

        // Test that a range query with a stage that requires a primary shard merge ($out in this
        // case) is passed through if the chunk ranges encompassed by the query all lie on the
        // primary shard.
        testName = "agg_shard_targeting_range_all_chunks_on_primary_shard_out_no_merge";
        outColl = mongosDB[testName];

        assert.commandWorked(mongosDB.runCommand({
            aggregate: mongosColl.getName(),
            pipeline: [{$match: {_id: {$gte: -150, $lte: -50}}}].concat(splitPoint).concat([
                {$out: testName}
            ]),
            comment: testName,
            cursor: {}
        }));

        // We expect one aggregation on shard0, none on shard1, and no $mergeCursors on shard0 (the
        // primary shard).
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard0DB,
            filter: {"command.aggregate": mongosColl.getName(), "command.comment": testName}
        });
        profilerHasZeroMatchingEntriesOrThrow({
            profileDB: shard1DB,
            filter: {"command.aggregate": mongosColl.getName(), "command.comment": testName}
        });
        profilerHasZeroMatchingEntriesOrThrow({
            profileDB: primaryShardDB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: 1}
            }
        });

        // Verify that the contents of the $out collection are as expected.
        assert.eq(outColl.find().sort({_id: 1}).toArray(), [{_id: -150}, {_id: -50}]);

        // Test that a passthrough will back out and split the pipeline if we try to target a single
        // shard, get a stale config exception, and find that more than one shard is now involved.
        // Move the _id: [-100, 0) chunk from shard0000 to shard0001 via mongosForMove.
        assert.commandWorked(mongosForMove.getDB("admin").runCommand(
            {moveChunk: mongosColl.getFullName(), find: {_id: -50}, to: "shard0001"}));

        // Run the same aggregation that targeted a single shard via the now-stale mongoS. It should
        // attempt to send the aggregation to shard0000, hit a stale config exception, split the
        // pipeline and redispatch. We append an $_internalSplitPipeline stage in order to force a
        // shard merge rather than a mongoS merge.
        testName = "agg_shard_targeting_backout_passthrough_and_split_if_cache_is_stale";
        assert.eq(mongosColl
                      .aggregate([{$match: {_id: {$gte: -150, $lte: -50}}}]
                                     .concat(splitPoint)
                                     .concat(forcePrimaryMerge),
                                 {comment: testName})
                      .itcount(),
                  2);

        // Before the first dispatch:
        // - mongosForMove and shard0000 (the donor shard) are up to date.
        // - mongosForAgg and shard0001 are stale. mongosForAgg incorrectly believes that the
        // necessary data is all on shard0000.
        // We therefore expect that:
        // - mongosForAgg will throw a stale config error when it attempts to establish a
        // single-shard cursor on shard0000 (attempt 1).
        // - mongosForAgg will back out, refresh itself, split the pipeline and redispatch to both
        // shards.
        // - shard0001 will throw a stale config and refresh itself when the split pipeline is sent
        // to it (attempt 2).
        // - mongosForAgg will back out, retain the split pipeline and redispatch (attempt 3).
        // - The aggregation will succeed on the third dispatch.

        // We confirm this behaviour via the following profiler results:

        // - One aggregation on shard0000 with a shard version exception (indicating that the mongoS
        // was stale).
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard0DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$in: shardExceptions}
            }
        });

        // - One aggregation on shard0001 with a shard version exception (indicating that the shard
        // was stale).
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard1DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$in: shardExceptions}
            }
        });

        // - At most two aggregations on shard0000 with no stale config exceptions. The first, if
        // present, is an aborted cursor created if the command reaches shard0000 before shard0001
        // throws its stale config exception during attempt 2. The second profiler entry is from the
        // aggregation which succeeded.
        profilerHasAtLeastOneAtMostNumMatchingEntriesOrThrow({
            profileDB: shard0DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$exists: false}
            },
            maxExpectedMatches: 2
        });

        // - One aggregation on shard0001 with no stale config exception.
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard1DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$exists: false}
            }
        });

        // - One $mergeCursors aggregation on primary shard0000, since we eventually target both
        // shards after backing out the passthrough and splitting the pipeline.
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: primaryShardDB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: true}
            }
        });

        // Test that a split pipeline will back out and reassemble the pipeline if we target
        // multiple shards, get a stale config exception, and find that we can now target a single
        // shard.
        // Move the _id: [-100, 0) chunk back from shard0001 to shard0000 via mongosForMove.
        assert.commandWorked(mongosForMove.getDB("admin").runCommand(
            {moveChunk: mongosColl.getFullName(), find: {_id: -50}, to: "shard0000"}));

        // Run the same aggregation via the now-stale mongoS. It should split the pipeline, hit a
        // stale config exception, and reset to the original single-shard pipeline upon refresh. We
        // append an $_internalSplitPipeline stage in order to force a shard merge rather than a
        // mongoS merge.
        testName = "agg_shard_targeting_backout_split_pipeline_and_reassemble_if_cache_is_stale";
        assert.eq(mongosColl
                      .aggregate([{$match: {_id: {$gte: -150, $lte: -50}}}]
                                     .concat(splitPoint)
                                     .concat(forcePrimaryMerge),
                                 {comment: testName})
                      .itcount(),
                  2);

        // Before the first dispatch:
        // - mongosForMove and shard0001 (the donor shard) are up to date.
        // - mongosForAgg and shard0000 are stale. mongosForAgg incorrectly believes that the
        // necessary data is spread across both shards.
        // We therefore expect that:
        // - mongosForAgg will throw a stale config error when it attempts to establish a cursor on
        // shard0001 (attempt 1).
        // - mongosForAgg will back out, refresh itself, coalesce the split pipeline into a single
        // pipeline and redispatch to shard0000.
        // - shard0000 will throw a stale config and refresh itself when the pipeline is sent to it
        // (attempt 2).
        // - mongosForAgg will back out, retain the single-shard pipeline and redispatch (attempt
        // 3).
        // - The aggregation will succeed on the third dispatch.

        // We confirm this behaviour via the following profiler results:

        // - One aggregation on shard0001 with a shard version exception (indicating that the mongoS
        // was stale).
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard1DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$in: shardExceptions}
            }
        });

        // - One aggregation on shard0000 with a shard version exception (indicating that the shard
        // was stale).
        profilerHasSingleMatchingEntryOrThrow({
            profileDB: shard0DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$in: shardExceptions}
            }
        });

        // - At most two aggregations on shard0000 with no stale config exceptions. The first, if
        // present, is an aborted cursor created if the command reaches shard0000 before shard0001
        // throws its stale config exception during attempt 1. The second profiler entry is the
        // aggregation which succeeded.
        profilerHasAtLeastOneAtMostNumMatchingEntriesOrThrow({
            profileDB: shard0DB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: false},
                exceptionCode: {$exists: false}
            },
            maxExpectedMatches: 2
        });

        // No $mergeCursors aggregation on primary shard0000, since after backing out the split
        // pipeline we eventually target only shard0000.
        profilerHasZeroMatchingEntriesOrThrow({
            profileDB: primaryShardDB,
            filter: {
                "command.aggregate": mongosColl.getName(),
                "command.comment": testName,
                "command.pipeline.$mergeCursors": {$exists: true}
            }
        });

        // Clean up the test run by dropping the $out collection and resetting the profiler.
        assert(outColl.drop());

        assert.commandWorked(shard0DB.setProfilingLevel(0));
        assert.commandWorked(shard1DB.setProfilingLevel(0));

        assert(shard0DB.system.profile.drop());
        assert(shard1DB.system.profile.drop());
    }

    // Run tests with a variety of splitpoints, testing the pipeline split and re-assembly logic in
    // cases where the merge pipeline is empty, where the split stage is moved from shard to merge
    // pipe ($facet, $lookup), and where there are both shard and merge versions of the split source
    // ($sort, $group, $limit). Each test case will ultimately produce the same output.
    runAggShardTargetTest({splitPoint: []});
    runAggShardTargetTest({splitPoint: [{$sort: {_id: 1}}]});
    runAggShardTargetTest({splitPoint: [{$group: {_id: "$_id"}}]});
    runAggShardTargetTest({splitPoint: [{$limit: 4}]});
    runAggShardTargetTest({
        splitPoint: [
            {$facet: {facetPipe: [{$match: {_id: {$gt: MinKey}}}]}},
            {$unwind: "$facetPipe"},
            {$replaceRoot: {newRoot: "$facetPipe"}}
        ]
    });
    runAggShardTargetTest({
        splitPoint: [
            {
              $lookup: {
                  from: "dummycoll",
                  localField: "dummyfield",
                  foreignField: "dummyfield",
                  as: "lookupRes"
              }
            },
            {$project: {lookupRes: 0}}
        ]
    });

    st.stop();
})();