summaryrefslogtreecommitdiff
path: root/jstests/core/columnstore/column_scan_skip_row_store_projection.js
blob: b69e515d7efbe406efb81f3047198657802b9380 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/**
 * Tests that the row store expression is skipped when there is an appropriate group or projection
 * above a columnscan stage.
 *
 * @tags: [
 *   # explain is not supported in transactions
 *   does_not_support_transactions,
 *   requires_pipeline_optimization,
 *   # Runs explain on an aggregate command which is only compatible with readConcern local.
 *   assumes_read_concern_unchanged,
 *   # explain will be different in a sharded collection
 *   assumes_unsharded_collection,
 *   # column store row store expression skipping is new in 6.2.
 *   requires_fcv_62,
 *   # Columnstore tests set server parameters to disable columnstore query planning heuristics -
 *   # 1) server parameters are stored in-memory only so are not transferred onto the recipient,
 *   # 2) server parameters may not be set in stepdown passthroughs because it is a command that may
 *   #      return different values after a failover
 *   tenant_migration_incompatible,
 *   does_not_support_stepdowns,
 *   not_allowed_with_security_token,
 * ]
 */
(function() {
"use strict";

load('jstests/aggregation/extras/utils.js');  // For assertArrayEq.
load("jstests/libs/sbe_util.js");             // For checkSBEEnabled.
// For areAllCollectionsClustered.
load("jstests/libs/clustered_collections/clustered_collection_util.js");
load("jstests/libs/columnstore_util.js");  // For setUpServerForColumnStoreIndexTest.

const columnstoreEnabled = checkSBEEnabled(
    db, ["featureFlagColumnstoreIndexes", "featureFlagSbeFull"], true /* checkAllNodes */);
if (!columnstoreEnabled) {
    jsTestLog("Skipping columnstore index test since the feature flag is not enabled.");
    return;
}

if (!setUpServerForColumnStoreIndexTest(db)) {
    return;
}

const indexedColl = db.column_scan_skip_row_store_projection_indexed;
const unindexedColl = db.column_scan_skip_row_store_projection_unindexed;

function setupCollections() {
    indexedColl.drop();
    unindexedColl.drop();
    assert.commandWorked(indexedColl.createIndex({"$**": "columnstore"}));

    const docs = [
        {_id: "a_number", a: 4},
        {_id: "a_subobject_c_not_null", a: {c: "hi"}},
        {_id: "a_subobject_c_null", a: {c: null}},
        {_id: "a_subobject_c_undefined", a: {c: undefined}},
        {_id: "no_a", b: 1},
        {_id: "a_and_b_nested", a: 2, b: {d: 1}},
        {_id: "a_nested_and_b_nested", a: {c: 5}, b: {d: {f: 2}}, e: 1},
    ];
    assert.commandWorked(indexedColl.insertMany(docs));
    assert.commandWorked(unindexedColl.insertMany(docs));
}

function test({agg, requiresRowStoreExpr, requiredRowstoreReads}) {
    // Check that columnstore index is used, and we skip the row store expression appropriately.
    const explainPlan = indexedColl.explain("queryPlanner").aggregate(agg);
    let sbeStages = ('queryPlanner' in explainPlan)
        // entirely SBE plan
        ? explainPlan.queryPlanner.winningPlan.slotBasedPlan.stages
        // SBE + classic plan
        : explainPlan.stages[0]["$cursor"].queryPlanner.winningPlan.slotBasedPlan.stages;
    assert(sbeStages.includes('columnscan'), `No columnscan in SBE stages: ${sbeStages}`);
    const nullRegex =
        /columnscan s.* ((s.*)|(none)) paths\[.*\] pathFilters\[.*\] rowStoreExpr\[\] @.* @.*/;
    const notNullRegex =
        /columnscan s.* ((s.*)|(none)) paths\[.*\] pathFilters\[.*\] rowStoreExpr\[.*, \n/;
    if (requiresRowStoreExpr) {
        assert(!nullRegex.test(sbeStages), `Don't expect null rowstoreExpr in ${sbeStages}`);
        assert(notNullRegex.test(sbeStages), `Expected non-null rowstoreExpr in ${sbeStages}`);
    } else {
        assert(nullRegex.test(sbeStages), `Expected null rowStoreExpr in ${sbeStages}`);
        assert(!notNullRegex.test(sbeStages), `Don't expect non-null rowStoreExpr in ${sbeStages}`);
    }

    // Check the expected number of row store reads. The reads are triggered by encountering a
    // record that cannot be reconstructed from the index and come in the form of a fetch followed
    // by a few records scanned from the row store. The number of scanned records fluctuates
    // depending on the settings and the data patterns so the only invariant we can assert is that
    // the number of combined reads from the row store is at least as the number of "bad" records.
    const explainExec = indexedColl.explain("executionStats").aggregate(agg);
    const actualRowstoreReads =
        parseInt(JSON.stringify(explainExec).split('"numRowStoreFetches":')[1].split(",")[0]) +
        parseInt(JSON.stringify(explainExec).split('"numRowStoreScans":')[1].split(",")[0]);
    assert.gte(
        actualRowstoreReads,
        requiredRowstoreReads,
        `Unexpected nubmer of row store fetches in ${JSON.stringify(explainExec, null, '\t')}`);

    // Check that results are identical with and without columnstore index.
    assertArrayEq({
        actual: indexedColl.aggregate(agg).toArray(),
        expected: unindexedColl.aggregate(agg).toArray()
    });
}

function runAllAggregations() {
    // $project only.  Requires row store expression regardless of nesting under the projected path.
    test({agg: [{$project: {_id: 0, a: 1}}], requiresRowStoreExpr: true, requiredRowstoreReads: 4});
    test({agg: [{$project: {_id: 0, b: 1}}], requiresRowStoreExpr: true, requiredRowstoreReads: 2});

    // $group only.
    // The 4 cases below provide the same coverage but illustrate when row store fetches are needed.
    test({
        agg: [{$group: {_id: null, a: {$push: "$a"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 4
    });
    test({
        agg: [{$group: {_id: null, b: {$push: "$b"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 2
    });
    test({
        agg: [{$group: {_id: null, e: {$push: "$e"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 0
    });
    test({
        agg: [{$group: {_id: "$_id", a: {$push: "$a"}, b: {$push: "$b"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 5
    });

    // $group and $project, including _id.
    test({
        agg: [{$project: {_id: 1, a: 1}}, {$group: {_id: "$_id", a: {$push: "$a"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 4
    });

    // The rowStoreExpr is needed to prevent the $group from seeing b.
    test({
        agg: [
            {$project: {_id: 1, a: 1}},
            {$group: {_id: "$_id", a: {$push: "$a"}, b: {$push: "$b"}}}
        ],
        requiresRowStoreExpr: true,
        requiredRowstoreReads: 4
    });

    // Same as above, but add another $group later that would be eligible for skipping the row store
    // expression.
    test({
        agg: [
            {$project: {_id: 1, a: 1}},
            {$group: {_id: "$_id", a: {$push: "$a"}, b: {$push: "$b"}}},
            {$project: {_id: 1, a: 1}},
            {$group: {_id: "$_id", a: {$push: "$a"}}}
        ],
        requiresRowStoreExpr: true,
        requiredRowstoreReads: 4
    });

    // $group and $project, excluding _id.
    // Because _id is projected out, the $group will aggregate all docs together.  The rowStoreExpr
    // must not be skipped or else $group will behave incorrectly.
    test({
        agg: [{$project: {_id: 0, a: 1}}, {$group: {_id: "$_id", a: {$push: "$a"}}}],
        requiresRowStoreExpr: true,
        requiredRowstoreReads: 4
    });

    // $match with a filter that can be pushed down.
    test({
        agg: [{$match: {a: 2}}, {$group: {_id: "$_id", b: {$push: "$b"}, a: {$push: "$a"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 1
    });

    // $match with no group, and non-output filter that can't be pushed down.
    test({
        agg: [{$match: {e: {$exists: false}}}, {$project: {_id: 1, b: 1}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 2
    });
    // $match with no group, and non-output filter that can be pushed down.
    test({
        agg: [{$match: {e: {$exists: true}}}, {$project: {_id: 1, b: 1}}],
        requiresRowStoreExpr: true,
        requiredRowstoreReads: 1
    });

    // Nested paths.
    // The BrowserUsageByDistinctUserQuery that motivated this ticket is an example of this.
    test({
        agg: [{$match: {"a.c": 5}}, {$group: {_id: "$_id", b_d: {$push: "$b.d"}}}],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 1
    });

    // BrowserUsageByDistinctUserQuery from ColumnStoreIndex.yml in the genny repo.
    // $addFields is not implemented in SBE, so this will have an SBE plan + an agg pipeline.
    // This query does not match our documents, but the test checks for row store expression
    // elimination.
    test({
        agg: [
            {"$match": {"metadata.browser": {"$exists": true}}},
            {
                "$addFields":
                    {"browserName": {"$arrayElemAt": [{"$split": ["$metadata.browser", " "]}, 0]}}
            },
            {
                "$match": {
                    "browserName": {"$nin": [null, "", "null"]},
                    "created_at": {"$gte": ISODate("2020-03-10T01:17:41Z")}
                }
            },
            {
                "$group":
                    {"_id": {"__alias_0": "$browserName"}, "__alias_1": {"$addToSet": "$user_id"}}
            },
            {
                "$project":
                    {"_id": 0, "__alias_0": "$_id.__alias_0", "__alias_1": {"$size": "$__alias_1"}}
            },
            {"$project": {"label": "$__alias_0", "value": "$__alias_1", "_id": 0}},
            {"$limit": 5000}
        ],
        requiresRowStoreExpr: false,
        requiredRowstoreReads: 0
    });

    // Cases that may be improved by future work:

    // The limit below creates a Query Solution Node between the column scan and the group.
    // Our optimization is not clever enough to see that the limit QSN is irrelevant.
    test({
        agg: [{$limit: 100}, {$group: {_id: null, a: {$push: "$a"}}}],
        requiresRowStoreExpr: true,  // ideally this would be false
        requiredRowstoreReads: 4
    });

    // $match with a nested path filter than can be pushed down.
    // This fails to even use the column store index.  It should be able to in the future.
    assert.throws(() => {
        test({
            agg: [{$match: {"a.e": 1}}, {$group: {_id: "$_id", a: {$push: "$a"}}}],
            requiresRowStoreExpr: false,
            requiredRowstoreReads: 0
        });
    });
}

setupCollections();
runAllAggregations();
}());