jstests/cqf/ce_histogram.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

/**
 * This is an integration test for histogram CE to ensure that we can create a histogram and
 * retrieve that histogram to estimate a simple match predicate. Note that this tests predicates and
 * histograms on several scalar types.
 *
 * This test is designed such that the constructed histograms should always give an exact
 * answer for a given equality predicate on any distinct value produced by 'generateDocs' below.
 * This is the case because the 'ndv' is sufficiently small such that each histogram will have
 * one bucket per distinct value with that distinct value as a bucket boundary. This should never
 * change as a result of updates to estimation, since estimates for bucket boundaries should always
 * be accurate.
 */
(function() {
"use strict";

load('jstests/aggregation/extras/utils.js');  // For assertArrayEq.
load("jstests/libs/optimizer_utils.js");      // For checkCascadesOptimizerEnabled.
load("jstests/libs/sbe_util.js");             // For checkSBEEnabled.

const fields = ["int", "dbl", "str", "date"];
const tolerance = 0.01;

let _id;

// Helper functions.

/**
 * Generates 'val' documents where each document has a distinct value for each 'field' in 'fields'.
 */
function generateDocs(val) {
    let docs = [];
    const fields = {
        int: val,
        dbl: val + 0.1,
        // A small note: the ordering of string bounds (lexicographical) is different than that of
        // int bounds. In order to simplify the histogram validation logic, we don't want to have to
        // account for the fact that string bounds will be sorted differently than int bounds. To
        // illustrate this, if we were to use the format `string_${val}`, the string corresponding
        // to value 10 would be the second entry in the histogram bounds array, even though it would
        // be generated for 'val' = 10, not 'val' = 2.
        str: `string_${String.fromCharCode(64 + val)}`,
        date: new Date(`02 December ${val + 2000}`)
    };
    for (let i = 0; i < val; i++) {
        docs.push(Object.assign({_id}, fields));
        _id += 1;
    }
    return docs;
}

/**
 * Retrieves the cardinality estimate from a node in explain.
 */
function extractCEFromExplain(node) {
    const ce = node.properties.adjustedCE;
    assert.neq(ce, null);
    return ce;
}

/**
 * Extracts the cardinality estimate for the given $match predicate, assuming we get an index scan
 * plan.
 */
function getIxscanCEForMatch(coll, predicate, hint) {
    // We expect the plan to include a BinaryJoin whose left child is an IxScan whose logical
    // representation was estimated via histograms.
    const explain = coll.explain().aggregate([{$match: predicate}], {hint});
    const ixScan = leftmostLeafStage(explain);
    assert.neq(ixScan, null);
    assert.eq(ixScan.nodeType, "IndexScan");
    return extractCEFromExplain(ixScan);
}

/**
 * Asserts that expected and actual are equal, within a small tolerance.
 */
function assertApproxEq(expected, actual, msg) {
    assert(Math.abs(expected - actual) < tolerance, msg);
}

/**
 * Validates that the results and cardinality estimate for a given $match predicate agree.
 */
function verifyCEForMatch({coll, predicate, expected, hint}) {
    print("ACB", tojson(predicate));
    const actual = coll.aggregate([{$match: predicate}], {hint}).toArray();
    assertArrayEq({actual, expected});

    const expectedCE = expected.length;
    const actualCE = getIxscanCEForMatch(coll, predicate, hint);
    assertApproxEq(
        actualCE,
        expectedCE,
        `${tojson(predicate)} returned ${expectedCE} documents, estimated ${actualCE} docs.`);
}

/**
 * This is the main testing function. Note that the input value 'ndv' corresponds to both the number
 * of distinct values per type in 'fields', as well as the number of buckets in each histogram
 * produced for this test.
 */
function verifyCEForNDV(ndv) {
    /**
     * For this test we create one collection and with an index for each field. We use a new
     * collection name for each ndv and for each field because of two problems.
     *  1. SERVER-70855: Re-running the analyze command does update the statistics collection
     * correctly; however, CollectionStatistics caches a stale histogram generated for the previous
     * 'ndv'.
     *  2. SERVER-70856: We also can't currently have multiple histograms on a collection because
     * there is no logic to correctly filter on field name, which means we will always retrieve the
     * first histogram generated for the collection (regardless of which field we care about), even
     * though we have correct histograms in the system collection for all fields.
     *
     * TODO: rewrite this test to reuse the same collection once 1) & 2) are addressed.
     */
    for (const field of fields) {
        // We can't use forceBonsai here because the new optimizer doesn't know how to handle the
        // analyze command.
        assert.commandWorked(
            db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "tryBonsai"}));

        const collName = `ce_histogram_${field}_${ndv}`;
        const coll = db[collName];
        coll.drop();
        assert.commandWorked(coll.createIndex({[field]: 1}));

        const expectedHistograms = [];
        expectedHistograms.push(
            {_id: field, statistics: {documents: 0, scalarHistogram: {buckets: [], bounds: []}}});

        // Set up test collection and initialize the expected histograms in order to validate basic
        // histogram construction. We generate 'ndv' distinct values for each 'field', such that the
        // 'i'th distinct value has a frequency of 'i'. Because we have a small number of distinct
        // values, we expect to have one bucket per distinct value.
        _id = 0;
        let cumulativeCount = 0;
        let allDocs = [];
        for (let val = 1; val <= ndv; val++) {
            const docs = generateDocs(val);
            assert.commandWorked(coll.insertMany(docs));
            cumulativeCount += docs.length;
            for (const expectedHistogram of expectedHistograms) {
                const field = expectedHistogram._id;
                const {statistics} = expectedHistogram;
                statistics.documents = cumulativeCount;
                statistics.scalarHistogram.buckets.push({
                    boundaryCount: val,
                    rangeCount: 0,
                    cumulativeCount,
                    rangeDistincts: 0,
                    cumulativeDistincts: val
                });
                statistics.scalarHistogram.bounds.push(docs[0][field]);
            }
            allDocs = allDocs.concat(docs);
        }

        // Set up histogram for test collection.
        const stats = db.system.statistics[collName];
        stats.drop();

        const res = db.runCommand({analyze: collName, key: field});
        assert.commandWorked(res);

        // Validate histograms.
        const actualHistograms = stats.aggregate().toArray();
        assertArrayEq({actual: actualHistograms, expected: expectedHistograms});

        // We need to set the CE query knob to use histograms and force the use of the new optimizer
        // to ensure that we use histograms to estimate CE here.
        assert.commandWorked(
            db.adminCommand({setParameter: 1, internalQueryCardinalityEstimatorMode: "histogram"}));
        assert.commandWorked(
            db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceBonsai"}));

        // Verify CE for all distinct values of each field across multiple types.
        let count = 0;
        const hint = {[field]: 1};
        for (let val = 1; val <= ndv; val++) {
            // Compute the expected documents selected for a single range using the boundary val.
            const docsEq = allDocs.slice(count, count + val);
            const docsLt = allDocs.slice(0, count);
            const docsLte = allDocs.slice(0, count + val);
            const docsGt = allDocs.slice(count + val);
            const docsGte = allDocs.slice(count);

            for (const documentField of fields) {
                const fieldVal = docsEq[0][documentField];

                if (field == documentField) {
                    verifyCEForMatch(
                        {coll, predicate: {[field]: fieldVal}, hint, expected: docsEq});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte});

                } else if (field == "int" && documentField == "dbl") {
                    // Each distinct double value corresponds to an int value + 0.1, so we shouldn't
                    // get any equality matches.
                    verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []});

                    // When we have a predicate ~ < val + 0.1 or <= val + 0.1, it should match all
                    // integers <= val.
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLte});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte});

                    // When we have a predicate ~ > val + 0.1 or >= val + 0.1, it should match all
                    // integers > val.
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGt});

                } else if (field == "dbl" && documentField == "int") {
                    // Each distinct double value corresponds to an int value + 0.1, so we shouldn't
                    // get any equality matches.
                    verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []});

                    // When we have a predicate ~ < val - 0.1 or <= val - 0.1, it should match all
                    // doubles < val.
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLt});

                    // When we have a predicate ~ > val - 0.1 or >= val - 0.1, it should match all
                    // doubles >= val.
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGte});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte});

                } else {
                    // Verify that we obtain a CE of 0 for types other than the 'field' type when at
                    // least one type is not numeric.
                    const expected = [];
                    verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected});
                    verifyCEForMatch({coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected});
                    verifyCEForMatch({coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected});
                    verifyCEForMatch(
                        {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected});
                }
            }

            count += val;
        }

        // Verify CE for values outside the range of distinct values for each field.
        const docLow = {int: 0, dbl: 0.0, str: `string_0`, date: new Date(`02 December ${2000}`)};
        const docHigh = generateDocs(ndv + 1)[0];
        const expected = [];
        verifyCEForMatch({coll, predicate: {[field]: docLow[field]}, hint, expected});
        verifyCEForMatch({coll, predicate: {[field]: docHigh[field]}, hint, expected});
    }
}

// Run test.

if (!checkCascadesOptimizerEnabled(db)) {
    jsTestLog("Skipping test because the optimizer is not enabled");
    return;
}

if (checkSBEEnabled(db, ["featureFlagSbeFull"], true)) {
    jsTestLog("Skipping the test because it doesn't work in Full SBE");
    return;
}

verifyCEForNDV(1);
verifyCEForNDV(2);
verifyCEForNDV(3);
verifyCEForNDV(10);
}());