/** * This is an integration test for histogram CE to ensure that we can create a histogram and * retrieve that histogram to estimate a simple match predicate. Note that this tests predicates and * histograms on several scalar types. * * This test is designed such that the constructed histograms should always give an exact * answer for a given equality predicate on any distinct value produced by 'generateDocs' below. * This is the case because the 'ndv' is sufficiently small such that each histogram will have * one bucket per distinct value with that distinct value as a bucket boundary. This should never * change as a result of updates to estimation, since estimates for bucket boundaries should always * be accurate. */ (function() { "use strict"; load('jstests/aggregation/extras/utils.js'); // For assertArrayEq. load("jstests/libs/optimizer_utils.js"); // For checkCascadesOptimizerEnabled. load("jstests/libs/sbe_util.js"); // For checkSBEEnabled. const fields = ["int", "dbl", "str", "date"]; const tolerance = 0.01; let _id; // Helper functions. /** * Generates 'val' documents where each document has a distinct value for each 'field' in 'fields'. */ function generateDocs(val) { let docs = []; const fields = { int: val, dbl: val + 0.1, // A small note: the ordering of string bounds (lexicographical) is different than that of // int bounds. In order to simplify the histogram validation logic, we don't want to have to // account for the fact that string bounds will be sorted differently than int bounds. To // illustrate this, if we were to use the format `string_${val}`, the string corresponding // to value 10 would be the second entry in the histogram bounds array, even though it would // be generated for 'val' = 10, not 'val' = 2. str: `string_${String.fromCharCode(64 + val)}`, date: new Date(`02 December ${val + 2000}`) }; for (let i = 0; i < val; i++) { docs.push(Object.assign({_id}, fields)); _id += 1; } return docs; } /** * Retrieves the cardinality estimate from a node in explain. */ function extractCEFromExplain(node) { const ce = node.properties.adjustedCE; assert.neq(ce, null); return ce; } /** * Extracts the cardinality estimate for the given $match predicate, assuming we get an index scan * plan. */ function getIxscanCEForMatch(coll, predicate, hint) { // We expect the plan to include a BinaryJoin whose left child is an IxScan whose logical // representation was estimated via histograms. const explain = coll.explain().aggregate([{$match: predicate}], {hint}); const ixScan = leftmostLeafStage(explain); assert.neq(ixScan, null); assert.eq(ixScan.nodeType, "IndexScan"); return extractCEFromExplain(ixScan); } /** * Asserts that expected and actual are equal, within a small tolerance. */ function assertApproxEq(expected, actual, msg) { assert(Math.abs(expected - actual) < tolerance, msg); } /** * Validates that the results and cardinality estimate for a given $match predicate agree. */ function verifyCEForMatch({coll, predicate, expected, hint}) { print("ACB", tojson(predicate)); const actual = coll.aggregate([{$match: predicate}], {hint}).toArray(); assertArrayEq({actual, expected}); const expectedCE = expected.length; const actualCE = getIxscanCEForMatch(coll, predicate, hint); assertApproxEq( actualCE, expectedCE, `${tojson(predicate)} returned ${expectedCE} documents, estimated ${actualCE} docs.`); } /** * This is the main testing function. Note that the input value 'ndv' corresponds to both the number * of distinct values per type in 'fields', as well as the number of buckets in each histogram * produced for this test. */ function verifyCEForNDV(ndv) { /** * For this test we create one collection and with an index for each field. We use a new * collection name for each ndv and for each field because of two problems. * 1. SERVER-70855: Re-running the analyze command does update the statistics collection * correctly; however, CollectionStatistics caches a stale histogram generated for the previous * 'ndv'. * 2. SERVER-70856: We also can't currently have multiple histograms on a collection because * there is no logic to correctly filter on field name, which means we will always retrieve the * first histogram generated for the collection (regardless of which field we care about), even * though we have correct histograms in the system collection for all fields. * * TODO: rewrite this test to reuse the same collection once 1) & 2) are addressed. */ for (const field of fields) { // We can't use forceBonsai here because the new optimizer doesn't know how to handle the // analyze command. assert.commandWorked( db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "tryBonsai"})); const collName = `ce_histogram_${field}_${ndv}`; const coll = db[collName]; coll.drop(); assert.commandWorked(coll.createIndex({[field]: 1})); const expectedHistograms = []; expectedHistograms.push( {_id: field, statistics: {documents: 0, scalarHistogram: {buckets: [], bounds: []}}}); // Set up test collection and initialize the expected histograms in order to validate basic // histogram construction. We generate 'ndv' distinct values for each 'field', such that the // 'i'th distinct value has a frequency of 'i'. Because we have a small number of distinct // values, we expect to have one bucket per distinct value. _id = 0; let cumulativeCount = 0; let allDocs = []; for (let val = 1; val <= ndv; val++) { const docs = generateDocs(val); assert.commandWorked(coll.insertMany(docs)); cumulativeCount += docs.length; for (const expectedHistogram of expectedHistograms) { const field = expectedHistogram._id; const {statistics} = expectedHistogram; statistics.documents = cumulativeCount; statistics.scalarHistogram.buckets.push({ boundaryCount: val, rangeCount: 0, cumulativeCount, rangeDistincts: 0, cumulativeDistincts: val }); statistics.scalarHistogram.bounds.push(docs[0][field]); } allDocs = allDocs.concat(docs); } // Set up histogram for test collection. const stats = db.system.statistics[collName]; stats.drop(); const res = db.runCommand({analyze: collName, key: field}); assert.commandWorked(res); // Validate histograms. const actualHistograms = stats.aggregate().toArray(); assertArrayEq({actual: actualHistograms, expected: expectedHistograms}); // We need to set the CE query knob to use histograms and force the use of the new optimizer // to ensure that we use histograms to estimate CE here. assert.commandWorked( db.adminCommand({setParameter: 1, internalQueryCardinalityEstimatorMode: "histogram"})); assert.commandWorked( db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceBonsai"})); // Verify CE for all distinct values of each field across multiple types. let count = 0; const hint = {[field]: 1}; for (let val = 1; val <= ndv; val++) { // Compute the expected documents selected for a single range using the boundary val. const docsEq = allDocs.slice(count, count + val); const docsLt = allDocs.slice(0, count); const docsLte = allDocs.slice(0, count + val); const docsGt = allDocs.slice(count + val); const docsGte = allDocs.slice(count); for (const documentField of fields) { const fieldVal = docsEq[0][documentField]; if (field == documentField) { verifyCEForMatch( {coll, predicate: {[field]: fieldVal}, hint, expected: docsEq}); verifyCEForMatch( {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt}); verifyCEForMatch( {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte}); verifyCEForMatch( {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt}); verifyCEForMatch( {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte}); } else if (field == "int" && documentField == "dbl") { // Each distinct double value corresponds to an int value + 0.1, so we shouldn't // get any equality matches. verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []}); // When we have a predicate ~ < val + 0.1 or <= val + 0.1, it should match all // integers <= val. verifyCEForMatch( {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLte}); verifyCEForMatch( {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte}); // When we have a predicate ~ > val + 0.1 or >= val + 0.1, it should match all // integers > val. verifyCEForMatch( {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt}); verifyCEForMatch( {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGt}); } else if (field == "dbl" && documentField == "int") { // Each distinct double value corresponds to an int value + 0.1, so we shouldn't // get any equality matches. verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []}); // When we have a predicate ~ < val - 0.1 or <= val - 0.1, it should match all // doubles < val. verifyCEForMatch( {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt}); verifyCEForMatch( {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLt}); // When we have a predicate ~ > val - 0.1 or >= val - 0.1, it should match all // doubles >= val. verifyCEForMatch( {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGte}); verifyCEForMatch( {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte}); } else { // Verify that we obtain a CE of 0 for types other than the 'field' type when at // least one type is not numeric. const expected = []; verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected}); verifyCEForMatch({coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected}); verifyCEForMatch( {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected}); verifyCEForMatch({coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected}); verifyCEForMatch( {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected}); } } count += val; } // Verify CE for values outside the range of distinct values for each field. const docLow = {int: 0, dbl: 0.0, str: `string_0`, date: new Date(`02 December ${2000}`)}; const docHigh = generateDocs(ndv + 1)[0]; const expected = []; verifyCEForMatch({coll, predicate: {[field]: docLow[field]}, hint, expected}); verifyCEForMatch({coll, predicate: {[field]: docHigh[field]}, hint, expected}); } } // Run test. if (!checkCascadesOptimizerEnabled(db)) { jsTestLog("Skipping test because the optimizer is not enabled"); return; } if (checkSBEEnabled(db, ["featureFlagSbeFull"], true)) { jsTestLog("Skipping the test because it doesn't work in Full SBE"); return; } verifyCEForNDV(1); verifyCEForNDV(2); verifyCEForNDV(3); verifyCEForNDV(10); }());