diff options
author | Milena Ivanova <milena.ivanova@mongodb.com> | 2022-12-12 10:54:53 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-12-12 11:28:43 +0000 |
commit | 21aba5b624e0371869a453f89ad1b3711ee720b0 (patch) | |
tree | 9d9dd03c7ecf453cf51310282287a760e8eccd91 | |
parent | 20913e4b839e108613a711b60698bc515eb2b2ac (diff) | |
download | mongo-21aba5b624e0371869a453f89ad1b3711ee720b0.tar.gz |
SERVER-71989 Implement CE accuracy testing framework based on JS testing
-rw-r--r-- | jstests/cqf/analyze/array_histogram.js | 2 | ||||
-rw-r--r-- | jstests/cqf/analyze/ce_histogram.js | 2 | ||||
-rw-r--r-- | jstests/cqf/analyze/missing_histogram.js | 2 | ||||
-rw-r--r-- | jstests/cqf/analyze/type_counts.js | 6 | ||||
-rw-r--r-- | jstests/libs/ce_stats_utils.js | 21 | ||||
-rw-r--r-- | jstests/libs/optimizer_utils.js | 15 | ||||
-rw-r--r-- | jstests/query_golden/ce_accuracy.js | 167 | ||||
-rw-r--r-- | jstests/query_golden/expected_output/ce_accuracy | 470 | ||||
-rw-r--r-- | jstests/query_golden/libs/ce_data.js | 35 | ||||
-rw-r--r-- | jstests/query_golden/libs/compute_errors.js | 29 | ||||
-rw-r--r-- | jstests/query_golden/libs/generate_queries.js | 74 |
11 files changed, 810 insertions, 13 deletions
diff --git a/jstests/cqf/analyze/array_histogram.js b/jstests/cqf/analyze/array_histogram.js index 23277ab08ba..0ee10af9ebf 100644 --- a/jstests/cqf/analyze/array_histogram.js +++ b/jstests/cqf/analyze/array_histogram.js @@ -112,7 +112,7 @@ runHistogramsTest(function verifyArrayHistograms() { createAndValidateHistogram({coll, expectedHistogram}); // Verify CE. - forceHistogramCE(); + forceCE("histogram"); // Equality predicates. verifyCEForMatch( diff --git a/jstests/cqf/analyze/ce_histogram.js b/jstests/cqf/analyze/ce_histogram.js index c9f2691b887..cb27f55b3f9 100644 --- a/jstests/cqf/analyze/ce_histogram.js +++ b/jstests/cqf/analyze/ce_histogram.js @@ -187,7 +187,7 @@ function verifyCEForNDV(ndv) { }; // Verify CE for all distinct values of each field across multiple types. - forceHistogramCE(); + forceCE("histogram"); for (const field of fields) { jsTestLog(`Testing histogram for ndv ${ndv} and field ${field}`); diff --git a/jstests/cqf/analyze/missing_histogram.js b/jstests/cqf/analyze/missing_histogram.js index 7349337dd1d..57d16836883 100644 --- a/jstests/cqf/analyze/missing_histogram.js +++ b/jstests/cqf/analyze/missing_histogram.js @@ -29,7 +29,7 @@ runHistogramsTest(function testEmptyAndMissingHistograms() { // Ensure we have no histogram. assert.eq(db.system.statistics[noHistogramColl.getName()].count(), 0); - forceHistogramCE(); + forceCE("histogram"); // We actually use heuristics because we don't generate histograms for empty collections; // however, our estimate should still be 0, because the collection is empty. diff --git a/jstests/cqf/analyze/type_counts.js b/jstests/cqf/analyze/type_counts.js index b43a37dad55..d3a25eb85b9 100644 --- a/jstests/cqf/analyze/type_counts.js +++ b/jstests/cqf/analyze/type_counts.js @@ -109,7 +109,7 @@ runHistogramsTest(function testTypeCounts() { }); // Verify type count CE. - forceHistogramCE(); + forceCE("histogram"); let hint = {a: 1}; // TODO SERVER-70936: estimate boolean counts. @@ -352,7 +352,7 @@ runHistogramsTest(function testTypeCounts() { }); // Verify type count CE. - forceHistogramCE(); + forceCE("histogram"); hint = {"a.b": 1}; // Test CE for histogrammable types. @@ -551,7 +551,7 @@ runHistogramsTest(function testTypeCounts() { // Verify type count CE. Note that for non-$elemMatch preidcates, we include both array and // scalar type-counts, while for $elemMatch predicates, we include only array type counts in // our estimate. - forceHistogramCE(); + forceCE("histogram"); hint = {a: 1}; // TODO SERVER-70936: estimate boolean counts. diff --git a/jstests/libs/ce_stats_utils.js b/jstests/libs/ce_stats_utils.js index fdef187007a..48f8ec77f0c 100644 --- a/jstests/libs/ce_stats_utils.js +++ b/jstests/libs/ce_stats_utils.js @@ -126,12 +126,19 @@ function runHistogramsTest(test) { } /** - * We need to set the CE query knob to use histograms and force the use of the new optimizer to - * ensure that we use histograms to estimate CE. + * Creates a single-field index for each field in the 'fields' array. */ -function forceHistogramCE() { - assert.commandWorked( - db.adminCommand({setParameter: 1, internalQueryCardinalityEstimatorMode: "histogram"})); - assert.commandWorked( - db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceBonsai"})); +function createIndexes(coll, fields) { + for (const field of fields) { + assert.commandWorked(coll.createIndex({[field]: 1})); + } +} + +/** + * Creates statistics for each field in the 'fields' array. + */ +function analyzeFields(coll, fields) { + for (const field of fields) { + assert.commandWorked(db.runCommand({analyze: coll.getName(), key: field})); + } } diff --git a/jstests/libs/optimizer_utils.js b/jstests/libs/optimizer_utils.js index bb363a6a54a..4f7b4ca7c3d 100644 --- a/jstests/libs/optimizer_utils.js +++ b/jstests/libs/optimizer_utils.js @@ -324,3 +324,18 @@ function runWithParams(keyValPairs, fn) { } } } + +function round2(n) { + return (Math.round(n * 100) / 100); +} + +/** + * Force cardinality estimation mode: "histogram", "heuristic", or "sampling". We need to force the + * use of the new optimizer. + */ +function forceCE(mode) { + assert.commandWorked( + db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceBonsai"})); + assert.commandWorked( + db.adminCommand({setParameter: 1, internalQueryCardinalityEstimatorMode: mode})); +} diff --git a/jstests/query_golden/ce_accuracy.js b/jstests/query_golden/ce_accuracy.js new file mode 100644 index 00000000000..db011020a52 --- /dev/null +++ b/jstests/query_golden/ce_accuracy.js @@ -0,0 +1,167 @@ +/** + * Tests for cardinality estimation accuracy. + * @tags: [ + * requires_cqf, + * ] + */ + +(function() { + +load("jstests/libs/ce_stats_utils.js"); +load("jstests/libs/optimizer_utils.js"); +load("jstests/query_golden/libs/ce_data.js"); +load("jstests/query_golden/libs/compute_errors.js"); +load("jstests/query_golden/libs/generate_queries.js"); + +runHistogramsTest(function testCEAccuracy() { + // Flag to show more information for debugging purposes: + // - query results; + // - execution of sampling CE strategy. + const ceDebugFlag = false; + let ceStrategies = ["heuristic", "histogram"]; + if (ceDebugFlag) { + ceStrategies.push("sampling"); + } + + const coll = db.ce_accuracy; + coll.drop(); + + // Collection to store accuracy errors. + const errorColl = db.ce_errors; + errorColl.drop(); + + // Stats collection. + const statsColl = db.system.statistics.ce_accuracy; + + jsTestLog("Populating collection"); + assert.commandWorked(coll.insertMany(getCEDocs())); + const collSize = coll.find().itcount(); + print(`Collection count: ${coll.find().itcount()}\n`); + if (ceDebugFlag) { + show(coll.find().limit(100).toArray()); + } + + // Switch to 'tryBonsai' to create statistics. + assert.commandWorked( + db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "tryBonsai"})); + // Create indexes and statistics. + const fields = ["a", "b", "c"]; + createIndexes(coll, fields); + analyzeFields(coll, fields); + for (const field of fields) { + let stats = statsColl.find({"_id": field})[0]; + assert.eq(stats["statistics"]["documents"], collSize); + } + + function runAggregationWithCE(testcase, strategy) { + if (testcase["nReturned"] == null) { + var explain = coll.explain("executionStats").aggregate(testcase.pipeline); + testcase["nReturned"] = explain.executionStats.nReturned; + } else { + var explain = coll.explain().aggregate(testcase.pipeline); + } + testcase[strategy] = round2(getRootCE(explain)); + } + + // Run all queries in the array testCases with all CE strategies. + function runQueries(testCases) { + ceStrategies.forEach(function(strategy) { + forceCE(strategy); + testCases.forEach(testCase => runAggregationWithCE(testCase, strategy)); + }); + } + + function computeStrategyErrors(testcase, strategy, collSize) { + let errorDoc = {}; + const absError = testcase[strategy] - testcase.nReturned; + let relError = 0.0; + if (testcase.nReturned > 0) { + relError = absError / testcase.nReturned; + } else if (testcase[strategy] > 0) { + // We cannot compute the relative error by division by zero. Take 10% of the absolute + // error instead. + relError = 0.1 * testcase[strategy]; + } + + // Selectivity error wrt collection size. + const selError = 100.0 * absError / collSize; + errorDoc["absError"] = round2(absError); + errorDoc["relError"] = round2(relError); + errorDoc["selError"] = round2(selError); + return errorDoc; + } + + function computeAndPrintErrors(testcase, collSize) { + let errorDoc = {_id: testcase._id}; + + ceStrategies.forEach(function(strategy) { + errors = computeStrategyErrors(testcase, strategy, collSize); + errorDoc[strategy] = errors; + print(`${strategy}: ${testcase[strategy]} `); + print(`AbsError: ${errors["absError"]}, RelError: ${errors["relError"]} , SelError: ${ + errors["selError"]}%\n`); + }); + return errorDoc; + } + + // Queries. + // Defined as documents: { _id: 1, pipeline: [{$match: {a: {$gt: 16}}}] } + let values = selectQueryValues(coll, "a"); + let testCasesInt = generateComparisons("a", values); + let testCasesStr = generateComparisons("b", ["", genRandomString(10), genRandomString(15)]); + let testCasesArr = generateRangePredicates("c", [ + [2, 4], + ]); + let testCases = testCasesInt.concat(testCasesStr).concat(testCasesArr); + + let i = 0; + for (let query of testCases) { + query["_id"] = i++; + } + + runQueries(testCases); + + // Compute errors per query. + for (const testcase of testCases) { + jsTestLog(`Query ${testcase._id}: ${tojsononeline(testcase.pipeline)}`); + if (ceDebugFlag) + show(coll.aggregate(testcase.pipeline)); + print(`Actual cardinality: ${testcase.nReturned}\n`); + print(`Cardinality estimates:\n`); + let errorDoc = computeAndPrintErrors(testcase, collSize); + errorColl.insert(errorDoc); + } + + if (ceDebugFlag) { + show(testCases); + show(errorColl.find()); + } + + // Switch to 'forceClassicEngine' to run the pipelines computing the errors. + assert.commandWorked( + db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceClassicEngine"})); + + // Compute aggregate errors for all CE strategies across all queries. + const trialSize = errorColl.find().itcount(); + + print(`\nHeuristic mean errors: `); + print(`absRMSE: ${computeRMSE(errorColl, "$heuristic.absError", trialSize)}, `); + print(`relRMSE: ${computeRMSE(errorColl, "$heuristic.relError", trialSize)}, `); + print( + `meanAbsSelErr: ${computeMeanAbsSelError(errorColl, "$heuristic.selError", trialSize)}%\n`); + + print(`Histogram mean errors: `); + print(`absRMSE: ${computeRMSE(errorColl, "$histogram.absError", trialSize)}, `); + print(`relRMSE: ${computeRMSE(errorColl, "$histogram.relError", trialSize)}, `); + print( + `meanAbsSelErr: ${computeMeanAbsSelError(errorColl, "$histogram.selError", trialSize)}%\n`); + + if (ceDebugFlag) { + print(`Sampling mean errors: `); + print(`absRMSE: ${computeRMSE(errorColl, "$sampling.absError", trialSize)}, `); + print(`relRMSE: ${computeRMSE(errorColl, "$sampling.relError", trialSize)}, `); + print(`meanAbsSelErr: ${ + computeMeanAbsSelError(errorColl, "$sampling.selError", trialSize)}%\n`); + } +}); +})(); diff --git a/jstests/query_golden/expected_output/ce_accuracy b/jstests/query_golden/expected_output/ce_accuracy new file mode 100644 index 00000000000..518993f6b99 --- /dev/null +++ b/jstests/query_golden/expected_output/ce_accuracy @@ -0,0 +1,470 @@ + + +[jsTest] ---- +[jsTest] Populating collection +[jsTest] ---- + +Collection count: 10 + + +[jsTest] ---- +[jsTest] Query 0: [ { "$match" : { "a" : { "$eq" : 12 } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 3.16 AbsError: 2.16, RelError: 2.16 , SelError: 21.6% +histogram: 1 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 1: [ { "$match" : { "a" : { "$eq" : 15 } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 3.16 AbsError: 2.16, RelError: 2.16 , SelError: 21.6% +histogram: 1 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 2: [ { "$match" : { "a" : { "$eq" : 18 } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 3.16 AbsError: 2.16, RelError: 2.16 , SelError: 21.6% +histogram: 1 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 3: [ { "$match" : { "a" : { "$lt" : 12 } } } ] +[jsTest] ---- + +Actual cardinality: 2 +Cardinality estimates: +heuristic: 5 AbsError: 3, RelError: 1.5 , SelError: 30% +histogram: 2 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 4: [ { "$match" : { "a" : { "$lt" : 15 } } } ] +[jsTest] ---- + +Actual cardinality: 5 +Cardinality estimates: +heuristic: 5 AbsError: 0, RelError: 0 , SelError: 0% +histogram: 5 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 5: [ { "$match" : { "a" : { "$lt" : 18 } } } ] +[jsTest] ---- + +Actual cardinality: 8 +Cardinality estimates: +heuristic: 5 AbsError: -3, RelError: -0.37 , SelError: -30% +histogram: 8 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 6: [ { "$match" : { "a" : { "$lte" : 12 } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 5 AbsError: 2, RelError: 0.67 , SelError: 20% +histogram: 3 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 7: [ { "$match" : { "a" : { "$lte" : 15 } } } ] +[jsTest] ---- + +Actual cardinality: 6 +Cardinality estimates: +heuristic: 5 AbsError: -1, RelError: -0.17 , SelError: -10% +histogram: 6 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 8: [ { "$match" : { "a" : { "$lte" : 18 } } } ] +[jsTest] ---- + +Actual cardinality: 9 +Cardinality estimates: +heuristic: 5 AbsError: -4, RelError: -0.44 , SelError: -40% +histogram: 9 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 9: [ { "$match" : { "a" : { "$gt" : 12 } } } ] +[jsTest] ---- + +Actual cardinality: 7 +Cardinality estimates: +heuristic: 7 AbsError: 0, RelError: 0 , SelError: 0% +histogram: 7 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 10: [ { "$match" : { "a" : { "$gt" : 15 } } } ] +[jsTest] ---- + +Actual cardinality: 4 +Cardinality estimates: +heuristic: 7 AbsError: 3, RelError: 0.75 , SelError: 30% +histogram: 4 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 11: [ { "$match" : { "a" : { "$gt" : 18 } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 7 AbsError: 6, RelError: 6 , SelError: 60% +histogram: 1 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 12: [ { "$match" : { "a" : { "$gte" : 12 } } } ] +[jsTest] ---- + +Actual cardinality: 8 +Cardinality estimates: +heuristic: 7 AbsError: -1, RelError: -0.12 , SelError: -10% +histogram: 8 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 13: [ { "$match" : { "a" : { "$gte" : 15 } } } ] +[jsTest] ---- + +Actual cardinality: 5 +Cardinality estimates: +heuristic: 7 AbsError: 2, RelError: 0.4 , SelError: 20% +histogram: 5 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 14: [ { "$match" : { "a" : { "$gte" : 18 } } } ] +[jsTest] ---- + +Actual cardinality: 2 +Cardinality estimates: +heuristic: 7 AbsError: 5, RelError: 2.5 , SelError: 50% +histogram: 2 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 15: [ { "$match" : { "b" : { "$eq" : "" } } } ] +[jsTest] ---- + +Actual cardinality: 0 +Cardinality estimates: +heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6% +histogram: 0 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 16: [ { "$match" : { "b" : { "$eq" : "mno" } } } ] +[jsTest] ---- + +Actual cardinality: 0 +Cardinality estimates: +heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6% +histogram: 0 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 17: [ { "$match" : { "b" : { "$eq" : "stuv" } } } ] +[jsTest] ---- + +Actual cardinality: 0 +Cardinality estimates: +heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6% +histogram: 0 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 18: [ { "$match" : { "b" : { "$lt" : "" } } } ] +[jsTest] ---- + +Actual cardinality: 0 +Cardinality estimates: +heuristic: 0 AbsError: 0, RelError: 0 , SelError: 0% +histogram: 0 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 19: [ { "$match" : { "b" : { "$lt" : "mno" } } } ] +[jsTest] ---- + +Actual cardinality: 4 +Cardinality estimates: +heuristic: 5 AbsError: 1, RelError: 0.25 , SelError: 10% +histogram: 4 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 20: [ { "$match" : { "b" : { "$lt" : "stuv" } } } ] +[jsTest] ---- + +Actual cardinality: 7 +Cardinality estimates: +heuristic: 5 AbsError: -2, RelError: -0.29 , SelError: -20% +histogram: 7 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 21: [ { "$match" : { "b" : { "$lte" : "" } } } ] +[jsTest] ---- + +Actual cardinality: 0 +Cardinality estimates: +heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6% +histogram: 0 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 22: [ { "$match" : { "b" : { "$lte" : "mno" } } } ] +[jsTest] ---- + +Actual cardinality: 4 +Cardinality estimates: +heuristic: 5 AbsError: 1, RelError: 0.25 , SelError: 10% +histogram: 4 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 23: [ { "$match" : { "b" : { "$lte" : "stuv" } } } ] +[jsTest] ---- + +Actual cardinality: 7 +Cardinality estimates: +heuristic: 5 AbsError: -2, RelError: -0.29 , SelError: -20% +histogram: 7 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 24: [ { "$match" : { "b" : { "$gt" : "" } } } ] +[jsTest] ---- + +Actual cardinality: 10 +Cardinality estimates: +heuristic: 7 AbsError: -3, RelError: -0.3 , SelError: -30% +histogram: 10 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 25: [ { "$match" : { "b" : { "$gt" : "mno" } } } ] +[jsTest] ---- + +Actual cardinality: 6 +Cardinality estimates: +heuristic: 7 AbsError: 1, RelError: 0.17 , SelError: 10% +histogram: 6 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 26: [ { "$match" : { "b" : { "$gt" : "stuv" } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 7 AbsError: 4, RelError: 1.33 , SelError: 40% +histogram: 3 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 27: [ { "$match" : { "b" : { "$gte" : "" } } } ] +[jsTest] ---- + +Actual cardinality: 10 +Cardinality estimates: +heuristic: 7 AbsError: -3, RelError: -0.3 , SelError: -30% +histogram: 10 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 28: [ { "$match" : { "b" : { "$gte" : "mno" } } } ] +[jsTest] ---- + +Actual cardinality: 6 +Cardinality estimates: +heuristic: 7 AbsError: 1, RelError: 0.17 , SelError: 10% +histogram: 6 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 29: [ { "$match" : { "b" : { "$gte" : "stuv" } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 7 AbsError: 4, RelError: 1.33 , SelError: 40% +histogram: 3 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 30: [ { "$match" : { "c" : { "$gt" : 2, "$lt" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 2 +Cardinality estimates: +heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8% +histogram: 2.85 AbsError: 0.85, RelError: 0.43 , SelError: 8.5% + + +[jsTest] ---- +[jsTest] Query 31: [ { "$match" : { "c" : { "$elemMatch" : { "$gt" : 2, "$lt" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 2 +Cardinality estimates: +heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8% +histogram: 1.65 AbsError: -0.35, RelError: -0.18 , SelError: -3.5% + + +[jsTest] ---- +[jsTest] Query 32: [ { "$match" : { "c" : { "$gt" : 2, "$lte" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8% +histogram: 3.79 AbsError: 0.79, RelError: 0.26 , SelError: 7.9% + + +[jsTest] ---- +[jsTest] Query 33: [ { "$match" : { "c" : { "$elemMatch" : { "$gt" : 2, "$lte" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8% +histogram: 4.13 AbsError: 1.13, RelError: 0.38 , SelError: 11.3% + + +[jsTest] ---- +[jsTest] Query 34: [ { "$match" : { "c" : { "$gt" : 2, "$eq" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5% +histogram: 2.85 AbsError: -0.15, RelError: -0.05 , SelError: -1.5% + + +[jsTest] ---- +[jsTest] Query 35: [ { "$match" : { "c" : { "$elemMatch" : { "$gt" : 2, "$eq" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5% +histogram: 3 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 36: [ { "$match" : { "c" : { "$gte" : 2, "$lt" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 2 +Cardinality estimates: +heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8% +histogram: 2.85 AbsError: 0.85, RelError: 0.43 , SelError: 8.5% + + +[jsTest] ---- +[jsTest] Query 37: [ { "$match" : { "c" : { "$elemMatch" : { "$gte" : 2, "$lt" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 2 +Cardinality estimates: +heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8% +histogram: 2.48 AbsError: 0.48, RelError: 0.24 , SelError: 4.8% + + +[jsTest] ---- +[jsTest] Query 38: [ { "$match" : { "c" : { "$gte" : 2, "$lte" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8% +histogram: 3.79 AbsError: 0.79, RelError: 0.26 , SelError: 7.9% + + +[jsTest] ---- +[jsTest] Query 39: [ { "$match" : { "c" : { "$elemMatch" : { "$gte" : 2, "$lte" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8% +histogram: 4.96 AbsError: 1.96, RelError: 0.65 , SelError: 19.6% + + +[jsTest] ---- +[jsTest] Query 40: [ { "$match" : { "c" : { "$gte" : 2, "$eq" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5% +histogram: 2.85 AbsError: -0.15, RelError: -0.05 , SelError: -1.5% + + +[jsTest] ---- +[jsTest] Query 41: [ { "$match" : { "c" : { "$elemMatch" : { "$gte" : 2, "$eq" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 3 +Cardinality estimates: +heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5% +histogram: 3 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 42: [ { "$match" : { "c" : { "$eq" : 2, "$lt" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 2.24 AbsError: 1.24, RelError: 1.24 , SelError: 12.4% +histogram: 0.55 AbsError: -0.45, RelError: -0.45 , SelError: -4.5% + + +[jsTest] ---- +[jsTest] Query 43: [ { "$match" : { "c" : { "$elemMatch" : { "$eq" : 2, "$lt" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 2.65 AbsError: 1.65, RelError: 1.65 , SelError: 16.5% +histogram: 1 AbsError: 0, RelError: 0 , SelError: 0% + + +[jsTest] ---- +[jsTest] Query 44: [ { "$match" : { "c" : { "$eq" : 2, "$lte" : 4 } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 2.24 AbsError: 1.24, RelError: 1.24 , SelError: 12.4% +histogram: 0.63 AbsError: -0.37, RelError: -0.37 , SelError: -3.7% + + +[jsTest] ---- +[jsTest] Query 45: [ { "$match" : { "c" : { "$elemMatch" : { "$eq" : 2, "$lte" : 4 } } } } ] +[jsTest] ---- + +Actual cardinality: 1 +Cardinality estimates: +heuristic: 2.65 AbsError: 1.65, RelError: 1.65 , SelError: 16.5% +histogram: 1 AbsError: 0, RelError: 0 , SelError: 0% + +Heuristic mean errors: absRMSE: 2.394, relRMSE: 1.306, meanAbsSelErr: 19.943% +Histogram mean errors: absRMSE: 0.431, relRMSE: 0.181, meanAbsSelErr: 1.809% diff --git a/jstests/query_golden/libs/ce_data.js b/jstests/query_golden/libs/ce_data.js new file mode 100644 index 00000000000..c59a6af58a9 --- /dev/null +++ b/jstests/query_golden/libs/ce_data.js @@ -0,0 +1,35 @@ +// Small data generator for the purpose of developing the test framework. + +const alphabet = "abcdefghijklmnopqrstuvwxyz"; +const len = alphabet.length; + +// Returns pseudo-random string where the symbols and the length are functions of the parameter n. +function genRandomString(n) { + let strLen = n % 4 + 1; + let str = ""; + let i = 0; + while (i < strLen) { + let idx = (100 * n + i++) % len; + str = str + alphabet[idx]; + } + return str; +} + +const seedArray = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; +const arrLen = seedArray.length; + +// Returns pseudo-random array where the elements and the length are functions of the parameter n. +function genRandomArray(n) { + let aLen = (7 * n) % 5 + 1; + let start = (13 * n) % arrLen; + return seedArray.slice(start, start + aLen); +} + +/** + * Returns documents for cardinality estimation tests. + */ + +function getCEDocs() { + return Array.from({length: 10}, + (_, i) => ({_id: i, a: i + 10, b: genRandomString(i), c: genRandomArray(i)})); +} diff --git a/jstests/query_golden/libs/compute_errors.js b/jstests/query_golden/libs/compute_errors.js new file mode 100644 index 00000000000..4443376217f --- /dev/null +++ b/jstests/query_golden/libs/compute_errors.js @@ -0,0 +1,29 @@ +// Given a collection containing errors for individual queries, compute root-mean square error. +function computeRMSE(errorColl, errorField, collSize) { + const res = + errorColl + .aggregate([ + {$project: {error2: {$pow: [errorField, 2]}}}, + {$group: {_id: null, cumError: {$sum: "$error2"}}}, + { + $project: + {_id: 0, "rmse": {$round: [{$sqrt: {$divide: ["$cumError", collSize]}}, 3]}} + } + ]) + .toArray(); + return res[0].rmse; +} + +// Given a collection with a field with selectivity errors, compute the mean absolute selectivity +// error. +function computeMeanAbsSelError(errorColl, errorField, collSize) { + const res = + errorColl + .aggregate([ + {$project: {absError: {$abs: errorField}}}, + {$group: {_id: null, cumError: {$sum: "$absError"}}}, + {$project: {_id: 0, "meanErr": {$round: [{$divide: ["$cumError", collSize]}, 3]}}} + ]) + .toArray(); + return res[0].meanErr; +} diff --git a/jstests/query_golden/libs/generate_queries.js b/jstests/query_golden/libs/generate_queries.js new file mode 100644 index 00000000000..7129ed64869 --- /dev/null +++ b/jstests/query_golden/libs/generate_queries.js @@ -0,0 +1,74 @@ +/** + * Helper functions for generating of queries over a collection. + */ + +function makeMatchPredicate(field, boundary, compOp) { + return {"$match": {[field]: {[compOp]: boundary}}}; +} + +function makeRangePredicate(field, op1, bound1, op2, bound2, isElemMatch = false) { + if (isElemMatch) { + return {"$match": {[field]: {"$elemMatch": {[op1]: bound1, [op2]: bound2}}}}; + } + return {"$match": {[field]: {[op1]: bound1, [op2]: bound2}}}; +} + +function generateComparisons(field, boundaries) { + let predicates = []; + const compOps = ["$eq", "$lt", "$lte", "$gt", "$gte"]; + for (const op of compOps) { + boundaries.forEach(function(boundary) { + predicates.push(makeMatchPredicate(field, boundary, op)); + }); + } + let docs = []; + for (const pred of predicates) { + let doc = {"pipeline": [pred]}; + docs.push(doc); + } + return docs; +} + +// Generate range predicates with $match and $elemMatch. +// boundaries:[ [low1, high1], ...] +function generateRangePredicates(field, boundaries) { + let predicates = []; + const op1Option = ["$gt", "$gte", "$eq"]; + const op2Option = ["$lt", "$lte", "$eq"]; + boundaries.forEach(function(boundary) { + assert(boundary.length == 2); + for (let op1 of op1Option) { + for (let op2 of op2Option) { + if (op1 == "$eq" && op2 == "$eq") { + continue; + } + predicates.push(makeRangePredicate(field, op1, boundary[0], op2, boundary[1])); + if (boundary[0] <= boundary[1]) { + predicates.push( + makeRangePredicate(field, op1, boundary[0], op2, boundary[1], true)); + } + } + } + }); + + let docs = []; + for (let pred of predicates) { + let doc = {"pipeline": [pred]}; + docs.push(doc); + } + return docs; +} + +// Helper function to extract a number of values from a collection field, to be used for query +// generation. +function selectQueryValues(coll, field) { + let values = []; + const collSize = coll.find().itcount(); + let positions = [2, collSize / 2, collSize - 2]; + + for (const pos of positions) { + const res = coll.find({}, {"_id": 0, [field]: 1}).toArray()[pos]; + values.push(res[field]); + } + return values; +} |