SERVER-71989 Implement CE accuracy testing framework based on JS testing

author: Milena Ivanova <milena.ivanova@mongodb.com> 2022-12-12 10:54:53 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-12-12 11:28:43 +0000
commit: 21aba5b624e0371869a453f89ad1b3711ee720b0 (patch)
tree: 9d9dd03c7ecf453cf51310282287a760e8eccd91
parent: 20913e4b839e108613a711b60698bc515eb2b2ac (diff)
download: mongo-21aba5b624e0371869a453f89ad1b3711ee720b0.tar.gz
11 files changed, 810 insertions, 13 deletions
diff --git a/jstests/cqf/analyze/array_histogram.js b/jstests/cqf/analyze/array_histogram.js
index 23277ab08ba..0ee10af9ebf 100644
--- a/jstests/cqf/analyze/array_histogram.js
+++ b/jstests/cqf/analyze/array_histogram.js
@@ -112,7 +112,7 @@ runHistogramsTest(function verifyArrayHistograms() {
     createAndValidateHistogram({coll, expectedHistogram});
 
     // Verify CE.
-    forceHistogramCE();
+    forceCE("histogram");
 
     // Equality predicates.
     verifyCEForMatch(
diff --git a/jstests/cqf/analyze/ce_histogram.js b/jstests/cqf/analyze/ce_histogram.js
index c9f2691b887..cb27f55b3f9 100644
--- a/jstests/cqf/analyze/ce_histogram.js
+++ b/jstests/cqf/analyze/ce_histogram.js
@@ -187,7 +187,7 @@ function verifyCEForNDV(ndv) {
     };
 
     // Verify CE for all distinct values of each field across multiple types.
-    forceHistogramCE();
+    forceCE("histogram");
     for (const field of fields) {
         jsTestLog(`Testing histogram for ndv ${ndv} and field ${field}`);
 
diff --git a/jstests/cqf/analyze/missing_histogram.js b/jstests/cqf/analyze/missing_histogram.js
index 7349337dd1d..57d16836883 100644
--- a/jstests/cqf/analyze/missing_histogram.js
+++ b/jstests/cqf/analyze/missing_histogram.js
@@ -29,7 +29,7 @@ runHistogramsTest(function testEmptyAndMissingHistograms() {
     // Ensure we have no histogram.
     assert.eq(db.system.statistics[noHistogramColl.getName()].count(), 0);
 
-    forceHistogramCE();
+    forceCE("histogram");
 
     // We actually use heuristics because we don't generate histograms for empty collections;
     // however, our estimate should still be 0, because the collection is empty.
diff --git a/jstests/cqf/analyze/type_counts.js b/jstests/cqf/analyze/type_counts.js
index b43a37dad55..d3a25eb85b9 100644
--- a/jstests/cqf/analyze/type_counts.js
+++ b/jstests/cqf/analyze/type_counts.js
@@ -109,7 +109,7 @@ runHistogramsTest(function testTypeCounts() {
     });
 
     // Verify type count CE.
-    forceHistogramCE();
+    forceCE("histogram");
     let hint = {a: 1};
 
     // TODO SERVER-70936: estimate boolean counts.
@@ -352,7 +352,7 @@ runHistogramsTest(function testTypeCounts() {
     });
 
     // Verify type count CE.
-    forceHistogramCE();
+    forceCE("histogram");
     hint = {"a.b": 1};
 
     // Test CE for histogrammable types.
@@ -551,7 +551,7 @@ runHistogramsTest(function testTypeCounts() {
     // Verify type count CE. Note that for non-$elemMatch preidcates, we include both array and
     // scalar type-counts, while for $elemMatch predicates, we include only array type counts in
     // our estimate.
-    forceHistogramCE();
+    forceCE("histogram");
     hint = {a: 1};
 
     // TODO SERVER-70936: estimate boolean counts.
diff --git a/jstests/libs/ce_stats_utils.js b/jstests/libs/ce_stats_utils.js
index fdef187007a..48f8ec77f0c 100644
--- a/jstests/libs/ce_stats_utils.js
+++ b/jstests/libs/ce_stats_utils.js
@@ -126,12 +126,19 @@ function runHistogramsTest(test) {
 }
 
 /**
- * We need to set the CE query knob to use histograms and force the use of the new optimizer to
- * ensure that we use histograms to estimate CE.
+ * Creates a single-field index for each field in the 'fields' array.
  */
-function forceHistogramCE() {
-    assert.commandWorked(
-        db.adminCommand({setParameter: 1, internalQueryCardinalityEstimatorMode: "histogram"}));
-    assert.commandWorked(
-        db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceBonsai"}));
+function createIndexes(coll, fields) {
+    for (const field of fields) {
+        assert.commandWorked(coll.createIndex({[field]: 1}));
+    }
+}
+
+/**
+ * Creates statistics for each field in the 'fields' array.
+ */
+function analyzeFields(coll, fields) {
+    for (const field of fields) {
+        assert.commandWorked(db.runCommand({analyze: coll.getName(), key: field}));
+    }
 }
diff --git a/jstests/libs/optimizer_utils.js b/jstests/libs/optimizer_utils.js
index bb363a6a54a..4f7b4ca7c3d 100644
--- a/jstests/libs/optimizer_utils.js
+++ b/jstests/libs/optimizer_utils.js
@@ -324,3 +324,18 @@ function runWithParams(keyValPairs, fn) {
         }
     }
 }
+
+function round2(n) {
+    return (Math.round(n * 100) / 100);
+}
+
+/**
+ * Force cardinality estimation mode: "histogram", "heuristic", or "sampling". We need to force the
+ * use of the new optimizer.
+ */
+function forceCE(mode) {
+    assert.commandWorked(
+        db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceBonsai"}));
+    assert.commandWorked(
+        db.adminCommand({setParameter: 1, internalQueryCardinalityEstimatorMode: mode}));
+}
diff --git a/jstests/query_golden/ce_accuracy.js b/jstests/query_golden/ce_accuracy.js
new file mode 100644
index 00000000000..db011020a52
--- /dev/null
+++ b/jstests/query_golden/ce_accuracy.js
@@ -0,0 +1,167 @@
+/**
+ * Tests for cardinality estimation accuracy.
+ * @tags: [
+ *   requires_cqf,
+ * ]
+ */
+
+(function() {
+
+load("jstests/libs/ce_stats_utils.js");
+load("jstests/libs/optimizer_utils.js");
+load("jstests/query_golden/libs/ce_data.js");
+load("jstests/query_golden/libs/compute_errors.js");
+load("jstests/query_golden/libs/generate_queries.js");
+
+runHistogramsTest(function testCEAccuracy() {
+    // Flag to show more information for debugging purposes:
+    // - query results;
+    // - execution of sampling CE strategy.
+    const ceDebugFlag = false;
+    let ceStrategies = ["heuristic", "histogram"];
+    if (ceDebugFlag) {
+        ceStrategies.push("sampling");
+    }
+
+    const coll = db.ce_accuracy;
+    coll.drop();
+
+    // Collection to store accuracy errors.
+    const errorColl = db.ce_errors;
+    errorColl.drop();
+
+    // Stats collection.
+    const statsColl = db.system.statistics.ce_accuracy;
+
+    jsTestLog("Populating collection");
+    assert.commandWorked(coll.insertMany(getCEDocs()));
+    const collSize = coll.find().itcount();
+    print(`Collection count: ${coll.find().itcount()}\n`);
+    if (ceDebugFlag) {
+        show(coll.find().limit(100).toArray());
+    }
+
+    // Switch to 'tryBonsai' to create statistics.
+    assert.commandWorked(
+        db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "tryBonsai"}));
+    // Create indexes and statistics.
+    const fields = ["a", "b", "c"];
+    createIndexes(coll, fields);
+    analyzeFields(coll, fields);
+    for (const field of fields) {
+        let stats = statsColl.find({"_id": field})[0];
+        assert.eq(stats["statistics"]["documents"], collSize);
+    }
+
+    function runAggregationWithCE(testcase, strategy) {
+        if (testcase["nReturned"] == null) {
+            var explain = coll.explain("executionStats").aggregate(testcase.pipeline);
+            testcase["nReturned"] = explain.executionStats.nReturned;
+        } else {
+            var explain = coll.explain().aggregate(testcase.pipeline);
+        }
+        testcase[strategy] = round2(getRootCE(explain));
+    }
+
+    // Run all queries in the array testCases with all CE strategies.
+    function runQueries(testCases) {
+        ceStrategies.forEach(function(strategy) {
+            forceCE(strategy);
+            testCases.forEach(testCase => runAggregationWithCE(testCase, strategy));
+        });
+    }
+
+    function computeStrategyErrors(testcase, strategy, collSize) {
+        let errorDoc = {};
+        const absError = testcase[strategy] - testcase.nReturned;
+        let relError = 0.0;
+        if (testcase.nReturned > 0) {
+            relError = absError / testcase.nReturned;
+        } else if (testcase[strategy] > 0) {
+            // We cannot compute the relative error by division by zero. Take 10% of the absolute
+            // error instead.
+            relError = 0.1 * testcase[strategy];
+        }
+
+        // Selectivity error wrt collection size.
+        const selError = 100.0 * absError / collSize;
+        errorDoc["absError"] = round2(absError);
+        errorDoc["relError"] = round2(relError);
+        errorDoc["selError"] = round2(selError);
+        return errorDoc;
+    }
+
+    function computeAndPrintErrors(testcase, collSize) {
+        let errorDoc = {_id: testcase._id};
+
+        ceStrategies.forEach(function(strategy) {
+            errors = computeStrategyErrors(testcase, strategy, collSize);
+            errorDoc[strategy] = errors;
+            print(`${strategy}: ${testcase[strategy]} `);
+            print(`AbsError: ${errors["absError"]}, RelError: ${errors["relError"]} , SelError: ${
+                errors["selError"]}%\n`);
+        });
+        return errorDoc;
+    }
+
+    // Queries.
+    // Defined as documents: { _id: 1, pipeline: [{$match: {a: {$gt: 16}}}] }
+    let values = selectQueryValues(coll, "a");
+    let testCasesInt = generateComparisons("a", values);
+    let testCasesStr = generateComparisons("b", ["", genRandomString(10), genRandomString(15)]);
+    let testCasesArr = generateRangePredicates("c", [
+        [2, 4],
+    ]);
+    let testCases = testCasesInt.concat(testCasesStr).concat(testCasesArr);
+
+    let i = 0;
+    for (let query of testCases) {
+        query["_id"] = i++;
+    }
+
+    runQueries(testCases);
+
+    // Compute errors per query.
+    for (const testcase of testCases) {
+        jsTestLog(`Query ${testcase._id}: ${tojsononeline(testcase.pipeline)}`);
+        if (ceDebugFlag)
+            show(coll.aggregate(testcase.pipeline));
+        print(`Actual cardinality: ${testcase.nReturned}\n`);
+        print(`Cardinality estimates:\n`);
+        let errorDoc = computeAndPrintErrors(testcase, collSize);
+        errorColl.insert(errorDoc);
+    }
+
+    if (ceDebugFlag) {
+        show(testCases);
+        show(errorColl.find());
+    }
+
+    // Switch to 'forceClassicEngine' to run the pipelines computing the errors.
+    assert.commandWorked(
+        db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceClassicEngine"}));
+
+    // Compute aggregate errors for all CE strategies across all queries.
+    const trialSize = errorColl.find().itcount();
+
+    print(`\nHeuristic mean errors: `);
+    print(`absRMSE: ${computeRMSE(errorColl, "$heuristic.absError", trialSize)}, `);
+    print(`relRMSE: ${computeRMSE(errorColl, "$heuristic.relError", trialSize)}, `);
+    print(
+        `meanAbsSelErr: ${computeMeanAbsSelError(errorColl, "$heuristic.selError", trialSize)}%\n`);
+
+    print(`Histogram mean errors: `);
+    print(`absRMSE: ${computeRMSE(errorColl, "$histogram.absError", trialSize)}, `);
+    print(`relRMSE: ${computeRMSE(errorColl, "$histogram.relError", trialSize)}, `);
+    print(
+        `meanAbsSelErr: ${computeMeanAbsSelError(errorColl, "$histogram.selError", trialSize)}%\n`);
+
+    if (ceDebugFlag) {
+        print(`Sampling mean errors: `);
+        print(`absRMSE: ${computeRMSE(errorColl, "$sampling.absError", trialSize)}, `);
+        print(`relRMSE: ${computeRMSE(errorColl, "$sampling.relError", trialSize)}, `);
+        print(`meanAbsSelErr: ${
+            computeMeanAbsSelError(errorColl, "$sampling.selError", trialSize)}%\n`);
+    }
+});
+})();
diff --git a/jstests/query_golden/expected_output/ce_accuracy b/jstests/query_golden/expected_output/ce_accuracy
new file mode 100644
index 00000000000..518993f6b99
--- /dev/null
+++ b/jstests/query_golden/expected_output/ce_accuracy
@@ -0,0 +1,470 @@
+
+
+[jsTest] ----
+[jsTest] Populating collection
+[jsTest] ----
+
+Collection count: 10
+
+
+[jsTest] ----
+[jsTest] Query 0: [ { "$match" : { "a" : { "$eq" : 12 } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 3.16 AbsError: 2.16, RelError: 2.16 , SelError: 21.6%
+histogram: 1 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 1: [ { "$match" : { "a" : { "$eq" : 15 } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 3.16 AbsError: 2.16, RelError: 2.16 , SelError: 21.6%
+histogram: 1 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 2: [ { "$match" : { "a" : { "$eq" : 18 } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 3.16 AbsError: 2.16, RelError: 2.16 , SelError: 21.6%
+histogram: 1 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 3: [ { "$match" : { "a" : { "$lt" : 12 } } } ]
+[jsTest] ----
+
+Actual cardinality: 2
+Cardinality estimates:
+heuristic: 5 AbsError: 3, RelError: 1.5 , SelError: 30%
+histogram: 2 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 4: [ { "$match" : { "a" : { "$lt" : 15 } } } ]
+[jsTest] ----
+
+Actual cardinality: 5
+Cardinality estimates:
+heuristic: 5 AbsError: 0, RelError: 0 , SelError: 0%
+histogram: 5 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 5: [ { "$match" : { "a" : { "$lt" : 18 } } } ]
+[jsTest] ----
+
+Actual cardinality: 8
+Cardinality estimates:
+heuristic: 5 AbsError: -3, RelError: -0.37 , SelError: -30%
+histogram: 8 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 6: [ { "$match" : { "a" : { "$lte" : 12 } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 5 AbsError: 2, RelError: 0.67 , SelError: 20%
+histogram: 3 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 7: [ { "$match" : { "a" : { "$lte" : 15 } } } ]
+[jsTest] ----
+
+Actual cardinality: 6
+Cardinality estimates:
+heuristic: 5 AbsError: -1, RelError: -0.17 , SelError: -10%
+histogram: 6 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 8: [ { "$match" : { "a" : { "$lte" : 18 } } } ]
+[jsTest] ----
+
+Actual cardinality: 9
+Cardinality estimates:
+heuristic: 5 AbsError: -4, RelError: -0.44 , SelError: -40%
+histogram: 9 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 9: [ { "$match" : { "a" : { "$gt" : 12 } } } ]
+[jsTest] ----
+
+Actual cardinality: 7
+Cardinality estimates:
+heuristic: 7 AbsError: 0, RelError: 0 , SelError: 0%
+histogram: 7 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 10: [ { "$match" : { "a" : { "$gt" : 15 } } } ]
+[jsTest] ----
+
+Actual cardinality: 4
+Cardinality estimates:
+heuristic: 7 AbsError: 3, RelError: 0.75 , SelError: 30%
+histogram: 4 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 11: [ { "$match" : { "a" : { "$gt" : 18 } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 7 AbsError: 6, RelError: 6 , SelError: 60%
+histogram: 1 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 12: [ { "$match" : { "a" : { "$gte" : 12 } } } ]
+[jsTest] ----
+
+Actual cardinality: 8
+Cardinality estimates:
+heuristic: 7 AbsError: -1, RelError: -0.12 , SelError: -10%
+histogram: 8 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 13: [ { "$match" : { "a" : { "$gte" : 15 } } } ]
+[jsTest] ----
+
+Actual cardinality: 5
+Cardinality estimates:
+heuristic: 7 AbsError: 2, RelError: 0.4 , SelError: 20%
+histogram: 5 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 14: [ { "$match" : { "a" : { "$gte" : 18 } } } ]
+[jsTest] ----
+
+Actual cardinality: 2
+Cardinality estimates:
+heuristic: 7 AbsError: 5, RelError: 2.5 , SelError: 50%
+histogram: 2 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 15: [ { "$match" : { "b" : { "$eq" : "" } } } ]
+[jsTest] ----
+
+Actual cardinality: 0
+Cardinality estimates:
+heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6%
+histogram: 0 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 16: [ { "$match" : { "b" : { "$eq" : "mno" } } } ]
+[jsTest] ----
+
+Actual cardinality: 0
+Cardinality estimates:
+heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6%
+histogram: 0 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 17: [ { "$match" : { "b" : { "$eq" : "stuv" } } } ]
+[jsTest] ----
+
+Actual cardinality: 0
+Cardinality estimates:
+heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6%
+histogram: 0 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 18: [ { "$match" : { "b" : { "$lt" : "" } } } ]
+[jsTest] ----
+
+Actual cardinality: 0
+Cardinality estimates:
+heuristic: 0 AbsError: 0, RelError: 0 , SelError: 0%
+histogram: 0 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 19: [ { "$match" : { "b" : { "$lt" : "mno" } } } ]
+[jsTest] ----
+
+Actual cardinality: 4
+Cardinality estimates:
+heuristic: 5 AbsError: 1, RelError: 0.25 , SelError: 10%
+histogram: 4 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 20: [ { "$match" : { "b" : { "$lt" : "stuv" } } } ]
+[jsTest] ----
+
+Actual cardinality: 7
+Cardinality estimates:
+heuristic: 5 AbsError: -2, RelError: -0.29 , SelError: -20%
+histogram: 7 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 21: [ { "$match" : { "b" : { "$lte" : "" } } } ]
+[jsTest] ----
+
+Actual cardinality: 0
+Cardinality estimates:
+heuristic: 3.16 AbsError: 3.16, RelError: 0.32 , SelError: 31.6%
+histogram: 0 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 22: [ { "$match" : { "b" : { "$lte" : "mno" } } } ]
+[jsTest] ----
+
+Actual cardinality: 4
+Cardinality estimates:
+heuristic: 5 AbsError: 1, RelError: 0.25 , SelError: 10%
+histogram: 4 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 23: [ { "$match" : { "b" : { "$lte" : "stuv" } } } ]
+[jsTest] ----
+
+Actual cardinality: 7
+Cardinality estimates:
+heuristic: 5 AbsError: -2, RelError: -0.29 , SelError: -20%
+histogram: 7 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 24: [ { "$match" : { "b" : { "$gt" : "" } } } ]
+[jsTest] ----
+
+Actual cardinality: 10
+Cardinality estimates:
+heuristic: 7 AbsError: -3, RelError: -0.3 , SelError: -30%
+histogram: 10 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 25: [ { "$match" : { "b" : { "$gt" : "mno" } } } ]
+[jsTest] ----
+
+Actual cardinality: 6
+Cardinality estimates:
+heuristic: 7 AbsError: 1, RelError: 0.17 , SelError: 10%
+histogram: 6 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 26: [ { "$match" : { "b" : { "$gt" : "stuv" } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 7 AbsError: 4, RelError: 1.33 , SelError: 40%
+histogram: 3 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 27: [ { "$match" : { "b" : { "$gte" : "" } } } ]
+[jsTest] ----
+
+Actual cardinality: 10
+Cardinality estimates:
+heuristic: 7 AbsError: -3, RelError: -0.3 , SelError: -30%
+histogram: 10 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 28: [ { "$match" : { "b" : { "$gte" : "mno" } } } ]
+[jsTest] ----
+
+Actual cardinality: 6
+Cardinality estimates:
+heuristic: 7 AbsError: 1, RelError: 0.17 , SelError: 10%
+histogram: 6 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 29: [ { "$match" : { "b" : { "$gte" : "stuv" } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 7 AbsError: 4, RelError: 1.33 , SelError: 40%
+histogram: 3 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 30: [ { "$match" : { "c" : { "$gt" : 2, "$lt" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 2
+Cardinality estimates:
+heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8%
+histogram: 2.85 AbsError: 0.85, RelError: 0.43 , SelError: 8.5%
+
+
+[jsTest] ----
+[jsTest] Query 31: [ { "$match" : { "c" : { "$elemMatch" : { "$gt" : 2, "$lt" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 2
+Cardinality estimates:
+heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8%
+histogram: 1.65 AbsError: -0.35, RelError: -0.18 , SelError: -3.5%
+
+
+[jsTest] ----
+[jsTest] Query 32: [ { "$match" : { "c" : { "$gt" : 2, "$lte" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8%
+histogram: 3.79 AbsError: 0.79, RelError: 0.26 , SelError: 7.9%
+
+
+[jsTest] ----
+[jsTest] Query 33: [ { "$match" : { "c" : { "$elemMatch" : { "$gt" : 2, "$lte" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8%
+histogram: 4.13 AbsError: 1.13, RelError: 0.38 , SelError: 11.3%
+
+
+[jsTest] ----
+[jsTest] Query 34: [ { "$match" : { "c" : { "$gt" : 2, "$eq" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5%
+histogram: 2.85 AbsError: -0.15, RelError: -0.05 , SelError: -1.5%
+
+
+[jsTest] ----
+[jsTest] Query 35: [ { "$match" : { "c" : { "$elemMatch" : { "$gt" : 2, "$eq" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5%
+histogram: 3 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 36: [ { "$match" : { "c" : { "$gte" : 2, "$lt" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 2
+Cardinality estimates:
+heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8%
+histogram: 2.85 AbsError: 0.85, RelError: 0.43 , SelError: 8.5%
+
+
+[jsTest] ----
+[jsTest] Query 37: [ { "$match" : { "c" : { "$elemMatch" : { "$gte" : 2, "$lt" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 2
+Cardinality estimates:
+heuristic: 4.18 AbsError: 2.18, RelError: 1.09 , SelError: 21.8%
+histogram: 2.48 AbsError: 0.48, RelError: 0.24 , SelError: 4.8%
+
+
+[jsTest] ----
+[jsTest] Query 38: [ { "$match" : { "c" : { "$gte" : 2, "$lte" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8%
+histogram: 3.79 AbsError: 0.79, RelError: 0.26 , SelError: 7.9%
+
+
+[jsTest] ----
+[jsTest] Query 39: [ { "$match" : { "c" : { "$elemMatch" : { "$gte" : 2, "$lte" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 4.18 AbsError: 1.18, RelError: 0.39 , SelError: 11.8%
+histogram: 4.96 AbsError: 1.96, RelError: 0.65 , SelError: 19.6%
+
+
+[jsTest] ----
+[jsTest] Query 40: [ { "$match" : { "c" : { "$gte" : 2, "$eq" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5%
+histogram: 2.85 AbsError: -0.15, RelError: -0.05 , SelError: -1.5%
+
+
+[jsTest] ----
+[jsTest] Query 41: [ { "$match" : { "c" : { "$elemMatch" : { "$gte" : 2, "$eq" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 3
+Cardinality estimates:
+heuristic: 2.65 AbsError: -0.35, RelError: -0.12 , SelError: -3.5%
+histogram: 3 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 42: [ { "$match" : { "c" : { "$eq" : 2, "$lt" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 2.24 AbsError: 1.24, RelError: 1.24 , SelError: 12.4%
+histogram: 0.55 AbsError: -0.45, RelError: -0.45 , SelError: -4.5%
+
+
+[jsTest] ----
+[jsTest] Query 43: [ { "$match" : { "c" : { "$elemMatch" : { "$eq" : 2, "$lt" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 2.65 AbsError: 1.65, RelError: 1.65 , SelError: 16.5%
+histogram: 1 AbsError: 0, RelError: 0 , SelError: 0%
+
+
+[jsTest] ----
+[jsTest] Query 44: [ { "$match" : { "c" : { "$eq" : 2, "$lte" : 4 } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 2.24 AbsError: 1.24, RelError: 1.24 , SelError: 12.4%
+histogram: 0.63 AbsError: -0.37, RelError: -0.37 , SelError: -3.7%
+
+
+[jsTest] ----
+[jsTest] Query 45: [ { "$match" : { "c" : { "$elemMatch" : { "$eq" : 2, "$lte" : 4 } } } } ]
+[jsTest] ----
+
+Actual cardinality: 1
+Cardinality estimates:
+heuristic: 2.65 AbsError: 1.65, RelError: 1.65 , SelError: 16.5%
+histogram: 1 AbsError: 0, RelError: 0 , SelError: 0%
+
+Heuristic mean errors: absRMSE: 2.394, relRMSE: 1.306, meanAbsSelErr: 19.943%
+Histogram mean errors: absRMSE: 0.431, relRMSE: 0.181, meanAbsSelErr: 1.809%
diff --git a/jstests/query_golden/libs/ce_data.js b/jstests/query_golden/libs/ce_data.js
new file mode 100644
index 00000000000..c59a6af58a9
--- /dev/null
+++ b/jstests/query_golden/libs/ce_data.js
@@ -0,0 +1,35 @@
+// Small data generator for the purpose of developing the test framework.
+
+const alphabet = "abcdefghijklmnopqrstuvwxyz";
+const len = alphabet.length;
+
+// Returns pseudo-random string where the symbols and the length are functions of the parameter n.
+function genRandomString(n) {
+    let strLen = n % 4 + 1;
+    let str = "";
+    let i = 0;
+    while (i < strLen) {
+        let idx = (100 * n + i++) % len;
+        str = str + alphabet[idx];
+    }
+    return str;
+}
+
+const seedArray = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+const arrLen = seedArray.length;
+
+// Returns pseudo-random array where the elements and the length are functions of the parameter n.
+function genRandomArray(n) {
+    let aLen = (7 * n) % 5 + 1;
+    let start = (13 * n) % arrLen;
+    return seedArray.slice(start, start + aLen);
+}
+
+/**
+ * Returns documents for cardinality estimation tests.
+ */
+
+function getCEDocs() {
+    return Array.from({length: 10},
+                      (_, i) => ({_id: i, a: i + 10, b: genRandomString(i), c: genRandomArray(i)}));
+}
diff --git a/jstests/query_golden/libs/compute_errors.js b/jstests/query_golden/libs/compute_errors.js
new file mode 100644
index 00000000000..4443376217f
--- /dev/null
+++ b/jstests/query_golden/libs/compute_errors.js
@@ -0,0 +1,29 @@
+// Given a collection containing errors for individual queries, compute root-mean square error.
+function computeRMSE(errorColl, errorField, collSize) {
+    const res =
+        errorColl
+            .aggregate([
+                {$project: {error2: {$pow: [errorField, 2]}}},
+                {$group: {_id: null, cumError: {$sum: "$error2"}}},
+                {
+                    $project:
+                        {_id: 0, "rmse": {$round: [{$sqrt: {$divide: ["$cumError", collSize]}}, 3]}}
+                }
+            ])
+            .toArray();
+    return res[0].rmse;
+}
+
+// Given a collection with a field with selectivity errors, compute the mean absolute selectivity
+// error.
+function computeMeanAbsSelError(errorColl, errorField, collSize) {
+    const res =
+        errorColl
+            .aggregate([
+                {$project: {absError: {$abs: errorField}}},
+                {$group: {_id: null, cumError: {$sum: "$absError"}}},
+                {$project: {_id: 0, "meanErr": {$round: [{$divide: ["$cumError", collSize]}, 3]}}}
+            ])
+            .toArray();
+    return res[0].meanErr;
+}
diff --git a/jstests/query_golden/libs/generate_queries.js b/jstests/query_golden/libs/generate_queries.js
new file mode 100644
index 00000000000..7129ed64869
--- /dev/null
+++ b/jstests/query_golden/libs/generate_queries.js
@@ -0,0 +1,74 @@
+/**
+ * Helper functions for generating of queries over a collection.
+ */
+
+function makeMatchPredicate(field, boundary, compOp) {
+    return {"$match": {[field]: {[compOp]: boundary}}};
+}
+
+function makeRangePredicate(field, op1, bound1, op2, bound2, isElemMatch = false) {
+    if (isElemMatch) {
+        return {"$match": {[field]: {"$elemMatch": {[op1]: bound1, [op2]: bound2}}}};
+    }
+    return {"$match": {[field]: {[op1]: bound1, [op2]: bound2}}};
+}
+
+function generateComparisons(field, boundaries) {
+    let predicates = [];
+    const compOps = ["$eq", "$lt", "$lte", "$gt", "$gte"];
+    for (const op of compOps) {
+        boundaries.forEach(function(boundary) {
+            predicates.push(makeMatchPredicate(field, boundary, op));
+        });
+    }
+    let docs = [];
+    for (const pred of predicates) {
+        let doc = {"pipeline": [pred]};
+        docs.push(doc);
+    }
+    return docs;
+}
+
+// Generate range predicates with $match and $elemMatch.
+// boundaries:[ [low1, high1], ...]
+function generateRangePredicates(field, boundaries) {
+    let predicates = [];
+    const op1Option = ["$gt", "$gte", "$eq"];
+    const op2Option = ["$lt", "$lte", "$eq"];
+    boundaries.forEach(function(boundary) {
+        assert(boundary.length == 2);
+        for (let op1 of op1Option) {
+            for (let op2 of op2Option) {
+                if (op1 == "$eq" && op2 == "$eq") {
+                    continue;
+                }
+                predicates.push(makeRangePredicate(field, op1, boundary[0], op2, boundary[1]));
+                if (boundary[0] <= boundary[1]) {
+                    predicates.push(
+                        makeRangePredicate(field, op1, boundary[0], op2, boundary[1], true));
+                }
+            }
+        }
+    });
+
+    let docs = [];
+    for (let pred of predicates) {
+        let doc = {"pipeline": [pred]};
+        docs.push(doc);
+    }
+    return docs;
+}
+
+// Helper function to extract a number of values from a collection field, to be used for query
+// generation.
+function selectQueryValues(coll, field) {
+    let values = [];
+    const collSize = coll.find().itcount();
+    let positions = [2, collSize / 2, collSize - 2];
+
+    for (const pos of positions) {
+        const res = coll.find({}, {"_id": 0, [field]: 1}).toArray()[pos];
+        values.push(res[field]);
+    }
+    return values;
+}
author	Milena Ivanova <milena.ivanova@mongodb.com>	2022-12-12 10:54:53 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-12-12 11:28:43 +0000
commit	21aba5b624e0371869a453f89ad1b3711ee720b0 (patch)
tree	9d9dd03c7ecf453cf51310282287a760e8eccd91
parent	20913e4b839e108613a711b60698bc515eb2b2ac (diff)
download	mongo-21aba5b624e0371869a453f89ad1b3711ee720b0.tar.gz