SERVER-71205 Fix histogram type to double conversion

author: Alya Berciu <alya.berciu@mongodb.com> 2022-12-02 09:21:40 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-12-02 10:43:32 +0000
commit: 0465129215092c9c7fdf1a168cb56bc1cdefb855 (patch)
tree: 47aaf241a397b6996aa9edbbb567aab14efd8c7c /jstests/cqf
parent: 10d13865c7429a4da344cdcadc661268429624d2 (diff)
download: mongo-0465129215092c9c7fdf1a168cb56bc1cdefb855.tar.gz
1 files changed, 80 insertions, 53 deletions
diff --git a/jstests/cqf/analyze/ce_histogram.js b/jstests/cqf/analyze/ce_histogram.js
index fb72d50a456..c9f2691b887 100644
--- a/jstests/cqf/analyze/ce_histogram.js
+++ b/jstests/cqf/analyze/ce_histogram.js
@@ -15,11 +15,29 @@
 
 load('jstests/libs/ce_stats_utils.js');
 
+const charCodeA = 65;
 const collName = "ce_histogram";
-const fields = ["int", "dbl", "str", "date"];
+const fields = ["int", "dbl", "dec", "strS", "strB", "date", "ts", "oid"];
 
 let _id;
 
+// Helper to sort ints lexicographically.
+// A small note: the ordering of string bounds (lexicographical) is different than that of int
+// bounds. In order to simplify the histogram validation logic, we don't want to have to account for
+// the fact that in the resulting histogram, string bounds will be sorted differently than int
+// bounds. To illustrate this, if we were to use the format `${val}` to generate strings, we would
+// generate a different order of bounds for the histogram on the string fields and the histogram on
+// the int fields, because 2 < 10, but "10" < "2". This test relies on all field values being sorted
+// the same way in the bounds.
+function lexint(val) {
+    return String.fromCharCode(charCodeA - 1 + val);  // Note: val >= 1.
+}
+
+// Helper to generate hex strings for ints from 1-10. These are trivially lexicographically sorted.
+function lexhex(val) {
+    return val > 9 ? String.fromCharCode((val - 10) + charCodeA) : `${val}`;
+}
+
 /**
  * Generates 'val' documents where each document has a distinct value for each 'field' in 'fields'.
  */
@@ -28,14 +46,15 @@ function generateDocs(val) {
     const fields = {
         int: NumberInt(val),  // Necessary to cast, otherwise we get a double here.
         dbl: val + 0.1,
-        // A small note: the ordering of string bounds (lexicographical) is different than that of
-        // int bounds. In order to simplify the histogram validation logic, we don't want to have to
-        // account for the fact that string bounds will be sorted differently than int bounds. To
-        // illustrate this, if we were to use the format `string_${val}`, the string corresponding
-        // to value 10 would be the second entry in the histogram bounds array, even though it would
-        // be generated for 'val' = 10, not 'val' = 2.
-        str: `string_${String.fromCharCode(64 + val)}`,
-        date: new Date(`02 December ${val + 2000}`)
+        dec: NumberDecimal(val + 0.1),
+        strS: `${lexint(val)}`,
+        strB: `string_${lexint(val)}`,
+        date: new Date(`02 December ${val + 2000}`),
+        ts: new Timestamp(val, 1),
+        // Object Ids are represented by 12 bytes. We want to ensure the ordering matches the
+        // ordering of the other fields for the purposes of this test. As a result, we set the 4th
+        // most significant byte to a lexicographically increasing hexadecimal value.
+        oid: new ObjectId(`000${lexhex(val)}00000000000000000000`),
     };
     for (let i = 0; i < val; i++) {
         docs.push(Object.assign({_id}, fields));
@@ -53,15 +72,39 @@ function getTypeName(field) {
             return "NumberInt32";
         case "dbl":
             return "NumberDouble";
-        case "str":
+        case "dec":
+            return "NumberDecimal";
+        case "strS":
+            return "StringSmall";
+        case "strB":
             return "StringBig";
         case "date":
             return "Date";
+        case "ts":
+            return "Timestamp";
+        case "oid":
+            return "ObjectId";
         default:
             assert(false, `Name mapping for ${field} not defined.`);
     }
 }
 
+function isSubset(s1, s2) {
+    for (const e of s1) {
+        if (!s2.has(e)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+function sameTypeClass(field, documentField) {
+    const numTypes = new Set(["dbl", "dec", "int"]);
+    const strTypes = new Set(["strS", "strB"]);
+    const fields = new Set([field, documentField]);
+    return isSubset(fields, numTypes) || isSubset(fields, strTypes);
+}
+
 /**
  * This is the main testing function. Note that the input value 'ndv' corresponds to both the number
  * of distinct values per type in 'fields', as well as the number of buckets in each histogram
@@ -98,6 +141,7 @@ function verifyCEForNDV(ndv) {
     let allDocs = [];
     for (let val = 1; val <= ndv; val++) {
         let docs = generateDocs(val);
+        jsTestLog(tojson(docs));
         assert.commandWorked(coll.insertMany(docs));
 
         // Small hack; when we insert a doc, we want to insert it as a NumberInt so that the
@@ -124,11 +168,29 @@ function verifyCEForNDV(ndv) {
         allDocs = allDocs.concat(docs);
     }
 
+    // Create histograms for all fields.
     for (const field of fields) {
         createAndValidateHistogram({coll, expectedHistogram: expectedHistograms[field]});
-        forceHistogramCE();
+    }
+
+    jsTestLog(tojson(db.system.statistics[collName].find().toArray()));
+
+    const doc0 = {
+        int: 0,
+        dbl: 0.0,
+        dec: NumberDecimal(0.0),
+        strS: "",
+        strB: "",
+        date: new Date(`02 December ${2000}`),
+        ts: new Timestamp(0, 1),
+        oid: new ObjectId("000000000000000000000000")
+    };
+
+    // Verify CE for all distinct values of each field across multiple types.
+    forceHistogramCE();
+    for (const field of fields) {
+        jsTestLog(`Testing histogram for ndv ${ndv} and field ${field}`);
 
-        // Verify CE for all distinct values of each field across multiple types.
         let count = 0;
         const hint = {[field]: 1};
         for (let val = 1; val <= ndv; val++) {
@@ -154,47 +216,13 @@ function verifyCEForNDV(ndv) {
                     verifyCEForMatch(
                         {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte});
 
-                } else if (field == "int" && documentField == "dbl") {
-                    // Each distinct double value corresponds to an int value + 0.1, so we shouldn't
-                    // get any equality matches.
-                    verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []});
-
-                    // When we have a predicate ~ < val + 0.1 or <= val + 0.1, it should match all
-                    // integers <= val.
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLte});
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte});
-
-                    // When we have a predicate ~ > val + 0.1 or >= val + 0.1, it should match all
-                    // integers > val.
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt});
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGt});
-
-                } else if (field == "dbl" && documentField == "int") {
-                    // Each distinct double value corresponds to an int value + 0.1, so we shouldn't
-                    // get any equality matches.
-                    verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []});
-
-                    // When we have a predicate ~ < val - 0.1 or <= val - 0.1, it should match all
-                    // doubles < val.
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt});
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLt});
-
-                    // When we have a predicate ~ > val - 0.1 or >= val - 0.1, it should match all
-                    // doubles >= val.
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGte});
-                    verifyCEForMatch(
-                        {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte});
+                } else if (sameTypeClass(field, documentField)) {
+                    // Skip this estimation- we have already tested range predicates for this type.
+                    continue;
 
                 } else {
-                    // Verify that we obtain a CE of 0 for types other than the 'field' type when at
-                    // least one type is not numeric.
+                    // Verify that we obtain a CE of 0 for types other outside the 'field' type's
+                    // type-class.
                     const expected = [];
                     verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected});
                     verifyCEForMatch({coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected});
@@ -210,10 +238,9 @@ function verifyCEForNDV(ndv) {
         }
 
         // Verify CE for values outside the range of distinct values for each field.
-        const docLow = {int: 0, dbl: 0.0, str: `string_0`, date: new Date(`02 December ${2000}`)};
         const docHigh = generateDocs(ndv + 1)[0];
         const expected = [];
-        verifyCEForMatch({coll, predicate: {[field]: docLow[field]}, hint, expected});
+        verifyCEForMatch({coll, predicate: {[field]: doc0[field]}, hint, expected});
         verifyCEForMatch({coll, predicate: {[field]: docHigh[field]}, hint, expected});
     }
 }
author	Alya Berciu <alya.berciu@mongodb.com>	2022-12-02 09:21:40 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-12-02 10:43:32 +0000
commit	0465129215092c9c7fdf1a168cb56bc1cdefb855 (patch)
tree	47aaf241a397b6996aa9edbbb567aab14efd8c7c /jstests/cqf
parent	10d13865c7429a4da344cdcadc661268429624d2 (diff)
download	mongo-0465129215092c9c7fdf1a168cb56bc1cdefb855.tar.gz