summaryrefslogtreecommitdiff
path: root/jstests
diff options
context:
space:
mode:
authorAlya Berciu <alya.berciu@mongodb.com>2022-12-02 09:21:40 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-12-02 10:43:32 +0000
commit0465129215092c9c7fdf1a168cb56bc1cdefb855 (patch)
tree47aaf241a397b6996aa9edbbb567aab14efd8c7c /jstests
parent10d13865c7429a4da344cdcadc661268429624d2 (diff)
downloadmongo-0465129215092c9c7fdf1a168cb56bc1cdefb855.tar.gz
SERVER-71205 Fix histogram type to double conversion
Diffstat (limited to 'jstests')
-rw-r--r--jstests/cqf/analyze/ce_histogram.js133
-rw-r--r--jstests/libs/ce_stats_utils.js2
2 files changed, 81 insertions, 54 deletions
diff --git a/jstests/cqf/analyze/ce_histogram.js b/jstests/cqf/analyze/ce_histogram.js
index fb72d50a456..c9f2691b887 100644
--- a/jstests/cqf/analyze/ce_histogram.js
+++ b/jstests/cqf/analyze/ce_histogram.js
@@ -15,11 +15,29 @@
load('jstests/libs/ce_stats_utils.js');
+const charCodeA = 65;
const collName = "ce_histogram";
-const fields = ["int", "dbl", "str", "date"];
+const fields = ["int", "dbl", "dec", "strS", "strB", "date", "ts", "oid"];
let _id;
+// Helper to sort ints lexicographically.
+// A small note: the ordering of string bounds (lexicographical) is different than that of int
+// bounds. In order to simplify the histogram validation logic, we don't want to have to account for
+// the fact that in the resulting histogram, string bounds will be sorted differently than int
+// bounds. To illustrate this, if we were to use the format `${val}` to generate strings, we would
+// generate a different order of bounds for the histogram on the string fields and the histogram on
+// the int fields, because 2 < 10, but "10" < "2". This test relies on all field values being sorted
+// the same way in the bounds.
+function lexint(val) {
+ return String.fromCharCode(charCodeA - 1 + val); // Note: val >= 1.
+}
+
+// Helper to generate hex strings for ints from 1-10. These are trivially lexicographically sorted.
+function lexhex(val) {
+ return val > 9 ? String.fromCharCode((val - 10) + charCodeA) : `${val}`;
+}
+
/**
* Generates 'val' documents where each document has a distinct value for each 'field' in 'fields'.
*/
@@ -28,14 +46,15 @@ function generateDocs(val) {
const fields = {
int: NumberInt(val), // Necessary to cast, otherwise we get a double here.
dbl: val + 0.1,
- // A small note: the ordering of string bounds (lexicographical) is different than that of
- // int bounds. In order to simplify the histogram validation logic, we don't want to have to
- // account for the fact that string bounds will be sorted differently than int bounds. To
- // illustrate this, if we were to use the format `string_${val}`, the string corresponding
- // to value 10 would be the second entry in the histogram bounds array, even though it would
- // be generated for 'val' = 10, not 'val' = 2.
- str: `string_${String.fromCharCode(64 + val)}`,
- date: new Date(`02 December ${val + 2000}`)
+ dec: NumberDecimal(val + 0.1),
+ strS: `${lexint(val)}`,
+ strB: `string_${lexint(val)}`,
+ date: new Date(`02 December ${val + 2000}`),
+ ts: new Timestamp(val, 1),
+ // Object Ids are represented by 12 bytes. We want to ensure the ordering matches the
+ // ordering of the other fields for the purposes of this test. As a result, we set the 4th
+ // most significant byte to a lexicographically increasing hexadecimal value.
+ oid: new ObjectId(`000${lexhex(val)}00000000000000000000`),
};
for (let i = 0; i < val; i++) {
docs.push(Object.assign({_id}, fields));
@@ -53,15 +72,39 @@ function getTypeName(field) {
return "NumberInt32";
case "dbl":
return "NumberDouble";
- case "str":
+ case "dec":
+ return "NumberDecimal";
+ case "strS":
+ return "StringSmall";
+ case "strB":
return "StringBig";
case "date":
return "Date";
+ case "ts":
+ return "Timestamp";
+ case "oid":
+ return "ObjectId";
default:
assert(false, `Name mapping for ${field} not defined.`);
}
}
+function isSubset(s1, s2) {
+ for (const e of s1) {
+ if (!s2.has(e)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+function sameTypeClass(field, documentField) {
+ const numTypes = new Set(["dbl", "dec", "int"]);
+ const strTypes = new Set(["strS", "strB"]);
+ const fields = new Set([field, documentField]);
+ return isSubset(fields, numTypes) || isSubset(fields, strTypes);
+}
+
/**
* This is the main testing function. Note that the input value 'ndv' corresponds to both the number
* of distinct values per type in 'fields', as well as the number of buckets in each histogram
@@ -98,6 +141,7 @@ function verifyCEForNDV(ndv) {
let allDocs = [];
for (let val = 1; val <= ndv; val++) {
let docs = generateDocs(val);
+ jsTestLog(tojson(docs));
assert.commandWorked(coll.insertMany(docs));
// Small hack; when we insert a doc, we want to insert it as a NumberInt so that the
@@ -124,11 +168,29 @@ function verifyCEForNDV(ndv) {
allDocs = allDocs.concat(docs);
}
+ // Create histograms for all fields.
for (const field of fields) {
createAndValidateHistogram({coll, expectedHistogram: expectedHistograms[field]});
- forceHistogramCE();
+ }
+
+ jsTestLog(tojson(db.system.statistics[collName].find().toArray()));
+
+ const doc0 = {
+ int: 0,
+ dbl: 0.0,
+ dec: NumberDecimal(0.0),
+ strS: "",
+ strB: "",
+ date: new Date(`02 December ${2000}`),
+ ts: new Timestamp(0, 1),
+ oid: new ObjectId("000000000000000000000000")
+ };
+
+ // Verify CE for all distinct values of each field across multiple types.
+ forceHistogramCE();
+ for (const field of fields) {
+ jsTestLog(`Testing histogram for ndv ${ndv} and field ${field}`);
- // Verify CE for all distinct values of each field across multiple types.
let count = 0;
const hint = {[field]: 1};
for (let val = 1; val <= ndv; val++) {
@@ -154,47 +216,13 @@ function verifyCEForNDV(ndv) {
verifyCEForMatch(
{coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte});
- } else if (field == "int" && documentField == "dbl") {
- // Each distinct double value corresponds to an int value + 0.1, so we shouldn't
- // get any equality matches.
- verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []});
-
- // When we have a predicate ~ < val + 0.1 or <= val + 0.1, it should match all
- // integers <= val.
- verifyCEForMatch(
- {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLte});
- verifyCEForMatch(
- {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte});
-
- // When we have a predicate ~ > val + 0.1 or >= val + 0.1, it should match all
- // integers > val.
- verifyCEForMatch(
- {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt});
- verifyCEForMatch(
- {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGt});
-
- } else if (field == "dbl" && documentField == "int") {
- // Each distinct double value corresponds to an int value + 0.1, so we shouldn't
- // get any equality matches.
- verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []});
-
- // When we have a predicate ~ < val - 0.1 or <= val - 0.1, it should match all
- // doubles < val.
- verifyCEForMatch(
- {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt});
- verifyCEForMatch(
- {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLt});
-
- // When we have a predicate ~ > val - 0.1 or >= val - 0.1, it should match all
- // doubles >= val.
- verifyCEForMatch(
- {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGte});
- verifyCEForMatch(
- {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte});
+ } else if (sameTypeClass(field, documentField)) {
+ // Skip this estimation- we have already tested range predicates for this type.
+ continue;
} else {
- // Verify that we obtain a CE of 0 for types other than the 'field' type when at
- // least one type is not numeric.
+ // Verify that we obtain a CE of 0 for types other outside the 'field' type's
+ // type-class.
const expected = [];
verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected});
verifyCEForMatch({coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected});
@@ -210,10 +238,9 @@ function verifyCEForNDV(ndv) {
}
// Verify CE for values outside the range of distinct values for each field.
- const docLow = {int: 0, dbl: 0.0, str: `string_0`, date: new Date(`02 December ${2000}`)};
const docHigh = generateDocs(ndv + 1)[0];
const expected = [];
- verifyCEForMatch({coll, predicate: {[field]: docLow[field]}, hint, expected});
+ verifyCEForMatch({coll, predicate: {[field]: doc0[field]}, hint, expected});
verifyCEForMatch({coll, predicate: {[field]: docHigh[field]}, hint, expected});
}
}
diff --git a/jstests/libs/ce_stats_utils.js b/jstests/libs/ce_stats_utils.js
index b664ec265ef..fdef187007a 100644
--- a/jstests/libs/ce_stats_utils.js
+++ b/jstests/libs/ce_stats_utils.js
@@ -35,6 +35,7 @@ function assertApproxEq(expected, actual, msg, tolerance = 0.01) {
* returns.
*/
function verifyCEForMatch({coll, predicate, expected, ce, hint}) {
+ jsTestLog(`Verify CE for match ${tojson(predicate)}`);
const CEs = ce ? [ce] : undefined;
return verifyCEForMatchNodes(
{coll, predicate, expected, getNodeCEs: (explain) => [getRootCE(explain)], CEs, hint});
@@ -78,7 +79,6 @@ function verifyCEForMatchNodes({coll, predicate, expected, getNodeCEs, CEs, hint
function createAndValidateHistogram({coll, expectedHistogram, empty = false}) {
const field = expectedHistogram._id;
const stats = db.system.statistics[coll.getName()];
- stats.drop();
// We can't use forceBonsai here because the new optimizer doesn't know how to handle the
// analyze command.