diff options
author | Alya Berciu <alya.berciu@mongodb.com> | 2022-12-02 09:21:40 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-12-02 10:43:32 +0000 |
commit | 0465129215092c9c7fdf1a168cb56bc1cdefb855 (patch) | |
tree | 47aaf241a397b6996aa9edbbb567aab14efd8c7c /jstests | |
parent | 10d13865c7429a4da344cdcadc661268429624d2 (diff) | |
download | mongo-0465129215092c9c7fdf1a168cb56bc1cdefb855.tar.gz |
SERVER-71205 Fix histogram type to double conversion
Diffstat (limited to 'jstests')
-rw-r--r-- | jstests/cqf/analyze/ce_histogram.js | 133 | ||||
-rw-r--r-- | jstests/libs/ce_stats_utils.js | 2 |
2 files changed, 81 insertions, 54 deletions
diff --git a/jstests/cqf/analyze/ce_histogram.js b/jstests/cqf/analyze/ce_histogram.js index fb72d50a456..c9f2691b887 100644 --- a/jstests/cqf/analyze/ce_histogram.js +++ b/jstests/cqf/analyze/ce_histogram.js @@ -15,11 +15,29 @@ load('jstests/libs/ce_stats_utils.js'); +const charCodeA = 65; const collName = "ce_histogram"; -const fields = ["int", "dbl", "str", "date"]; +const fields = ["int", "dbl", "dec", "strS", "strB", "date", "ts", "oid"]; let _id; +// Helper to sort ints lexicographically. +// A small note: the ordering of string bounds (lexicographical) is different than that of int +// bounds. In order to simplify the histogram validation logic, we don't want to have to account for +// the fact that in the resulting histogram, string bounds will be sorted differently than int +// bounds. To illustrate this, if we were to use the format `${val}` to generate strings, we would +// generate a different order of bounds for the histogram on the string fields and the histogram on +// the int fields, because 2 < 10, but "10" < "2". This test relies on all field values being sorted +// the same way in the bounds. +function lexint(val) { + return String.fromCharCode(charCodeA - 1 + val); // Note: val >= 1. +} + +// Helper to generate hex strings for ints from 1-10. These are trivially lexicographically sorted. +function lexhex(val) { + return val > 9 ? String.fromCharCode((val - 10) + charCodeA) : `${val}`; +} + /** * Generates 'val' documents where each document has a distinct value for each 'field' in 'fields'. */ @@ -28,14 +46,15 @@ function generateDocs(val) { const fields = { int: NumberInt(val), // Necessary to cast, otherwise we get a double here. dbl: val + 0.1, - // A small note: the ordering of string bounds (lexicographical) is different than that of - // int bounds. In order to simplify the histogram validation logic, we don't want to have to - // account for the fact that string bounds will be sorted differently than int bounds. To - // illustrate this, if we were to use the format `string_${val}`, the string corresponding - // to value 10 would be the second entry in the histogram bounds array, even though it would - // be generated for 'val' = 10, not 'val' = 2. - str: `string_${String.fromCharCode(64 + val)}`, - date: new Date(`02 December ${val + 2000}`) + dec: NumberDecimal(val + 0.1), + strS: `${lexint(val)}`, + strB: `string_${lexint(val)}`, + date: new Date(`02 December ${val + 2000}`), + ts: new Timestamp(val, 1), + // Object Ids are represented by 12 bytes. We want to ensure the ordering matches the + // ordering of the other fields for the purposes of this test. As a result, we set the 4th + // most significant byte to a lexicographically increasing hexadecimal value. + oid: new ObjectId(`000${lexhex(val)}00000000000000000000`), }; for (let i = 0; i < val; i++) { docs.push(Object.assign({_id}, fields)); @@ -53,15 +72,39 @@ function getTypeName(field) { return "NumberInt32"; case "dbl": return "NumberDouble"; - case "str": + case "dec": + return "NumberDecimal"; + case "strS": + return "StringSmall"; + case "strB": return "StringBig"; case "date": return "Date"; + case "ts": + return "Timestamp"; + case "oid": + return "ObjectId"; default: assert(false, `Name mapping for ${field} not defined.`); } } +function isSubset(s1, s2) { + for (const e of s1) { + if (!s2.has(e)) { + return false; + } + } + return true; +} + +function sameTypeClass(field, documentField) { + const numTypes = new Set(["dbl", "dec", "int"]); + const strTypes = new Set(["strS", "strB"]); + const fields = new Set([field, documentField]); + return isSubset(fields, numTypes) || isSubset(fields, strTypes); +} + /** * This is the main testing function. Note that the input value 'ndv' corresponds to both the number * of distinct values per type in 'fields', as well as the number of buckets in each histogram @@ -98,6 +141,7 @@ function verifyCEForNDV(ndv) { let allDocs = []; for (let val = 1; val <= ndv; val++) { let docs = generateDocs(val); + jsTestLog(tojson(docs)); assert.commandWorked(coll.insertMany(docs)); // Small hack; when we insert a doc, we want to insert it as a NumberInt so that the @@ -124,11 +168,29 @@ function verifyCEForNDV(ndv) { allDocs = allDocs.concat(docs); } + // Create histograms for all fields. for (const field of fields) { createAndValidateHistogram({coll, expectedHistogram: expectedHistograms[field]}); - forceHistogramCE(); + } + + jsTestLog(tojson(db.system.statistics[collName].find().toArray())); + + const doc0 = { + int: 0, + dbl: 0.0, + dec: NumberDecimal(0.0), + strS: "", + strB: "", + date: new Date(`02 December ${2000}`), + ts: new Timestamp(0, 1), + oid: new ObjectId("000000000000000000000000") + }; + + // Verify CE for all distinct values of each field across multiple types. + forceHistogramCE(); + for (const field of fields) { + jsTestLog(`Testing histogram for ndv ${ndv} and field ${field}`); - // Verify CE for all distinct values of each field across multiple types. let count = 0; const hint = {[field]: 1}; for (let val = 1; val <= ndv; val++) { @@ -154,47 +216,13 @@ function verifyCEForNDV(ndv) { verifyCEForMatch( {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte}); - } else if (field == "int" && documentField == "dbl") { - // Each distinct double value corresponds to an int value + 0.1, so we shouldn't - // get any equality matches. - verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []}); - - // When we have a predicate ~ < val + 0.1 or <= val + 0.1, it should match all - // integers <= val. - verifyCEForMatch( - {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLte}); - verifyCEForMatch( - {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLte}); - - // When we have a predicate ~ > val + 0.1 or >= val + 0.1, it should match all - // integers > val. - verifyCEForMatch( - {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGt}); - verifyCEForMatch( - {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGt}); - - } else if (field == "dbl" && documentField == "int") { - // Each distinct double value corresponds to an int value + 0.1, so we shouldn't - // get any equality matches. - verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected: []}); - - // When we have a predicate ~ < val - 0.1 or <= val - 0.1, it should match all - // doubles < val. - verifyCEForMatch( - {coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected: docsLt}); - verifyCEForMatch( - {coll, predicate: {[field]: {$lte: fieldVal}}, hint, expected: docsLt}); - - // When we have a predicate ~ > val - 0.1 or >= val - 0.1, it should match all - // doubles >= val. - verifyCEForMatch( - {coll, predicate: {[field]: {$gt: fieldVal}}, hint, expected: docsGte}); - verifyCEForMatch( - {coll, predicate: {[field]: {$gte: fieldVal}}, hint, expected: docsGte}); + } else if (sameTypeClass(field, documentField)) { + // Skip this estimation- we have already tested range predicates for this type. + continue; } else { - // Verify that we obtain a CE of 0 for types other than the 'field' type when at - // least one type is not numeric. + // Verify that we obtain a CE of 0 for types other outside the 'field' type's + // type-class. const expected = []; verifyCEForMatch({coll, predicate: {[field]: fieldVal}, hint, expected}); verifyCEForMatch({coll, predicate: {[field]: {$lt: fieldVal}}, hint, expected}); @@ -210,10 +238,9 @@ function verifyCEForNDV(ndv) { } // Verify CE for values outside the range of distinct values for each field. - const docLow = {int: 0, dbl: 0.0, str: `string_0`, date: new Date(`02 December ${2000}`)}; const docHigh = generateDocs(ndv + 1)[0]; const expected = []; - verifyCEForMatch({coll, predicate: {[field]: docLow[field]}, hint, expected}); + verifyCEForMatch({coll, predicate: {[field]: doc0[field]}, hint, expected}); verifyCEForMatch({coll, predicate: {[field]: docHigh[field]}, hint, expected}); } } diff --git a/jstests/libs/ce_stats_utils.js b/jstests/libs/ce_stats_utils.js index b664ec265ef..fdef187007a 100644 --- a/jstests/libs/ce_stats_utils.js +++ b/jstests/libs/ce_stats_utils.js @@ -35,6 +35,7 @@ function assertApproxEq(expected, actual, msg, tolerance = 0.01) { * returns. */ function verifyCEForMatch({coll, predicate, expected, ce, hint}) { + jsTestLog(`Verify CE for match ${tojson(predicate)}`); const CEs = ce ? [ce] : undefined; return verifyCEForMatchNodes( {coll, predicate, expected, getNodeCEs: (explain) => [getRootCE(explain)], CEs, hint}); @@ -78,7 +79,6 @@ function verifyCEForMatchNodes({coll, predicate, expected, getNodeCEs, CEs, hint function createAndValidateHistogram({coll, expectedHistogram, empty = false}) { const field = expectedHistogram._id; const stats = db.system.statistics[coll.getName()]; - stats.drop(); // We can't use forceBonsai here because the new optimizer doesn't know how to handle the // analyze command. |