diff options
author | Alya Berciu <alya.berciu@mongodb.com> | 2022-12-19 18:08:39 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-12-19 18:39:26 +0000 |
commit | c2e9caa2fe8093a8fde55d1252c8c85d62859f7a (patch) | |
tree | c0e31055af8b687bce53ab1edaecc54d81acfc70 | |
parent | f1acb7641a790d9ef7d34461826c47370d6d2f97 (diff) | |
download | mongo-c2e9caa2fe8093a8fde55d1252c8c85d62859f7a.tar.gz |
SERVER-71057 Count histogram types once per array
-rw-r--r-- | jstests/cqf/analyze/array_histogram.js | 2 | ||||
-rw-r--r-- | jstests/cqf/analyze/ce_histogram.js | 2 | ||||
-rw-r--r-- | jstests/cqf/analyze/type_counts.js | 70 | ||||
-rw-r--r-- | src/mongo/db/query/ce/generated_histograms_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_array_data_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_estimator_test.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_interpolation_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/query/stats/SConscript | 10 | ||||
-rw-r--r-- | src/mongo/db/query/stats/array_histogram.cpp | 236 | ||||
-rw-r--r-- | src/mongo/db/query/stats/array_histogram.h | 7 | ||||
-rw-r--r-- | src/mongo/db/query/stats/max_diff.cpp | 27 | ||||
-rw-r--r-- | src/mongo/db/query/stats/type_count_test.cpp | 78 |
12 files changed, 336 insertions, 128 deletions
diff --git a/jstests/cqf/analyze/array_histogram.js b/jstests/cqf/analyze/array_histogram.js index 0ee10af9ebf..eb6285a1241 100644 --- a/jstests/cqf/analyze/array_histogram.js +++ b/jstests/cqf/analyze/array_histogram.js @@ -100,7 +100,7 @@ runHistogramsTest(function verifyArrayHistograms() { ], bounds: ["array", "mixed"] }, - typeCount: [{typeName: "StringSmall", count: 5}], + typeCount: [{typeName: "StringSmall", count: 3}], }, emptyArrayCount: 1, trueCount: 0, diff --git a/jstests/cqf/analyze/ce_histogram.js b/jstests/cqf/analyze/ce_histogram.js index cb27f55b3f9..2a7ab7f1510 100644 --- a/jstests/cqf/analyze/ce_histogram.js +++ b/jstests/cqf/analyze/ce_histogram.js @@ -249,6 +249,6 @@ runHistogramsTest(function testScalarHistograms() { verifyCEForNDV(1); verifyCEForNDV(2); verifyCEForNDV(3); - verifyCEForNDV(10); + verifyCEForNDV(4); }); }()); diff --git a/jstests/cqf/analyze/type_counts.js b/jstests/cqf/analyze/type_counts.js index 49ebe0f1eb9..dbee89a582e 100644 --- a/jstests/cqf/analyze/type_counts.js +++ b/jstests/cqf/analyze/type_counts.js @@ -98,7 +98,7 @@ runHistogramsTest(function testTypeCounts() { }], bounds: [3] }, - typeCount: [{typeName: "NumberDouble", count: 3}], + typeCount: [{typeName: "NumberDouble", count: 1}], }, emptyArrayCount: 4, trueCount: 1, @@ -335,7 +335,7 @@ runHistogramsTest(function testTypeCounts() { ], bounds: [2, 3] }, - typeCount: [{typeName: "NumberDouble", count: 5}], + typeCount: [{typeName: "NumberDouble", count: 2}], }, emptyArrayCount: 1, trueCount: 2, @@ -479,7 +479,6 @@ runHistogramsTest(function testTypeCounts() { ]; assert.commandWorked(coll.insertMany(docs)); - // TODO SERVER-71057: Only count types once per array. createAndValidateHistogram({ coll, expectedHistogram: { @@ -543,11 +542,11 @@ runHistogramsTest(function testTypeCounts() { bounds: ["c"] }, typeCount: [ - {typeName: "StringSmall", count: 3}, - {typeName: "Boolean", count: 10}, - {typeName: "Null", count: 5}, - {typeName: "Object", count: 6}, - {typeName: "Array", count: 13}, + {typeName: "StringSmall", count: 1}, + {typeName: "Boolean", count: 6}, + {typeName: "Null", count: 3}, + {typeName: "Object", count: 4}, + {typeName: "Array", count: 11}, ], }, emptyArrayCount: 1, @@ -558,14 +557,14 @@ runHistogramsTest(function testTypeCounts() { } }); - // Verify type count CE. Note that for non-$elemMatch preidcates, we include both array and + // Verify type count CE. Note that for non-$elemMatch predicates, we include both array and // scalar type-counts, while for $elemMatch predicates, we include only array type counts in // our estimate. forceCE("histogram"); hint = {a: 1}; - // Estimate boolean counts. Note that we have 10 boolean arrays, so that gets added to the - // counter estimates. + // Estimate boolean counts. Note that we have 6 boolean arrays, so that gets added to the + // counter estimate. verifyCEForMatch({ coll, predicate: {"a": true}, @@ -576,7 +575,7 @@ runHistogramsTest(function testTypeCounts() { {_id: 6, a: [[false, false], true]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 11, + ce: 7, hint }); verifyCEForMatch({ @@ -589,7 +588,7 @@ runHistogramsTest(function testTypeCounts() { {_id: 5, a: [false, false, false]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 11, + ce: 7, hint }); @@ -602,17 +601,17 @@ runHistogramsTest(function testTypeCounts() { {_id: 10, a: [{}]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 8, + ce: 6, hint }); verifyCEForMatch({ coll, predicate: {a: {a: 1}}, expected: [{_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}], - ce: 8, + ce: 6, hint }); - verifyCEForMatch({coll, predicate: {a: {b: 1}}, expected: [{_id: 12, a: {b: 1}}], ce: 8, hint}); + verifyCEForMatch({coll, predicate: {a: {b: 1}}, expected: [{_id: 12, a: {b: 1}}], ce: 6, hint}); // In the $elemMatch case for object predicates, we only include the array object counter value // in our estimate. @@ -623,22 +622,22 @@ runHistogramsTest(function testTypeCounts() { {_id: 10, a: [{}]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 6, + ce: 4, hint }); verifyCEForMatch({ coll, predicate: {a: {$elemMatch: {$eq: {a: 1}}}}, expected: [{_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}], - ce: 6, + ce: 4, hint }); verifyCEForMatch( - {coll, predicate: {a: {$elemMatch: {$eq: {b: 1}}}}, expected: [], ce: 6, hint}); + {coll, predicate: {a: {$elemMatch: {$eq: {b: 1}}}}, expected: [], ce: 4, hint}); // We are estimating the following predicates as two intervals joined by a conjunction: - // 1. [{}, {}] - estimated as the sum of scalar and array type counts 6 + 2 = 8. - // 2. [[{}], [{}]] - estimated as the count of nested arrays, 13. + // 1. [{}, {}] - estimated as the sum of scalar and array type counts 4 + 2 = 6. + // 2. [[{}], [{}]] - estimated as the count of nested arrays, 11. // The disjunction selectivities are then combined via exponential backoff. This estimate can be // found at the union of the two index scan nodes in the plan. However, the root node estimate // differs due to the filter node & group by node above the union, so we directly verify the @@ -651,7 +650,7 @@ runHistogramsTest(function testTypeCounts() { {_id: 11, a: [[{}]]}, ], getNodeCEs: getUnionNodeCE, - CEs: [15.323], + CEs: [12.931], hint }); verifyCEForMatchNodes({ @@ -659,7 +658,7 @@ runHistogramsTest(function testTypeCounts() { predicate: {a: [{c: 1}]}, expected: [{_id: 13, a: [{c: 1}]}], getNodeCEs: getUnionNodeCE, - CEs: [15.323], + CEs: [12.931], hint }); verifyCEForMatchNodes({ @@ -667,7 +666,7 @@ runHistogramsTest(function testTypeCounts() { predicate: {a: [{d: 1}]}, expected: [{_id: 15, a: [[{d: 1}]]}], getNodeCEs: getUnionNodeCE, - CEs: [15.323], + CEs: [12.931], hint }); @@ -687,20 +686,20 @@ runHistogramsTest(function testTypeCounts() { {_id: 20, a: [["a", "b", "c"]]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 13, + ce: 11, hint }); verifyCEForMatch({ coll, predicate: {a: {$elemMatch: {$eq: [["a", "b", "c"]]}}}, expected: [{_id: 21, a: [[["a", "b", "c"]]]}], - ce: 13, + ce: 11, hint }); // In the following cases, we have two intervals: // 1. ["a", "a"] - This is estimated as 1 based on the array buckets. - // 2. [["a", "b", "c"], ["a", "b", "c"]] - this is estimated as the count of nested arrays: 13. + // 2. [["a", "b", "c"], ["a", "b", "c"]] - this is estimated as the count of nested arrays: 11. // The selectivities are then combined by disjunctive exponential backoff. Once again, we can // find this estimate in the Union node. verifyCEForMatchNodes({ @@ -712,7 +711,7 @@ runHistogramsTest(function testTypeCounts() { {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], getNodeCEs: getUnionNodeCE, - CEs: [13.270], + CEs: [11.306], hint }); verifyCEForMatchNodes({ @@ -720,7 +719,7 @@ runHistogramsTest(function testTypeCounts() { predicate: {a: ["a"]}, expected: [], getNodeCEs: getUnionNodeCE, - CEs: [13.270], + CEs: [11.306], hint }); @@ -732,7 +731,7 @@ runHistogramsTest(function testTypeCounts() { predicate: {a: [["a", "b", "c"]]}, expected: [{_id: 20, a: [["a", "b", "c"]]}, {_id: 21, a: [[["a", "b", "c"]]]}], getNodeCEs: getUnionNodeCE, - CEs: [17.021], + CEs: [14.754], hint }); verifyCEForMatchNodes({ @@ -740,12 +739,11 @@ runHistogramsTest(function testTypeCounts() { predicate: {a: [[["a", "b", "c"]]]}, expected: [{_id: 21, a: [[["a", "b", "c"]]]}], getNodeCEs: getUnionNodeCE, - CEs: [17.021], + CEs: [14.754], hint }); // Verify null CE. - // TODO SERVER-71057: Only count each null once per array. verifyCEForMatch({ coll, predicate: {a: null}, @@ -756,7 +754,6 @@ runHistogramsTest(function testTypeCounts() { {_id: 25, a: [null, null, null]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 7, hint }); verifyCEForMatch({ @@ -767,7 +764,7 @@ runHistogramsTest(function testTypeCounts() { {_id: 25, a: [null, null, null]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 5, + ce: 3, hint }); @@ -780,7 +777,8 @@ runHistogramsTest(function testTypeCounts() { // {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, // ], ce: 14, hint}); - // In the following case, we expect to count only nested empty arrays. + // In the following case, we expect to count only nested empty arrays (so we estimate it as the + // count of all nested arrays). verifyCEForMatch({ coll, predicate: {a: {$elemMatch: {$eq: []}}}, @@ -788,7 +786,7 @@ runHistogramsTest(function testTypeCounts() { {_id: 17, a: [[]]}, {_id: 27, a: [null, true, false, [], [1, 2, 3], ["a", "b", "c"], {a: 1}, {}]}, ], - ce: 13, + ce: 11, hint }); diff --git a/src/mongo/db/query/ce/generated_histograms_test.cpp b/src/mongo/db/query/ce/generated_histograms_test.cpp index 84863eb4404..b63fc81bb03 100644 --- a/src/mongo/db/query/ce/generated_histograms_test.cpp +++ b/src/mongo/db/query/ce/generated_histograms_test.cpp @@ -222,8 +222,8 @@ TEST(EstimatorTest, IntStrArrayEstimate) { TypeCounts typeCounts{{value::TypeTags::NumberInt64, 388}, {value::TypeTags::StringSmall, 319}, {value::TypeTags::Array, 293}}; - TypeCounts arrayTypeCounts{{value::TypeTags::NumberInt64, 874}, - {value::TypeTags::StringSmall, 340}}; + TypeCounts arrayTypeCounts{{value::TypeTags::NumberInt64, 282}, + {value::TypeTags::StringSmall, 222}}; const auto arrHist = ArrayHistogram::make(scalarHist, typeCounts, uniqueHist, diff --git a/src/mongo/db/query/ce/histogram_array_data_test.cpp b/src/mongo/db/query/ce/histogram_array_data_test.cpp index 347217f80d9..8e037ca43a7 100644 --- a/src/mongo/db/query/ce/histogram_array_data_test.cpp +++ b/src/mongo/db/query/ce/histogram_array_data_test.cpp @@ -160,7 +160,7 @@ TEST(EstimatorArrayDataTest, Histogram1000ArraysSmall10Buckets) { TypeCounts arrayTypeCounts; // Dataset generated as 1000 arrays of size between 3 to 5. typeCounts.insert({value::TypeTags::Array, 1000}); - arrayTypeCounts.insert({value::TypeTags::NumberInt32, 3996}); + arrayTypeCounts.insert({value::TypeTags::NumberInt32, 1000}); const auto arrHist = ArrayHistogram::make(scalarHist, typeCounts, @@ -251,7 +251,7 @@ TEST(EstimatorArrayDataTest, Histogram1000ArraysLarge10Buckets) { TypeCounts arrayTypeCounts; // Dataset generated as 1000 arrays of size between 8 to 10. typeCounts.insert({value::TypeTags::Array, 1000}); - arrayTypeCounts.insert({value::TypeTags::NumberInt32, 8940}); + arrayTypeCounts.insert({value::TypeTags::NumberInt32, 1000}); const auto arrHist = ArrayHistogram::make(scalarHist, typeCounts, diff --git a/src/mongo/db/query/ce/histogram_estimator_test.cpp b/src/mongo/db/query/ce/histogram_estimator_test.cpp index 11d3a6da729..1787ef55ff4 100644 --- a/src/mongo/db/query/ce/histogram_estimator_test.cpp +++ b/src/mongo/db/query/ce/histogram_estimator_test.cpp @@ -527,9 +527,9 @@ TEST(CEHistogramTest, TestArrayHistogramOnAtomicPredicates) { {Value(6), 1 /* frequency */}, {Value(10), 1 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, 13}}, // Array type counts. - 4, // 4 arrays (including []). - 1 // 1 empty array. + {{sbe::value::TypeTags::NumberInt32, 3}}, // Array type counts (3 arrays with ints). + 4, // 4 arrays (including []). + 1 // 1 empty array. )); // Test simple predicates against 'a'. Note: in the $elemMatch case, we exclude scalar @@ -609,7 +609,7 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { {Value(6), 35 /* frequency */}, {Value(10), 35 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, 420}}, // Array type count = 3*35+5*35+1*35+3*35. + {{sbe::value::TypeTags::NumberInt32, 140}}, // Arrays with ints = 4*35 = 140. kCollCard._value, // kCollCard arrays total. 35 // 35 empty arrays )); @@ -648,9 +648,9 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { {Value(6), 17 /* frequency */}, {Value(10), 17 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, 289}}, // Array type count = 3*17+5*17+6*17+3*17 - 88, // kCollCard arrays total. - 20 // 20 empty arrays. + {{sbe::value::TypeTags::NumberInt32, 68}}, // Arrays with ints = 17*4 = 68. + 88, // kCollCard arrays total. + 20 // 20 empty arrays. )); // Test cardinality of individual predicates. @@ -776,8 +776,8 @@ TEST(CEHistogramTest, TestMixedElemMatchAndNonElemMatch) { // Array max buckets. {Value(10), 1 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, 2}}, - // Array type counts. + // We only have one array with ints. + {{sbe::value::TypeTags::NumberInt32, 1}}, 1, 0)); diff --git a/src/mongo/db/query/ce/histogram_interpolation_test.cpp b/src/mongo/db/query/ce/histogram_interpolation_test.cpp index 16e876d8b26..c6091f2b2c7 100644 --- a/src/mongo/db/query/ce/histogram_interpolation_test.cpp +++ b/src/mongo/db/query/ce/histogram_interpolation_test.cpp @@ -369,7 +369,8 @@ TEST(EstimatorTest, UniformIntArrayOnlyEstimate) { uniqueHist, minHist, maxHist, - TypeCounts{{value::TypeTags::NumberInt64, 404}}, + // There are 100 non-empty int-only arrays. + TypeCounts{{value::TypeTags::NumberInt64, 100}}, 0); // Query in the middle of the domain: estimate from ArrayUnique histogram. @@ -473,7 +474,8 @@ TEST(EstimatorTest, UniformIntMixedArrayEstimate) { uniqueHist, minHist, maxHist, - TypeCounts{{value::TypeTags::NumberInt64, 375}}, + // There are 94 non-empty int-only arrays. + TypeCounts{{value::TypeTags::NumberInt64, 94}}, 0); value::TypeTags lowTag = value::TypeTags::NumberInt64; diff --git a/src/mongo/db/query/stats/SConscript b/src/mongo/db/query/stats/SConscript index 181c7db3d34..a6d6d2fc634 100644 --- a/src/mongo/db/query/stats/SConscript +++ b/src/mongo/db/query/stats/SConscript @@ -131,3 +131,13 @@ env.CppUnitTest( 'stats_test_utils', ], ) + +env.CppUnitTest( + target="type_count_test", + source=[ + "type_count_test.cpp", + ], + LIBDEPS=[ + 'stats_test_utils', + ], +) diff --git a/src/mongo/db/query/stats/array_histogram.cpp b/src/mongo/db/query/stats/array_histogram.cpp index 12b5eea8d47..759acc802e5 100644 --- a/src/mongo/db/query/stats/array_histogram.cpp +++ b/src/mongo/db/query/stats/array_histogram.cpp @@ -57,21 +57,6 @@ double getTagTypeCount(const TypeCounts& tc, sbe::value::TypeTags tag) { return 0.0; } -/** - * By default, aggregates the total number of tag counts in the histogrma together. - * If 'isHistogrammable' is set to true, then only counts histogrammable types. - * Otherwise, if 'isHistogrammable' is set to false, then only counts non-histogrammable types. - **/ -double getTotalCount(const TypeCounts& tc, boost::optional<bool> isHistogrammable = boost::none) { - double total = 0.0; - for (const auto& [tag, count] : tc) { - if (isHistogrammable || (*isHistogrammable == canEstimateTypeViaHistogram(tag))) { - total += count; - } - } - return total; -} - double getNumericTypeCount(const TypeCounts& tc) { return getTagTypeCount(tc, sbe::value::TypeTags::NumberInt32) + getTagTypeCount(tc, sbe::value::TypeTags::NumberInt64) + @@ -95,34 +80,154 @@ double getTypeBracketTypeCount(const TypeCounts& tc, sbe::value::TypeTags tag) { } } -void validateHistogramTypeCounts(const TypeCounts& tc, const ScalarHistogram& s) { - const auto& bounds = s.getBounds(); - const auto& buckets = s.getBuckets(); - size_t i = 0; - while (i < bounds.size()) { - // First, we have to aggregate all frequencies for the current type bracket. - const auto tagL = bounds.getAt(i).first; - double freq = 0.0; - for (; i < bounds.size(); i++) { - const auto tagR = bounds.getAt(i).first; - // Stop aggregating when the next bound belongs to a different type bracket. - if (!sameTypeBracket(tagL, tagR)) { - break; - } else { - const auto& bucketR = buckets[i]; - freq += bucketR._equalFreq + bucketR._rangeFreq; +/** + * Helper class to iterate over all of a histogram's type-brackets and their frequencies in order. + */ +class TypeBracketFrequencyIterator { +public: + TypeBracketFrequencyIterator(const ScalarHistogram& histogram) : histogram(histogram) {} + + bool hasNext() { + return _bracket < histogram.getBounds().size(); + } + + /** + * Iterates over the bounds & buckets one type-bracket at the time, starting from the first + * bucket/bound. It sums up the frequencies of all the buckets in the current type-bracket, then + * updates the internal counter so the next call to this function will return the left-most type + * tag and total frequency of the next type-bracket in the histogram as a pair {tag, frequency}. + * Once there are no more type-brackets left in the histogram, this will return the tag Nothing + * with a frequency of 0.0, and 'hasNext()' will return false. + */ + std::pair<sbe::value::TypeTags, double> getNext() { + const auto& bounds = histogram.getBounds(); + const auto& buckets = histogram.getBuckets(); + if (hasNext()) { + // Update tag & frequency for the left-most bucket in the current type-bracket. + const auto tagL = bounds.getAt(_bracket).first; + const auto& bucketL = buckets[_bracket]; + double freq = bucketL._equalFreq + bucketL._rangeFreq; + + // Increment the bucket counter to look at the next bucket. + _bracket++; + + // Aggregate all frequencies for the current type bracket. + for (; hasNext(); _bracket++) { + // Get the tag for the next bucket. + const auto tagR = bounds.getAt(_bracket).first; + if (!sameTypeBracket(tagL, tagR)) { + // Stop aggregating when the next bound belongs to a different type bracket. + return {tagL, freq}; + } else { + // This is the rightmost bucket in the current type-bracket (so far). Update the + // frequency counter and look at the next bucket. + const auto& bucketR = buckets[_bracket]; + freq += bucketR._equalFreq + bucketR._rangeFreq; + } } + + // This was the last type-bracket in the histogram. There are no more buckets left. + return {tagL, freq}; + } + return {sbe::value::TypeTags::Nothing, 0.0}; + } + + void reset() { + _bracket = 0; + } + + const ScalarHistogram& histogram; + +private: + size_t _bracket = 0; +}; + +/** + * Validates the type counts per type bracket compared to those in the scalar histogram. If + * 'lowBound' is set to false, these frequencies must be equal; otherwise, the type counts must be a + * lower bound on the counts in the histogram. + */ +void validateHistogramTypeCounts(const TypeCounts& tc, + const ScalarHistogram& s, + bool lowBound = false) { + // Ensure that all histogrammable type brackets are accounted for in the histogram. + TypeBracketFrequencyIterator it{s}; + sbe::value::TypeTags tag; + double freq; + while (it.hasNext()) { + std::tie(tag, freq) = it.getNext(); + const double tcFreq = getTypeBracketTypeCount(tc, tag); + + // If 'lowBound' is set, having more entries in the histogram than in the type counts is + // considered valid. + const bool invalid = lowBound ? (tcFreq > freq) : (tcFreq != freq); + if (invalid) { + uasserted(7105700, + str::stream() << "Type count frequency " << tcFreq << " of type bracket for " + << tag << " did not match histogram frequency " << freq); + } + } + + // Ensure that all histogrammable type counts are accounted for in the type counters. + const double totalTC = getTotalCount(tc, true /* histogrammable*/); + const double totalCard = s.getCardinality(); + const bool invalid = lowBound ? (totalTC > totalCard) : (totalTC != totalCard); + if (invalid) { + uasserted(7105701, + str::stream() << "The type counters count " << totalTC + << " values, but the histogram frequency is " << totalCard); + } +} + +/** + * Validates the bucket counts per type bracket compared to those in the scalar histogram. If + * 'lowBound' is set to false, these frequencies must be equal; otherwise, the frequencies of 'ls' + * must be a lower bound of the counts in the 'rs' histogram. + */ +void validateHistogramFrequencies(const ScalarHistogram& ls, + const ScalarHistogram& rs, + bool lowBound = false) { + // Ensure that the total cardinality of the histograms is comparatively correct. + const double cardL = ls.getCardinality(); + const double cardR = rs.getCardinality(); + const bool invalid = lowBound ? (cardL > cardR) : (cardL != cardR); + if (invalid) { + uasserted(7105702, + str::stream() << "The histogram cardinalities " << cardL << " and " << cardR + << " did not match."); + } + + // Validate all type brackets in both histograms against each other. + TypeBracketFrequencyIterator itL{ls}; + TypeBracketFrequencyIterator itR{rs}; + sbe::value::TypeTags tagL, tagR; + double freqL, freqR; + while (itL.hasNext() && itR.hasNext()) { + std::tie(tagL, freqL) = itL.getNext(); + std::tie(tagR, freqR) = itR.getNext(); + + if (tagL != tagR) { + // Regardless of whether or not 'ls' has fewer entries than 'rs', both must have the + // same number of type-brackets. + uasserted(7105703, + str::stream() << "Histograms had different type-brackets " << tagL << " and " + << tagR << " at the same bound position."); } - // We now have the expected frequency for this type bracket. Validate it against 'tc'. - double tcFreq = getTypeBracketTypeCount(tc, tagL); - if (tcFreq != freq) { - uasserted(7131014, + // If 'lowBound' is set, having more entries in the histogram than in the type counts is + // considered valid. + const bool invalid = lowBound ? (freqL > freqR) : (freqL != freqR); + if (invalid) { + uasserted(7105704, str::stream() - << "Type count frequency " << tcFreq << " did not match bucket frequency " - << freq << " of type bracket for " << tagL); + << "Histogram frequencies frequencies " << freqL << " and " << freqR + << " of type bracket for " << tagL << " did not match."); } } + + if (itL.hasNext()) { + uasserted(7105705, "One histogram had more type-brackets than the other."); + } } struct ArrayFields { @@ -144,32 +249,9 @@ void validate(const ScalarHistogram& scalar, uasserted(7131010, str::stream() << "Array histogram must have at least one array."); } - // We must have the same number of histogrammable type brackets in all arrays, and each such - // type bracket must have a min value and a max value. Therefore, both the min and max - // array histograms should have the same cardinality. Note that this is not a true count of - // arrays since we could have multiple type-brackets per array. - const double minArrCard = arrayFields->arrayMin.getCardinality(); - const double maxArrCard = arrayFields->arrayMax.getCardinality(); - if (minArrCard != maxArrCard) { - uasserted(7131011, - str::stream() << "Min array cardinality was " << minArrCard - << " while max array cardinality was " << maxArrCard); - } - - // Ensure we have at least as many histogrammable types counted in our type counters as - // histogrammable type-brackets across all arrays (this is counted by 'minArrCard'). - double arrayTypeCount = getTotalCount(arrayFields->typeCounts, true /* histogrammable*/); - if (minArrCard > arrayTypeCount) { - uasserted(7131012, - str::stream() << "The array type counters count " << arrayTypeCount - << " array values, but the minimum number of arrays we must " - "have according to the min/max histograms is " - << minArrCard); - } - // There must be at least as many arrays as there are empty arrays. if (numArrays < arrayFields->emptyArrayCount) { - uasserted(7131013, + uasserted(7131011, str::stream() << "The Array type counter counts " << numArrays << " arrays, but the minimum number of arrays we must have according to " @@ -177,9 +259,25 @@ void validate(const ScalarHistogram& scalar, << arrayFields->emptyArrayCount); } - // TODO SERVER-71057: validate array histograms once type counters only count each type in - // an array at most once per array. validateHistogramTypeCounts(getArrayTypeCounts(), - // getArrayUnique()); + // Validate array histograms based on array type counters. Since there is one entry per type + // bracket per array in the min/max histograms, ensure that there are at least as many + // histogrammable entries in the array type counts. Since we first validate that min & max + // are equivalent, we only compare min type brackets with the type counters. + validateHistogramFrequencies(arrayFields->arrayMax, arrayFields->arrayMin); + validateHistogramTypeCounts(arrayFields->typeCounts, + arrayFields->arrayMin, + true /* Type counts are a lower bound. */); + + // This is similarly true for unique histograms, only their counts are larger. Furthermore, + // the min/max histograms are a "lower bound" on the scalar histograms. Only check the min + // histogram since it has already been dteermined to be equivalent to the max histogram. + validateHistogramTypeCounts(arrayFields->typeCounts, + arrayFields->arrayUnique, + true /* Type counts are a lower bound. */); + validateHistogramFrequencies(arrayFields->arrayMin, + arrayFields->arrayUnique, + true /* Type counts are a lower bound. */); + } else if (numArrays > 0) { uasserted(7131000, "A scalar ArrayHistogram should not have any arrays in its counters."); } @@ -199,6 +297,16 @@ void validate(const ScalarHistogram& scalar, } // namespace +double getTotalCount(const TypeCounts& tc, boost::optional<bool> isHistogrammable) { + double total = 0.0; + for (const auto& [tag, count] : tc) { + if (!isHistogrammable || (*isHistogrammable == canEstimateTypeViaHistogram(tag))) { + total += count; + } + } + return total; +} + ArrayHistogram::ArrayHistogram() : ArrayHistogram(ScalarHistogram::make(), {} /* Type counts. */, diff --git a/src/mongo/db/query/stats/array_histogram.h b/src/mongo/db/query/stats/array_histogram.h index a4f39495035..d15ce681819 100644 --- a/src/mongo/db/query/stats/array_histogram.h +++ b/src/mongo/db/query/stats/array_histogram.h @@ -38,6 +38,13 @@ namespace mongo::stats { using TypeCounts = std::map<sbe::value::TypeTags, double>; +/** + * By default, aggregates the total number of tag counts in the histogram together. + * If 'isHistogrammable' is set to true, then only counts histogrammable types. + * Otherwise, if 'isHistogrammable' is set to false, then only counts non-histogrammable types. + **/ +double getTotalCount(const TypeCounts& tc, boost::optional<bool> isHistogrammable = boost::none); + class ArrayHistogram { public: /** diff --git a/src/mongo/db/query/stats/max_diff.cpp b/src/mongo/db/query/stats/max_diff.cpp index 1c40c92afe7..200010341fd 100644 --- a/src/mongo/db/query/stats/max_diff.cpp +++ b/src/mongo/db/query/stats/max_diff.cpp @@ -325,25 +325,30 @@ std::shared_ptr<const ArrayHistogram> createArrayEstimator(const std::vector<SBE continue; } - // TODO SERVER-71057: Only count types once per array for histogram CE. + // We only count types once per occurrence per array for histogram CE. + std::set<value::TypeTags> perArrayTags; for (size_t i = 0; i < arrSize; i++) { - const auto [tag, val] = arr->getAt(i); + const auto [elemTag, elemVal] = arr->getAt(i); - // Increment array type tag counts. - auto arrTagCount = arrayTypeCounts.insert({tag, 1}); - if (!arrTagCount.second) { - ++arrTagCount.first->second; - } - - if (!canEstimateTypeViaHistogram(tag)) { + perArrayTags.insert(elemTag); + if (!canEstimateTypeViaHistogram(elemTag)) { // If the elements of this array are not histogrammable, then we can only update - // the array type counters + // the array type counters; we cannot add this value to the histogram. continue; } - const auto [tagCopy, valCopy] = value::copyValue(tag, val); + const auto [tagCopy, valCopy] = value::copyValue(elemTag, elemVal); arrayElements.emplace_back(tagCopy, valCopy); } + + // Increment array type tag counts. + for (auto elemTag : perArrayTags) { + auto arrTagCount = arrayTypeCounts.insert({elemTag, 1}); + if (!arrTagCount.second) { + ++arrTagCount.first->second; + } + } + updateMinMaxUniqArrayVals(arrayElements, arrayMinData, arrayMaxData, arrayUniqueData); } else if (tag == value::TypeTags::Boolean) { diff --git a/src/mongo/db/query/stats/type_count_test.cpp b/src/mongo/db/query/stats/type_count_test.cpp new file mode 100644 index 00000000000..68aff8ca1a0 --- /dev/null +++ b/src/mongo/db/query/stats/type_count_test.cpp @@ -0,0 +1,78 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/stats/array_histogram.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::stats { + +using TypeTags = sbe::value::TypeTags; + +TEST(TypeCountTest, HistogrammableCount) { + const TypeCounts _empty = {}; + ASSERT_EQ(getTotalCount(_empty), 0); + ASSERT_EQ(getTotalCount(_empty, true), 0); + ASSERT_EQ(getTotalCount(_empty, false), 0); + + // Histogrammable types only. + const TypeCounts _histogrammable = { + {TypeTags::NumberDecimal, 10}, + {TypeTags::StringSmall, 20}, + {TypeTags::Timestamp, 30}, + }; + ASSERT_EQ(getTotalCount(_histogrammable), 60); + ASSERT_EQ(getTotalCount(_histogrammable, true), 60); + ASSERT_EQ(getTotalCount(_histogrammable, false), 0); + + // Non-histogrammable types only. + const TypeCounts _nonHistogrammable = { + {TypeTags::Boolean, 10}, + {TypeTags::Object, 20}, + {TypeTags::Array, 30}, + }; + ASSERT_EQ(getTotalCount(_nonHistogrammable), 60); + ASSERT_EQ(getTotalCount(_nonHistogrammable, true), 0); + ASSERT_EQ(getTotalCount(_nonHistogrammable, false), 60); + + // Mixed types. + const TypeCounts _allTypes = { + // Non-histogrammable types. + {TypeTags::Boolean, 10}, + {TypeTags::Object, 20}, + {TypeTags::Array, 30}, + // Histogrammable types. + {TypeTags::NumberInt32, 50}, + {TypeTags::StringBig, 20}, + {TypeTags::ObjectId, 20}, + }; + ASSERT_EQ(getTotalCount(_allTypes), 150); + ASSERT_EQ(getTotalCount(_allTypes, true), 90); + ASSERT_EQ(getTotalCount(_allTypes, false), 60); +} +} // namespace mongo::stats |