diff options
author | Timour Katchaounov <timour.katchaounov@mongodb.com> | 2022-10-12 12:42:54 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-10-12 13:40:54 +0000 |
commit | ea85b99ada1fe21761f1e4f442f53509fda09d3a (patch) | |
tree | 87883d3c8c3d3b87e5e14dd5634e657151b04ee0 | |
parent | c66a4121d166b955523f001a4f81e3ffdabe961d (diff) | |
download | mongo-ea85b99ada1fe21761f1e4f442f53509fda09d3a.tar.gz |
SERVER-69795 PathArr estimation via Array type counter
-rw-r--r-- | src/mongo/db/query/ce/array_histogram.cpp | 28 | ||||
-rw-r--r-- | src/mongo/db/query/ce/array_histogram.h | 4 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_heuristic_test.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_histogram.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_histogram_test.cpp | 79 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.cpp | 23 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.h | 11 | ||||
-rw-r--r-- | src/mongo/db/query/ce/maxdiff_test_utils.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/query/ce/scalar_histogram.h | 7 |
10 files changed, 144 insertions, 41 deletions
diff --git a/src/mongo/db/query/ce/array_histogram.cpp b/src/mongo/db/query/ce/array_histogram.cpp index efbe7404289..52e2c3d32a2 100644 --- a/src/mongo/db/query/ce/array_histogram.cpp +++ b/src/mongo/db/query/ce/array_histogram.cpp @@ -48,6 +48,8 @@ ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, _arrayMax(std::move(arrayMax)), _arrayTypeCounts(std::move(arrayTypeCounts)) { invariant(isArray()); + // ArrayMin/Max histograms must have the same number of buckets. + invariant(_arrayMin->getBuckets().size() == _arrayMax->getBuckets().size()); } ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, TypeCounts typeCounts) @@ -121,4 +123,30 @@ const TypeCounts& ArrayHistogram::getArrayTypeCounts() const { return *_arrayTypeCounts; } +const size_t ArrayHistogram::getArrayCount() const { + if (isArray()) { + auto findArray = _typeCounts.find(value::TypeTags::Array); + uassert(6979504, + "Histogram with array data must have a total array count.", + findArray != _typeCounts.end()); + size_t arrayCount = findArray->second; + uassert(6979503, "Histogram with array data must have at least one array.", arrayCount > 0); + return arrayCount; + } + return 0; +} + +const size_t ArrayHistogram::getEmptyArrayCount() const { + if (isArray()) { + size_t nonEmptyArrayCount = _arrayMin->empty() ? 0 : _arrayMin->getCardinality(); + size_t totalArrCount = getArrayCount(); + uassert(6979502, + "The number of empty arrays is < the total number of arrays.", + totalArrCount >= nonEmptyArrayCount); + size_t emptyArrCount = totalArrCount - nonEmptyArrayCount; + return emptyArrCount; + } + return 0; +} + } // namespace mongo::ce diff --git a/src/mongo/db/query/ce/array_histogram.h b/src/mongo/db/query/ce/array_histogram.h index 2a6e9af3383..8a953f3414d 100644 --- a/src/mongo/db/query/ce/array_histogram.h +++ b/src/mongo/db/query/ce/array_histogram.h @@ -73,6 +73,10 @@ public: const ScalarHistogram& getArrayMax() const; const TypeCounts& getTypeCounts() const; const TypeCounts& getArrayTypeCounts() const; + // The total number of arrays in the histogram's path including empty arrays. + const size_t getArrayCount() const; + // The total number of empty arrays ( [] ) in the histogram's path. + const size_t getEmptyArrayCount() const; private: /* ScalarHistogram fields for all paths. */ diff --git a/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp b/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp index f03c50754ef..f859a355ff6 100644 --- a/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp +++ b/src/mongo/db/query/ce/ce_dataflow_nodes_test.cpp @@ -61,7 +61,7 @@ protected: TEST(CEDataflowTest, EstimateTrivialNodes) { DataflowCETester t; - const auto matchCard = t.getMatchCE("{a: 1}"); + const auto matchCard = t.getMatchCE<optimizer::RootNode>("{a: 1}"); // Verify 'CollationNode' estimate returns the input cardinality. ASSERT_CE(t, "[{$sort: {a: 1}}]", kCollCard); @@ -132,7 +132,7 @@ TEST(CEDataflowTest, EstimateUnionNode) { TEST(CEDataflowTest, EstimateLimitSkipNode) { DataflowCETester t; - const CEType matchCard = t.getMatchCE("{a: 1}"); + const CEType matchCard = t.getMatchCE<optimizer::RootNode>("{a: 1}"); // Verify that 'LimitSkipNode' estimate with only a limit set is min(limit, inputCE). ASSERT_CE(t, "[{$limit: 1}]", 1.0); @@ -204,7 +204,7 @@ TEST(CEDataflowTest, EstimateLimitSkipNode) { TEST(CEDataflowTest, EstimateUnwindNode) { DataflowCETester t; - const CEType matchCard = t.getMatchCE("{a: 1}"); + const CEType matchCard = t.getMatchCE<optimizer::RootNode>("{a: 1}"); // We assume that arrays on average have ~10 elements, so we estimate this as inputCard*10. ASSERT_CE(t, "[{$unwind: '$a'}]", 10 * kCollCard); diff --git a/src/mongo/db/query/ce/ce_heuristic_test.cpp b/src/mongo/db/query/ce/ce_heuristic_test.cpp index 0eaf38eb97b..6146a001fa3 100644 --- a/src/mongo/db/query/ce/ce_heuristic_test.cpp +++ b/src/mongo/db/query/ce/ce_heuristic_test.cpp @@ -206,8 +206,8 @@ TEST(CEHeuristicTest, CEWithoutOptimizationTraverseSelectivityDoesNotAccumulate) "{'b0.b1.b3': {$gt: 10}}" "]}"; HeuristicCETester ht(collName, kNoOptPhaseSet); - auto ce1 = ht.getMatchCE(query); - auto ce2 = ht.getMatchCE(queryWithLongPaths); + auto ce1 = ht.getMatchCE<optimizer::RootNode>(query); + auto ce2 = ht.getMatchCE<optimizer::RootNode>(queryWithLongPaths); ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); } @@ -649,8 +649,8 @@ TEST(CEHeuristicTest, CEWithoutOptimizationEquivalentConjunctions) { HeuristicCETester ht(collName, kNoOptPhaseSet); ht.setCollCard(kCollCard); - auto ce1 = ht.getCE(rootNode1); - auto ce2 = ht.getCE(rootNode2); + auto ce1 = ht.getCE<optimizer::RootNode>(rootNode1); + auto ce2 = ht.getCE<optimizer::RootNode>(rootNode2); ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); } @@ -750,7 +750,7 @@ TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathComplex) { "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " "81}}, {a0: {$lt: 77}}]}" "]}"; - auto ce1 = ht.getMatchCE(query1); + auto ce1 = ht.getMatchCE<optimizer::RootNode>(query1); // The conjuncts are in inverse selectivity order. std::string query2 = "{$or: [" @@ -762,7 +762,7 @@ TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathComplex) { "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}" "]}"; - auto ce2 = ht.getMatchCE(query2); + auto ce2 = ht.getMatchCE<optimizer::RootNode>(query2); ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); } @@ -804,10 +804,8 @@ TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF2paths) { } TEST(CEHeuristicTest, CEAfterMemoSubstitutionExplorationPhases) { - std::string query = "{a : 13, b : 42}"; HeuristicCETester ht(collName); - double ce = ht.getMatchCE(query); - ASSERT_APPROX_EQUAL(10.0, ce, kMaxCEError); + ASSERT_MATCH_CE(ht, "{a : 13, b : 42}", 10.0); } } // namespace diff --git a/src/mongo/db/query/ce/ce_histogram.cpp b/src/mongo/db/query/ce/ce_histogram.cpp index ac6938766b4..4ea46a7b8dd 100644 --- a/src/mongo/db/query/ce/ce_histogram.cpp +++ b/src/mongo/db/query/ce/ce_histogram.cpp @@ -177,10 +177,10 @@ public: const CEType totalCard = _stats->getCardinality(); if (conjunctReq.intervals.empty() && !conjunctReq.includeScalar) { - // TODO SERVER-69795: handle the case where we have a single 'PathArr' interval for - // this field by returning the number of arrays we expect in the output based on - // histogram type counters. - topLevelSelectivities.push_back(1.0); + // In this case there is a single 'PathArr' interval for this field. + // The selectivity of this interval is: (count of all arrays) / totalCard + double pathArrSel = conjunctReq.histogram.getArrayCount() / totalCard; + topLevelSelectivities.push_back(pathArrSel); } // Intervals are in DNF. diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp index 6e38fd707b7..67a19f02f25 100644 --- a/src/mongo/db/query/ce/ce_histogram_test.cpp +++ b/src/mongo/db/query/ce/ce_histogram_test.cpp @@ -119,12 +119,21 @@ std::unique_ptr<ArrayHistogram> getArrayHistogramFromData( std::vector<TestBucket> arrayUniqueBuckets, std::vector<TestBucket> arrayMinBuckets, std::vector<TestBucket> arrayMaxBuckets, - TypeCounts arrayTypeCounts) { + TypeCounts arrayTypeCounts, + size_t emptyArrayCount = 0) { + auto arrayMinHist = getHistogramFromData(arrayMinBuckets); + auto arrayMaxHist = getHistogramFromData(arrayMaxBuckets); + TypeCounts typeCounts = getTypeCountsFromData(scalarBuckets); + // The min/max histograms contain one (min or max) value from each array. Therefore the + // total number of min/max values is equal to the number of non non-empty arrays. + // The total number of arrays needs to account for the additional empty arrays not present + // in histograms. + typeCounts[value::TypeTags::Array] = arrayMinHist.getCardinality() + emptyArrayCount; return std::make_unique<ArrayHistogram>(getHistogramFromData(scalarBuckets), - getTypeCountsFromData(scalarBuckets), + std::move(typeCounts), getHistogramFromData(arrayUniqueBuckets), - getHistogramFromData(arrayMinBuckets), - getHistogramFromData(arrayMaxBuckets), + std::move(arrayMinHist), + std::move(arrayMaxHist), std::move(arrayTypeCounts)); } @@ -477,7 +486,8 @@ TEST(CEHistogramTest, TestArrayHistogramOnAtomicPredicates) { {Value(6), 1 /* frequency */}, {Value(10), 1 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, 4}} // Array type counts. + {{sbe::value::TypeTags::NumberInt32, 4}}, // Array type counts. + 1 // 1 empty array. )); CEHistogramTester t(collName, collCardinality, collStats); @@ -536,17 +546,17 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { })); // An array histogram built on the following arrays with 35 occurrences of each: - // [[1, 2, 3], [5, 5, 5, 5, 5], [6], [], [8, 9, 10]] + // [{[1, 2, 3]: 35}, {[5, 5, 5, 5, 5]: 35}, {[6]: 35}, {[]: 35}, {[8, 9, 10]: 35}] collStats->addHistogram( "array", getArrayHistogramFromData( {/* No scalar buckets. */}, { // Array unique buckets. - {Value(1), 35 /* frequency */}, {Value(2), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, - {Value(5), 35 /* frequency */}, - {Value(6), 35 /* frequency */, 105 /* range frequency */, 4 /* ndv */}, + {Value(5), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, + {Value(6), 35 /* frequency */}, + {Value(10), 35 /* frequency */, 105 /* range frequency */, 3 /* ndv */}, }, { // Array min buckets. @@ -562,7 +572,8 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { {Value(6), 35 /* frequency */}, {Value(10), 35 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, collCardinality}} // Array type counts. + {{sbe::value::TypeTags::NumberInt32, collCardinality}}, // Array type counts. + 35 // 35 empty arrays )); collStats->addHistogram( @@ -572,7 +583,7 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { // [{[1, 2, 3]: 17}, {[5, 5, 5, 5, 5]: 17}, {[6]: 17}, {[]: 20}, {[8, 9, 10]: 17}] getArrayHistogramFromData( { - // Scalar buckets. + // Scalar buckets. These are half the number of values from the "scalar" histogram. {Value(1), 5 /* frequency */}, {Value(2), 5 /* frequency */}, {Value(3), 10 /* frequency */, 60 /* range frequency */, 5 /* ndv */}, @@ -580,9 +591,10 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { }, { // Array unique buckets. - {Value(1), 17 /* frequency */, 34 /* range frequency */, 3 /* ndv */}, - {Value(5), 17 /* frequency */}, - {Value(6), 17 /* frequency */, 51 /* range frequency */, 4 /* ndv */}, + {Value(2), 17 /* frequency */, 17 /* range frequency */, 2 /* ndv */}, + {Value(5), 17 /* frequency */, 17 /* range frequency */, 2 /* ndv */}, + {Value(6), 17 /* frequency */}, + {Value(10), 17 /* frequency */, 34 /* range frequency */, 3 /* ndv */}, }, { // Array min buckets. @@ -598,7 +610,8 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { {Value(6), 17 /* frequency */}, {Value(10), 17 /* frequency */}, }, - {{sbe::value::TypeTags::NumberInt32, 88}} // Array type counts. + {{sbe::value::TypeTags::NumberInt32, 88}}, // Array type counts. + 20 // 20 empty arrays. )); CEHistogramTester t(collName, collCardinality, collStats); @@ -647,8 +660,8 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 35.0); // Test case with multiple predicates and ranges. - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 68.75); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 28.19); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 55.959); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 25.4291); // Test multiple $elemMatches. ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 0.0); @@ -666,8 +679,8 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: " "{$elemMatch: {$eq: 5}}}", 0.0); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$lt: 5}}}", 32.09); - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$gt: 5}}}", 42.78); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$lt: 5}}}", 24.124); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$gt: 5}}}", 38.5984); // Verify that we still return an estimate of 0.0 for any $elemMatch predicate on a scalar // field when we have a non-multikey index. @@ -675,6 +688,34 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { makeIndexDefinition("scalar", CollationOp::Ascending, /* isMultiKey */ false)}}); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}}", 0.0); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$gt: 1, $lt: 10}}}", 0.0); + + // Test how we estimate singular PathArr sargable predicate. + ASSERT_MATCH_CE_NODE(t, "{array: {$elemMatch: {}}}", 175.0, optimizer::SargableNode); + ASSERT_MATCH_CE_NODE(t, "{mixed: {$elemMatch: {}}}", 88.0, optimizer::SargableNode); + + // Take into account both empty and non-empty arrays. + auto makePathArrABT = [&](const std::string& path) { + const auto scanProjection = "scan_0"; + auto scanNode = make<ScanNode>(scanProjection, collName); + auto filterNode = make<FilterNode>( + make<EvalFilter>(make<PathGet>(path, make<PathArr>()), make<Variable>(scanProjection)), + std::move(scanNode)); + return make<RootNode>( + properties::ProjectionRequirement{ProjectionNameVector{scanProjection}}, + std::move(filterNode)); + }; + + // There are no arrays in the 'scalar' field. + ABT scalarABT = makePathArrABT("scalar"); + ASSERT_CE(t, scalarABT, 0.0); + + // About half the values of this field are arrays. + ABT mixedABT = makePathArrABT("mixed"); + ASSERT_CE(t, mixedABT, 88.0); + + // This field is always an array. + ABT arrayABT = makePathArrABT("array"); + ASSERT_CE(t, arrayABT, collCardinality); } TEST(CEHistogramTest, TestMixedElemMatchAndNonElemMatch) { diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp index f4e2e4080ee..f91f1a3c6b3 100644 --- a/src/mongo/db/query/ce/ce_test_utils.cpp +++ b/src/mongo/db/query/ce/ce_test_utils.cpp @@ -52,22 +52,29 @@ CETester::CETester(std::string collName, addCollection(collName, collCard); } +template <class T> optimizer::CEType CETester::getMatchCE(const std::string& predicate) const { - return getCE("[{$match: " + predicate + "}]"); + return getCE<T>("[{$match: " + predicate + "}]"); } +template <class T> optimizer::CEType CETester::getCE(const std::string& pipeline) const { if constexpr (kCETestLogOnly) { - std::cout << "Query: " << pipeline << "\n"; + std::cout << "\n\nQuery: " << pipeline << "\n"; } // Construct ABT from pipeline and optimize. ABT abt = translatePipeline(pipeline, _collName); // Get cardinality estimate. - return getCE(abt); + return getCE<T>(abt); } +template optimizer::CEType CETester::getCE<optimizer::RootNode>(const std::string& query) const; +template optimizer::CEType CETester::getCE<optimizer::SargableNode>(const std::string& query) const; + + +template <class T> optimizer::CEType CETester::getCE(ABT& abt) const { if constexpr (kCETestLogOnly) { std::cout << ExplainGenerator::explainV2(abt) << std::endl; @@ -138,7 +145,7 @@ optimizer::CEType CETester::getCE(ABT& abt) const { } } - if (node.is<optimizer::RootNode>()) { + if (node.is<T>()) { // We want to return the cardinality for the entire ABT. outCard = memoCE; } @@ -153,12 +160,20 @@ optimizer::CEType CETester::getCE(ABT& abt) const { return outCard; } +template optimizer::CEType CETester::getCE<optimizer::RootNode>(ABT& abt) const; +template optimizer::CEType CETester::getCE<optimizer::SargableNode>(ABT& abt) const; +template optimizer::CEType CETester::getMatchCE<optimizer::RootNode>( + const std::string& matchPredicate) const; +template optimizer::CEType CETester::getMatchCE<optimizer::SargableNode>( + const std::string& matchPredicate) const; + ScanDefinition& CETester::getCollScanDefinition() { auto it = _metadata._scanDefs.find(_collName); invariant(it != _metadata._scanDefs.end()); return it->second; } + void CETester::setCollCard(double card) { auto& scanDef = getCollScanDefinition(); addCollection(_collName, card, scanDef.getIndexDefs()); diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h index ccab2ba77ce..a62a3b4a3b8 100644 --- a/src/mongo/db/query/ce/ce_test_utils.h +++ b/src/mongo/db/query/ce/ce_test_utils.h @@ -86,7 +86,8 @@ const OptPhaseManager::PhaseSet kNoOptPhaseSet{}; (str::stream() << "{" << field << ": {$elemMatch: " << predicate << "}}") // This macro verifies the cardinality of a pipeline or an input ABT. -#define ASSERT_CE(ce, pipeline, expectedCE) _ASSERT_CE(ce.getCE(pipeline), (expectedCE)) +#define ASSERT_CE(ce, pipeline, expectedCE) \ + _ASSERT_CE(ce.getCE<optimizer::RootNode>(pipeline), (expectedCE)) // This macro does the same as above but also sets the collection cardinality. #define ASSERT_CE_CARD(ce, pipeline, expectedCE, collCard) \ @@ -95,7 +96,10 @@ const OptPhaseManager::PhaseSet kNoOptPhaseSet{}; // This macro verifies the cardinality of a pipeline with a single $match predicate. #define ASSERT_MATCH_CE(ce, predicate, expectedCE) \ - _ASSERT_CE(ce.getMatchCE(predicate), (expectedCE)) + _ASSERT_CE(ce.getMatchCE<optimizer::RootNode>(predicate), (expectedCE)) + +#define ASSERT_MATCH_CE_NODE(ce, predicate, expectedCE, nodeType) \ + _ASSERT_CE(ce.getMatchCE<nodeType>(predicate), (expectedCE)) // This macro does the same as above but also sets the collection cardinality. #define ASSERT_MATCH_CE_CARD(ce, predicate, expectedCE, collCard) \ @@ -124,16 +128,19 @@ public: /** * Returns the estimated cardinality of a given 'matchPredicate'. */ + template <class T> CEType getMatchCE(const std::string& matchPredicate) const; /** * Returns the estimated cardinality of a given 'pipeline'. */ + template <class T> CEType getCE(const std::string& pipeline) const; /** * Returns the estimated cardinality of a given 'abt'. */ + template <class T> CEType getCE(ABT& abt) const; /** diff --git a/src/mongo/db/query/ce/maxdiff_test_utils.cpp b/src/mongo/db/query/ce/maxdiff_test_utils.cpp index fdd42ff627b..96f86914633 100644 --- a/src/mongo/db/query/ce/maxdiff_test_utils.cpp +++ b/src/mongo/db/query/ce/maxdiff_test_utils.cpp @@ -137,6 +137,9 @@ std::string plotArrayEstimator(const ArrayHistogram& estimator, const std::strin os << tagCount.first << "=" << tagCount.second << " "; } } + if (estimator.isArray()) { + os << "\nEmpty array count: " << estimator.getEmptyArrayCount(); + } os << "\n"; return os.str(); diff --git a/src/mongo/db/query/ce/scalar_histogram.h b/src/mongo/db/query/ce/scalar_histogram.h index 203b9bd7d96..3ce391f20d2 100644 --- a/src/mongo/db/query/ce/scalar_histogram.h +++ b/src/mongo/db/query/ce/scalar_histogram.h @@ -84,6 +84,13 @@ public: const sbe::value::Array& getBounds() const; const std::vector<Bucket>& getBuckets() const; + // Return the total number of histogrammed values. + const size_t getCardinality() const { + if (_buckets.empty()) { + return 0.0; + } + return _buckets.back()._cumulativeFreq; + } bool empty() const { return _buckets.empty(); |