diff options
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/query/ce/SConscript | 10 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_array_data_test.cpp | 285 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_interpolation_test.cpp | 65 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.cpp | 27 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.h | 21 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_estimation.cpp | 85 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_estimation.h | 4 |
7 files changed, 433 insertions, 64 deletions
diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index 5439f5a078a..db504072718 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -85,6 +85,16 @@ env.CppUnitTest( ) env.CppUnitTest( + target="ce_array_data_test", + source=[ + "ce_array_data_test.cpp", + ], + LIBDEPS=[ + 'ce_test_utils', + ], +) + +env.CppUnitTest( target='stats_cache_loader_test', source=[ 'stats_cache_loader_test.cpp', diff --git a/src/mongo/db/query/ce/ce_array_data_test.cpp b/src/mongo/db/query/ce/ce_array_data_test.cpp new file mode 100644 index 00000000000..b8e7d203c59 --- /dev/null +++ b/src/mongo/db/query/ce/ce_array_data_test.cpp @@ -0,0 +1,285 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include <vector> + +#include "mongo/db/exec/sbe/values/value.h" +#include "mongo/db/query/ce/array_histogram.h" +#include "mongo/db/query/ce/ce_test_utils.h" +#include "mongo/db/query/ce/histogram_estimation.h" +#include "mongo/db/query/query_test_service_context.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { +namespace { + +using namespace sbe; + +/** + * Structure representing a range query and its estimated and actual cardinalities. + * Used to record hand-crafted queries over a pre-generated dataset. + */ +struct QuerySpec { + // Low bound of the query range. + int32_t low; + // Upper bound of the query range. + int32_t high; + // Estimated cardinality of $match query. + double estMatch; + // Actual cardinality of $match query. + double actMatch; + // Estimated cardinality of $elemMatch query. + double estElemMatch; + // Actual cardinality of $elemMatch query. + double actElemMatch; +}; + +static std::pair<double, double> computeErrors(size_t actualCard, double estimatedCard) { + double error = estimatedCard - actualCard; + double relError = (actualCard == 0) ? (estimatedCard == 0 ? 0.0 : -1.0) : error / actualCard; + return std::make_pair(error, relError); +} + +static std::string serializeQuery(QuerySpec& q, bool isElemMatch) { + std::ostringstream os; + os << "{$match: {a: {"; + if (isElemMatch) { + os << "$elemMatch: {"; + } + os << "$gt: " << q.low; + os << ", $lt: " << q.high; + if (isElemMatch) { + os << "}"; + } + os << "}}}\n"; + return os.str(); +} + +static std::string computeRMSE(std::vector<QuerySpec>& querySet, bool isElemMatch) { + double rms = 0.0, relRms = 0.0, meanAbsSelErr = 0.0; + size_t trialSize = querySet.size(); + const size_t dataSize = 1000; + + std::ostringstream os; + os << "\nQueries:\n"; + for (auto& q : querySet) { + double estimatedCard = isElemMatch ? q.estElemMatch : q.estMatch; + double actualCard = isElemMatch ? q.actElemMatch : q.actMatch; + + auto [error, relError] = computeErrors(actualCard, estimatedCard); + rms += error * error; + relRms += relError * relError; + meanAbsSelErr += std::abs(error); + os << serializeQuery(q, isElemMatch); + os << "Estimated: " << estimatedCard << " Actual " << actualCard << " (Error: " << error + << " RelError: " << relError << ")\n\n"; + } + rms = std::sqrt(rms / trialSize); + relRms = std::sqrt(relRms / trialSize); + meanAbsSelErr /= (trialSize * dataSize); + + os << "=====" << (isElemMatch ? " ElemMatch errors: " : "Match errors:") << "=====\n"; + os << "RMSE : " << rms << " RelRMSE : " << relRms + << " MeanAbsSelectivityError: " << meanAbsSelErr << std::endl; + return os.str(); +} + +TEST(EstimatorArrayDataTest, Histogram1000ArraysSmall10Buckets) { + std::vector<BucketData> scalarData{{}}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector<BucketData> minData{{0, 5.0, 0.0, 0.0}, + {553, 2.0, 935.0, 303.0}, + {591, 4.0, 2.0, 1.0}, + {656, 2.0, 21.0, 12.0}, + {678, 3.0, 6.0, 3.0}, + {693, 2.0, 1.0, 1.0}, + {730, 1.0, 6.0, 3.0}, + {788, 1.0, 2.0, 2.0}, + {847, 2.0, 4.0, 1.0}, + {867, 1.0, 0.0, 0.0}}; + + const ScalarHistogram aMinHist = createHistogram(minData); + + std::vector<BucketData> maxData{{117, 1.0, 0.0, 0.0}, + {210, 1.0, 1.0, 1.0}, + {591, 1.0, 8.0, 4.0}, + {656, 1.0, 0.0, 0.0}, + {353, 2.0, 18.0, 9.0}, + {610, 5.0, 125.0, 65.0}, + {733, 8.0, 134.0, 53.0}, + {768, 6.0, 50.0, 16.0}, + {957, 8.0, 448.0, 137.0}, + {1000, 7.0, 176.0, 40.0}}; + + const ScalarHistogram aMaxHist = createHistogram(maxData); + + std::vector<BucketData> uniqueData{{0, 5.0, 0.0, 0.0}, + {16, 11.0, 74.0, 13.0}, + {192, 13.0, 698.0, 148.0}, + {271, 9.0, 312.0, 70.0}, + {670, 7.0, 1545.0, 355.0}, + {712, 9.0, 159.0, 32.0}, + {776, 11.0, 247.0, 54.0}, + {869, 9.0, 361.0, 85.0}, + {957, 8.0, 323.0, 76.0}, + {1000, 7.0, 188.0, 40.0}}; + + const ScalarHistogram aUniqueHist = createHistogram(uniqueData); + + std::map<value::TypeTags, size_t> typeCounts; + std::map<value::TypeTags, size_t> arrayTypeCounts; + // Dataset generated as 1000 arrays of size between 3 to 5. + typeCounts.insert({value::TypeTags::Array, 1000}); + arrayTypeCounts.insert({value::TypeTags::NumberInt32, 3996}); + + const ArrayHistogram arrHist( + scalarHist, typeCounts, aUniqueHist, aMinHist, aMaxHist, arrayTypeCounts); + + std::vector<QuerySpec> querySet{{10, 20, 35.7, 93.0, 37.8, 39.0}, + {10, 60, 103.3, 240.0, 158.0, 196.0}, + {320, 330, 554.5, 746.0, 26.0, 30.0}, + {320, 400, 672.9, 832.0, 231.5, 298.0}, + {980, 990, 88.8, 101.0, 36.5, 41.0}, + {970, 1050, 129.7, 141.0, 129.7, 141.0}}; + + for (const auto q : querySet) { + // $match query, includeScalar = true. + double estCard = estimateCardRange(arrHist, + true, + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.low), + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.high)); + ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1); + + // $elemMatch query, includeScalar = false. + estCard = estimateCardRange(arrHist, + false, + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.low), + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.high)); + ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1); + } + std::cout << computeRMSE(querySet, false) << std::endl; + std::cout << computeRMSE(querySet, true) << std::endl; +} + +TEST(EstimatorArrayDataTest, Histogram1000ArraysLarge10Buckets) { + std::vector<BucketData> scalarData{{}}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector<BucketData> minData{{0, 2.0, 0.0, 0.0}, + {1324, 4.0, 925.0, 408.0}, + {1389, 5.0, 7.0, 5.0}, + {1521, 2.0, 16.0, 10.0}, + {1621, 2.0, 13.0, 7.0}, + {1852, 5.0, 10.0, 9.0}, + {1864, 2.0, 0.0, 0.0}, + {1971, 1.0, 3.0, 3.0}, + {2062, 2.0, 0.0, 0.0}, + {2873, 1.0, 0.0, 0.0}}; + + const ScalarHistogram aMinHist = createHistogram(minData); + + std::vector<BucketData> maxData{{2261, 1.0, 0.0, 0.0}, + {2673, 1.0, 0.0, 0.0}, + {2930, 1.0, 1.0, 1.0}, + {3048, 2.0, 2.0, 2.0}, + {3128, 3.0, 1.0, 1.0}, + {3281, 2.0, 0.0, 0.0}, + {3378, 2.0, 7.0, 5.0}, + {3453, 4.0, 2.0, 2.0}, + {3763, 6.0, 44.0, 23.0}, + {5000, 1.0, 920.0, 416.0}}; + + const ScalarHistogram aMaxHist = createHistogram(maxData); + + std::vector<BucketData> uniqueData{{0, 2.0, 0.0, 0.0}, + {1106, 9.0, 1970.0, 704.0}, + {1542, 11.0, 736.0, 280.0}, + {3267, 6.0, 3141.0, 1097.0}, + {3531, 6.0, 461.0, 175.0}, + {3570, 7.0, 48.0, 20.0}, + {4573, 8.0, 1851.0, 656.0}, + {4619, 6.0, 65.0, 30.0}, + {4782, 5.0, 265.0, 99.0}, + {5000, 1.0, 342.0, 135.0}}; + + const ScalarHistogram aUniqueHist = createHistogram(uniqueData); + + std::map<value::TypeTags, size_t> typeCounts; + std::map<value::TypeTags, size_t> arrayTypeCounts; + // Dataset generated as 1000 arrays of size between 8 to 10. + typeCounts.insert({value::TypeTags::Array, 1000}); + arrayTypeCounts.insert({value::TypeTags::NumberInt32, 8940}); + + const ArrayHistogram arrHist( + scalarHist, typeCounts, aUniqueHist, aMinHist, aMaxHist, arrayTypeCounts); + + std::vector<QuerySpec> querySet{{10, 20, 13.7, 39.0, 9.7, 26.0}, + {10, 60, 41.6, 108.0, 55.7, 101.0}, + {1000, 1010, 705.4, 861.0, 9.7, 7.0}, + {1000, 1050, 733.3, 884.0, 55.7, 87.0}, + {3250, 3300, 988.0, 988.0, 59.3, 86.0}, + {4970, 4980, 23.3, 53.0, 8.5, 16.0}}; + + for (const auto q : querySet) { + // $match query, includeScalar = true. + double estCard = estimateCardRange(arrHist, + true, + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.low), + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.high)); + ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1); + + // $elemMatch query, includeScalar = false. + estCard = estimateCardRange(arrHist, + false, + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.low), + false, + value::TypeTags::NumberInt32, + sbe::value::bitcastFrom<int32_t>(q.high)); + ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1); + } + std::cout << computeRMSE(querySet, false) << std::endl; + std::cout << computeRMSE(querySet, true) << std::endl; +} +} // namespace +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp index 3e79374b627..6bbda0d0c34 100644 --- a/src/mongo/db/query/ce/ce_interpolation_test.cpp +++ b/src/mongo/db/query/ce/ce_interpolation_test.cpp @@ -44,47 +44,6 @@ double estimateIntValCard(const ScalarHistogram& hist, const int v, const Estima return estimate(hist, tag, val, type).card; }; -struct BucketData { - Value _v; - double _equalFreq; - double _rangeFreq; - double _ndv; - - BucketData(Value v, double equalFreq, double rangeFreq, double ndv) - : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {} - BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv) - : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} - BucketData(int v, double equalFreq, double rangeFreq, double ndv) - : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} -}; - -ScalarHistogram createHistogram(const std::vector<BucketData>& data) { - sbe::value::Array array; - for (const auto& item : data) { - const auto [tag, val] = stage_builder::makeValue(item._v); - array.push_back(tag, val); - } - - value::Array bounds; - std::vector<Bucket> buckets; - - double cumulativeFreq = 0.0; - double cumulativeNDV = 0.0; - - for (size_t i = 0; i < data.size(); i++) { - const auto [tag, val] = array.getAt(i); - bounds.push_back(tag, val); - - const auto& item = data.at(i); - cumulativeFreq += item._equalFreq + item._rangeFreq; - cumulativeNDV += item._ndv + 1.0; - buckets.emplace_back( - item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV); - } - - return {std::move(bounds), std::move(buckets)}; -} - TEST(EstimatorTest, ManualHistogram) { std::vector<BucketData> data{{0, 1.0, 1.0, 1.0}, {10, 1.0, 10.0, 5.0}, @@ -391,29 +350,29 @@ TEST(EstimatorTest, UniformIntArrayOnlyEstimate) { value::TypeTags highTag = value::TypeTags::NumberInt64; value::Value highVal = 600; - // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}]. + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. double expectedCard = estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal); - ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1); + ASSERT_APPROX_EQUAL(27.0, expectedCard, 0.1); // actual 21. - // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. - // Note: this should be the same as above, since we have no scalars. + // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}]. + // Note: although there are no scalars, the estimate is different than the + // above since we use different formulas. expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal); - ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1); + ASSERT_APPROX_EQUAL(92.0, expectedCard, 0.1); // actual 92. // Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms. lowVal = 10; highVal = 110; - // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}]. + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 10, $lt: 110}}}}]. expectedCard = estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal); - ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); + ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); // actual 29. - // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. - // Note: this should be the same as above, since we have no scalars. + // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}]. expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal); - ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); + ASSERT_APPROX_EQUAL(27.8, expectedCard, 0.1); // actual 31. } TEST(EstimatorTest, UniformIntMixedArrayEstimate) { @@ -465,12 +424,12 @@ TEST(EstimatorTest, UniformIntMixedArrayEstimate) { // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}]. double expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal); - ASSERT_APPROX_EQUAL(9.8, expectedCard, 0.1); // Actual: 94. + ASSERT_APPROX_EQUAL(90.9, expectedCard, 0.1); // Actual: 94. // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}]. expectedCard = estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal); - ASSERT_APPROX_EQUAL(5.6, expectedCard, 0.1); // Actual: 8. + ASSERT_APPROX_EQUAL(11.0, expectedCard, 0.1); // Actual: 8. } } // namespace diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp index 6f68cc04304..fc34dd147ac 100644 --- a/src/mongo/db/query/ce/ce_test_utils.cpp +++ b/src/mongo/db/query/ce/ce_test_utils.cpp @@ -37,6 +37,7 @@ #include "mongo/db/query/optimizer/explain.h" #include "mongo/db/query/optimizer/opt_phase_manager.h" #include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" #include "mongo/unittest/unittest.h" namespace mongo::ce { @@ -136,4 +137,30 @@ optimizer::CEType CETester::getCE(ABT& abt) const { return outCard; } +ScalarHistogram createHistogram(const std::vector<BucketData>& data) { + sbe::value::Array array; + for (const auto& item : data) { + const auto [tag, val] = stage_builder::makeValue(item._v); + array.push_back(tag, val); + } + + value::Array bounds; + std::vector<Bucket> buckets; + + double cumulativeFreq = 0.0; + double cumulativeNDV = 0.0; + + for (size_t i = 0; i < data.size(); i++) { + const auto [tag, val] = array.getAt(i); + bounds.push_back(tag, val); + + const auto& item = data.at(i); + cumulativeFreq += item._equalFreq + item._rangeFreq; + cumulativeNDV += item._ndv + 1.0; + buckets.emplace_back( + item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV); + } + + return {std::move(bounds), std::move(buckets)}; +} } // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h index 85ece3a1ee6..8d38b160a83 100644 --- a/src/mongo/db/query/ce/ce_test_utils.h +++ b/src/mongo/db/query/ce/ce_test_utils.h @@ -32,6 +32,7 @@ #include <cstddef> #include <sys/types.h> +#include "mongo/db/query/ce/scalar_histogram.h" #include "mongo/db/query/optimizer/cascades/interfaces.h" #include "mongo/db/query/optimizer/opt_phase_manager.h" @@ -49,6 +50,7 @@ class CEInterface; namespace ce { using namespace optimizer; +using namespace sbe; // Enable this flag to log all estimates, and let all tests pass. constexpr bool kCETestLogOnly = false; @@ -128,5 +130,24 @@ private: mutable PrefixId _prefixId; }; +/** + * Test utility for helping with creation of manual histograms in the unit tests. + */ +struct BucketData { + Value _v; + double _equalFreq; + double _rangeFreq; + double _ndv; + + BucketData(Value v, double equalFreq, double rangeFreq, double ndv) + : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {} + BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv) + : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} + BucketData(int v, double equalFreq, double rangeFreq, double ndv) + : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} +}; + +ScalarHistogram createHistogram(const std::vector<BucketData>& data); + } // namespace ce } // namespace mongo diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp index 948729ee1b4..4fd2d8376db 100644 --- a/src/mongo/db/query/ce/histogram_estimation.cpp +++ b/src/mongo/db/query/ce/histogram_estimation.cpp @@ -266,6 +266,29 @@ static EstimationResult estimateRange(const ScalarHistogram& histogram, return highEstimate - lowEstimate; } +/** + * Compute an estimate for range query on array data with formula: + * Card(ArrayMin(a < valHigh)) - Card(ArrayMax(a < valLow)) + */ +static EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin, + const ScalarHistogram& histogramAmax, + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh) { + const EstimationType highType = + highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; + const EstimationResult highEstimate = estimate(histogramAmin, tagHigh, valHigh, highType); + + const EstimationType lowType = + lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; + const EstimationResult lowEstimate = estimate(histogramAmax, tagLow, valLow, lowType); + + return highEstimate - lowEstimate; +} + double estimateCardRange(const ArrayHistogram& ah, bool includeScalar, /* Define lower bound. */ @@ -275,7 +298,8 @@ double estimateCardRange(const ArrayHistogram& ah, /* Define upper bound. */ bool highInclusive, value::TypeTags tagHigh, - value::Value valHigh) { + value::Value valHigh, + EstimationAlgo estimationAlgo) { uassert(6695701, "Low bound must not be higher than high", compareValues3w(tagLow, valLow, tagHigh, valHigh) <= 0); @@ -287,16 +311,57 @@ double estimateCardRange(const ArrayHistogram& ah, double result = 0.0; if (ah.isArray()) { - const auto arrayMinEst = estRange(ah.getArrayMin()); - const auto arrayMaxEst = estRange(ah.getArrayMax()); - const auto arrayUniqueEst = estRange(ah.getArrayUnique()); - - // TODO: should we consider diving by sqrt(ndv) or just by ndv? - const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) - ? 0.0 - : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); - result = std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); + if (includeScalar) { + // Range query on array data. + const EstimationResult rangeCardOnArray = estimateRangeQueryOnArray(ah.getArrayMin(), + ah.getArrayMax(), + lowInclusive, + tagLow, + valLow, + highInclusive, + tagHigh, + valHigh); + result += rangeCardOnArray.card; + } else { + // $elemMatch query on array data. + const auto arrayMinEst = estRange(ah.getArrayMin()); + const auto arrayMaxEst = estRange(ah.getArrayMax()); + const auto arrayUniqueEst = estRange(ah.getArrayUnique()); + + const double totalArrayCount = getTotals(ah.getArrayMin()).card; + uassert( + 6715101, "Array histograms should contain at least one array", totalArrayCount > 0); + switch (estimationAlgo) { + case EstimationAlgo::HistogramV1: { + const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0) + ? 0.0 + : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv)); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity); + break; + } + case EstimationAlgo::HistogramV2: { + const double avgArraySize = + getTotals(ah.getArrayUnique()).card / totalArrayCount; + const double adjustedUniqueCard = (avgArraySize == 0.0) + ? 0.0 + : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); + break; + } + case EstimationAlgo::HistogramV3: { + const double adjustedUniqueCard = + 0.85 * std::min(arrayUniqueEst.card, totalArrayCount); + result = + std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard); + break; + } + default: + MONGO_UNREACHABLE; + } + } } if (includeScalar) { diff --git a/src/mongo/db/query/ce/histogram_estimation.h b/src/mongo/db/query/ce/histogram_estimation.h index 0e554993983..3225dab1272 100644 --- a/src/mongo/db/query/ce/histogram_estimation.h +++ b/src/mongo/db/query/ce/histogram_estimation.h @@ -36,6 +36,7 @@ namespace mongo::ce { enum class EstimationType { kEqual, kLess, kLessOrEqual, kGreater, kGreaterOrEqual }; +enum class EstimationAlgo { HistogramV1, HistogramV2, HistogramV3 }; const stdx::unordered_map<EstimationType, std::string> estimationTypeName = { {EstimationType::kEqual, "eq"}, @@ -95,6 +96,7 @@ double estimateCardRange(const ArrayHistogram& ah, /* Define upper bound. */ bool highInclusive, sbe::value::TypeTags tagHigh, - sbe::value::Value valHigh); + sbe::value::Value valHigh, + EstimationAlgo estAlgo = EstimationAlgo::HistogramV2); } // namespace mongo::ce |