diff options
author | Alya Berciu <alya.berciu@mongodb.com> | 2022-09-02 12:40:26 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-09-02 13:39:17 +0000 |
commit | 8581b4968646ea09b703d985c22b08cc01102597 (patch) | |
tree | a916997e4044cb0223a4ef483c4ec805353989b6 /src/mongo/db | |
parent | 26d222d66fbb97152b283a63f8e6393171c8710f (diff) | |
download | mongo-8581b4968646ea09b703d985c22b08cc01102597.tar.gz |
SERVER-68445 Integrate histogram bucket interpolation
Co-authored-by: Milena Ivanova <milena.ivanova@mongodb.com>
Co-authored-by: Alya Berciu <alya.berciu@mongodb.com>
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/query/ce/SConscript | 10 | ||||
-rw-r--r-- | src/mongo/db/query/ce/array_histogram.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/query/ce/array_histogram.h | 16 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_histogram_test.cpp | 48 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_interpolation_test.cpp | 480 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.h | 7 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_estimation.cpp | 146 | ||||
-rw-r--r-- | src/mongo/db/query/ce/histogram_estimation.h | 25 |
9 files changed, 686 insertions, 70 deletions
diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index edb7b808623..ae0e61a8f92 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -63,6 +63,16 @@ env.CppUnitTest( ) env.CppUnitTest( + target="ce_interpolation_test", + source=[ + "ce_interpolation_test.cpp", + ], + LIBDEPS=[ + 'ce_test_utils', + ], +) + +env.CppUnitTest( target="ce_heuristic_test", source=[ "ce_heuristic_test.cpp", diff --git a/src/mongo/db/query/ce/array_histogram.cpp b/src/mongo/db/query/ce/array_histogram.cpp index 8e44c2973cf..453e31ac047 100644 --- a/src/mongo/db/query/ce/array_histogram.cpp +++ b/src/mongo/db/query/ce/array_histogram.cpp @@ -35,11 +35,11 @@ using namespace sbe; ArrayHistogram::ArrayHistogram() : ArrayHistogram(ScalarHistogram(), {}) {} ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, - std::map<value::TypeTags, size_t> typeCounts, + TypeCounts typeCounts, ScalarHistogram arrayUnique, ScalarHistogram arrayMin, ScalarHistogram arrayMax, - std::map<value::TypeTags, size_t> arrayTypeCounts) + TypeCounts arrayTypeCounts) : _scalar(std::move(scalar)), _typeCounts(std::move(typeCounts)), _arrayUnique(std::move(arrayUnique)), @@ -49,7 +49,7 @@ ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, invariant(isArray()); } -ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, std::map<value::TypeTags, size_t> typeCounts) +ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, TypeCounts typeCounts) : _scalar(std::move(scalar)), _typeCounts(std::move(typeCounts)), _arrayUnique(boost::none), @@ -63,7 +63,7 @@ bool ArrayHistogram::isArray() const { return _arrayUnique && _arrayMin && _arrayMax && _arrayTypeCounts; } -std::string typeCountsToString(const std::map<value::TypeTags, size_t>& typeCounts) { +std::string typeCountsToString(const TypeCounts& typeCounts) { std::ostringstream os; os << "{"; bool first = true; @@ -111,11 +111,11 @@ const ScalarHistogram& ArrayHistogram::getArrayMax() const { return *_arrayMax; } -const std::map<value::TypeTags, size_t>& ArrayHistogram::getTypeCounts() const { +const TypeCounts& ArrayHistogram::getTypeCounts() const { return _typeCounts; } -const std::map<value::TypeTags, size_t>& ArrayHistogram::getArrayTypeCounts() const { +const TypeCounts& ArrayHistogram::getArrayTypeCounts() const { invariant(isArray()); return *_arrayTypeCounts; } diff --git a/src/mongo/db/query/ce/array_histogram.h b/src/mongo/db/query/ce/array_histogram.h index 9acdeb6b7f7..2a6e9af3383 100644 --- a/src/mongo/db/query/ce/array_histogram.h +++ b/src/mongo/db/query/ce/array_histogram.h @@ -36,21 +36,23 @@ namespace mongo::ce { +using TypeCounts = std::map<sbe::value::TypeTags, size_t>; + class ArrayHistogram { public: // Constructs an empty scalar histogram. ArrayHistogram(); // Constructor for scalar field histograms. - ArrayHistogram(ScalarHistogram scalar, std::map<sbe::value::TypeTags, size_t> typeCounts); + ArrayHistogram(ScalarHistogram scalar, TypeCounts typeCounts); // Constructor for array field histograms. We have to initialize all array fields in this case. ArrayHistogram(ScalarHistogram scalar, - std::map<sbe::value::TypeTags, size_t> typeCounts, + TypeCounts typeCounts, ScalarHistogram arrayUnique, ScalarHistogram arrayMin, ScalarHistogram arrayMax, - std::map<sbe::value::TypeTags, size_t> arrayTypeCounts); + TypeCounts arrayTypeCounts); // ArrayHistogram is neither copy-constructible nor copy-assignable. ArrayHistogram(const ArrayHistogram&) = delete; @@ -69,15 +71,15 @@ public: const ScalarHistogram& getArrayUnique() const; const ScalarHistogram& getArrayMin() const; const ScalarHistogram& getArrayMax() const; - const std::map<sbe::value::TypeTags, size_t>& getTypeCounts() const; - const std::map<sbe::value::TypeTags, size_t>& getArrayTypeCounts() const; + const TypeCounts& getTypeCounts() const; + const TypeCounts& getArrayTypeCounts() const; private: /* ScalarHistogram fields for all paths. */ // Contains values which appeared originally as scalars on the path. ScalarHistogram _scalar; - std::map<sbe::value::TypeTags, size_t> _typeCounts; + TypeCounts _typeCounts; /* ScalarHistogram fields for array paths (only initialized if arrays are present). */ @@ -87,7 +89,7 @@ private: boost::optional<ScalarHistogram> _arrayMin; // Contains maximum values originating from arrays **per class**. boost::optional<ScalarHistogram> _arrayMax; - boost::optional<std::map<sbe::value::TypeTags, size_t>> _arrayTypeCounts; + boost::optional<TypeCounts> _arrayTypeCounts; }; diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp index 13ab164840c..be0073d96d1 100644 --- a/src/mongo/db/query/ce/ce_histogram_test.cpp +++ b/src/mongo/db/query/ce/ce_histogram_test.cpp @@ -30,6 +30,7 @@ #include "mongo/db/query/ce/ce_histogram.h" #include "mongo/db/query/ce/ce_test_utils.h" #include "mongo/db/query/ce/histogram_estimation.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" #include "mongo/db/query/sbe_stage_builder_helpers.h" #include "mongo/unittest/unittest.h" @@ -63,7 +64,7 @@ struct TestBucket { std::unique_ptr<ArrayHistogram> getHistogramFromData(std::vector<TestBucket> testBuckets) { sbe::value::Array bounds; std::vector<Bucket> buckets; - std::map<sbe::value::TypeTags, size_t> typeCounts; + TypeCounts typeCounts; int cumulativeFreq = 0; int cumulativeNDV = 0; @@ -306,24 +307,18 @@ TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) { ASSERT_MATCH_CE(t, "{intRange: {$eq: 20}}", 1.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 20}}", 1.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 23.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 23.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 23.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 23.27); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 23.27); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 17.25); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 17.25); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 17.25); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 17.25); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 46.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5); // Test ranges that partially overlap with the entire histogram. - ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 27.5); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 4.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5); ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lte: 15}}", 27.5); - ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 22.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); // Test ranges that include all values in the histogram. @@ -341,7 +336,6 @@ TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) { ASSERT_MATCH_CE(t, "{intRange: {$eq: 10.5}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 12.5}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 19.36}}", 5.0); - ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 17.26); // Test ranges that don't overlap with the histogram. ASSERT_MATCH_CE(t, "{intRange: {$lt: 10}}", 0.0); @@ -360,6 +354,32 @@ TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) { ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lt: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lt: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lte: 5}}", 0.0); + + // Because we don't specify any indexes here, these intervals do not go through simplification. + // This means that instead of having one key in the requirements map of the generated sargable + // node corresponding to the path "intRange", we have two keys and two ranges, both + // corresponding to the same path. As a consequence, we combine the estimates for the intervals + // using exponential backoff, which results in an overestimate. + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 46.04); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 23.42); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 24.96); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 36.53); + + // When we specify that there is a non-multikey index on 'intRange', we expect to see interval + // simplification occurring, which should provide a better estimate for the following ranges. + t.setIndexes( + {{"intRangeIndex", + makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}}); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 45.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 13.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 18.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 31.0); } TEST(CEHistogramTest, TestHistogramOnNestedPaths) { diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp new file mode 100644 index 00000000000..152ac7480f3 --- /dev/null +++ b/src/mongo/db/query/ce/ce_interpolation_test.cpp @@ -0,0 +1,480 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/array_histogram.h" +#include "mongo/db/query/ce/ce_test_utils.h" +#include "mongo/db/query/ce/histogram_estimation.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { +namespace { + +using namespace sbe; + +const std::pair<value::TypeTags, value::Value> makeInt64Value(const int v) { + return std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom<int64_t>(v)); +}; + +double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) { + const auto [tag, val] = makeInt64Value(v); + return estimate(hist, tag, val, type).card; +}; + +struct BucketData { + Value _v; + double _equalFreq; + double _rangeFreq; + double _ndv; + + BucketData(Value v, double equalFreq, double rangeFreq, double ndv) + : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {} + BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv) + : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} + BucketData(int v, double equalFreq, double rangeFreq, double ndv) + : BucketData(Value(v), equalFreq, rangeFreq, ndv) {} +}; + +ScalarHistogram createHistogram(const std::vector<BucketData>& data) { + sbe::value::Array array; + for (const auto& item : data) { + const auto [tag, val] = stage_builder::makeValue(item._v); + array.push_back(tag, val); + } + + value::Array bounds; + std::vector<Bucket> buckets; + + double cumulativeFreq = 0.0; + double cumulativeNDV = 0.0; + + for (size_t i = 0; i < data.size(); i++) { + const auto [tag, val] = array.getAt(i); + bounds.push_back(tag, val); + + const auto& item = data.at(i); + cumulativeFreq += item._equalFreq + item._rangeFreq; + cumulativeNDV += item._ndv + 1.0; + buckets.emplace_back( + item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV); + } + + return {std::move(bounds), std::move(buckets)}; +} + +TEST(EstimatorTest, ManualHistogram) { + std::vector<BucketData> data{{0, 1.0, 1.0, 1.0}, + {10, 1.0, 10.0, 5.0}, + {20, 3.0, 15.0, 3.0}, + {30, 1.0, 10.0, 4.0}, + {40, 2.0, 0.0, 0.0}, + {50, 1.0, 10.0, 5.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(55.0, getTotals(hist).card); + + ASSERT_EQ(1.0, estimateIntValCard(hist, 0, EstimationType::kEqual)); + ASSERT_EQ(2.0, estimateIntValCard(hist, 5, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 35, EstimationType::kEqual)); + + ASSERT_EQ(15.5, estimateIntValCard(hist, 15, EstimationType::kLess)); + ASSERT_EQ(20.5, estimateIntValCard(hist, 15, EstimationType::kLessOrEqual)); + ASSERT_EQ(28, estimateIntValCard(hist, 20, EstimationType::kLess)); + ASSERT_EQ(31.0, estimateIntValCard(hist, 20, EstimationType::kLessOrEqual)); + + ASSERT_EQ(42, estimateIntValCard(hist, 10, EstimationType::kGreater)); + ASSERT_EQ(43, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual)); + ASSERT_EQ(19, estimateIntValCard(hist, 25, EstimationType::kGreater)); + ASSERT_EQ(21.5, estimateIntValCard(hist, 25, EstimationType::kGreaterOrEqual)); +} + +TEST(EstimatorTest, UniformIntEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a uniform int distribution with + // a minimum of 0, a maximum of 1000, and 70 distinct values. + std::vector<BucketData> data{{2, 1, 0, 0}, + {57, 3, 2, 1}, + {179, 5, 10, 6}, + {317, 5, 9, 6}, + {344, 3, 0, 0}, + {558, 4, 19, 12}, + {656, 2, 4, 3}, + {798, 3, 7, 4}, + {951, 5, 17, 7}, + {986, 1, 0, 0}}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over bucket bound. + double expectedCard = estimateIntValCard(hist, 558, EstimationType::kEqual); + ASSERT_EQ(4.0, expectedCard); + expectedCard = estimateIntValCard(hist, 558, EstimationType::kLess); + ASSERT_EQ(57.0, expectedCard); + expectedCard = estimateIntValCard(hist, 558, EstimationType::kLessOrEqual); + ASSERT_EQ(61.0, expectedCard); + + // Predicates over value inside of a bucket. + + // Query: [{$match: {a: {$eq: 530}}}]. + expectedCard = estimateIntValCard(hist, 530, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1); // Actual: 1. + + // Query: [{$match: {a: {$lt: 530}}}]. + expectedCard = estimateIntValCard(hist, 530, EstimationType::kLess); + ASSERT_APPROX_EQUAL(52.9, expectedCard, 0.1); // Actual: 50. + + // Query: [{$match: {a: {$lte: 530}}}]. + expectedCard = estimateIntValCard(hist, 530, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(54.5, expectedCard, 0.1); // Actual: 51. + + // Query: [{$match: {a: {$eq: 400}}}]. + expectedCard = estimateIntValCard(hist, 400, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1); // Actual: 1. + + // Query: [{$match: {a: {$lt: 400}}}]. + expectedCard = estimateIntValCard(hist, 400, EstimationType::kLess); + ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 39. + + // Query: [{$match: {a: {$lte: 400}}}]. + expectedCard = estimateIntValCard(hist, 400, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(43.0, expectedCard, 0.1); // Actual: 40. +} + +TEST(EstimatorTest, NormalIntEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a normal int distribution with + // a minimum of 0, a maximum of 1000, and 70 distinct values. + std::vector<BucketData> data{{2, 1, 0, 0}, + {317, 8, 20, 15}, + {344, 2, 0, 0}, + {388, 3, 0, 0}, + {423, 4, 2, 2}, + {579, 4, 12, 8}, + {632, 3, 2, 1}, + {696, 3, 5, 3}, + {790, 5, 4, 2}, + {993, 1, 21, 9}}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over bucket bound. + double expectedCard = estimateIntValCard(hist, 696, EstimationType::kEqual); + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimateIntValCard(hist, 696, EstimationType::kLess); + ASSERT_EQ(66.0, expectedCard); + expectedCard = estimateIntValCard(hist, 696, EstimationType::kLessOrEqual); + ASSERT_EQ(69.0, expectedCard); + + // Predicates over value inside of a bucket. + + // Query: [{$match: {a: {$eq: 150}}}]. + expectedCard = estimateIntValCard(hist, 150, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(1.3, expectedCard, 0.1); // Actual: 1. + + // Query: [{$match: {a: {$lt: 150}}}]. + expectedCard = estimateIntValCard(hist, 150, EstimationType::kLess); + ASSERT_APPROX_EQUAL(9.1, expectedCard, 0.1); // Actual: 9. + + // Query: [{$match: {a: {$lte: 150}}}]. + expectedCard = estimateIntValCard(hist, 150, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(10.4, expectedCard, 0.1); // Actual: 10. +} + +TEST(EstimatorTest, UniformStrEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a uniform string distribution + // with a minimum length of 3, a maximum length of 5, and 80 distinct values. + std::vector<BucketData> data{{{"0ejz", 2, 0, 0}, + {"8DCaq", 3, 4, 4}, + {"Cy5Kw", 3, 3, 3}, + {"WXX7w", 3, 31, 20}, + {"YtzS", 2, 0, 0}, + {"fuK", 5, 13, 7}, + {"gLkp", 3, 0, 0}, + {"ixmVx", 2, 6, 2}, + {"qou", 1, 9, 6}, + {"z2b", 1, 9, 6}}}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over value inside of a bucket. + const auto [tag, value] = value::makeNewString("TTV"_sd); + value::ValueGuard vg(tag, value); + + // Query: [{$match: {a: {$eq: 'TTV'}}}]. + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.55, expectedCard, 0.1); // Actual: 2. + + // Query: [{$match: {a: {$lt: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(39.8, expectedCard, 0.1); // Actual: 39. + + // Query: [{$match: {a: {$lte: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1); // Actual: 41. +} + +TEST(EstimatorTest, NormalStrEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off a normal string distribution + // with a minimum length of 3, a maximum length of 5, and 80 distinct values. + std::vector<BucketData> data{{ + {"0ejz", 1, 0, 0}, + {"4FGjc", 3, 5, 3}, + {"9bU3", 2, 3, 2}, + {"Cy5Kw", 3, 3, 3}, + {"Lm4U", 2, 11, 5}, + {"TTV", 5, 14, 8}, + {"YtzS", 2, 3, 2}, + {"o9cD4", 6, 26, 16}, + {"qfmnP", 1, 4, 2}, + {"xqbi", 2, 4, 4}, + }}; + const ScalarHistogram hist = createHistogram(data); + + // Predicates over bucket bound. + auto [tag, value] = value::makeNewString("TTV"_sd); + value::ValueGuard vg(tag, value); + + // Query: [{$match: {a: {$eq: 'TTV'}}}]. + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1); // Actual: 5. + + // Query: [{$match: {a: {$lt: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(47.0, expectedCard, 0.1); // Actual: 47. + + // Query: [{$match: {a: {$lte: 'TTV'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(52.0, expectedCard, 0.1); // Actual: 52. + + // Predicates over value inside of a bucket. + std::tie(tag, value) = value::makeNewString("Pfa"_sd); + + // Query: [{$match: {a: {$eq: 'Pfa'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.75, expectedCard, 0.1); // Actual: 2. + + // Query: [{$match: {a: {$lt: 'Pfa'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(38.3, expectedCard, 0.1); // Actual: 35. + + // Query: [{$match: {a: {$lte: 'Pfa'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(40.0, expectedCard, 0.1); // Actual: 37. +} + +TEST(EstimatorTest, UniformIntStrEstimate) { + // This hard-codes a maxdiff histogram with 20 buckets built off of a uniform distribution with + // two types occurring with equal probability: + // - 100 distinct ints between 0 and 1000, and + // - 100 distinct strings of length between 2 and 5. + std::vector<BucketData> data{{ + {2, 3, 0, 0}, {19, 4, 1, 1}, {226, 2, 49, 20}, {301, 5, 12, 4}, + {317, 3, 0, 0}, {344, 2, 3, 1}, {423, 5, 18, 6}, {445, 3, 0, 0}, + {495, 3, 4, 2}, {542, 5, 9, 3}, {696, 3, 44, 19}, {773, 4, 11, 5}, + {805, 2, 8, 4}, {931, 5, 21, 8}, {998, 4, 21, 3}, {"8N4", 5, 31, 14}, + {"MIb", 5, 45, 17}, {"Zgi", 3, 55, 22}, {"pZ", 6, 62, 25}, {"yUwxz", 5, 29, 12}, + }}; + const ScalarHistogram hist = createHistogram(data); + const ArrayHistogram arrHist( + hist, TypeCounts{{value::TypeTags::NumberInt64, 254}, {value::TypeTags::StringSmall, 246}}); + + // Predicates over value inside of the last numeric bucket. + + // Query: [{$match: {a: {$eq: 993}}}]. + double expectedCard = estimateIntValCard(hist, 993, EstimationType::kEqual); + ASSERT_APPROX_EQUAL(7.0, expectedCard, 0.1); // Actual: 9. + + // Query: [{$match: {a: {$lt: 993}}}]. + expectedCard = estimateIntValCard(hist, 993, EstimationType::kLess); + ASSERT_APPROX_EQUAL(241.4, expectedCard, 0.1); // Actual: 241. + + // Query: [{$match: {a: {$lte: 993}}}]. + expectedCard = estimateIntValCard(hist, 993, EstimationType::kLessOrEqual); + ASSERT_APPROX_EQUAL(248.4, expectedCard, 0.1); // Actual: 250. + + // Predicates over value inside of the first string bucket. + auto [tag, value] = value::makeNewString("04e"_sd); + value::ValueGuard vg(tag, value); + + // Query: [{$match: {a: {$eq: '04e'}}}]. + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(2.2, expectedCard, 0.1); // Actual: 3. + + value::TypeTags lowTag = value::TypeTags::NumberInt64; + value::Value lowVal = 100000000; + + // Type bracketing: low value of different type than the bucket bound. + // Query: [{$match: {a: {$eq: 100000000}}}]. + expectedCard = estimateCardEq(arrHist, lowTag, lowVal); + ASSERT_APPROX_EQUAL(0.0, expectedCard, 0.1); // Actual: 0. + + // No interpolation for inequality to values inside the first string bucket, fallback to half of + // the bucket frequency. + + // Query: [{$match: {a: {$lt: '04e'}}}]. + expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, tag, value); + ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1); // Actual: 0. + + // Query: [{$match: {a: {$lte: '04e'}}}]. + expectedCard = estimateCardRange( + arrHist, true, false, lowTag, lowVal, true /* highInclusive */, tag, value); + ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1); // Actual: 3. + + // Value towards the end of the bucket gets the same half bucket estimate. + std::tie(tag, value) = value::makeNewString("8B5"_sd); + + // Query: [{$match: {a: {$lt: '8B5'}}}]. + expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, tag, value); + ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1); // Actual: 24. + + // Query: [{$match: {a: {$lte: '8B5'}}}]. + expectedCard = estimateCardRange( + arrHist, true, false, lowTag, lowVal, true /* highInclusive */, tag, value); + ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1); // Actual: 29. +} + +TEST(EstimatorTest, UniformIntArrayOnlyEstimate) { + // This hard-codes a maxdiff histogram with 10 buckets built off of an array distribution with + // arrays between 3 and 5 elements long, each containing 100 distinct ints uniformly distributed + // between 0 and 1000. There are no scalar elements. + std::vector<BucketData> scalarData{{}}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector<BucketData> minData{{ + {5, 3, 0, 0}, {19, 5, 2, 1}, {57, 4, 4, 3}, {116, 7, 13, 7}, {198, 3, 15, 6}, + {228, 2, 3, 2}, {254, 4, 0, 0}, {280, 2, 2, 1}, {335, 3, 5, 3}, {344, 2, 0, 0}, + {388, 3, 0, 0}, {420, 2, 0, 0}, {454, 1, 6, 3}, {488, 2, 1, 1}, {530, 1, 0, 0}, + {561, 1, 0, 0}, {609, 1, 0, 0}, {685, 1, 0, 0}, {713, 1, 0, 0}, {758, 1, 0, 0}, + }}; + const ScalarHistogram minHist = createHistogram(minData); + + std::vector<BucketData> maxData{{ + {301, 1, 0, 0}, {408, 2, 0, 0}, {445, 1, 0, 0}, {605, 2, 0, 0}, {620, 1, 0, 0}, + {665, 1, 1, 1}, {687, 3, 0, 0}, {704, 2, 6, 2}, {718, 2, 2, 1}, {741, 2, 1, 1}, + {752, 2, 0, 0}, {823, 7, 3, 3}, {827, 1, 0, 0}, {852, 3, 0, 0}, {864, 5, 0, 0}, + {909, 7, 12, 5}, {931, 2, 3, 1}, {939, 3, 0, 0}, {970, 2, 12, 4}, {998, 1, 10, 4}, + }}; + const ScalarHistogram maxHist = createHistogram(maxData); + + std::vector<BucketData> uniqueData{{ + {5, 3, 0, 0}, {19, 6, 2, 1}, {57, 4, 4, 3}, {116, 7, 15, 8}, {228, 2, 38, 13}, + {254, 7, 0, 0}, {269, 10, 0, 0}, {280, 7, 3, 1}, {306, 4, 1, 1}, {317, 4, 0, 0}, + {344, 2, 19, 5}, {423, 2, 27, 8}, {507, 2, 22, 13}, {704, 8, 72, 34}, {718, 6, 3, 1}, + {758, 3, 13, 4}, {864, 7, 35, 14}, {883, 4, 0, 0}, {939, 5, 32, 10}, {998, 1, 24, 9}, + }}; + const ScalarHistogram uniqueHist = createHistogram(uniqueData); + + const ArrayHistogram arrHist( + scalarHist, TypeCounts{}, uniqueHist, minHist, maxHist, TypeCounts{}); + + // Query in the middle of the domain: estimate from ArrayUnique histogram. + value::TypeTags lowTag = value::TypeTags::NumberInt64; + value::Value lowVal = 500; + value::TypeTags highTag = value::TypeTags::NumberInt64; + value::Value highVal = 600; + + // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}]. + double expectedCard = + estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal); + ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1); + + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. + // Note: this should be the same as above, since we have no scalars. + expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal); + ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1); + + // Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms. + lowVal = 10; + highVal = 110; + + // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}]. + expectedCard = + estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal); + ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); + + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}]. + // Note: this should be the same as above, since we have no scalars. + expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal); + ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); +} + +TEST(EstimatorTest, UniformIntMixedArrayEstimate) { + // This hard-codes a maxdiff histogram with 20 buckets built off of a mixed distribution split + // with equal probability between: + // - an array distribution between 3 and 5 elements long, each containing 80 distinct ints + // uniformly distributed between 0 and 1000, and + // - a uniform int distribution with 80 distinct ints between 0 and 1000. + std::vector<BucketData> scalarData{{ + {25, 1, 0, 0}, {41, 2, 0, 0}, {142, 2, 3, 3}, {209, 3, 3, 1}, {243, 1, 2, 1}, + {296, 3, 4, 3}, {321, 5, 4, 2}, {480, 3, 9, 8}, {513, 3, 3, 2}, {554, 1, 0, 0}, + {637, 3, 3, 2}, {666, 2, 1, 1}, {697, 2, 2, 1}, {750, 3, 3, 2}, {768, 4, 0, 0}, + {791, 4, 3, 3}, {851, 2, 2, 2}, {927, 2, 10, 6}, {958, 3, 2, 1}, {980, 3, 0, 0}, + }}; + const ScalarHistogram scalarHist = createHistogram(scalarData); + + std::vector<BucketData> minData{{ + {3, 3, 0, 0}, {5, 8, 0, 0}, {9, 3, 0, 0}, {19, 2, 0, 0}, {49, 7, 4, 2}, + {69, 6, 0, 0}, {115, 3, 5, 3}, {125, 2, 0, 0}, {146, 1, 2, 1}, {198, 2, 4, 3}, + {214, 2, 0, 0}, {228, 3, 0, 0}, {260, 3, 4, 1}, {280, 1, 2, 2}, {330, 2, 2, 1}, + {344, 6, 0, 0}, {388, 2, 0, 0}, {420, 2, 0, 0}, {461, 2, 8, 4}, {696, 1, 2, 1}, + }}; + const ScalarHistogram minHist = createHistogram(minData); + + std::vector<BucketData> maxData{{ + {301, 1, 0, 0}, {445, 1, 0, 0}, {491, 1, 0, 0}, {533, 3, 0, 0}, {605, 3, 0, 0}, + {620, 2, 0, 0}, {647, 3, 0, 0}, {665, 4, 0, 0}, {713, 3, 10, 4}, {741, 3, 0, 0}, + {814, 3, 2, 2}, {839, 2, 1, 1}, {864, 1, 2, 2}, {883, 3, 0, 0}, {893, 7, 0, 0}, + {898, 5, 0, 0}, {909, 1, 12, 3}, {931, 2, 2, 1}, {953, 6, 3, 2}, {993, 1, 7, 5}, + }}; + const ScalarHistogram maxHist = createHistogram(maxData); + + std::vector<BucketData> uniqueData{{ + {3, 3, 0, 0}, {19, 5, 11, 2}, {49, 7, 5, 3}, {69, 8, 0, 0}, {75, 3, 0, 0}, + {125, 2, 10, 5}, {228, 3, 27, 14}, {260, 4, 5, 1}, {344, 6, 36, 13}, {423, 4, 20, 8}, + {605, 4, 61, 28}, {665, 8, 12, 6}, {758, 4, 41, 16}, {768, 5, 0, 0}, {776, 3, 0, 0}, + {864, 3, 15, 10}, {883, 8, 0, 0}, {911, 2, 28, 6}, {953, 6, 8, 4}, {993, 1, 7, 5}, + }}; + const ScalarHistogram uniqueHist = createHistogram(uniqueData); + + const ArrayHistogram arrHist( + scalarHist, TypeCounts{}, uniqueHist, minHist, maxHist, TypeCounts{}); + + value::TypeTags lowTag = value::TypeTags::NumberInt64; + value::Value lowVal = 500; + value::TypeTags highTag = value::TypeTags::NumberInt64; + value::Value highVal = 550; + + // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}]. + double expectedCard = + estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal); + ASSERT_APPROX_EQUAL(9.8, expectedCard, 0.1); // Actual: 94. + + // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}]. + expectedCard = + estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal); + ASSERT_APPROX_EQUAL(5.6, expectedCard, 0.1); // Actual: 8. +} + +} // namespace +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp index 7beedf1adfa..05ca92c2be6 100644 --- a/src/mongo/db/query/ce/ce_test_utils.cpp +++ b/src/mongo/db/query/ce/ce_test_utils.cpp @@ -47,7 +47,7 @@ using namespace cascades; CETester::CETester(std::string collName, double collCard, const optimizer::OptPhaseManager::PhaseSet& optPhases) - : _collName(std::move(collName)), _collCard(collCard), _optPhases(optPhases) {} + : _collName(std::move(collName)), _collCard(collCard), _optPhases(optPhases), _indexes() {} optimizer::CEType CETester::getCE(const std::string& query) const { if constexpr (kCETestLogOnly) { @@ -90,8 +90,12 @@ optimizer::CEType CETester::getCE(ABT& abt) const { const auto& group = memo.getGroup(i); // If the 'optPhases' either ends with the MemoSubstitutionPhase or the - // MemoImplementationPhase, we should have exactly one logical node per group. - ASSERT_EQUALS(group._logicalNodes.size(), 1); + // MemoImplementationPhase, we should have exactly one logical node per group. However, if + // we have indexes, we may have multiple logical nodes as a result of interval + // simplification. In this case, we still want to pick the first Sargable node. + if (_indexes.empty()) { + ASSERT_EQUALS(group._logicalNodes.size(), 1); + } const auto& node = group._logicalNodes.at(0); // This gets the cardinality estimate actually produced during optimization. @@ -123,7 +127,7 @@ optimizer::CEType CETester::getCE(ABT& abt) const { } optimizer::OptPhaseManager CETester::getPhaseManager() const { - ScanDefinition sd({}, {}, {DistributionType::Centralized}, true, _collCard); + ScanDefinition sd({}, _indexes, {DistributionType::Centralized}, true, _collCard); Metadata metadata({{_collName, sd}}); return {_optPhases, _prefixId, diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h index ae2af837e8e..a46af3b5624 100644 --- a/src/mongo/db/query/ce/ce_test_utils.h +++ b/src/mongo/db/query/ce/ce_test_utils.h @@ -108,6 +108,10 @@ public: _collCard = card; } + void setIndexes(opt::unordered_map<std::string, IndexDefinition>&& indexes) { + _indexes = std::move(indexes); + } + protected: /** * Subclasses need to override this method to initialize the transports they are testing. @@ -119,12 +123,11 @@ private: OptPhaseManager getPhaseManager() const; std::string _collName; - // The number of records in the collection we are testing. double _collCard; - // Phases to use when optimizing an input query. const OptPhaseManager::PhaseSet& _optPhases; + opt::unordered_map<std::string, IndexDefinition> _indexes; mutable PrefixId _prefixId; }; diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp index c98e086e39c..948729ee1b4 100644 --- a/src/mongo/db/query/ce/histogram_estimation.cpp +++ b/src/mongo/db/query/ce/histogram_estimation.cpp @@ -34,6 +34,36 @@ namespace mongo::ce { using namespace sbe; namespace { + +bool sameTypeBracket(value::TypeTags tag1, value::TypeTags tag2) { + if (tag1 == tag2) { + return true; + } + return ((value::isNumber(tag1) && value::isNumber(tag2)) || + (value::isString(tag1) && value::isString(tag2))); +} + +double valueToDouble(value::TypeTags tag, value::Value val) { + double result = 0; + if (value::isNumber(tag)) { + result = value::numericCast<double>(tag, val); + } else if (value::isString(tag)) { + const StringData sd = value::getStringView(tag, val); + + // Convert a prefix of the string to a double. + const size_t maxPrecision = std::min(sd.size(), sizeof(double)); + for (size_t i = 0; i < maxPrecision; ++i) { + const char ch = sd[i]; + const double charToDbl = ch / std::pow(2, i * 8); + result += charToDbl; + } + } else { + uassert(6844500, "Unexpected value type", false); + } + + return result; +} + int32_t compareValues3w(value::TypeTags tag1, value::Value val1, value::TypeTags tag2, @@ -53,6 +83,70 @@ EstimationResult getTotals(const ScalarHistogram& h) { return {last._cumulativeFreq, last._cumulativeNDV}; } +/** + * Helper function that uses linear interpolation to estimate the cardinality and NDV for a value + * that falls inside of a histogram bucket. + */ +EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h, + value::TypeTags tag, + value::Value val, + EstimationType type, + size_t bucketIndex) { + + const Bucket& bucket = h.getBuckets().at(bucketIndex); + const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); + + double resultCard = bucket._cumulativeFreq - bucket._equalFreq - bucket._rangeFreq; + double resultNDV = bucket._cumulativeNDV - bucket._ndv - 1.0; + + // Check if the estimate is at the point of type brackets switch. If the current bucket is the + // first bucket of a new type bracket and the value is of another type, estimate cardinality + // from the current bucket as 0. + // + // For example, let bound 1 = 1000, bound 2 = "abc". The value 100000000 falls in bucket 2, the + // first bucket for strings, but should not get cardinality/ ndv fraction from it. + if (!sameTypeBracket(tag, boundTag)) { + if (type == EstimationType::kEqual) { + return {0.0, 0.0}; + } else { + return {resultCard, resultNDV}; + } + } + + // Estimate for equality frequency inside of the bucket. + const double innerEqFreq = (bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv; + + if (type == EstimationType::kEqual) { + return {innerEqFreq, 1.0}; + } + + // For $lt and $lte operations use linear interpolation to take a fraction of the bucket + // cardinality and NDV if there is a preceeding bucket with bound of the same type. Use half of + // the bucket estimates otherwise. + double ratio = 0.5; + if (bucketIndex > 0) { + const auto [lowBoundTag, lowBoundVal] = h.getBounds().getAt(bucketIndex - 1); + if (sameTypeBracket(lowBoundTag, boundTag)) { + double doubleLowBound = valueToDouble(lowBoundTag, lowBoundVal); + double doubleUpperBound = valueToDouble(boundTag, boundVal); + double doubleVal = valueToDouble(tag, val); + ratio = (doubleVal - doubleLowBound) / (doubleUpperBound - doubleLowBound); + } + } + + resultCard += bucket._rangeFreq * ratio; + resultNDV += bucket._ndv * ratio; + + if (type == EstimationType::kLess) { + // Subtract from the estimate the cardinality and ndv corresponding to the equality + // operation. + const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0; + resultCard -= innerEqFreq; + resultNDV -= innerEqNdv; + } + return {resultCard, resultNDV}; +} + EstimationResult estimate(const ScalarHistogram& h, value::TypeTags tag, value::Value val, @@ -103,47 +197,32 @@ EstimationResult estimate(const ScalarHistogram& h, const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex); const bool isEndpoint = compareValues3w(boundTag, boundVal, tag, val) == 0; - switch (type) { - case EstimationType::kEqual: { - if (isEndpoint) { + if (isEndpoint) { + switch (type) { + case EstimationType::kEqual: { return {bucket._equalFreq, 1.0}; } - return {(bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv, 1.0}; - } - case EstimationType::kLess: { - double resultCard = bucket._cumulativeFreq - bucket._equalFreq; - double resultNDV = bucket._cumulativeNDV - 1.0; - - if (!isEndpoint) { - // TODO: consider value interpolation instead of assigning 50% of the weight. - resultCard -= bucket._rangeFreq / 2.0; - resultNDV -= bucket._ndv / 2.0; + case EstimationType::kLess: { + double resultCard = bucket._cumulativeFreq - bucket._equalFreq; + double resultNDV = bucket._cumulativeNDV - 1.0; + return {resultCard, resultNDV}; } - return {resultCard, resultNDV}; - } - case EstimationType::kLessOrEqual: { - double resultCard = bucket._cumulativeFreq; - double resultNDV = bucket._cumulativeNDV; - - if (!isEndpoint) { - // TODO: consider value interpolation instead of assigning 50% of the weight. - resultCard -= bucket._equalFreq + bucket._rangeFreq / 2.0; - resultNDV -= 1.0 + bucket._ndv / 2.0; + case EstimationType::kLessOrEqual: { + double resultCard = bucket._cumulativeFreq; + double resultNDV = bucket._cumulativeNDV; + return {resultCard, resultNDV}; } - return {resultCard, resultNDV}; - } - default: - MONGO_UNREACHABLE; + default: + MONGO_UNREACHABLE; + } + } else { + return interpolateEstimateInBucket(h, tag, val, type, bucketIndex); } } -/** - * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and - * type tag pair. - */ double estimateCardEq(const ArrayHistogram& ah, value::TypeTags tag, value::Value val) { if (tag != value::TypeTags::Null) { double card = estimate(ah.getScalar(), tag, val, EstimationType::kEqual).card; @@ -187,11 +266,6 @@ static EstimationResult estimateRange(const ScalarHistogram& histogram, return highEstimate - lowEstimate; } -/** - * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate. - * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array - * values. The other fields define the range of the estimation. - */ double estimateCardRange(const ArrayHistogram& ah, bool includeScalar, /* Define lower bound. */ diff --git a/src/mongo/db/query/ce/histogram_estimation.h b/src/mongo/db/query/ce/histogram_estimation.h index b949d0b752e..0e554993983 100644 --- a/src/mongo/db/query/ce/histogram_estimation.h +++ b/src/mongo/db/query/ce/histogram_estimation.h @@ -59,7 +59,8 @@ struct EstimationResult { EstimationResult getTotals(const ScalarHistogram& h); /** - * Estimates the cardinality of a predicate of the given type against the histogram. + * Compute an estimate for a given value and estimation type. Use linear interpolation for values + * that fall inside of histogram buckets. */ EstimationResult estimate(const ScalarHistogram& h, sbe::value::TypeTags tag, @@ -74,4 +75,26 @@ double estimateIntervalCardinality(const ArrayHistogram& estimator, const optimizer::IntervalRequirement& interval, optimizer::CEType inputCardinality); +/** + * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and + * type tag pair. + */ +double estimateCardEq(const ArrayHistogram& ah, sbe::value::TypeTags tag, sbe::value::Value val); + +/** + * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate. + * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array + * values. The other fields define the range of the estimation. + */ +double estimateCardRange(const ArrayHistogram& ah, + bool includeScalar, + /* Define lower bound. */ + bool lowInclusive, + sbe::value::TypeTags tagLow, + sbe::value::Value valLow, + /* Define upper bound. */ + bool highInclusive, + sbe::value::TypeTags tagHigh, + sbe::value::Value valHigh); + } // namespace mongo::ce |