SERVER-70659 Cardinality estimate of the minimum values of data types

author: Milena Ivanova <milena.ivanova@mongodb.com> 2022-11-10 15:52:46 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-11-10 16:25:21 +0000
commit: d5ff440817391833df0ee8d918689cef60140d90 (patch)
tree: d6d26f58da16ce3e11be567e6932c22f5c3a5bc6 /src/mongo/db
parent: e9c34ed06ca402773a01816dff99a462c43e933a (diff)
download: mongo-d5ff440817391833df0ee8d918689cef60140d90.tar.gz
4 files changed, 328 insertions, 64 deletions
diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript
index f36be806896..8901a512479 100644
--- a/src/mongo/db/query/ce/SConscript
+++ b/src/mongo/db/query/ce/SConscript
@@ -161,6 +161,7 @@ env.CppUnitTest(
     ],
     LIBDEPS=[
         'ce_test_utils',
+        'query_stats_test_utils',
     ],
 )
 
diff --git a/src/mongo/db/query/ce/ce_edge_cases_test.cpp b/src/mongo/db/query/ce/ce_edge_cases_test.cpp
index 5df9124395d..51d400c10fb 100644
--- a/src/mongo/db/query/ce/ce_edge_cases_test.cpp
+++ b/src/mongo/db/query/ce/ce_edge_cases_test.cpp
@@ -27,9 +27,12 @@
  *    it in the license file.
  */
 
+#include "mongo/db/pipeline/abt/utils.h"
 #include "mongo/db/query/ce/array_histogram.h"
 #include "mongo/db/query/ce/ce_test_utils.h"
 #include "mongo/db/query/ce/histogram_estimation.h"
+#include "mongo/db/query/ce/maxdiff_test_utils.h"
+#include "mongo/db/query/ce/value_utils.h"
 #include "mongo/db/query/optimizer/utils/ce_math.h"
 #include "mongo/db/query/sbe_stage_builder_helpers.h"
 #include "mongo/unittest/unittest.h"
@@ -39,6 +42,8 @@ namespace {
 
 using namespace sbe;
 
+constexpr double kErrorBound = 0.01;
+
 TEST(EstimatorTest, OneBucketIntHistogram) {
     // Data set of 10 values, each with frequency 3, in the range (-inf, 100].
     // Example: { -100, -20, 0, 20, 50, 60, 70, 80, 90, 100}.
@@ -170,18 +175,23 @@ TEST(EstimatorTest, TwoBucketsIntHistogram) {
     // Estimates with a value inside the bucket. The estimates use interpolation.
     // The bucket ratio for the value of 10 is smaller than the estimate for equality
     // and the estimates for Less and LessOrEqual are the same.
-    ASSERT_APPROX_EQUAL(3.3, estimateIntValCard(hist, 10, EstimationType::kEqual), 0.1);
-    ASSERT_APPROX_EQUAL(3.4, estimateIntValCard(hist, 10, EstimationType::kLess), 0.1);
-    ASSERT_APPROX_EQUAL(3.4, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual), 0.1);
-    ASSERT_APPROX_EQUAL(26.6, estimateIntValCard(hist, 10, EstimationType::kGreater), 0.1);
-    ASSERT_APPROX_EQUAL(26.6, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual), 0.1);
+    ASSERT_APPROX_EQUAL(3.25, estimateIntValCard(hist, 10, EstimationType::kEqual), kErrorBound);
+    ASSERT_APPROX_EQUAL(3.36, estimateIntValCard(hist, 10, EstimationType::kLess), kErrorBound);
+    ASSERT_APPROX_EQUAL(
+        3.36, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual), kErrorBound);
+
+    ASSERT_APPROX_EQUAL(26.64, estimateIntValCard(hist, 10, EstimationType::kGreater), kErrorBound);
+    ASSERT_APPROX_EQUAL(
+        26.64, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual), kErrorBound);
 
     // Different estimates for Less and LessOrEqual for the value of 50.
-    ASSERT_APPROX_EQUAL(3.3, estimateIntValCard(hist, 50, EstimationType::kEqual), 0.1);
-    ASSERT_APPROX_EQUAL(10.6, estimateIntValCard(hist, 50, EstimationType::kLess), 0.1);
-    ASSERT_APPROX_EQUAL(13.9, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), 0.1);
-    ASSERT_APPROX_EQUAL(16.1, estimateIntValCard(hist, 50, EstimationType::kGreater), 0.1);
-    ASSERT_APPROX_EQUAL(19.4, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), 0.1);
+    ASSERT_APPROX_EQUAL(3.25, estimateIntValCard(hist, 50, EstimationType::kEqual), kErrorBound);
+    ASSERT_APPROX_EQUAL(10.61, estimateIntValCard(hist, 50, EstimationType::kLess), kErrorBound);
+    ASSERT_APPROX_EQUAL(
+        13.87, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), kErrorBound);
+    ASSERT_APPROX_EQUAL(16.13, estimateIntValCard(hist, 50, EstimationType::kGreater), kErrorBound);
+    ASSERT_APPROX_EQUAL(
+        19.38, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), kErrorBound);
 }
 
 TEST(EstimatorTest, ThreeExclusiveBucketsIntHistogram) {
@@ -232,8 +242,12 @@ TEST(EstimatorTest, OneBucketStrHistogram) {
     ASSERT_EQ(19.5, expectedCard);
 
     std::tie(tag, value) = value::makeNewString(""_sd);
+    // In the special case of a single string bucket, we estimate empty string equality as for any
+    // other string value. In practice if there are at least 2 buckets for the string data and an
+    // empty string in the data set, it will be chosen as a bound for the first bucket and produce
+    // precise estimates.
     expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
-    ASSERT_EQ(kMinCard, expectedCard);
+    ASSERT_EQ(3.0, expectedCard);
     expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
     ASSERT_EQ(0.0, expectedCard);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
@@ -299,28 +313,28 @@ TEST(EstimatorTest, TwoBucketsStrHistogram) {
     // Estimates for a value inside the bucket.
     std::tie(tag, value) = value::makeNewString("sun"_sd);
     expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
-    ASSERT_APPROX_EQUAL(2.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(1.98, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
-    ASSERT_APPROX_EQUAL(74.4, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(74.39, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
-    ASSERT_APPROX_EQUAL(76.4, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(76.37, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
-    ASSERT_APPROX_EQUAL(23.6, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(23.64, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
-    ASSERT_APPROX_EQUAL(25.6, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(25.62, expectedCard, kErrorBound);
 
     // Estimate for a value very close to the bucket bound.
     std::tie(tag, value) = value::makeNewString("xyw"_sd);
     expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
-    ASSERT_APPROX_EQUAL(2.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(1.98, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
-    ASSERT_APPROX_EQUAL(95.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(95.02, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
-    ASSERT_APPROX_EQUAL(97.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(96.99, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
-    ASSERT_APPROX_EQUAL(3.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
-    ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(4.98, expectedCard, kErrorBound);
 }
 
 TEST(EstimatorTest, TwoBucketsDateHistogram) {
@@ -366,9 +380,9 @@ TEST(EstimatorTest, TwoBucketsDateHistogram) {
     expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kEqual).card;
     ASSERT_EQ(2.0, expectedCard);
     expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kLess).card;
-    ASSERT_APPROX_EQUAL(48.8, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(48.77, expectedCard, kErrorBound);
     expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kGreater).card;
-    ASSERT_APPROX_EQUAL(49.2, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(49.22, expectedCard, kErrorBound);
 
     const auto valueAfter = value::bitcastFrom<int64_t>(endInstant + 100);
     expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kEqual).card;
@@ -428,10 +442,10 @@ TEST(EstimatorTest, TwoBucketsTimestampHistogram) {
     expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kEqual).card;
     ASSERT_EQ(2.0, expectedCard);
     expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kLess).card;
-    ASSERT_APPROX_EQUAL(49.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(49.0, expectedCard, kErrorBound);
     expectedCard =
         estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kGreater).card;
-    ASSERT_APPROX_EQUAL(49.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(49.0, expectedCard, kErrorBound);
 
     const auto valueAfter = value::bitcastFrom<int64_t>(endTs.asULL() + 100);
     expectedCard =
@@ -489,11 +503,12 @@ TEST(EstimatorTest, TwoBucketsObjectIdHistogram) {
     const auto oidInside = OID("63340db2cd4d46ff39178e9d");
     oidInside.view().readInto(value::getObjectIdView(value));
     expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
-    ASSERT_APPROX_EQUAL(1.2, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(1.25, expectedCard, kErrorBound);
+
     expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
-    ASSERT_APPROX_EQUAL(83.9, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(83.95, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
-    ASSERT_APPROX_EQUAL(14.8, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(14.78, expectedCard, kErrorBound);
 
     const auto oidAfter = OID("63340dbed6cd8af737d4139b");
     oidAfter.view().readInto(value::getObjectIdView(value));
@@ -525,7 +540,7 @@ TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) {
                                             value::TypeTags::NumberInt32,
                                             value::bitcastFrom<int64_t>(1),
                                             true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(0.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(0.0, expectedCard, kErrorBound);
 
     // (NaN, 5).
     expectedCard = estimateCardRange(arrHist,
@@ -536,7 +551,7 @@ TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) {
                                      value::TypeTags::NumberInt32,
                                      value::bitcastFrom<int64_t>(5),
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(3.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound);
 
     const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd);
     value::ValueGuard vgLowStr(tagLowStr, valLowStr);
@@ -552,7 +567,7 @@ TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) {
                                      tagLowStr,
                                      valLowStr,
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(3.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound);
 
     // ["", "a"].
     expectedCard = estimateCardRange(arrHist,
@@ -564,7 +579,7 @@ TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) {
                                      value,
                                      true /* includeScalar */);
 
-    ASSERT_APPROX_EQUAL(0.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(0.0, expectedCard, kErrorBound);
 
     std::tie(tag, value) = value::makeNewString("xyz"_sd);
     // ["", "xyz"].
@@ -577,7 +592,7 @@ TEST(EstimatorTest, TwoExclusiveBucketsMixedHistogram) {
                                      value,
                                      true /* includeScalar */);
 
-    ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(5.0, expectedCard, kErrorBound);
 }
 
 TEST(EstimatorTest, TwoBucketsMixedHistogram) {
@@ -605,26 +620,28 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
     ASSERT_EQ(0.0, expectedCard);
 
     // Estimates for a value smaller than the first bucket bound.
-    ASSERT_APPROX_EQUAL(1.9, estimateIntValCard(hist, 50, EstimationType::kEqual), 0.1);
-    ASSERT_APPROX_EQUAL(6.6, estimateIntValCard(hist, 50, EstimationType::kLess), 0.1);
-    ASSERT_APPROX_EQUAL(8.5, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), 0.1);
-    ASSERT_APPROX_EQUAL(91.5, estimateIntValCard(hist, 50, EstimationType::kGreater), 0.1);
-    ASSERT_APPROX_EQUAL(93.4, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), 0.1);
+    ASSERT_APPROX_EQUAL(1.88, estimateIntValCard(hist, 50, EstimationType::kEqual), kErrorBound);
+    ASSERT_APPROX_EQUAL(6.61, estimateIntValCard(hist, 50, EstimationType::kLess), kErrorBound);
+    ASSERT_APPROX_EQUAL(
+        8.49, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), kErrorBound);
+    ASSERT_APPROX_EQUAL(91.5, estimateIntValCard(hist, 50, EstimationType::kGreater), kErrorBound);
+    ASSERT_APPROX_EQUAL(
+        93.39, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), kErrorBound);
 
     // Estimates for a value between bucket bounds.
     ASSERT_EQ(0.0, estimateIntValCard(hist, 105, EstimationType::kEqual));
 
     std::tie(tag, value) = value::makeNewString("a"_sd);
     expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
-    ASSERT_APPROX_EQUAL(3.0, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(3.0, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
-    ASSERT_APPROX_EQUAL(54.5, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(54.5, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
-    ASSERT_APPROX_EQUAL(57.5, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(57.5, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
-    ASSERT_APPROX_EQUAL(42.5, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(42.5, expectedCard, kErrorBound);
     expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
-    ASSERT_APPROX_EQUAL(45.5, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(45.5, expectedCard, kErrorBound);
 
     // Range estimates, including min/max values per data type.
     const auto [tagLowDbl, valLowDbl] =
@@ -642,7 +659,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      value::TypeTags::NumberInt32,
                                      value::bitcastFrom<int64_t>(25),
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(8.49, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(8.49, expectedCard, kErrorBound);
 
     // [25, 1000000].
     expectedCard = estimateCardRange(arrHist,
@@ -653,7 +670,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      tagHighInt,
                                      valHighInt,
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(13.38, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(13.38, expectedCard, kErrorBound);
 
     // [NaN, 1000000].
     expectedCard = estimateCardRange(arrHist,
@@ -664,7 +681,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      tagHighInt,
                                      valHighInt,
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(19.99, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(20.0, expectedCard, kErrorBound);
 
     const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd);
     value::ValueGuard vgLowStr(tagLowStr, valLowStr);
@@ -678,7 +695,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      tagLowStr,
                                      valLowStr,
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(19.99, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(20.0, expectedCard, kErrorBound);
 
     // [25, "").
     expectedCard = estimateCardRange(arrHist,
@@ -689,7 +706,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      tagLowStr,
                                      valLowStr,
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(13.39, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(13.39, expectedCard, kErrorBound);
 
     // ["", "a"].
     expectedCard = estimateCardRange(arrHist,
@@ -701,7 +718,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      value,
                                      true /* includeScalar */);
 
-    ASSERT_APPROX_EQUAL(37.49, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(37.49, expectedCard, kErrorBound);
 
     // ["", {}).
     auto [tagObj, valObj] = value::makeNewObject();
@@ -714,7 +731,7 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      tagObj,
                                      valObj,
                                      true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(79.99, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(80.0, expectedCard, kErrorBound);
 
     // ["a", {}).
     expectedCard = estimateCardRange(arrHist,
@@ -726,8 +743,257 @@ TEST(EstimatorTest, TwoBucketsMixedHistogram) {
                                      valObj,
                                      true /* includeScalar */);
 
-    ASSERT_APPROX_EQUAL(45.5, expectedCard, 0.1);
+    ASSERT_APPROX_EQUAL(45.5, expectedCard, kErrorBound);
+}
+
+/**
+ * Tests for cardinality estimates for queries over minimum values of date, timestamp, and objectId
+ * types. When the histogram has at least 2 buckets per data type, the minimum value, if present in
+ * the data, is picked as a bound for the first bucket for the corresponding data type. In this case
+ * the cardinality estimates are precise. To test the approximate estimation, we force the histogram
+ * generation to use one bucket per type (except the first numeric type).
+ */
+TEST(EstimatorTest, MinValueMixedHistogramFromData) {
+    const int64_t startInstant = 1506777923000LL;
+    const int64_t endInstant = 1516864323000LL;
+    const Timestamp startTs{Seconds(1516864323LL), 0};
+    const Timestamp endTs{Seconds(1526864323LL), 0};
+    const auto startOid = OID("63340d8d27afef2de7357e8d");
+    //    const auto endOid = OID("63340dbed6cd8af737d4139a");
+
+    std::vector<SBEValue> data;
+    data.emplace_back(value::TypeTags::Date, value::bitcastFrom<int64_t>(startInstant));
+    data.emplace_back(value::TypeTags::Date, value::bitcastFrom<int64_t>(endInstant));
+
+    data.emplace_back(value::TypeTags::Timestamp, value::bitcastFrom<int64_t>(startTs.asULL()));
+    data.emplace_back(value::TypeTags::Timestamp, value::bitcastFrom<int64_t>(endTs.asULL()));
+
+    auto [tag, val] = makeInt64Value(100);
+    data.emplace_back(tag, val);
+    std::tie(tag, val) = makeInt64Value(1000);
+    data.emplace_back(tag, val);
+
+    auto [strTag, strVal] = value::makeNewString("abc"_sd);
+    value::ValueGuard strVG(strTag, strVal);
+    auto [copyTag, copyVal] = value::copyValue(strTag, strVal);
+    data.emplace_back(copyTag, copyVal);
+    std::tie(strTag, strVal) = value::makeNewString("xyz"_sd);
+    std::tie(copyTag, copyVal) = value::copyValue(strTag, strVal);
+    data.emplace_back(copyTag, copyVal);
+
+    auto [objTag, objVal] = value::makeNewObjectId();
+    value::ValueGuard objVG(objTag, objVal);
+    startOid.view().readInto(value::getObjectIdView(objVal));
+    std::tie(tag, val) = copyValue(objTag, objVal);
+    data.emplace_back(tag, val);
+    /* TODO: add another objectId value when mapping to double is fixed by SERVER-71205.
+        endOid.view().readInto(value::getObjectIdView(objVal));
+        std::tie(tag, val) = copyValue(objTag, objVal);
+        data.emplace_back(tag, val);
+    */
+
+    sortValueVector(data);
+
+    // Force each type except numbers to use a single bucket. This way there is no bucket for the
+    // min value if present in the data and it needs to be estimated.
+    const ScalarHistogram& hist = makeHistogram(data, 6);
+    // Mixed data are sorted in the histogram according to the BSON order as defined in bsontypes.h
+    // the canonicalizeBSONTypeUnsafeLookup function.
+    if constexpr (kCETestLogOnly) {
+        std::cout << printValueArray(data) << "\n";
+        std::cout << "Mixed types " << hist.dump();
+    }
+
+    // Minimum ObjectId.
+    auto&& [minOid, inclOid] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::ObjectId);
+    auto [minOidTag, minOidVal] = minOid->cast<mongo::optimizer::Constant>()->get();
+    double expectedCard = estimate(hist, minOidTag, minOidVal, EstimationType::kEqual).card;
+    ASSERT_EQ(1.0, expectedCard);
+
+    // Minimum date.
+    const auto&& [minDate, inclDate] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Date);
+    const auto [minDateTag, minDateVal] = minDate->cast<mongo::optimizer::Constant>()->get();
+    expectedCard = estimate(hist, minDateTag, minDateVal, EstimationType::kEqual).card;
+    ASSERT_EQ(1.0, expectedCard);
+
+    // Minimum timestamp.
+    auto&& [minTs, inclTs] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Timestamp);
+    auto [minTsTag, minTsVal] = minTs->cast<mongo::optimizer::Constant>()->get();
+    expectedCard = estimate(hist, minTsTag, minTsVal, EstimationType::kEqual).card;
+    ASSERT_EQ(1.0, expectedCard);
+
+    // Add minimum values to the data set and create another histogram.
+    const auto [tagLowStr, valLowStr] = value::makeNewString(""_sd);
+    value::ValueGuard vgLowStr(tagLowStr, valLowStr);
+    std::tie(copyTag, copyVal) = value::copyValue(tagLowStr, valLowStr);
+    data.emplace_back(copyTag, copyVal);
+    data.emplace_back(minDateTag, minDateVal);
+    data.emplace_back(minTsTag, minTsVal);
+
+    sortValueVector(data);
+    const ScalarHistogram& hist2 = makeHistogram(data, 6);
+    if constexpr (kCETestLogOnly) {
+        std::cout << printValueArray(data) << "\n";
+        std::cout << "Mixed types " << hist2.dump();
+    }
+
+    // Precise estimate for equality to empty string, it is a bucket boundary.
+    expectedCard = estimate(hist2, tagLowStr, valLowStr, EstimationType::kEqual).card;
+    ASSERT_EQ(1.0, expectedCard);
+    // Equality to the minimum date/ts value is estimated by range_frequency/NDV.
+    expectedCard = estimate(hist2, minDateTag, minDateVal, EstimationType::kEqual).card;
+    ASSERT_EQ(1.0, expectedCard);
+    expectedCard = estimate(hist2, minTsTag, minTsVal, EstimationType::kEqual).card;
+    ASSERT_EQ(1.0, expectedCard);
+
+    // Inequality predicates using min values.
+    const ArrayHistogram arrHist(hist2,
+                                 TypeCounts{
+                                     {value::TypeTags::NumberInt64, 2},
+                                     {value::TypeTags::StringSmall, 3},
+                                     {value::TypeTags::ObjectId, 1},
+                                     {value::TypeTags::Date, 3},
+                                     {value::TypeTags::Timestamp, 3},
+                                 });
+    // [minDate, startInstant], estimated by the half of the date bucket.
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minDateTag,
+                                     minDateVal,
+                                     true /* highInclusive */,
+                                     value::TypeTags::Date,
+                                     value::bitcastFrom<int64_t>(startInstant),
+                                     true /* includeScalar */);
+    ASSERT_EQ(1.0, expectedCard);
+
+    // [minDate, endInstant], estimated by the entire date bucket.
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minDateTag,
+                                     minDateVal,
+                                     true /* highInclusive */,
+                                     value::TypeTags::Date,
+                                     value::bitcastFrom<int64_t>(endInstant),
+                                     true /* includeScalar */);
+    ASSERT_EQ(3.0, expectedCard);
+
+    // [minDate, minTs), estimated by the entire date bucket.
+    // (is this interval possible or is it better to have maxDate upper bound?).
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minDateTag,
+                                     minDateVal,
+                                     false /* highInclusive */,
+                                     minTsTag,
+                                     minTsVal,
+                                     true /* includeScalar */);
+    ASSERT_EQ(3.0, expectedCard);
+
+    // [minTs, startTs], estimated by the half of the timestamp bucket.
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minTsTag,
+                                     minTsVal,
+                                     true /* highInclusive */,
+                                     value::TypeTags::Timestamp,
+                                     value::bitcastFrom<int64_t>(startTs.asULL()),
+                                     true /* includeScalar */);
+    ASSERT_EQ(1.0, expectedCard);
+
+    // [minTs, endTs], estimated by the entire timestamp bucket.
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minTsTag,
+                                     minTsVal,
+                                     true /* highInclusive */,
+                                     value::TypeTags::Timestamp,
+                                     value::bitcastFrom<int64_t>(endTs.asULL()),
+                                     true /* includeScalar */);
+    ASSERT_EQ(3.0, expectedCard);
+
+    // [minTs, maxTs], estimated by the entire timestamp bucket.
+    auto&& [maxTs, inclMaxTs] = getMinMaxBoundForType(false /*isMin*/, value::TypeTags::Timestamp);
+    const auto [maxTsTag, maxTsVal] = maxTs->cast<mongo::optimizer::Constant>()->get();
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minTsTag,
+                                     minTsVal,
+                                     true /* highInclusive */,
+                                     maxTsTag,
+                                     maxTsVal,
+                                     true /* includeScalar */);
+    ASSERT_EQ(3.0, expectedCard);
 }
 
+TEST(EstimatorTest, MinValueMixedHistogramFromBuckets) {
+    const auto endOid = OID("63340dbed6cd8af737d4139a");
+    const auto endDate = Date_t::fromMillisSinceEpoch(1526864323000LL);
+    const Timestamp endTs{Seconds(1526864323LL), 0};
+
+    std::vector<BucketData> data{
+        {0, 1.0, 0.0, 0.0},
+        {100, 4.0, 95.0, 30.0},
+        {"xyz", 5.0, 95.0, 25.0},
+        {Value(endOid), 5.0, 95.0, 50.0},
+        {Value(endDate), 4.0, 96.0, 24.0},
+        {Value(endTs), 5.0, 95.0, 50.0},
+    };
+    const ScalarHistogram hist = createHistogram(data);
+    if constexpr (kCETestLogOnly) {
+        std::cout << "Mixed types " << hist.dump();
+    }
+    ASSERT_EQ(500.0, getTotals(hist).card);
+
+    // Minimum ObjectId.
+    auto&& [minOid, inclOid] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::ObjectId);
+    auto [minOidTag, minOidVal] = minOid->cast<mongo::optimizer::Constant>()->get();
+    double expectedCard = estimate(hist, minOidTag, minOidVal, EstimationType::kEqual).card;
+    ASSERT_APPROX_EQUAL(1.9, expectedCard, kErrorBound);
+
+    // Minimum date.
+    const auto&& [minDate, inclDate] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Date);
+    const auto [minDateTag, minDateVal] = minDate->cast<mongo::optimizer::Constant>()->get();
+    expectedCard = estimate(hist, minDateTag, minDateVal, EstimationType::kEqual).card;
+    ASSERT_EQ(4.0, expectedCard);
+
+    // Minimum timestamp.
+    auto&& [minTs, inclTs] = getMinMaxBoundForType(true /*isMin*/, value::TypeTags::Timestamp);
+    auto [minTsTag, minTsVal] = minTs->cast<mongo::optimizer::Constant>()->get();
+    expectedCard = estimate(hist, minTsTag, minTsVal, EstimationType::kEqual).card;
+    ASSERT_APPROX_EQUAL(1.9, expectedCard, kErrorBound);
+
+    // Inequality predicates using min values.
+    const ArrayHistogram arrHist(hist,
+                                 TypeCounts{
+                                     {value::TypeTags::NumberInt64, 100},
+                                     {value::TypeTags::StringSmall, 100},
+                                     {value::TypeTags::ObjectId, 100},
+                                     {value::TypeTags::Date, 100},
+                                     {value::TypeTags::Timestamp, 100},
+                                 });
+    // [minDate, innerDate], estimated by the half of the date bucket.
+    const int64_t innerDate = 1516864323000LL;
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minDateTag,
+                                     minDateVal,
+                                     true /* highInclusive */,
+                                     value::TypeTags::Date,
+                                     value::bitcastFrom<int64_t>(innerDate),
+                                     true /* includeScalar */);
+    ASSERT_APPROX_EQUAL(48.0, expectedCard, kErrorBound);
+
+    // [minTs, innerTs], estimated by the half of the timestamp bucket.
+    const Timestamp innerTs{Seconds(1516864323LL), 0};
+    expectedCard = estimateCardRange(arrHist,
+                                     true /* lowInclusive */,
+                                     minTsTag,
+                                     minTsVal,
+                                     true /* highInclusive */,
+                                     value::TypeTags::Timestamp,
+                                     value::bitcastFrom<int64_t>(innerTs.asULL()),
+                                     true /* includeScalar */);
+    ASSERT_APPROX_EQUAL(47.5, expectedCard, kErrorBound);
+}
 }  // namespace
 }  // namespace mongo::ce
diff --git a/src/mongo/db/query/ce/ce_generated_histograms_test.cpp b/src/mongo/db/query/ce/ce_generated_histograms_test.cpp
index c3751acc418..93696346447 100644
--- a/src/mongo/db/query/ce/ce_generated_histograms_test.cpp
+++ b/src/mongo/db/query/ce/ce_generated_histograms_test.cpp
@@ -131,7 +131,7 @@ TEST(EstimatorTest, UniformIntStrEstimate) {
     // Queries over the low string bound.
     // Actual cardinality {$eq: ''} = 0.
     expectedCard = estimateCardEq(arrHist, tagLowStr, valLowStr, true);
-    ASSERT_APPROX_EQUAL(0.01, expectedCard, 0.001);
+    ASSERT_APPROX_EQUAL(2.727, expectedCard, 0.001);
 
     // Actual cardinality {$gt: ''} = 485.
     expectedCard = estimateCardRange(arrHist,
@@ -270,7 +270,7 @@ TEST(EstimatorTest, IntStrArrayEstimate) {
 
     // Actual cardinality {$eq: ''} = 0.
     expectedCard = estimateCardEq(arrHist, tagLowStr, valLowStr, true /* includeScalar */);
-    ASSERT_APPROX_EQUAL(0.02, expectedCard, 0.001);
+    ASSERT_APPROX_EQUAL(6.69, expectedCard, 0.001);
 
     // Actual cardinality {$eq: 'DD2'} = 2.
     auto [tagStr, valStr] = value::makeNewString("DD2"_sd);
diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp
index c2cd8226ef1..cd1e52219c2 100644
--- a/src/mongo/db/query/ce/histogram_estimation.cpp
+++ b/src/mongo/db/query/ce/histogram_estimation.cpp
@@ -121,17 +121,6 @@ EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h,
         }
     }
 
-    // If the value is minimal for its type, return minimal valid cardinality.
-    auto&& [minConstant, inclusive] = getMinMaxBoundForType(true /*isMin*/, tag);
-    auto [minTag, minVal] = getConstTypeVal(*minConstant);
-    if (compareValues(minTag, minVal, tag, val) == 0) {
-        if (type == EstimationType::kEqual) {
-            return {kMinCard, 1.0};
-        } else {
-            return {resultCard, resultNDV};
-        }
-    }
-
     // Estimate for equality frequency inside of the bucket.
     const double innerEqFreq = (bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv;
 
@@ -139,6 +128,14 @@ EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h,
         return {innerEqFreq, 1.0};
     }
 
+    // If the value is minimal for its type, and the operation is $lt or $lte return cardinality up
+    // to the previous bucket.
+    auto&& [minConstant, inclusive] = getMinMaxBoundForType(true /*isMin*/, tag);
+    auto [minTag, minVal] = getConstTypeVal(*minConstant);
+    if (compareValues(minTag, minVal, tag, val) == 0) {
+        return {resultCard, resultNDV};
+    }
+
     // For $lt and $lte operations use linear interpolation to take a fraction of the bucket
     // cardinality and NDV if there is a preceeding bucket with bound of the same type. Use half of
     // the bucket estimates otherwise.
author	Milena Ivanova <milena.ivanova@mongodb.com>	2022-11-10 15:52:46 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-11-10 16:25:21 +0000
commit	d5ff440817391833df0ee8d918689cef60140d90 (patch)
tree	d6d26f58da16ce3e11be567e6932c22f5c3a5bc6 /src/mongo/db
parent	e9c34ed06ca402773a01816dff99a462c43e933a (diff)
download	mongo-d5ff440817391833df0ee8d918689cef60140d90.tar.gz