SERVER-68445 Integrate histogram bucket interpolation

Co-authored-by: Milena Ivanova <milena.ivanova@mongodb.com> Co-authored-by: Alya Berciu <alya.berciu@mongodb.com>
author: Alya Berciu <alya.berciu@mongodb.com> 2022-09-02 12:40:26 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-09-02 13:39:17 +0000
commit: 8581b4968646ea09b703d985c22b08cc01102597 (patch)
tree: a916997e4044cb0223a4ef483c4ec805353989b6 /src/mongo/db
parent: 26d222d66fbb97152b283a63f8e6393171c8710f (diff)
download: mongo-8581b4968646ea09b703d985c22b08cc01102597.tar.gz
9 files changed, 686 insertions, 70 deletions
diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript
index edb7b808623..ae0e61a8f92 100644
--- a/src/mongo/db/query/ce/SConscript
+++ b/src/mongo/db/query/ce/SConscript
@@ -63,6 +63,16 @@ env.CppUnitTest(
 )
 
 env.CppUnitTest(
+    target="ce_interpolation_test",
+    source=[
+        "ce_interpolation_test.cpp",
+    ],
+    LIBDEPS=[
+        'ce_test_utils',
+    ],
+)
+
+env.CppUnitTest(
     target="ce_heuristic_test",
     source=[
         "ce_heuristic_test.cpp",
diff --git a/src/mongo/db/query/ce/array_histogram.cpp b/src/mongo/db/query/ce/array_histogram.cpp
index 8e44c2973cf..453e31ac047 100644
--- a/src/mongo/db/query/ce/array_histogram.cpp
+++ b/src/mongo/db/query/ce/array_histogram.cpp
@@ -35,11 +35,11 @@ using namespace sbe;
 ArrayHistogram::ArrayHistogram() : ArrayHistogram(ScalarHistogram(), {}) {}
 
 ArrayHistogram::ArrayHistogram(ScalarHistogram scalar,
-                               std::map<value::TypeTags, size_t> typeCounts,
+                               TypeCounts typeCounts,
                                ScalarHistogram arrayUnique,
                                ScalarHistogram arrayMin,
                                ScalarHistogram arrayMax,
-                               std::map<value::TypeTags, size_t> arrayTypeCounts)
+                               TypeCounts arrayTypeCounts)
     : _scalar(std::move(scalar)),
       _typeCounts(std::move(typeCounts)),
       _arrayUnique(std::move(arrayUnique)),
@@ -49,7 +49,7 @@ ArrayHistogram::ArrayHistogram(ScalarHistogram scalar,
     invariant(isArray());
 }
 
-ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, std::map<value::TypeTags, size_t> typeCounts)
+ArrayHistogram::ArrayHistogram(ScalarHistogram scalar, TypeCounts typeCounts)
     : _scalar(std::move(scalar)),
       _typeCounts(std::move(typeCounts)),
       _arrayUnique(boost::none),
@@ -63,7 +63,7 @@ bool ArrayHistogram::isArray() const {
     return _arrayUnique && _arrayMin && _arrayMax && _arrayTypeCounts;
 }
 
-std::string typeCountsToString(const std::map<value::TypeTags, size_t>& typeCounts) {
+std::string typeCountsToString(const TypeCounts& typeCounts) {
     std::ostringstream os;
     os << "{";
     bool first = true;
@@ -111,11 +111,11 @@ const ScalarHistogram& ArrayHistogram::getArrayMax() const {
     return *_arrayMax;
 }
 
-const std::map<value::TypeTags, size_t>& ArrayHistogram::getTypeCounts() const {
+const TypeCounts& ArrayHistogram::getTypeCounts() const {
     return _typeCounts;
 }
 
-const std::map<value::TypeTags, size_t>& ArrayHistogram::getArrayTypeCounts() const {
+const TypeCounts& ArrayHistogram::getArrayTypeCounts() const {
     invariant(isArray());
     return *_arrayTypeCounts;
 }
diff --git a/src/mongo/db/query/ce/array_histogram.h b/src/mongo/db/query/ce/array_histogram.h
index 9acdeb6b7f7..2a6e9af3383 100644
--- a/src/mongo/db/query/ce/array_histogram.h
+++ b/src/mongo/db/query/ce/array_histogram.h
@@ -36,21 +36,23 @@
 
 namespace mongo::ce {
 
+using TypeCounts = std::map<sbe::value::TypeTags, size_t>;
+
 class ArrayHistogram {
 public:
     // Constructs an empty scalar histogram.
     ArrayHistogram();
 
     // Constructor for scalar field histograms.
-    ArrayHistogram(ScalarHistogram scalar, std::map<sbe::value::TypeTags, size_t> typeCounts);
+    ArrayHistogram(ScalarHistogram scalar, TypeCounts typeCounts);
 
     // Constructor for array field histograms. We have to initialize all array fields in this case.
     ArrayHistogram(ScalarHistogram scalar,
-                   std::map<sbe::value::TypeTags, size_t> typeCounts,
+                   TypeCounts typeCounts,
                    ScalarHistogram arrayUnique,
                    ScalarHistogram arrayMin,
                    ScalarHistogram arrayMax,
-                   std::map<sbe::value::TypeTags, size_t> arrayTypeCounts);
+                   TypeCounts arrayTypeCounts);
 
     // ArrayHistogram is neither copy-constructible nor copy-assignable.
     ArrayHistogram(const ArrayHistogram&) = delete;
@@ -69,15 +71,15 @@ public:
     const ScalarHistogram& getArrayUnique() const;
     const ScalarHistogram& getArrayMin() const;
     const ScalarHistogram& getArrayMax() const;
-    const std::map<sbe::value::TypeTags, size_t>& getTypeCounts() const;
-    const std::map<sbe::value::TypeTags, size_t>& getArrayTypeCounts() const;
+    const TypeCounts& getTypeCounts() const;
+    const TypeCounts& getArrayTypeCounts() const;
 
 private:
     /* ScalarHistogram fields for all paths. */
 
     // Contains values which appeared originally as scalars on the path.
     ScalarHistogram _scalar;
-    std::map<sbe::value::TypeTags, size_t> _typeCounts;
+    TypeCounts _typeCounts;
 
     /* ScalarHistogram fields for array paths (only initialized if arrays are present). */
 
@@ -87,7 +89,7 @@ private:
     boost::optional<ScalarHistogram> _arrayMin;
     // Contains maximum values originating from arrays **per class**.
     boost::optional<ScalarHistogram> _arrayMax;
-    boost::optional<std::map<sbe::value::TypeTags, size_t>> _arrayTypeCounts;
+    boost::optional<TypeCounts> _arrayTypeCounts;
 };
 
 
diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp
index 13ab164840c..be0073d96d1 100644
--- a/src/mongo/db/query/ce/ce_histogram_test.cpp
+++ b/src/mongo/db/query/ce/ce_histogram_test.cpp
@@ -30,6 +30,7 @@
 #include "mongo/db/query/ce/ce_histogram.h"
 #include "mongo/db/query/ce/ce_test_utils.h"
 #include "mongo/db/query/ce/histogram_estimation.h"
+#include "mongo/db/query/optimizer/utils/unit_test_utils.h"
 #include "mongo/db/query/sbe_stage_builder_helpers.h"
 #include "mongo/unittest/unittest.h"
 
@@ -63,7 +64,7 @@ struct TestBucket {
 std::unique_ptr<ArrayHistogram> getHistogramFromData(std::vector<TestBucket> testBuckets) {
     sbe::value::Array bounds;
     std::vector<Bucket> buckets;
-    std::map<sbe::value::TypeTags, size_t> typeCounts;
+    TypeCounts typeCounts;
 
     int cumulativeFreq = 0;
     int cumulativeNDV = 0;
@@ -306,24 +307,18 @@ TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) {
     ASSERT_MATCH_CE(t, "{intRange: {$eq: 20}}", 1.0);
     ASSERT_MATCH_CE(t, "{intRange: {$gte: 20}}", 1.0);
     ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0);
-    ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 23.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5);
     ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5);
-    ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 23.5);
-    ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 23.5);
-    ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 23.27);
-    ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 23.27);
-    ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 17.25);
-    ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 17.25);
-    ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 17.25);
-    ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 17.25);
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 46.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5);
 
     // Test ranges that partially overlap with the entire histogram.
-    ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 27.5);
-    ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 27.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 4.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5);
     ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5);
     ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5);
     ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lte: 15}}", 27.5);
-    ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 27.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 22.5);
     ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5);
 
     // Test ranges that include all values in the histogram.
@@ -341,7 +336,6 @@ TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) {
     ASSERT_MATCH_CE(t, "{intRange: {$eq: 10.5}}", 5.0);
     ASSERT_MATCH_CE(t, "{intRange: {$eq: 12.5}}", 5.0);
     ASSERT_MATCH_CE(t, "{intRange: {$eq: 19.36}}", 5.0);
-    ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 17.26);
 
     // Test ranges that don't overlap with the histogram.
     ASSERT_MATCH_CE(t, "{intRange: {$lt: 10}}", 0.0);
@@ -360,6 +354,32 @@ TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) {
     ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lt: 5}}", 0.0);
     ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lt: 5}}", 0.0);
     ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lte: 5}}", 0.0);
+
+    // Because we don't specify any indexes here, these intervals do not go through simplification.
+    // This means that instead of having one key in the requirements map of the generated sargable
+    // node corresponding to the path "intRange", we have two keys and two ranges, both
+    // corresponding to the same path. As a consequence, we combine the estimates for the intervals
+    // using exponential backoff, which results in an overestimate.
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 46.04);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16);
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 23.42);
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 24.96);
+    ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 36.53);
+
+    // When we specify that there is a non-multikey index on 'intRange', we expect to see interval
+    // simplification occurring, which should provide a better estimate for the following ranges.
+    t.setIndexes(
+        {{"intRangeIndex",
+          makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}});
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 45.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 13.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 18.5);
+    ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 31.0);
 }
 
 TEST(CEHistogramTest, TestHistogramOnNestedPaths) {
diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp
new file mode 100644
index 00000000000..152ac7480f3
--- /dev/null
+++ b/src/mongo/db/query/ce/ce_interpolation_test.cpp
@@ -0,0 +1,480 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/query/ce/array_histogram.h"
+#include "mongo/db/query/ce/ce_test_utils.h"
+#include "mongo/db/query/ce/histogram_estimation.h"
+#include "mongo/db/query/sbe_stage_builder_helpers.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::ce {
+namespace {
+
+using namespace sbe;
+
+const std::pair<value::TypeTags, value::Value> makeInt64Value(const int v) {
+    return std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom<int64_t>(v));
+};
+
+double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) {
+    const auto [tag, val] = makeInt64Value(v);
+    return estimate(hist, tag, val, type).card;
+};
+
+struct BucketData {
+    Value _v;
+    double _equalFreq;
+    double _rangeFreq;
+    double _ndv;
+
+    BucketData(Value v, double equalFreq, double rangeFreq, double ndv)
+        : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {}
+    BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv)
+        : BucketData(Value(v), equalFreq, rangeFreq, ndv) {}
+    BucketData(int v, double equalFreq, double rangeFreq, double ndv)
+        : BucketData(Value(v), equalFreq, rangeFreq, ndv) {}
+};
+
+ScalarHistogram createHistogram(const std::vector<BucketData>& data) {
+    sbe::value::Array array;
+    for (const auto& item : data) {
+        const auto [tag, val] = stage_builder::makeValue(item._v);
+        array.push_back(tag, val);
+    }
+
+    value::Array bounds;
+    std::vector<Bucket> buckets;
+
+    double cumulativeFreq = 0.0;
+    double cumulativeNDV = 0.0;
+
+    for (size_t i = 0; i < data.size(); i++) {
+        const auto [tag, val] = array.getAt(i);
+        bounds.push_back(tag, val);
+
+        const auto& item = data.at(i);
+        cumulativeFreq += item._equalFreq + item._rangeFreq;
+        cumulativeNDV += item._ndv + 1.0;
+        buckets.emplace_back(
+            item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV);
+    }
+
+    return {std::move(bounds), std::move(buckets)};
+}
+
+TEST(EstimatorTest, ManualHistogram) {
+    std::vector<BucketData> data{{0, 1.0, 1.0, 1.0},
+                                 {10, 1.0, 10.0, 5.0},
+                                 {20, 3.0, 15.0, 3.0},
+                                 {30, 1.0, 10.0, 4.0},
+                                 {40, 2.0, 0.0, 0.0},
+                                 {50, 1.0, 10.0, 5.0}};
+    const ScalarHistogram hist = createHistogram(data);
+
+    ASSERT_EQ(55.0, getTotals(hist).card);
+
+    ASSERT_EQ(1.0, estimateIntValCard(hist, 0, EstimationType::kEqual));
+    ASSERT_EQ(2.0, estimateIntValCard(hist, 5, EstimationType::kEqual));
+    ASSERT_EQ(0.0, estimateIntValCard(hist, 35, EstimationType::kEqual));
+
+    ASSERT_EQ(15.5, estimateIntValCard(hist, 15, EstimationType::kLess));
+    ASSERT_EQ(20.5, estimateIntValCard(hist, 15, EstimationType::kLessOrEqual));
+    ASSERT_EQ(28, estimateIntValCard(hist, 20, EstimationType::kLess));
+    ASSERT_EQ(31.0, estimateIntValCard(hist, 20, EstimationType::kLessOrEqual));
+
+    ASSERT_EQ(42, estimateIntValCard(hist, 10, EstimationType::kGreater));
+    ASSERT_EQ(43, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual));
+    ASSERT_EQ(19, estimateIntValCard(hist, 25, EstimationType::kGreater));
+    ASSERT_EQ(21.5, estimateIntValCard(hist, 25, EstimationType::kGreaterOrEqual));
+}
+
+TEST(EstimatorTest, UniformIntEstimate) {
+    // This hard-codes a maxdiff histogram with 10 buckets built off a uniform int distribution with
+    // a minimum of 0, a maximum of 1000, and 70 distinct values.
+    std::vector<BucketData> data{{2, 1, 0, 0},
+                                 {57, 3, 2, 1},
+                                 {179, 5, 10, 6},
+                                 {317, 5, 9, 6},
+                                 {344, 3, 0, 0},
+                                 {558, 4, 19, 12},
+                                 {656, 2, 4, 3},
+                                 {798, 3, 7, 4},
+                                 {951, 5, 17, 7},
+                                 {986, 1, 0, 0}};
+    const ScalarHistogram hist = createHistogram(data);
+
+    // Predicates over bucket bound.
+    double expectedCard = estimateIntValCard(hist, 558, EstimationType::kEqual);
+    ASSERT_EQ(4.0, expectedCard);
+    expectedCard = estimateIntValCard(hist, 558, EstimationType::kLess);
+    ASSERT_EQ(57.0, expectedCard);
+    expectedCard = estimateIntValCard(hist, 558, EstimationType::kLessOrEqual);
+    ASSERT_EQ(61.0, expectedCard);
+
+    // Predicates over value inside of a bucket.
+
+    // Query: [{$match: {a: {$eq: 530}}}].
+    expectedCard = estimateIntValCard(hist, 530, EstimationType::kEqual);
+    ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1);  // Actual: 1.
+
+    // Query: [{$match: {a: {$lt: 530}}}].
+    expectedCard = estimateIntValCard(hist, 530, EstimationType::kLess);
+    ASSERT_APPROX_EQUAL(52.9, expectedCard, 0.1);  // Actual: 50.
+
+    // Query: [{$match: {a: {$lte: 530}}}].
+    expectedCard = estimateIntValCard(hist, 530, EstimationType::kLessOrEqual);
+    ASSERT_APPROX_EQUAL(54.5, expectedCard, 0.1);  // Actual: 51.
+
+    // Query: [{$match: {a: {$eq: 400}}}].
+    expectedCard = estimateIntValCard(hist, 400, EstimationType::kEqual);
+    ASSERT_APPROX_EQUAL(1.6, expectedCard, 0.1);  // Actual: 1.
+
+    // Query: [{$match: {a: {$lt: 400}}}].
+    expectedCard = estimateIntValCard(hist, 400, EstimationType::kLess);
+    ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1);  // Actual: 39.
+
+    // Query: [{$match: {a: {$lte: 400}}}].
+    expectedCard = estimateIntValCard(hist, 400, EstimationType::kLessOrEqual);
+    ASSERT_APPROX_EQUAL(43.0, expectedCard, 0.1);  // Actual: 40.
+}
+
+TEST(EstimatorTest, NormalIntEstimate) {
+    // This hard-codes a maxdiff histogram with 10 buckets built off a normal int distribution with
+    // a minimum of 0, a maximum of 1000, and 70 distinct values.
+    std::vector<BucketData> data{{2, 1, 0, 0},
+                                 {317, 8, 20, 15},
+                                 {344, 2, 0, 0},
+                                 {388, 3, 0, 0},
+                                 {423, 4, 2, 2},
+                                 {579, 4, 12, 8},
+                                 {632, 3, 2, 1},
+                                 {696, 3, 5, 3},
+                                 {790, 5, 4, 2},
+                                 {993, 1, 21, 9}};
+    const ScalarHistogram hist = createHistogram(data);
+
+    // Predicates over bucket bound.
+    double expectedCard = estimateIntValCard(hist, 696, EstimationType::kEqual);
+    ASSERT_EQ(3.0, expectedCard);
+    expectedCard = estimateIntValCard(hist, 696, EstimationType::kLess);
+    ASSERT_EQ(66.0, expectedCard);
+    expectedCard = estimateIntValCard(hist, 696, EstimationType::kLessOrEqual);
+    ASSERT_EQ(69.0, expectedCard);
+
+    // Predicates over value inside of a bucket.
+
+    // Query: [{$match: {a: {$eq: 150}}}].
+    expectedCard = estimateIntValCard(hist, 150, EstimationType::kEqual);
+    ASSERT_APPROX_EQUAL(1.3, expectedCard, 0.1);  // Actual: 1.
+
+    // Query: [{$match: {a: {$lt: 150}}}].
+    expectedCard = estimateIntValCard(hist, 150, EstimationType::kLess);
+    ASSERT_APPROX_EQUAL(9.1, expectedCard, 0.1);  // Actual: 9.
+
+    // Query: [{$match: {a: {$lte: 150}}}].
+    expectedCard = estimateIntValCard(hist, 150, EstimationType::kLessOrEqual);
+    ASSERT_APPROX_EQUAL(10.4, expectedCard, 0.1);  // Actual: 10.
+}
+
+TEST(EstimatorTest, UniformStrEstimate) {
+    // This hard-codes a maxdiff histogram with 10 buckets built off a uniform string distribution
+    // with a minimum length of 3, a maximum length of 5, and 80 distinct values.
+    std::vector<BucketData> data{{{"0ejz", 2, 0, 0},
+                                  {"8DCaq", 3, 4, 4},
+                                  {"Cy5Kw", 3, 3, 3},
+                                  {"WXX7w", 3, 31, 20},
+                                  {"YtzS", 2, 0, 0},
+                                  {"fuK", 5, 13, 7},
+                                  {"gLkp", 3, 0, 0},
+                                  {"ixmVx", 2, 6, 2},
+                                  {"qou", 1, 9, 6},
+                                  {"z2b", 1, 9, 6}}};
+    const ScalarHistogram hist = createHistogram(data);
+
+    // Predicates over value inside of a bucket.
+    const auto [tag, value] = value::makeNewString("TTV"_sd);
+    value::ValueGuard vg(tag, value);
+
+    // Query: [{$match: {a: {$eq: 'TTV'}}}].
+    double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+    ASSERT_APPROX_EQUAL(1.55, expectedCard, 0.1);  // Actual: 2.
+
+    // Query: [{$match: {a: {$lt: 'TTV'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+    ASSERT_APPROX_EQUAL(39.8, expectedCard, 0.1);  // Actual: 39.
+
+    // Query: [{$match: {a: {$lte: 'TTV'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+    ASSERT_APPROX_EQUAL(41.3, expectedCard, 0.1);  // Actual: 41.
+}
+
+TEST(EstimatorTest, NormalStrEstimate) {
+    // This hard-codes a maxdiff histogram with 10 buckets built off a normal string distribution
+    // with a minimum length of 3, a maximum length of 5, and 80 distinct values.
+    std::vector<BucketData> data{{
+        {"0ejz", 1, 0, 0},
+        {"4FGjc", 3, 5, 3},
+        {"9bU3", 2, 3, 2},
+        {"Cy5Kw", 3, 3, 3},
+        {"Lm4U", 2, 11, 5},
+        {"TTV", 5, 14, 8},
+        {"YtzS", 2, 3, 2},
+        {"o9cD4", 6, 26, 16},
+        {"qfmnP", 1, 4, 2},
+        {"xqbi", 2, 4, 4},
+    }};
+    const ScalarHistogram hist = createHistogram(data);
+
+    // Predicates over bucket bound.
+    auto [tag, value] = value::makeNewString("TTV"_sd);
+    value::ValueGuard vg(tag, value);
+
+    // Query: [{$match: {a: {$eq: 'TTV'}}}].
+    double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+    ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1);  // Actual: 5.
+
+    // Query: [{$match: {a: {$lt: 'TTV'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+    ASSERT_APPROX_EQUAL(47.0, expectedCard, 0.1);  // Actual: 47.
+
+    // Query: [{$match: {a: {$lte: 'TTV'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+    ASSERT_APPROX_EQUAL(52.0, expectedCard, 0.1);  // Actual: 52.
+
+    // Predicates over value inside of a bucket.
+    std::tie(tag, value) = value::makeNewString("Pfa"_sd);
+
+    // Query: [{$match: {a: {$eq: 'Pfa'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+    ASSERT_APPROX_EQUAL(1.75, expectedCard, 0.1);  // Actual: 2.
+
+    // Query: [{$match: {a: {$lt: 'Pfa'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+    ASSERT_APPROX_EQUAL(38.3, expectedCard, 0.1);  // Actual: 35.
+
+    // Query: [{$match: {a: {$lte: 'Pfa'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+    ASSERT_APPROX_EQUAL(40.0, expectedCard, 0.1);  // Actual: 37.
+}
+
+TEST(EstimatorTest, UniformIntStrEstimate) {
+    // This hard-codes a maxdiff histogram with 20 buckets built off of a uniform distribution with
+    // two types occurring with equal probability:
+    // - 100 distinct ints between 0 and 1000, and
+    // - 100 distinct strings of length between 2 and 5.
+    std::vector<BucketData> data{{
+        {2, 3, 0, 0},       {19, 4, 1, 1},      {226, 2, 49, 20},  {301, 5, 12, 4},
+        {317, 3, 0, 0},     {344, 2, 3, 1},     {423, 5, 18, 6},   {445, 3, 0, 0},
+        {495, 3, 4, 2},     {542, 5, 9, 3},     {696, 3, 44, 19},  {773, 4, 11, 5},
+        {805, 2, 8, 4},     {931, 5, 21, 8},    {998, 4, 21, 3},   {"8N4", 5, 31, 14},
+        {"MIb", 5, 45, 17}, {"Zgi", 3, 55, 22}, {"pZ", 6, 62, 25}, {"yUwxz", 5, 29, 12},
+    }};
+    const ScalarHistogram hist = createHistogram(data);
+    const ArrayHistogram arrHist(
+        hist, TypeCounts{{value::TypeTags::NumberInt64, 254}, {value::TypeTags::StringSmall, 246}});
+
+    // Predicates over value inside of the last numeric bucket.
+
+    // Query: [{$match: {a: {$eq: 993}}}].
+    double expectedCard = estimateIntValCard(hist, 993, EstimationType::kEqual);
+    ASSERT_APPROX_EQUAL(7.0, expectedCard, 0.1);  // Actual: 9.
+
+    // Query: [{$match: {a: {$lt: 993}}}].
+    expectedCard = estimateIntValCard(hist, 993, EstimationType::kLess);
+    ASSERT_APPROX_EQUAL(241.4, expectedCard, 0.1);  // Actual: 241.
+
+    // Query: [{$match: {a: {$lte: 993}}}].
+    expectedCard = estimateIntValCard(hist, 993, EstimationType::kLessOrEqual);
+    ASSERT_APPROX_EQUAL(248.4, expectedCard, 0.1);  // Actual: 250.
+
+    // Predicates over value inside of the first string bucket.
+    auto [tag, value] = value::makeNewString("04e"_sd);
+    value::ValueGuard vg(tag, value);
+
+    // Query: [{$match: {a: {$eq: '04e'}}}].
+    expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+    ASSERT_APPROX_EQUAL(2.2, expectedCard, 0.1);  // Actual: 3.
+
+    value::TypeTags lowTag = value::TypeTags::NumberInt64;
+    value::Value lowVal = 100000000;
+
+    // Type bracketing: low value of different type than the bucket bound.
+    // Query: [{$match: {a: {$eq: 100000000}}}].
+    expectedCard = estimateCardEq(arrHist, lowTag, lowVal);
+    ASSERT_APPROX_EQUAL(0.0, expectedCard, 0.1);  // Actual: 0.
+
+    // No interpolation for inequality to values inside the first string bucket, fallback to half of
+    // the bucket frequency.
+
+    // Query: [{$match: {a: {$lt: '04e'}}}].
+    expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, tag, value);
+    ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1);  // Actual: 0.
+
+    // Query: [{$match: {a: {$lte: '04e'}}}].
+    expectedCard = estimateCardRange(
+        arrHist, true, false, lowTag, lowVal, true /* highInclusive */, tag, value);
+    ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1);  // Actual: 3.
+
+    // Value towards the end of the bucket gets the same half bucket estimate.
+    std::tie(tag, value) = value::makeNewString("8B5"_sd);
+
+    // Query: [{$match: {a: {$lt: '8B5'}}}].
+    expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, tag, value);
+    ASSERT_APPROX_EQUAL(13.3, expectedCard, 0.1);  // Actual: 24.
+
+    // Query: [{$match: {a: {$lte: '8B5'}}}].
+    expectedCard = estimateCardRange(
+        arrHist, true, false, lowTag, lowVal, true /* highInclusive */, tag, value);
+    ASSERT_APPROX_EQUAL(15.5, expectedCard, 0.1);  // Actual: 29.
+}
+
+TEST(EstimatorTest, UniformIntArrayOnlyEstimate) {
+    // This hard-codes a maxdiff histogram with 10 buckets built off of an array distribution with
+    // arrays between 3 and 5 elements long, each containing 100 distinct ints uniformly distributed
+    // between 0 and 1000. There are no scalar elements.
+    std::vector<BucketData> scalarData{{}};
+    const ScalarHistogram scalarHist = createHistogram(scalarData);
+
+    std::vector<BucketData> minData{{
+        {5, 3, 0, 0},   {19, 5, 2, 1},  {57, 4, 4, 3},  {116, 7, 13, 7}, {198, 3, 15, 6},
+        {228, 2, 3, 2}, {254, 4, 0, 0}, {280, 2, 2, 1}, {335, 3, 5, 3},  {344, 2, 0, 0},
+        {388, 3, 0, 0}, {420, 2, 0, 0}, {454, 1, 6, 3}, {488, 2, 1, 1},  {530, 1, 0, 0},
+        {561, 1, 0, 0}, {609, 1, 0, 0}, {685, 1, 0, 0}, {713, 1, 0, 0},  {758, 1, 0, 0},
+    }};
+    const ScalarHistogram minHist = createHistogram(minData);
+
+    std::vector<BucketData> maxData{{
+        {301, 1, 0, 0},  {408, 2, 0, 0}, {445, 1, 0, 0}, {605, 2, 0, 0},  {620, 1, 0, 0},
+        {665, 1, 1, 1},  {687, 3, 0, 0}, {704, 2, 6, 2}, {718, 2, 2, 1},  {741, 2, 1, 1},
+        {752, 2, 0, 0},  {823, 7, 3, 3}, {827, 1, 0, 0}, {852, 3, 0, 0},  {864, 5, 0, 0},
+        {909, 7, 12, 5}, {931, 2, 3, 1}, {939, 3, 0, 0}, {970, 2, 12, 4}, {998, 1, 10, 4},
+    }};
+    const ScalarHistogram maxHist = createHistogram(maxData);
+
+    std::vector<BucketData> uniqueData{{
+        {5, 3, 0, 0},    {19, 6, 2, 1},    {57, 4, 4, 3},    {116, 7, 15, 8},  {228, 2, 38, 13},
+        {254, 7, 0, 0},  {269, 10, 0, 0},  {280, 7, 3, 1},   {306, 4, 1, 1},   {317, 4, 0, 0},
+        {344, 2, 19, 5}, {423, 2, 27, 8},  {507, 2, 22, 13}, {704, 8, 72, 34}, {718, 6, 3, 1},
+        {758, 3, 13, 4}, {864, 7, 35, 14}, {883, 4, 0, 0},   {939, 5, 32, 10}, {998, 1, 24, 9},
+    }};
+    const ScalarHistogram uniqueHist = createHistogram(uniqueData);
+
+    const ArrayHistogram arrHist(
+        scalarHist, TypeCounts{}, uniqueHist, minHist, maxHist, TypeCounts{});
+
+    // Query in the middle of the domain: estimate from ArrayUnique histogram.
+    value::TypeTags lowTag = value::TypeTags::NumberInt64;
+    value::Value lowVal = 500;
+    value::TypeTags highTag = value::TypeTags::NumberInt64;
+    value::Value highVal = 600;
+
+    // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}].
+    double expectedCard =
+        estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal);
+    ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1);
+
+    // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}].
+    // Note: this should be the same as above, since we have no scalars.
+    expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal);
+    ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1);
+
+    // Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms.
+    lowVal = 10;
+    highVal = 110;
+
+    // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}].
+    expectedCard =
+        estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal);
+    ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1);
+
+    // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}].
+    // Note: this should be the same as above, since we have no scalars.
+    expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal);
+    ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1);
+}
+
+TEST(EstimatorTest, UniformIntMixedArrayEstimate) {
+    // This hard-codes a maxdiff histogram with 20 buckets built off of a mixed distribution split
+    // with equal probability between:
+    // - an array distribution between 3 and 5 elements long, each containing 80 distinct ints
+    // uniformly distributed between 0 and 1000, and
+    // - a uniform int distribution with 80 distinct ints between 0 and 1000.
+    std::vector<BucketData> scalarData{{
+        {25, 1, 0, 0},  {41, 2, 0, 0},  {142, 2, 3, 3},  {209, 3, 3, 1}, {243, 1, 2, 1},
+        {296, 3, 4, 3}, {321, 5, 4, 2}, {480, 3, 9, 8},  {513, 3, 3, 2}, {554, 1, 0, 0},
+        {637, 3, 3, 2}, {666, 2, 1, 1}, {697, 2, 2, 1},  {750, 3, 3, 2}, {768, 4, 0, 0},
+        {791, 4, 3, 3}, {851, 2, 2, 2}, {927, 2, 10, 6}, {958, 3, 2, 1}, {980, 3, 0, 0},
+    }};
+    const ScalarHistogram scalarHist = createHistogram(scalarData);
+
+    std::vector<BucketData> minData{{
+        {3, 3, 0, 0},   {5, 8, 0, 0},   {9, 3, 0, 0},   {19, 2, 0, 0},  {49, 7, 4, 2},
+        {69, 6, 0, 0},  {115, 3, 5, 3}, {125, 2, 0, 0}, {146, 1, 2, 1}, {198, 2, 4, 3},
+        {214, 2, 0, 0}, {228, 3, 0, 0}, {260, 3, 4, 1}, {280, 1, 2, 2}, {330, 2, 2, 1},
+        {344, 6, 0, 0}, {388, 2, 0, 0}, {420, 2, 0, 0}, {461, 2, 8, 4}, {696, 1, 2, 1},
+    }};
+    const ScalarHistogram minHist = createHistogram(minData);
+
+    std::vector<BucketData> maxData{{
+        {301, 1, 0, 0}, {445, 1, 0, 0},  {491, 1, 0, 0}, {533, 3, 0, 0},  {605, 3, 0, 0},
+        {620, 2, 0, 0}, {647, 3, 0, 0},  {665, 4, 0, 0}, {713, 3, 10, 4}, {741, 3, 0, 0},
+        {814, 3, 2, 2}, {839, 2, 1, 1},  {864, 1, 2, 2}, {883, 3, 0, 0},  {893, 7, 0, 0},
+        {898, 5, 0, 0}, {909, 1, 12, 3}, {931, 2, 2, 1}, {953, 6, 3, 2},  {993, 1, 7, 5},
+    }};
+    const ScalarHistogram maxHist = createHistogram(maxData);
+
+    std::vector<BucketData> uniqueData{{
+        {3, 3, 0, 0},     {19, 5, 11, 2},   {49, 7, 5, 3},    {69, 8, 0, 0},    {75, 3, 0, 0},
+        {125, 2, 10, 5},  {228, 3, 27, 14}, {260, 4, 5, 1},   {344, 6, 36, 13}, {423, 4, 20, 8},
+        {605, 4, 61, 28}, {665, 8, 12, 6},  {758, 4, 41, 16}, {768, 5, 0, 0},   {776, 3, 0, 0},
+        {864, 3, 15, 10}, {883, 8, 0, 0},   {911, 2, 28, 6},  {953, 6, 8, 4},   {993, 1, 7, 5},
+    }};
+    const ScalarHistogram uniqueHist = createHistogram(uniqueData);
+
+    const ArrayHistogram arrHist(
+        scalarHist, TypeCounts{}, uniqueHist, minHist, maxHist, TypeCounts{});
+
+    value::TypeTags lowTag = value::TypeTags::NumberInt64;
+    value::Value lowVal = 500;
+    value::TypeTags highTag = value::TypeTags::NumberInt64;
+    value::Value highVal = 550;
+
+    // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}].
+    double expectedCard =
+        estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal);
+    ASSERT_APPROX_EQUAL(9.8, expectedCard, 0.1);  // Actual: 94.
+
+    // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}].
+    expectedCard =
+        estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal);
+    ASSERT_APPROX_EQUAL(5.6, expectedCard, 0.1);  // Actual: 8.
+}
+
+}  // namespace
+}  // namespace mongo::ce
diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp
index 7beedf1adfa..05ca92c2be6 100644
--- a/src/mongo/db/query/ce/ce_test_utils.cpp
+++ b/src/mongo/db/query/ce/ce_test_utils.cpp
@@ -47,7 +47,7 @@ using namespace cascades;
 CETester::CETester(std::string collName,
                    double collCard,
                    const optimizer::OptPhaseManager::PhaseSet& optPhases)
-    : _collName(std::move(collName)), _collCard(collCard), _optPhases(optPhases) {}
+    : _collName(std::move(collName)), _collCard(collCard), _optPhases(optPhases), _indexes() {}
 
 optimizer::CEType CETester::getCE(const std::string& query) const {
     if constexpr (kCETestLogOnly) {
@@ -90,8 +90,12 @@ optimizer::CEType CETester::getCE(ABT& abt) const {
         const auto& group = memo.getGroup(i);
 
         // If the 'optPhases' either ends with the MemoSubstitutionPhase or the
-        // MemoImplementationPhase, we should have exactly one logical node per group.
-        ASSERT_EQUALS(group._logicalNodes.size(), 1);
+        // MemoImplementationPhase, we should have exactly one logical node per group. However, if
+        // we have indexes, we may have multiple logical nodes as a result of interval
+        // simplification. In this case, we still want to pick the first Sargable node.
+        if (_indexes.empty()) {
+            ASSERT_EQUALS(group._logicalNodes.size(), 1);
+        }
         const auto& node = group._logicalNodes.at(0);
 
         // This gets the cardinality estimate actually produced during optimization.
@@ -123,7 +127,7 @@ optimizer::CEType CETester::getCE(ABT& abt) const {
 }
 
 optimizer::OptPhaseManager CETester::getPhaseManager() const {
-    ScanDefinition sd({}, {}, {DistributionType::Centralized}, true, _collCard);
+    ScanDefinition sd({}, _indexes, {DistributionType::Centralized}, true, _collCard);
     Metadata metadata({{_collName, sd}});
     return {_optPhases,
             _prefixId,
diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h
index ae2af837e8e..a46af3b5624 100644
--- a/src/mongo/db/query/ce/ce_test_utils.h
+++ b/src/mongo/db/query/ce/ce_test_utils.h
@@ -108,6 +108,10 @@ public:
         _collCard = card;
     }
 
+    void setIndexes(opt::unordered_map<std::string, IndexDefinition>&& indexes) {
+        _indexes = std::move(indexes);
+    }
+
 protected:
     /**
      * Subclasses need to override this method to initialize the transports they are testing.
@@ -119,12 +123,11 @@ private:
     OptPhaseManager getPhaseManager() const;
 
     std::string _collName;
-
     // The number of records in the collection we are testing.
     double _collCard;
-
     // Phases to use when optimizing an input query.
     const OptPhaseManager::PhaseSet& _optPhases;
+    opt::unordered_map<std::string, IndexDefinition> _indexes;
     mutable PrefixId _prefixId;
 };
 
diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp
index c98e086e39c..948729ee1b4 100644
--- a/src/mongo/db/query/ce/histogram_estimation.cpp
+++ b/src/mongo/db/query/ce/histogram_estimation.cpp
@@ -34,6 +34,36 @@
 namespace mongo::ce {
 using namespace sbe;
 namespace {
+
+bool sameTypeBracket(value::TypeTags tag1, value::TypeTags tag2) {
+    if (tag1 == tag2) {
+        return true;
+    }
+    return ((value::isNumber(tag1) && value::isNumber(tag2)) ||
+            (value::isString(tag1) && value::isString(tag2)));
+}
+
+double valueToDouble(value::TypeTags tag, value::Value val) {
+    double result = 0;
+    if (value::isNumber(tag)) {
+        result = value::numericCast<double>(tag, val);
+    } else if (value::isString(tag)) {
+        const StringData sd = value::getStringView(tag, val);
+
+        // Convert a prefix of the string to a double.
+        const size_t maxPrecision = std::min(sd.size(), sizeof(double));
+        for (size_t i = 0; i < maxPrecision; ++i) {
+            const char ch = sd[i];
+            const double charToDbl = ch / std::pow(2, i * 8);
+            result += charToDbl;
+        }
+    } else {
+        uassert(6844500, "Unexpected value type", false);
+    }
+
+    return result;
+}
+
 int32_t compareValues3w(value::TypeTags tag1,
                         value::Value val1,
                         value::TypeTags tag2,
@@ -53,6 +83,70 @@ EstimationResult getTotals(const ScalarHistogram& h) {
     return {last._cumulativeFreq, last._cumulativeNDV};
 }
 
+/**
+ * Helper function that uses linear interpolation to estimate the cardinality and NDV for a value
+ * that falls inside of a histogram bucket.
+ */
+EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h,
+                                             value::TypeTags tag,
+                                             value::Value val,
+                                             EstimationType type,
+                                             size_t bucketIndex) {
+
+    const Bucket& bucket = h.getBuckets().at(bucketIndex);
+    const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex);
+
+    double resultCard = bucket._cumulativeFreq - bucket._equalFreq - bucket._rangeFreq;
+    double resultNDV = bucket._cumulativeNDV - bucket._ndv - 1.0;
+
+    // Check if the estimate is at the point of type brackets switch. If the current bucket is the
+    // first bucket of a new type bracket and the value is of another type, estimate cardinality
+    // from the current bucket as 0.
+    //
+    // For example, let bound 1 = 1000, bound 2 = "abc". The value 100000000 falls in bucket 2, the
+    // first bucket for strings, but should not get cardinality/ ndv fraction from it.
+    if (!sameTypeBracket(tag, boundTag)) {
+        if (type == EstimationType::kEqual) {
+            return {0.0, 0.0};
+        } else {
+            return {resultCard, resultNDV};
+        }
+    }
+
+    // Estimate for equality frequency inside of the bucket.
+    const double innerEqFreq = (bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv;
+
+    if (type == EstimationType::kEqual) {
+        return {innerEqFreq, 1.0};
+    }
+
+    // For $lt and $lte operations use linear interpolation to take a fraction of the bucket
+    // cardinality and NDV if there is a preceeding bucket with bound of the same type. Use half of
+    // the bucket estimates otherwise.
+    double ratio = 0.5;
+    if (bucketIndex > 0) {
+        const auto [lowBoundTag, lowBoundVal] = h.getBounds().getAt(bucketIndex - 1);
+        if (sameTypeBracket(lowBoundTag, boundTag)) {
+            double doubleLowBound = valueToDouble(lowBoundTag, lowBoundVal);
+            double doubleUpperBound = valueToDouble(boundTag, boundVal);
+            double doubleVal = valueToDouble(tag, val);
+            ratio = (doubleVal - doubleLowBound) / (doubleUpperBound - doubleLowBound);
+        }
+    }
+
+    resultCard += bucket._rangeFreq * ratio;
+    resultNDV += bucket._ndv * ratio;
+
+    if (type == EstimationType::kLess) {
+        // Subtract from the estimate the cardinality and ndv corresponding to the equality
+        // operation.
+        const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0;
+        resultCard -= innerEqFreq;
+        resultNDV -= innerEqNdv;
+    }
+    return {resultCard, resultNDV};
+}
+
 EstimationResult estimate(const ScalarHistogram& h,
                           value::TypeTags tag,
                           value::Value val,
@@ -103,47 +197,32 @@ EstimationResult estimate(const ScalarHistogram& h,
     const auto [boundTag, boundVal] = h.getBounds().getAt(bucketIndex);
     const bool isEndpoint = compareValues3w(boundTag, boundVal, tag, val) == 0;
 
-    switch (type) {
-        case EstimationType::kEqual: {
-            if (isEndpoint) {
+    if (isEndpoint) {
+        switch (type) {
+            case EstimationType::kEqual: {
                 return {bucket._equalFreq, 1.0};
             }
-            return {(bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv, 1.0};
-        }
 
-        case EstimationType::kLess: {
-            double resultCard = bucket._cumulativeFreq - bucket._equalFreq;
-            double resultNDV = bucket._cumulativeNDV - 1.0;
-
-            if (!isEndpoint) {
-                // TODO: consider value interpolation instead of assigning 50% of the weight.
-                resultCard -= bucket._rangeFreq / 2.0;
-                resultNDV -= bucket._ndv / 2.0;
+            case EstimationType::kLess: {
+                double resultCard = bucket._cumulativeFreq - bucket._equalFreq;
+                double resultNDV = bucket._cumulativeNDV - 1.0;
+                return {resultCard, resultNDV};
             }
-            return {resultCard, resultNDV};
-        }
 
-        case EstimationType::kLessOrEqual: {
-            double resultCard = bucket._cumulativeFreq;
-            double resultNDV = bucket._cumulativeNDV;
-
-            if (!isEndpoint) {
-                // TODO: consider value interpolation instead of assigning 50% of the weight.
-                resultCard -= bucket._equalFreq + bucket._rangeFreq / 2.0;
-                resultNDV -= 1.0 + bucket._ndv / 2.0;
+            case EstimationType::kLessOrEqual: {
+                double resultCard = bucket._cumulativeFreq;
+                double resultNDV = bucket._cumulativeNDV;
+                return {resultCard, resultNDV};
             }
-            return {resultCard, resultNDV};
-        }
 
-        default:
-            MONGO_UNREACHABLE;
+            default:
+                MONGO_UNREACHABLE;
+        }
+    } else {
+        return interpolateEstimateInBucket(h, tag, val, type, bucketIndex);
     }
 }
 
-/**
- * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and
- * type tag pair.
- */
 double estimateCardEq(const ArrayHistogram& ah, value::TypeTags tag, value::Value val) {
     if (tag != value::TypeTags::Null) {
         double card = estimate(ah.getScalar(), tag, val, EstimationType::kEqual).card;
@@ -187,11 +266,6 @@ static EstimationResult estimateRange(const ScalarHistogram& histogram,
     return highEstimate - lowEstimate;
 }
 
-/**
- * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate.
- * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array
- * values. The other fields define the range of the estimation.
- */
 double estimateCardRange(const ArrayHistogram& ah,
                          bool includeScalar,
                          /* Define lower bound. */
diff --git a/src/mongo/db/query/ce/histogram_estimation.h b/src/mongo/db/query/ce/histogram_estimation.h
index b949d0b752e..0e554993983 100644
--- a/src/mongo/db/query/ce/histogram_estimation.h
+++ b/src/mongo/db/query/ce/histogram_estimation.h
@@ -59,7 +59,8 @@ struct EstimationResult {
 EstimationResult getTotals(const ScalarHistogram& h);
 
 /**
- * Estimates the cardinality of a predicate of the given type against the histogram.
+ * Compute an estimate for a given value and estimation type. Use linear interpolation for values
+ * that fall inside of histogram buckets.
  */
 EstimationResult estimate(const ScalarHistogram& h,
                           sbe::value::TypeTags tag,
@@ -74,4 +75,26 @@ double estimateIntervalCardinality(const ArrayHistogram& estimator,
                                    const optimizer::IntervalRequirement& interval,
                                    optimizer::CEType inputCardinality);
 
+/**
+ * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and
+ * type tag pair.
+ */
+double estimateCardEq(const ArrayHistogram& ah, sbe::value::TypeTags tag, sbe::value::Value val);
+
+/**
+ * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate.
+ * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array
+ * values. The other fields define the range of the estimation.
+ */
+double estimateCardRange(const ArrayHistogram& ah,
+                         bool includeScalar,
+                         /* Define lower bound. */
+                         bool lowInclusive,
+                         sbe::value::TypeTags tagLow,
+                         sbe::value::Value valLow,
+                         /* Define upper bound. */
+                         bool highInclusive,
+                         sbe::value::TypeTags tagHigh,
+                         sbe::value::Value valHigh);
+
 }  // namespace mongo::ce
author	Alya Berciu <alya.berciu@mongodb.com>	2022-09-02 12:40:26 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-09-02 13:39:17 +0000
commit	8581b4968646ea09b703d985c22b08cc01102597 (patch)
tree	a916997e4044cb0223a4ef483c4ec805353989b6 /src/mongo/db
parent	26d222d66fbb97152b283a63f8e6393171c8710f (diff)
download	mongo-8581b4968646ea09b703d985c22b08cc01102597.tar.gz