summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMilena Ivanova <milena.ivanova@mongodb.com>2022-09-30 12:08:38 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-30 13:04:57 +0000
commit1c4fafd4ae5c082f36a8af1442aa48174962b1b4 (patch)
tree5f06af8c2227d5505daada02ea8746a09e6a89ed
parent5b1663a2e449ec0f563e0f9a7f80c0fddb709596 (diff)
downloadmongo-6/0.tar.gz
SERVER-68446 Unit testing of simple histogram CE with all data types6/0
-rw-r--r--src/mongo/db/query/ce/SConscript10
-rw-r--r--src/mongo/db/query/ce/ce_edge_cases_test.cpp508
-rw-r--r--src/mongo/db/query/ce/ce_histogram_test.cpp10
-rw-r--r--src/mongo/db/query/ce/ce_interpolation_test.cpp7
-rw-r--r--src/mongo/db/query/ce/ce_test_utils.cpp17
-rw-r--r--src/mongo/db/query/ce/ce_test_utils.h20
-rw-r--r--src/mongo/db/query/ce/histogram_estimation.cpp8
-rw-r--r--src/mongo/db/query/ce/maxdiff_histogram_test.cpp5
-rw-r--r--src/mongo/db/query/ce/value_utils.cpp12
9 files changed, 564 insertions, 33 deletions
diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript
index 99b258b4e7f..37fdc253507 100644
--- a/src/mongo/db/query/ce/SConscript
+++ b/src/mongo/db/query/ce/SConscript
@@ -118,6 +118,16 @@ env.CppUnitTest(
)
env.CppUnitTest(
+ target="ce_edge_cases_test",
+ source=[
+ "ce_edge_cases_test.cpp",
+ ],
+ LIBDEPS=[
+ 'ce_test_utils',
+ ],
+)
+
+env.CppUnitTest(
target='stats_cache_loader_test',
source=[
'stats_cache_loader_test.cpp',
diff --git a/src/mongo/db/query/ce/ce_edge_cases_test.cpp b/src/mongo/db/query/ce/ce_edge_cases_test.cpp
new file mode 100644
index 00000000000..14cf86e17de
--- /dev/null
+++ b/src/mongo/db/query/ce/ce_edge_cases_test.cpp
@@ -0,0 +1,508 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/query/ce/array_histogram.h"
+#include "mongo/db/query/ce/ce_test_utils.h"
+#include "mongo/db/query/ce/histogram_estimation.h"
+#include "mongo/db/query/sbe_stage_builder_helpers.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::ce {
+namespace {
+
+using namespace sbe;
+
+TEST(EstimatorTest, OneBucketIntHistogram) {
+ // Data set of 10 values, each with frequency 3, in the range (-inf, 100].
+ // Example: { -100, -20, 0, 20, 50, 60, 70, 80, 90, 100}.
+ std::vector<BucketData> data{{100, 3.0, 27.0, 9.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(30.0, getTotals(hist).card);
+
+ // Estimates with the bucket bound.
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual));
+ ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater));
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual));
+
+ // Estimates with a value inside the bucket.
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual));
+ // No interpolation possible for estimates of inequalities in a single bucket. The estimates
+ // are based on the default cardinality of half bucket +/- the estimate of equality inside of
+ // the bucket.
+ ASSERT_EQ(10.5, estimateIntValCard(hist, 10, EstimationType::kLess));
+ ASSERT_EQ(13.5, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual));
+ ASSERT_EQ(16.5, estimateIntValCard(hist, 10, EstimationType::kGreater));
+ ASSERT_EQ(19.5, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual));
+
+ // Estimates for a value larger than the last bucket bound.
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLess));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLessOrEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreaterOrEqual));
+}
+
+TEST(EstimatorTest, OneExclusiveBucketIntHistogram) {
+ // Data set of a single value.
+ // By exclusive bucket we mean a bucket with only boundary, that is the range frequency and NDV
+ // are zero.
+ std::vector<BucketData> data{{100, 2.0, 0.0, 0.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(2.0, getTotals(hist).card);
+
+ // Estimates with the bucket boundary.
+ ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater));
+
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kLess));
+ ASSERT_EQ(2.0, estimateIntValCard(hist, 0, EstimationType::kGreater));
+
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual));
+ ASSERT_EQ(2.0, estimateIntValCard(hist, 1000, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater));
+}
+
+TEST(EstimatorTest, OneBucketTwoIntValuesHistogram) {
+ // Data set of two values, example {5, 100, 100}.
+ std::vector<BucketData> data{{100, 2.0, 1.0, 1.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(3.0, getTotals(hist).card);
+
+ // Estimates with the bucket boundary.
+ ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual));
+ ASSERT_EQ(1.0, estimateIntValCard(hist, 100, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater));
+
+ ASSERT_EQ(1.0, estimateIntValCard(hist, 10, EstimationType::kEqual));
+ // Default estimate of half of the bucket's range frequency = 0.5.
+ ASSERT_EQ(0.5, estimateIntValCard(hist, 10, EstimationType::kLess));
+ ASSERT_EQ(2.5, estimateIntValCard(hist, 10, EstimationType::kGreater));
+
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual));
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 1000, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater));
+}
+
+TEST(EstimatorTest, OneBucketTwoIntValuesHistogram2) {
+ // Similar to the above test with higher frequency for the second value.
+ // Example {5, 5, 5, 100, 100}.
+ std::vector<BucketData> data{{100, 2.0, 3.0, 1.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(5.0, getTotals(hist).card);
+
+ // Estimates with the bucket boundary.
+ ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual));
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater));
+
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual));
+ // Default estimate of half of the bucket's range frequency = 1.5.
+ ASSERT_EQ(1.5, estimateIntValCard(hist, 10, EstimationType::kLess));
+ ASSERT_EQ(3.5, estimateIntValCard(hist, 10, EstimationType::kGreater));
+
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual));
+ ASSERT_EQ(5.0, estimateIntValCard(hist, 1000, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater));
+}
+
+
+TEST(EstimatorTest, TwoBucketsIntHistogram) {
+ // Data set of 10 values in the range [1, 100].
+ std::vector<BucketData> data{{1, 1.0, 0.0, 0.0}, {100, 3.0, 26.0, 8.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(30.0, getTotals(hist).card);
+
+ // Estimates for a value smaller than the first bucket.
+ ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLess));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLessOrEqual));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreater));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreaterOrEqual));
+
+ // Estimates with bucket bounds.
+ ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 1, EstimationType::kLess));
+ ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kLessOrEqual));
+ ASSERT_EQ(29.0, estimateIntValCard(hist, 1, EstimationType::kGreater));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, 1, EstimationType::kGreaterOrEqual));
+
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual));
+ ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess));
+ ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual));
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater));
+ ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual));
+
+ // Estimates with a value inside the bucket. The estimates use interpolation.
+ // The bucket ratio for the value of 10 is smaller than the estimate for equality
+ // and the estimates for Less and LessOrEqual are the same.
+ ASSERT_APPROX_EQUAL(3.3, estimateIntValCard(hist, 10, EstimationType::kEqual), 0.1);
+ ASSERT_APPROX_EQUAL(3.4, estimateIntValCard(hist, 10, EstimationType::kLess), 0.1);
+ ASSERT_APPROX_EQUAL(3.4, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual), 0.1);
+ ASSERT_APPROX_EQUAL(26.6, estimateIntValCard(hist, 10, EstimationType::kGreater), 0.1);
+ ASSERT_APPROX_EQUAL(26.6, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual), 0.1);
+
+ // Different estimates for Less and LessOrEqual for the value of 50.
+ ASSERT_APPROX_EQUAL(3.3, estimateIntValCard(hist, 50, EstimationType::kEqual), 0.1);
+ ASSERT_APPROX_EQUAL(10.6, estimateIntValCard(hist, 50, EstimationType::kLess), 0.1);
+ ASSERT_APPROX_EQUAL(13.9, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), 0.1);
+ ASSERT_APPROX_EQUAL(16.1, estimateIntValCard(hist, 50, EstimationType::kGreater), 0.1);
+ ASSERT_APPROX_EQUAL(19.4, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), 0.1);
+}
+
+TEST(EstimatorTest, ThreeExclusiveBucketsIntHistogram) {
+ std::vector<BucketData> data{{1, 1.0, 0.0, 0.0}, {10, 8.0, 0.0, 0.0}, {100, 1.0, 0.0, 0.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(10.0, getTotals(hist).card);
+
+ ASSERT_EQ(0.0, estimateIntValCard(hist, 5, EstimationType::kEqual));
+ ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLess));
+ ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLessOrEqual));
+ ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreater));
+ ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreaterOrEqual));
+}
+TEST(EstimatorTest, OneBucketStrHistogram) {
+ std::vector<BucketData> data{{"xyz", 3.0, 27.0, 9.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(30.0, getTotals(hist).card);
+
+ // Estimates with bucket bound.
+ auto [tag, value] = value::makeNewString("xyz"_sd);
+ value::ValueGuard vg(tag, value);
+ double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(27.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_EQ(30.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+
+ // Estimates for a value inside the bucket. Since there is no low value bound in the histogram
+ // all values smaller than the upper bound will be estimated the same way using half of the
+ // bucket cardinality.
+ std::tie(tag, value) = value::makeNewString("a"_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(10.5, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_EQ(13.5, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(16.5, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_EQ(19.5, expectedCard);
+
+ std::tie(tag, value) = value::makeNewString(""_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(10.5, expectedCard);
+ // Can we do better? Figure out that the query value is the smallest in its data type.
+
+ // Estimates for a value larger than the upper bound.
+ std::tie(tag, value) = value::makeNewString("z"_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(30.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+}
+
+TEST(EstimatorTest, TwoBucketsStrHistogram) {
+ // Data set of 100 strings in the range ["abc", "xyz"], with average frequency of 2.
+ std::vector<BucketData> data{{"abc", 2.0, 0.0, 0.0}, {"xyz", 3.0, 95.0, 48.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(100.0, getTotals(hist).card);
+
+ // Estimates for a value smaller than the first bucket bound.
+ auto [tag, value] = value::makeNewString("a"_sd);
+ value::ValueGuard vg(tag, value);
+
+ double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(100.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_EQ(100.0, expectedCard);
+
+ // Estimates with bucket bounds.
+ std::tie(tag, value) = value::makeNewString("abc"_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(2.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_EQ(2.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(98.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_EQ(100.0, expectedCard);
+
+ std::tie(tag, value) = value::makeNewString("xyz"_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(97.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_EQ(100.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+
+ // Estimates for a value inside the bucket.
+ std::tie(tag, value) = value::makeNewString("sun"_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_APPROX_EQUAL(2.0, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_APPROX_EQUAL(74.4, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_APPROX_EQUAL(76.4, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_APPROX_EQUAL(23.6, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_APPROX_EQUAL(25.6, expectedCard, 0.1);
+
+ // Estimate for a value very close to the bucket bound.
+ std::tie(tag, value) = value::makeNewString("xyw"_sd);
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_APPROX_EQUAL(2.0, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_APPROX_EQUAL(95.0, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card;
+ ASSERT_APPROX_EQUAL(97.0, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_APPROX_EQUAL(3.0, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card;
+ ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1);
+}
+
+TEST(EstimatorTest, TwoBucketsDateHistogram) {
+ // June 6, 2017 -- June 7, 2017.
+ const int64_t startInstant = 1496777923000LL;
+ const int64_t endInstant = 1496864323000LL;
+ const auto startDate = Date_t::fromMillisSinceEpoch(startInstant);
+ const auto endDate = Date_t::fromMillisSinceEpoch(endInstant);
+
+ std::vector<BucketData> data{{Value(startDate), 3.0, 0.0, 0.0},
+ {Value(endDate), 1.0, 96.0, 48.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(100.0, getTotals(hist).card);
+
+ const auto valueBefore = value::bitcastFrom<int64_t>(startInstant - 1);
+ double expectedCard =
+ estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kGreater).card;
+ ASSERT_EQ(100.0, expectedCard);
+
+ const auto valueStart = value::bitcastFrom<int64_t>(startInstant);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kGreater).card;
+ ASSERT_EQ(97.0, expectedCard);
+
+ const auto valueEnd = value::bitcastFrom<int64_t>(endInstant);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kEqual).card;
+ ASSERT_EQ(1.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kLess).card;
+ ASSERT_EQ(99.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+
+ const auto valueIn = value::bitcastFrom<int64_t>(startInstant + 43000000);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kEqual).card;
+ ASSERT_EQ(2.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kLess).card;
+ ASSERT_APPROX_EQUAL(48.8, expectedCard, 0.1);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kGreater).card;
+ ASSERT_APPROX_EQUAL(49.2, expectedCard, 0.1);
+
+ const auto valueAfter = value::bitcastFrom<int64_t>(endInstant + 100);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kLess).card;
+ ASSERT_EQ(100.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+}
+
+TEST(EstimatorTest, TwoBucketsTimestampHistogram) {
+ // June 6, 2017 -- June 7, 2017 in seconds.
+ const int64_t startInstant = 1496777923LL;
+ const int64_t endInstant = 1496864323LL;
+ const Timestamp startTs{Seconds(startInstant), 0};
+ const Timestamp endTs{Seconds(endInstant), 0};
+
+ std::vector<BucketData> data{{Value(startTs), 3.0, 0.0, 0.0}, {Value(endTs), 1.0, 96.0, 48.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(100.0, getTotals(hist).card);
+
+ const auto valueBefore = value::bitcastFrom<int64_t>(startTs.asULL() - 1);
+ double expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kGreater).card;
+ ASSERT_EQ(100.0, expectedCard);
+
+ const auto valueStart = value::bitcastFrom<int64_t>(
+ startTs.asULL()); // NB: startTs.asInt64() produces different value.
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kEqual).card;
+ ASSERT_EQ(3.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kGreater).card;
+ ASSERT_EQ(97.0, expectedCard);
+
+ const auto valueEnd = value::bitcastFrom<int64_t>(endTs.asULL());
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kEqual).card;
+ ASSERT_EQ(1.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kLess).card;
+ ASSERT_EQ(99.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+
+ const auto valueIn = value::bitcastFrom<int64_t>((startTs.asULL() + endTs.asULL()) / 2);
+ expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kEqual).card;
+ ASSERT_EQ(2.0, expectedCard);
+ expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kLess).card;
+ ASSERT_APPROX_EQUAL(49.0, expectedCard, 0.1);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kGreater).card;
+ ASSERT_APPROX_EQUAL(49.0, expectedCard, 0.1);
+
+ const auto valueAfter = value::bitcastFrom<int64_t>(endTs.asULL() + 100);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kLess).card;
+ ASSERT_EQ(100.0, expectedCard);
+ expectedCard =
+ estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+}
+
+TEST(EstimatorTest, TwoBucketsObjectIdHistogram) {
+ const auto startOid = OID("63340d8d27afef2de7357e8d");
+ const auto endOid = OID("63340dbed6cd8af737d4139a");
+ ASSERT_TRUE(startOid < endOid);
+
+ std::vector<BucketData> data{{Value(startOid), 2.0, 0.0, 0.0},
+ {Value(endOid), 1.0, 97.0, 77.0}};
+ const ScalarHistogram hist = createHistogram(data);
+
+ ASSERT_EQ(100.0, getTotals(hist).card);
+
+ auto [tag, value] = value::makeNewObjectId();
+ value::ValueGuard vg(tag, value);
+ const auto oidBefore = OID("63340d8d27afef2de7357e8c");
+ oidBefore.view().readInto(value::getObjectIdView(value));
+
+ double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(100.0, expectedCard);
+
+ // Bucket bounds.
+ startOid.view().readInto(value::getObjectIdView(value));
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(2.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(98.0, expectedCard);
+
+ endOid.view().readInto(value::getObjectIdView(value));
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(1.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(99.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+
+ // ObjectId value inside the bucket.
+ const auto oidInside = OID("63340db2cd4d46ff39178e9d");
+ oidInside.view().readInto(value::getObjectIdView(value));
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_APPROX_EQUAL(1.2, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_APPROX_EQUAL(83.9, expectedCard, 0.1);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_APPROX_EQUAL(14.8, expectedCard, 0.1);
+
+ const auto oidAfter = OID("63340dbed6cd8af737d4139b");
+ oidAfter.view().readInto(value::getObjectIdView(value));
+ expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card;
+ ASSERT_EQ(0.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kLess).card;
+ ASSERT_EQ(100.0, expectedCard);
+ expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card;
+ ASSERT_EQ(0.0, expectedCard);
+}
+
+} // namespace
+} // namespace mongo::ce
diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp
index 70c884c6e8a..6e38fd707b7 100644
--- a/src/mongo/db/query/ce/ce_histogram_test.cpp
+++ b/src/mongo/db/query/ce/ce_histogram_test.cpp
@@ -335,11 +335,11 @@ TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) {
ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0);
ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5);
ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5);
- ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 46.5);
+ ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 41.5);
ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5);
// Test ranges that partially overlap with the entire histogram.
- ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 4.5);
+ ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 9.5);
ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5);
ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5);
ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5);
@@ -386,7 +386,7 @@ TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) {
// node corresponding to the path "intRange", we have two keys and two ranges, both
// corresponding to the same path. As a consequence, we combine the estimates for the intervals
// using exponential backoff, which results in an overestimate.
- ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 46.04);
+ ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 41.09);
ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09);
ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16);
ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42);
@@ -399,7 +399,7 @@ TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) {
t.setIndexes(
{{"intRangeIndex",
makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}});
- ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 45.5);
+ ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 40.5);
ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5);
ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5);
ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5);
@@ -647,7 +647,7 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) {
ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 35.0);
// Test case with multiple predicates and ranges.
- ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 67.88);
+ ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 68.75);
ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 28.19);
// Test multiple $elemMatches.
diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp
index 043d9074b10..63c134cb131 100644
--- a/src/mongo/db/query/ce/ce_interpolation_test.cpp
+++ b/src/mongo/db/query/ce/ce_interpolation_test.cpp
@@ -38,11 +38,6 @@ namespace {
using namespace sbe;
-double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) {
- auto [tag, val] = std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom<int64_t>(v));
- return estimate(hist, tag, val, type).card;
-};
-
TEST(EstimatorTest, ManualHistogram) {
std::vector<BucketData> data{{0, 1.0, 1.0, 1.0},
{10, 1.0, 10.0, 5.0},
@@ -481,7 +476,7 @@ TEST(EstimatorTest, UniformIntMixedArrayEstimate) {
highTag,
highVal,
true /* includeScalar */);
- ASSERT_APPROX_EQUAL(90.9, expectedCard, 0.1); // Actual: 94.
+ ASSERT_APPROX_EQUAL(92.9, expectedCard, 0.1); // Actual: 94.
// Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}].
expectedCard = estimateCardRange(arrHist,
diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp
index 407b6ecd47c..c4e76b2ef39 100644
--- a/src/mongo/db/query/ce/ce_test_utils.cpp
+++ b/src/mongo/db/query/ce/ce_test_utils.cpp
@@ -142,12 +142,6 @@ optimizer::CEType CETester::getCE(ABT& abt) const {
}
ScalarHistogram createHistogram(const std::vector<BucketData>& data) {
- sbe::value::Array array;
- for (const auto& item : data) {
- const auto [tag, val] = stage_builder::makeValue(item._v);
- array.push_back(tag, val);
- }
-
value::Array bounds;
std::vector<Bucket> buckets;
@@ -155,10 +149,10 @@ ScalarHistogram createHistogram(const std::vector<BucketData>& data) {
double cumulativeNDV = 0.0;
for (size_t i = 0; i < data.size(); i++) {
- const auto [tag, val] = array.getAt(i);
+ const auto& item = data.at(i);
+ const auto [tag, val] = stage_builder::makeValue(item._v);
bounds.push_back(tag, val);
- const auto& item = data.at(i);
cumulativeFreq += item._equalFreq + item._rangeFreq;
cumulativeNDV += item._ndv + 1.0;
buckets.emplace_back(
@@ -167,4 +161,11 @@ ScalarHistogram createHistogram(const std::vector<BucketData>& data) {
return {std::move(bounds), std::move(buckets)};
}
+
+double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) {
+ const auto [tag, val] =
+ std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom<int64_t>(v));
+ return estimate(hist, tag, val, type).card;
+};
+
} // namespace mongo::ce
diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h
index 667a3b70dc3..9e6ebdeb5e8 100644
--- a/src/mongo/db/query/ce/ce_test_utils.h
+++ b/src/mongo/db/query/ce/ce_test_utils.h
@@ -32,6 +32,7 @@
#include <cstddef>
#include <sys/types.h>
+#include "mongo/db/query/ce/histogram_estimation.h"
#include "mongo/db/query/ce/scalar_histogram.h"
#include "mongo/db/query/optimizer/cascades/interfaces.h"
#include "mongo/db/query/optimizer/opt_phase_manager.h"
@@ -71,14 +72,15 @@ const OptPhaseManager::PhaseSet kNoOptPhaseSet{};
* expecting.
*/
-#define _ASSERT_MATCH_CE(ce, predicate, expectedCE) \
- if constexpr (kCETestLogOnly) { \
- if (std::abs(ce.getCE(predicate) - expectedCE) > kMaxCEError) { \
- std::cout << "ERROR: expected " << expectedCE << std::endl; \
- } \
- ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError); \
- } else { \
- ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError); \
+#define _ASSERT_MATCH_CE(ce, predicate, expectedCE) \
+ if constexpr (kCETestLogOnly) { \
+ if (std::abs(ce.getCE(predicate) - expectedCE) > kMaxCEError) { \
+ std::cout << "ERROR: cardinality " << ce.getCE(predicate) << " expected " \
+ << expectedCE << std::endl; \
+ } \
+ ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError); \
+ } else { \
+ ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError); \
}
#define _PREDICATE(field, predicate) (str::stream() << "{" << field << ": " << predicate "}")
#define _ELEMMATCH_PREDICATE(field, predicate) \
@@ -159,5 +161,7 @@ struct BucketData {
ScalarHistogram createHistogram(const std::vector<BucketData>& data);
+double estimateIntValCard(const ScalarHistogram& hist, int v, EstimationType type);
+
} // namespace ce
} // namespace mongo
diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp
index fbd041380b4..52851e5118f 100644
--- a/src/mongo/db/query/ce/histogram_estimation.cpp
+++ b/src/mongo/db/query/ce/histogram_estimation.cpp
@@ -95,14 +95,16 @@ EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h,
}
}
- resultCard += bucket._rangeFreq * ratio;
+ const double bucketFreqRatio = bucket._rangeFreq * ratio;
+ resultCard += bucketFreqRatio;
resultNDV += bucket._ndv * ratio;
if (type == EstimationType::kLess) {
// Subtract from the estimate the cardinality and ndv corresponding to the equality
- // operation.
+ // operation, if they are larger than the ratio taken from this bucket.
+ const double innerEqFreqCorrection = (bucketFreqRatio < innerEqFreq) ? 0.0 : innerEqFreq;
const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0;
- resultCard -= innerEqFreq;
+ resultCard -= innerEqFreqCorrection;
resultNDV -= innerEqNdv;
}
return {resultCard, resultNDV};
diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
index 78c6490a0e9..2192b586237 100644
--- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
+++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp
@@ -129,9 +129,8 @@ TEST_F(HistogramTest, MaxDiffTestInt) {
ASSERT_LTE(hist.getBuckets().size(), nBuckets);
const double estimatedCard = estimateCard(hist, 11, EstimationType::kLess);
-
ASSERT_EQ(36, actualCard);
- ASSERT_APPROX_EQUAL(39.73333, estimatedCard, kTolerance);
+ ASSERT_APPROX_EQUAL(43.7333, estimatedCard, kTolerance);
}
TEST_F(HistogramTest, MaxDiffTestString) {
@@ -159,7 +158,7 @@ TEST_F(HistogramTest, MaxDiffTestString) {
const double estimatedCard = estimate(hist, tag, val, EstimationType::kLess).card;
ASSERT_EQ(15, actualCard);
- ASSERT_APPROX_EQUAL(10.9443, estimatedCard, kTolerance);
+ ASSERT_APPROX_EQUAL(15.9443, estimatedCard, kTolerance);
}
TEST_F(HistogramTest, MaxDiffTestMixedTypes) {
diff --git a/src/mongo/db/query/ce/value_utils.cpp b/src/mongo/db/query/ce/value_utils.cpp
index 9838f91a285..f4bfd0e8b53 100644
--- a/src/mongo/db/query/ce/value_utils.cpp
+++ b/src/mongo/db/query/ce/value_utils.cpp
@@ -156,6 +156,18 @@ double valueToDouble(value::TypeTags tag, value::Value val) {
const double charToDbl = ch / std::pow(2, i * 8);
result += charToDbl;
}
+ } else if (tag == value::TypeTags::Date || tag == value::TypeTags::Timestamp) {
+ int64_t v = value::bitcastTo<int64_t>(val);
+ result = value::numericCast<double>(value::TypeTags::NumberInt64, v);
+
+ } else if (tag == value::TypeTags::ObjectId) {
+ auto objView =
+ ConstDataView(reinterpret_cast<const char*>(sbe::value::getObjectIdView(val)->data()));
+ // Take the first 8 bytes of the ObjectId.
+ // ToDo: consider using the entire ObjectId or other parts of it
+ // auto v = objView.read<LittleEndian<uint64_t>>(sizeof(uint32_t));
+ auto v = objView.read<LittleEndian<uint64_t>>();
+ result = value::numericCast<double>(value::TypeTags::NumberInt64, v);
} else {
uassert(6844500, "Unexpected value type", false);
}