From 1c4fafd4ae5c082f36a8af1442aa48174962b1b4 Mon Sep 17 00:00:00 2001 From: Milena Ivanova Date: Fri, 30 Sep 2022 12:08:38 +0000 Subject: SERVER-68446 Unit testing of simple histogram CE with all data types --- src/mongo/db/query/ce/SConscript | 10 + src/mongo/db/query/ce/ce_edge_cases_test.cpp | 508 +++++++++++++++++++++++ src/mongo/db/query/ce/ce_histogram_test.cpp | 10 +- src/mongo/db/query/ce/ce_interpolation_test.cpp | 7 +- src/mongo/db/query/ce/ce_test_utils.cpp | 17 +- src/mongo/db/query/ce/ce_test_utils.h | 20 +- src/mongo/db/query/ce/histogram_estimation.cpp | 8 +- src/mongo/db/query/ce/maxdiff_histogram_test.cpp | 5 +- src/mongo/db/query/ce/value_utils.cpp | 12 + 9 files changed, 564 insertions(+), 33 deletions(-) create mode 100644 src/mongo/db/query/ce/ce_edge_cases_test.cpp diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index 99b258b4e7f..37fdc253507 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -117,6 +117,16 @@ env.CppUnitTest( ], ) +env.CppUnitTest( + target="ce_edge_cases_test", + source=[ + "ce_edge_cases_test.cpp", + ], + LIBDEPS=[ + 'ce_test_utils', + ], +) + env.CppUnitTest( target='stats_cache_loader_test', source=[ diff --git a/src/mongo/db/query/ce/ce_edge_cases_test.cpp b/src/mongo/db/query/ce/ce_edge_cases_test.cpp new file mode 100644 index 00000000000..14cf86e17de --- /dev/null +++ b/src/mongo/db/query/ce/ce_edge_cases_test.cpp @@ -0,0 +1,508 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/array_histogram.h" +#include "mongo/db/query/ce/ce_test_utils.h" +#include "mongo/db/query/ce/histogram_estimation.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { +namespace { + +using namespace sbe; + +TEST(EstimatorTest, OneBucketIntHistogram) { + // Data set of 10 values, each with frequency 3, in the range (-inf, 100]. + // Example: { -100, -20, 0, 20, 50, 60, 70, 80, 90, 100}. + std::vector data{{100, 3.0, 27.0, 9.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(30.0, getTotals(hist).card); + + // Estimates with the bucket bound. + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual)); + + // Estimates with a value inside the bucket. + ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); + // No interpolation possible for estimates of inequalities in a single bucket. The estimates + // are based on the default cardinality of half bucket +/- the estimate of equality inside of + // the bucket. + ASSERT_EQ(10.5, estimateIntValCard(hist, 10, EstimationType::kLess)); + ASSERT_EQ(13.5, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual)); + ASSERT_EQ(16.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); + ASSERT_EQ(19.5, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual)); + + // Estimates for a value larger than the last bucket bound. + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 1000, EstimationType::kLessOrEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreaterOrEqual)); +} + +TEST(EstimatorTest, OneExclusiveBucketIntHistogram) { + // Data set of a single value. + // By exclusive bucket we mean a bucket with only boundary, that is the range frequency and NDV + // are zero. + std::vector data{{100, 2.0, 0.0, 0.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(2.0, getTotals(hist).card); + + // Estimates with the bucket boundary. + ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 0, EstimationType::kLess)); + ASSERT_EQ(2.0, estimateIntValCard(hist, 0, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(2.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); +} + +TEST(EstimatorTest, OneBucketTwoIntValuesHistogram) { + // Data set of two values, example {5, 100, 100}. + std::vector data{{100, 2.0, 1.0, 1.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(3.0, getTotals(hist).card); + + // Estimates with the bucket boundary. + ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + ASSERT_EQ(1.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); + // Default estimate of half of the bucket's range frequency = 0.5. + ASSERT_EQ(0.5, estimateIntValCard(hist, 10, EstimationType::kLess)); + ASSERT_EQ(2.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); +} + +TEST(EstimatorTest, OneBucketTwoIntValuesHistogram2) { + // Similar to the above test with higher frequency for the second value. + // Example {5, 5, 5, 100, 100}. + std::vector data{{100, 2.0, 3.0, 1.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(5.0, getTotals(hist).card); + + // Estimates with the bucket boundary. + ASSERT_EQ(2.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + + ASSERT_EQ(3.0, estimateIntValCard(hist, 10, EstimationType::kEqual)); + // Default estimate of half of the bucket's range frequency = 1.5. + ASSERT_EQ(1.5, estimateIntValCard(hist, 10, EstimationType::kLess)); + ASSERT_EQ(3.5, estimateIntValCard(hist, 10, EstimationType::kGreater)); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kEqual)); + ASSERT_EQ(5.0, estimateIntValCard(hist, 1000, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1000, EstimationType::kGreater)); +} + + +TEST(EstimatorTest, TwoBucketsIntHistogram) { + // Data set of 10 values in the range [1, 100]. + std::vector data{{1, 1.0, 0.0, 0.0}, {100, 3.0, 26.0, 8.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(30.0, getTotals(hist).card); + + // Estimates for a value smaller than the first bucket. + ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLess)); + ASSERT_EQ(0.0, estimateIntValCard(hist, -42, EstimationType::kLessOrEqual)); + ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreater)); + ASSERT_EQ(30.0, estimateIntValCard(hist, -42, EstimationType::kGreaterOrEqual)); + + // Estimates with bucket bounds. + ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 1, EstimationType::kLess)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 1, EstimationType::kLessOrEqual)); + ASSERT_EQ(29.0, estimateIntValCard(hist, 1, EstimationType::kGreater)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 1, EstimationType::kGreaterOrEqual)); + + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kEqual)); + ASSERT_EQ(27.0, estimateIntValCard(hist, 100, EstimationType::kLess)); + ASSERT_EQ(30.0, estimateIntValCard(hist, 100, EstimationType::kLessOrEqual)); + ASSERT_EQ(0.0, estimateIntValCard(hist, 100, EstimationType::kGreater)); + ASSERT_EQ(3.0, estimateIntValCard(hist, 100, EstimationType::kGreaterOrEqual)); + + // Estimates with a value inside the bucket. The estimates use interpolation. + // The bucket ratio for the value of 10 is smaller than the estimate for equality + // and the estimates for Less and LessOrEqual are the same. + ASSERT_APPROX_EQUAL(3.3, estimateIntValCard(hist, 10, EstimationType::kEqual), 0.1); + ASSERT_APPROX_EQUAL(3.4, estimateIntValCard(hist, 10, EstimationType::kLess), 0.1); + ASSERT_APPROX_EQUAL(3.4, estimateIntValCard(hist, 10, EstimationType::kLessOrEqual), 0.1); + ASSERT_APPROX_EQUAL(26.6, estimateIntValCard(hist, 10, EstimationType::kGreater), 0.1); + ASSERT_APPROX_EQUAL(26.6, estimateIntValCard(hist, 10, EstimationType::kGreaterOrEqual), 0.1); + + // Different estimates for Less and LessOrEqual for the value of 50. + ASSERT_APPROX_EQUAL(3.3, estimateIntValCard(hist, 50, EstimationType::kEqual), 0.1); + ASSERT_APPROX_EQUAL(10.6, estimateIntValCard(hist, 50, EstimationType::kLess), 0.1); + ASSERT_APPROX_EQUAL(13.9, estimateIntValCard(hist, 50, EstimationType::kLessOrEqual), 0.1); + ASSERT_APPROX_EQUAL(16.1, estimateIntValCard(hist, 50, EstimationType::kGreater), 0.1); + ASSERT_APPROX_EQUAL(19.4, estimateIntValCard(hist, 50, EstimationType::kGreaterOrEqual), 0.1); +} + +TEST(EstimatorTest, ThreeExclusiveBucketsIntHistogram) { + std::vector data{{1, 1.0, 0.0, 0.0}, {10, 8.0, 0.0, 0.0}, {100, 1.0, 0.0, 0.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(10.0, getTotals(hist).card); + + ASSERT_EQ(0.0, estimateIntValCard(hist, 5, EstimationType::kEqual)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLess)); + ASSERT_EQ(1.0, estimateIntValCard(hist, 5, EstimationType::kLessOrEqual)); + ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreater)); + ASSERT_EQ(9.0, estimateIntValCard(hist, 5, EstimationType::kGreaterOrEqual)); +} +TEST(EstimatorTest, OneBucketStrHistogram) { + std::vector data{{"xyz", 3.0, 27.0, 9.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(30.0, getTotals(hist).card); + + // Estimates with bucket bound. + auto [tag, value] = value::makeNewString("xyz"_sd); + value::ValueGuard vg(tag, value); + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(27.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(30.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(3.0, expectedCard); + + // Estimates for a value inside the bucket. Since there is no low value bound in the histogram + // all values smaller than the upper bound will be estimated the same way using half of the + // bucket cardinality. + std::tie(tag, value) = value::makeNewString("a"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(10.5, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(13.5, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(16.5, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(19.5, expectedCard); + + std::tie(tag, value) = value::makeNewString(""_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(10.5, expectedCard); + // Can we do better? Figure out that the query value is the smallest in its data type. + + // Estimates for a value larger than the upper bound. + std::tie(tag, value) = value::makeNewString("z"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(30.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoBucketsStrHistogram) { + // Data set of 100 strings in the range ["abc", "xyz"], with average frequency of 2. + std::vector data{{"abc", 2.0, 0.0, 0.0}, {"xyz", 3.0, 95.0, 48.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + // Estimates for a value smaller than the first bucket bound. + auto [tag, value] = value::makeNewString("a"_sd); + value::ValueGuard vg(tag, value); + + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(100.0, expectedCard); + + // Estimates with bucket bounds. + std::tie(tag, value) = value::makeNewString("abc"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(98.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(100.0, expectedCard); + + std::tie(tag, value) = value::makeNewString("xyz"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(97.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_EQ(3.0, expectedCard); + + // Estimates for a value inside the bucket. + std::tie(tag, value) = value::makeNewString("sun"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(2.0, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(74.4, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(76.4, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(23.6, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_APPROX_EQUAL(25.6, expectedCard, 0.1); + + // Estimate for a value very close to the bucket bound. + std::tie(tag, value) = value::makeNewString("xyw"_sd); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(2.0, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(95.0, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kLessOrEqual).card; + ASSERT_APPROX_EQUAL(97.0, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(3.0, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kGreaterOrEqual).card; + ASSERT_APPROX_EQUAL(5.0, expectedCard, 0.1); +} + +TEST(EstimatorTest, TwoBucketsDateHistogram) { + // June 6, 2017 -- June 7, 2017. + const int64_t startInstant = 1496777923000LL; + const int64_t endInstant = 1496864323000LL; + const auto startDate = Date_t::fromMillisSinceEpoch(startInstant); + const auto endDate = Date_t::fromMillisSinceEpoch(endInstant); + + std::vector data{{Value(startDate), 3.0, 0.0, 0.0}, + {Value(endDate), 1.0, 96.0, 48.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + const auto valueBefore = value::bitcastFrom(startInstant - 1); + double expectedCard = + estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Date, valueBefore, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + + const auto valueStart = value::bitcastFrom(startInstant); + expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueStart, EstimationType::kGreater).card; + ASSERT_EQ(97.0, expectedCard); + + const auto valueEnd = value::bitcastFrom(endInstant); + expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kLess).card; + ASSERT_EQ(99.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueEnd, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + const auto valueIn = value::bitcastFrom(startInstant + 43000000); + expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(48.8, expectedCard, 0.1); + expectedCard = estimate(hist, value::TypeTags::Date, valueIn, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(49.2, expectedCard, 0.1); + + const auto valueAfter = value::bitcastFrom(endInstant + 100); + expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kLess).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Date, valueAfter, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoBucketsTimestampHistogram) { + // June 6, 2017 -- June 7, 2017 in seconds. + const int64_t startInstant = 1496777923LL; + const int64_t endInstant = 1496864323LL; + const Timestamp startTs{Seconds(startInstant), 0}; + const Timestamp endTs{Seconds(endInstant), 0}; + + std::vector data{{Value(startTs), 3.0, 0.0, 0.0}, {Value(endTs), 1.0, 96.0, 48.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + const auto valueBefore = value::bitcastFrom(startTs.asULL() - 1); + double expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueBefore, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + + const auto valueStart = value::bitcastFrom( + startTs.asULL()); // NB: startTs.asInt64() produces different value. + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kEqual).card; + ASSERT_EQ(3.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueStart, EstimationType::kGreater).card; + ASSERT_EQ(97.0, expectedCard); + + const auto valueEnd = value::bitcastFrom(endTs.asULL()); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kLess).card; + ASSERT_EQ(99.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueEnd, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + const auto valueIn = value::bitcastFrom((startTs.asULL() + endTs.asULL()) / 2); + expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(49.0, expectedCard, 0.1); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueIn, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(49.0, expectedCard, 0.1); + + const auto valueAfter = value::bitcastFrom(endTs.asULL() + 100); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kLess).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = + estimate(hist, value::TypeTags::Timestamp, valueAfter, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +TEST(EstimatorTest, TwoBucketsObjectIdHistogram) { + const auto startOid = OID("63340d8d27afef2de7357e8d"); + const auto endOid = OID("63340dbed6cd8af737d4139a"); + ASSERT_TRUE(startOid < endOid); + + std::vector data{{Value(startOid), 2.0, 0.0, 0.0}, + {Value(endOid), 1.0, 97.0, 77.0}}; + const ScalarHistogram hist = createHistogram(data); + + ASSERT_EQ(100.0, getTotals(hist).card); + + auto [tag, value] = value::makeNewObjectId(); + value::ValueGuard vg(tag, value); + const auto oidBefore = OID("63340d8d27afef2de7357e8c"); + oidBefore.view().readInto(value::getObjectIdView(value)); + + double expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(100.0, expectedCard); + + // Bucket bounds. + startOid.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(2.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(98.0, expectedCard); + + endOid.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(1.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(99.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); + + // ObjectId value inside the bucket. + const auto oidInside = OID("63340db2cd4d46ff39178e9d"); + oidInside.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_APPROX_EQUAL(1.2, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_APPROX_EQUAL(83.9, expectedCard, 0.1); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_APPROX_EQUAL(14.8, expectedCard, 0.1); + + const auto oidAfter = OID("63340dbed6cd8af737d4139b"); + oidAfter.view().readInto(value::getObjectIdView(value)); + expectedCard = estimate(hist, tag, value, EstimationType::kEqual).card; + ASSERT_EQ(0.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kLess).card; + ASSERT_EQ(100.0, expectedCard); + expectedCard = estimate(hist, tag, value, EstimationType::kGreater).card; + ASSERT_EQ(0.0, expectedCard); +} + +} // namespace +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp index 70c884c6e8a..6e38fd707b7 100644 --- a/src/mongo/db/query/ce/ce_histogram_test.cpp +++ b/src/mongo/db/query/ce/ce_histogram_test.cpp @@ -335,11 +335,11 @@ TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) { ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 46.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 41.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5); // Test ranges that partially overlap with the entire histogram. - ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 4.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 9.5); ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5); ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); @@ -386,7 +386,7 @@ TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) { // node corresponding to the path "intRange", we have two keys and two ranges, both // corresponding to the same path. As a consequence, we combine the estimates for the intervals // using exponential backoff, which results in an overestimate. - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 46.04); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 41.09); ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09); ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16); ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42); @@ -399,7 +399,7 @@ TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) { t.setIndexes( {{"intRangeIndex", makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}}); - ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 45.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 40.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5); @@ -647,7 +647,7 @@ TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 35.0); // Test case with multiple predicates and ranges. - ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 67.88); + ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 68.75); ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 28.19); // Test multiple $elemMatches. diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp index 043d9074b10..63c134cb131 100644 --- a/src/mongo/db/query/ce/ce_interpolation_test.cpp +++ b/src/mongo/db/query/ce/ce_interpolation_test.cpp @@ -38,11 +38,6 @@ namespace { using namespace sbe; -double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) { - auto [tag, val] = std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(v)); - return estimate(hist, tag, val, type).card; -}; - TEST(EstimatorTest, ManualHistogram) { std::vector data{{0, 1.0, 1.0, 1.0}, {10, 1.0, 10.0, 5.0}, @@ -481,7 +476,7 @@ TEST(EstimatorTest, UniformIntMixedArrayEstimate) { highTag, highVal, true /* includeScalar */); - ASSERT_APPROX_EQUAL(90.9, expectedCard, 0.1); // Actual: 94. + ASSERT_APPROX_EQUAL(92.9, expectedCard, 0.1); // Actual: 94. // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}]. expectedCard = estimateCardRange(arrHist, diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp index 407b6ecd47c..c4e76b2ef39 100644 --- a/src/mongo/db/query/ce/ce_test_utils.cpp +++ b/src/mongo/db/query/ce/ce_test_utils.cpp @@ -142,12 +142,6 @@ optimizer::CEType CETester::getCE(ABT& abt) const { } ScalarHistogram createHistogram(const std::vector& data) { - sbe::value::Array array; - for (const auto& item : data) { - const auto [tag, val] = stage_builder::makeValue(item._v); - array.push_back(tag, val); - } - value::Array bounds; std::vector buckets; @@ -155,10 +149,10 @@ ScalarHistogram createHistogram(const std::vector& data) { double cumulativeNDV = 0.0; for (size_t i = 0; i < data.size(); i++) { - const auto [tag, val] = array.getAt(i); + const auto& item = data.at(i); + const auto [tag, val] = stage_builder::makeValue(item._v); bounds.push_back(tag, val); - const auto& item = data.at(i); cumulativeFreq += item._equalFreq + item._rangeFreq; cumulativeNDV += item._ndv + 1.0; buckets.emplace_back( @@ -167,4 +161,11 @@ ScalarHistogram createHistogram(const std::vector& data) { return {std::move(bounds), std::move(buckets)}; } + +double estimateIntValCard(const ScalarHistogram& hist, const int v, const EstimationType type) { + const auto [tag, val] = + std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom(v)); + return estimate(hist, tag, val, type).card; +}; + } // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h index 667a3b70dc3..9e6ebdeb5e8 100644 --- a/src/mongo/db/query/ce/ce_test_utils.h +++ b/src/mongo/db/query/ce/ce_test_utils.h @@ -32,6 +32,7 @@ #include #include +#include "mongo/db/query/ce/histogram_estimation.h" #include "mongo/db/query/ce/scalar_histogram.h" #include "mongo/db/query/optimizer/cascades/interfaces.h" #include "mongo/db/query/optimizer/opt_phase_manager.h" @@ -71,14 +72,15 @@ const OptPhaseManager::PhaseSet kNoOptPhaseSet{}; * expecting. */ -#define _ASSERT_MATCH_CE(ce, predicate, expectedCE) \ - if constexpr (kCETestLogOnly) { \ - if (std::abs(ce.getCE(predicate) - expectedCE) > kMaxCEError) { \ - std::cout << "ERROR: expected " << expectedCE << std::endl; \ - } \ - ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError); \ - } else { \ - ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError); \ +#define _ASSERT_MATCH_CE(ce, predicate, expectedCE) \ + if constexpr (kCETestLogOnly) { \ + if (std::abs(ce.getCE(predicate) - expectedCE) > kMaxCEError) { \ + std::cout << "ERROR: cardinality " << ce.getCE(predicate) << " expected " \ + << expectedCE << std::endl; \ + } \ + ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError); \ + } else { \ + ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError); \ } #define _PREDICATE(field, predicate) (str::stream() << "{" << field << ": " << predicate "}") #define _ELEMMATCH_PREDICATE(field, predicate) \ @@ -159,5 +161,7 @@ struct BucketData { ScalarHistogram createHistogram(const std::vector& data); +double estimateIntValCard(const ScalarHistogram& hist, int v, EstimationType type); + } // namespace ce } // namespace mongo diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp index fbd041380b4..52851e5118f 100644 --- a/src/mongo/db/query/ce/histogram_estimation.cpp +++ b/src/mongo/db/query/ce/histogram_estimation.cpp @@ -95,14 +95,16 @@ EstimationResult interpolateEstimateInBucket(const ScalarHistogram& h, } } - resultCard += bucket._rangeFreq * ratio; + const double bucketFreqRatio = bucket._rangeFreq * ratio; + resultCard += bucketFreqRatio; resultNDV += bucket._ndv * ratio; if (type == EstimationType::kLess) { // Subtract from the estimate the cardinality and ndv corresponding to the equality - // operation. + // operation, if they are larger than the ratio taken from this bucket. + const double innerEqFreqCorrection = (bucketFreqRatio < innerEqFreq) ? 0.0 : innerEqFreq; const double innerEqNdv = (bucket._ndv * ratio <= 1.0) ? 0.0 : 1.0; - resultCard -= innerEqFreq; + resultCard -= innerEqFreqCorrection; resultNDV -= innerEqNdv; } return {resultCard, resultNDV}; diff --git a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp index 78c6490a0e9..2192b586237 100644 --- a/src/mongo/db/query/ce/maxdiff_histogram_test.cpp +++ b/src/mongo/db/query/ce/maxdiff_histogram_test.cpp @@ -129,9 +129,8 @@ TEST_F(HistogramTest, MaxDiffTestInt) { ASSERT_LTE(hist.getBuckets().size(), nBuckets); const double estimatedCard = estimateCard(hist, 11, EstimationType::kLess); - ASSERT_EQ(36, actualCard); - ASSERT_APPROX_EQUAL(39.73333, estimatedCard, kTolerance); + ASSERT_APPROX_EQUAL(43.7333, estimatedCard, kTolerance); } TEST_F(HistogramTest, MaxDiffTestString) { @@ -159,7 +158,7 @@ TEST_F(HistogramTest, MaxDiffTestString) { const double estimatedCard = estimate(hist, tag, val, EstimationType::kLess).card; ASSERT_EQ(15, actualCard); - ASSERT_APPROX_EQUAL(10.9443, estimatedCard, kTolerance); + ASSERT_APPROX_EQUAL(15.9443, estimatedCard, kTolerance); } TEST_F(HistogramTest, MaxDiffTestMixedTypes) { diff --git a/src/mongo/db/query/ce/value_utils.cpp b/src/mongo/db/query/ce/value_utils.cpp index 9838f91a285..f4bfd0e8b53 100644 --- a/src/mongo/db/query/ce/value_utils.cpp +++ b/src/mongo/db/query/ce/value_utils.cpp @@ -156,6 +156,18 @@ double valueToDouble(value::TypeTags tag, value::Value val) { const double charToDbl = ch / std::pow(2, i * 8); result += charToDbl; } + } else if (tag == value::TypeTags::Date || tag == value::TypeTags::Timestamp) { + int64_t v = value::bitcastTo(val); + result = value::numericCast(value::TypeTags::NumberInt64, v); + + } else if (tag == value::TypeTags::ObjectId) { + auto objView = + ConstDataView(reinterpret_cast(sbe::value::getObjectIdView(val)->data())); + // Take the first 8 bytes of the ObjectId. + // ToDo: consider using the entire ObjectId or other parts of it + // auto v = objView.read>(sizeof(uint32_t)); + auto v = objView.read>(); + result = value::numericCast(value::TypeTags::NumberInt64, v); } else { uassert(6844500, "Unexpected value type", false); } -- cgit v1.2.1