/** * Copyright (C) 2022-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/db/query/ce/ce_histogram.h" #include "mongo/db/query/ce/ce_test_utils.h" #include "mongo/db/query/ce/collection_statistics_mock.h" #include "mongo/db/query/ce/histogram_estimation.h" #include "mongo/db/query/optimizer/utils/unit_test_utils.h" #include "mongo/db/query/sbe_stage_builder_helpers.h" #include "mongo/unittest/unittest.h" namespace mongo::ce { namespace { using namespace optimizer; using namespace cascades; std::string collName("test"); class CEHistogramTester : public CETester { public: CEHistogramTester(std::string collName, double numRecords, std::shared_ptr stats) : CETester(collName, numRecords), _stats{stats} {} protected: std::unique_ptr getCETransport() const override { // making a copy of CollecitonStatistics to override return std::make_unique(_stats); } private: std::shared_ptr _stats; }; struct TestBucket { Value val; int equalFreq; int rangeFreq = 0; int ndv = 1; /* ndv including bucket boundary*/ }; ScalarHistogram getHistogramFromData(std::vector testBuckets) { sbe::value::Array bounds; std::vector buckets; int cumulativeFreq = 0; int cumulativeNDV = 0; for (const auto& b : testBuckets) { // Add bucket boundary value to bounds. auto [tag, val] = stage_builder::makeValue(b.val); bounds.push_back(tag, val); cumulativeFreq += b.equalFreq + b.rangeFreq; cumulativeNDV += b.ndv; // Create a histogram bucket. buckets.emplace_back(b.equalFreq, b.rangeFreq, cumulativeFreq, b.ndv - 1, /* ndv excluding bucket boundary*/ cumulativeNDV); } return ScalarHistogram(std::move(bounds), std::move(buckets)); } TypeCounts getTypeCountsFromData(std::vector testBuckets) { TypeCounts typeCounts; for (const auto& b : testBuckets) { // Add bucket boundary value to bounds. auto sbeVal = stage_builder::makeValue(b.val); auto [tag, val] = sbeVal; // Increment count of values for each type tag. if (auto it = typeCounts.find(tag); it != typeCounts.end()) { it->second += b.equalFreq + b.rangeFreq; } else { typeCounts[tag] = b.equalFreq + b.rangeFreq; } } return typeCounts; } std::unique_ptr getArrayHistogramFromData(std::vector testBuckets) { return std::make_unique(getHistogramFromData(testBuckets), getTypeCountsFromData(testBuckets)); } std::unique_ptr getArrayHistogramFromData( std::vector scalarBuckets, std::vector arrayUniqueBuckets, std::vector arrayMinBuckets, std::vector arrayMaxBuckets, TypeCounts arrayTypeCounts) { return std::make_unique(getHistogramFromData(scalarBuckets), getTypeCountsFromData(scalarBuckets), getHistogramFromData(arrayUniqueBuckets), getHistogramFromData(arrayMinBuckets), getHistogramFromData(arrayMaxBuckets), std::move(arrayTypeCounts)); } TEST(CEHistogramTest, AssertSmallMaxDiffHistogramEstimatesAtomicPredicates) { const auto collCardinality = 8; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); // Construct a histogram with two buckets: one for 3 ints equal to 1, another for 5 strings // equal to "ing". const std::string& str = "ing"; collStats->addHistogram("a", getArrayHistogramFromData({ {Value(1), 3 /* frequency */}, {Value(str), 5 /* frequency */}, })); CEHistogramTester t(collName, collCardinality, collStats); // Test $eq. ASSERT_MATCH_CE(t, "{a: {$eq: 1}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$eq: 2}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$eq: \"ing\"}}", 5.0); ASSERT_MATCH_CE(t, "{a: {$eq: \"foo\"}}", 0.0); // Test case when field doesn't match fieldpath of histogram. This falls back to heuristics. ASSERT_MATCH_CE(t, "{b: {$eq: 1}}", 2.82843); // Test $gt. ASSERT_MATCH_CE(t, "{a: {$gt: 3}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gt: 1}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gt: 0}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$gt: \"bar\"}}", 5.0); ASSERT_MATCH_CE(t, "{a: {$gt: \"ing\"}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gt: \"zap\"}}", 0.0); // Test $lt. ASSERT_MATCH_CE(t, "{a: {$lt: 3}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$lt: 1}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lt: 0}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lt: \"bar\"}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lt: \"ing\"}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lt: \"zap\"}}", 5.0); // Test $gte. ASSERT_MATCH_CE(t, "{a: {$gte: 3}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gte: 1}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$gte: 0}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$gte: \"bar\"}}", 5.0); ASSERT_MATCH_CE(t, "{a: {$gte: \"ing\"}}", 5.0); ASSERT_MATCH_CE(t, "{a: {$gte: \"zap\"}}", 0.0); // Test $lte. ASSERT_MATCH_CE(t, "{a: {$lte: 3}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$lte: 1}}", 3.0); ASSERT_MATCH_CE(t, "{a: {$lte: 0}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lte: \"bar\"}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lte: \"ing\"}}", 5.0); ASSERT_MATCH_CE(t, "{a: {$lte: \"zap\"}}", 5.0); } TEST(CEHistogramTest, AssertSmallHistogramEstimatesComplexPredicates) { const auto collCardinality = 9; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); // Construct a histogram with three int buckets for field 'a'. collStats->addHistogram("a", getArrayHistogramFromData({ {Value(1), 3 /* frequency */}, {Value(2), 5 /* frequency */}, {Value(3), 1 /* frequency */}, })); // Construct a histogram with two int buckets for field 'b'. collStats->addHistogram("b", getArrayHistogramFromData({ {Value(22), 3 /* frequency */}, {Value(33), 6 /* frequency */}, })); CEHistogramTester t(collName, collCardinality, collStats); // Test simple conjunctions on one field. Note the first example: the range we expect to see // here is (1, 3); however, the structure in the SargableNode gives us a conjunction of two // intervals instead: (1, "") ^ (nan, 3) This is then estimated using exponential backoff to // give us a less accurate result. The correct cardinality here would be 5. ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lt: 3}}", 5.66); ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lte: 3}}", 6.0); ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lt: 3}}", 8.0); ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lte: 3}}", 9.0); // Test ranges which exclude each other. ASSERT_MATCH_CE(t, "{a: {$lt: 1}, a: {$gt: 3}}", 0.0); // Test overlapping ranges. This is a similar case to {a: {$gt: 1}, a: {$lt: 3}} above: we // expect to see the range [2, 2]; instead, we see the range [nan, 2] ^ [2, ""). ASSERT_MATCH_CE(t, "{a: {$lte: 2}, a: {$gte: 2}}", 5.66); // Test conjunctions over multiple fields for which we have histograms. Here we expect a // cardinality estimated by exponential backoff. ASSERT_MATCH_CE(t, "{a: {$eq: 2}, b: {$eq: 22}}", 2.24); ASSERT_MATCH_CE(t, "{a: {$eq: 11}, b: {$eq: 22}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gt: 11}, a: {$lte: 100}, b: {$eq: 22}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$lt: 3}, a: {$gte: 1}, b: {$lt: 100}, b: {$gt: 30}}", 5.66); // Test conjunctions over multiple fields for which we may not have histograms. This falls back // to heuristic estimation. ASSERT_MATCH_CE(t, "{a: {$eq: 2}, c: {$eq: 1}}", 1.73205); ASSERT_MATCH_CE(t, "{c: {$eq: 2}, d: {$eq: 22}}", 1.73205); } TEST(CEHistogramTest, SanityTestEmptyHistogram) { const auto collCardinality = 0; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); collStats->addHistogram("empty", std::make_unique()); CEHistogramTester t(collName, collCardinality, collStats); ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}}", 0.0); ASSERT_MATCH_CE(t, "{empty: {$lt: 1.0}, empty: {$gt: 0.0}}", 0.0); ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}, other: {$eq: \"anything\"}}", 0.0); ASSERT_MATCH_CE(t, "{other: {$eq: \"anything\"}, empty: {$eq: 1.0}}", 0.0); } TEST(CEHistogramTest, TestOneBucketOneIntHistogram) { const auto collName = "test"; const auto collCardinality = 50; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); // Create a histogram with a single bucket that contains exactly one int (42) with a frequency // of 50 (equal to the collection cardinality). collStats->addHistogram("soloInt", getArrayHistogramFromData({ {Value(42), collCardinality /* frequency */}, })); CEHistogramTester t(collName, collCardinality, collStats); // Check against a variety of intervals that include 42 as a bound. ASSERT_MATCH_CE(t, "{soloInt: {$eq: 42}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$lte: 42}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 42}}", collCardinality); // Check against a variety of intervals that include 42 only as one bound. ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 43}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 43}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 42}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 42}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 42}}", collCardinality); // Check against a variety of intervals close to 42 using a lower bound of 41 and a higher bound // of 43. ASSERT_MATCH_CE(t, "{soloInt: {$eq: 41}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$eq: 43}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$lt: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$lte: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 43}}", collCardinality); ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 43}}", collCardinality); // Check against different types. ASSERT_MATCH_CE(t, "{soloInt: {$eq: \"42\"}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$lt: \"42\"}}", 0.0); ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42.1}}", collCardinality); } TEST(CEHistogramTest, TestOneBoundIntRangeHistogram) { const auto collName = "test"; const auto collCardinality = 51; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); collStats->addHistogram( "intRange", getArrayHistogramFromData({ {Value(10), 5 /* frequency */}, {Value(20), 1 /* frequency */, 45 /* range frequency */, 10 /* ndv */}, })); CEHistogramTester t(collName, collCardinality, collStats); // Test ranges that overlap only with the lower bound. // Note: 5 values equal 10. ASSERT_MATCH_CE(t, "{intRange: {$eq: 10}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}, intRange: {$gte: 10}}", 5.0); // Test ranges that overlap only with the upper bound. ASSERT_MATCH_CE(t, "{intRange: {$eq: 11}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 15}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 15.5}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 20}}", 1.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 20}}", 1.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 28.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 46.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 41.5); // Test ranges that partially overlap with the entire histogram. ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 4.5); ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 22.5); ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lte: 15}}", 27.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 22.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); // Test ranges that include all values in the histogram. ASSERT_MATCH_CE(t, "{intRange: {$gte: 10}, intRange: {$lte: 20}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lte: 30}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lt: 30}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lte: 30}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lt: 30}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$lt: 100}}", collCardinality); ASSERT_MATCH_CE(t, "{intRange: {$lte: 100}}", collCardinality); // Test ranges that are fully included in the histogram. ASSERT_MATCH_CE(t, "{intRange: {$eq: 10.5}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 12.5}}", 5.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 19.36}}", 5.0); // Test ranges that don't overlap with the histogram. ASSERT_MATCH_CE(t, "{intRange: {$lt: 10}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$lt: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$lte: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 20.1}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$eq: 21}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 21}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 20}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 100}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lte: 50}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lt: 50}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lt: 50}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lte: 50}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lte: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lt: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lt: 5}}", 0.0); ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lte: 5}}", 0.0); // Because we don't specify any indexes here, these intervals do not go through simplification. // This means that instead of having one key in the requirements map of the generated sargable // node corresponding to the path "intRange", we have two keys and two ranges, both // corresponding to the same path. As a consequence, we combine the estimates for the intervals // using exponential backoff, which results in an overestimate. ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 46.04); ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 41.09); ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 19.16); ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 20.42); ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 23.42); ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 24.96); ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 36.53); // When we specify that there is a non-multikey index on 'intRange', we expect to see interval // simplification occurring, which should provide a better estimate for the following ranges. t.setIndexes( {{"intRangeIndex", makeIndexDefinition("intRange", CollationOp::Ascending, /* isMultiKey */ false)}}); ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 45.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 40.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 8.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 13.5); ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 13.5); ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 18.5); ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 31.0); } TEST(CEHistogramTest, TestHistogramOnNestedPaths) { const auto collCardinality = 50; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); // Create a histogram with a single bucket that contains exactly one int (42) with a frequency // of 50 (equal to the collection cardinality). collStats->addHistogram("path", getArrayHistogramFromData({ {Value(42), collCardinality /* frequency */}, })); collStats->addHistogram("a.histogram.path", getArrayHistogramFromData({ {Value(42), collCardinality /* frequency */}, })); CEHistogramTester t(collName, collCardinality, collStats); ASSERT_MATCH_CE(t, "{\"not.a.histogram.path\": {$eq: 42}}", 7.071 /* heuristic */); ASSERT_MATCH_CE(t, "{\"a.histogram.path\": {$eq: 42}}", collCardinality); ASSERT_MATCH_CE( t, "{\"a.histogram.path.with.no.histogram\": {$eq: 42}}", 7.071 /* heuristic */); // TODO SERVER-68596: this doesn't generate a SargableNode. When it does, it should return 0.0. // Currently, this estimate falls back to heuristic CE. ASSERT_MATCH_CE(t, "{\"a.histogram.path\": {$elemMatch: {$eq: 42}}}", 6.21); } TEST(CEHistogramTest, TestArrayHistogramOnAtomicPredicates) { const auto collName = "test"; const auto collCardinality = 6; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); collStats->addHistogram( "a", // Generate a histogram for this data: // {a: 1}, {a: 2}, {a: [1, 2, 3, 2, 2]}, {a: [10]}, {a: [2, 3, 3, 4, 5, 5, 6]}, {a: []} // - scalars: [1, 2] // - unique values: [1], [2], [1, 2, 3], [10], [2, 3, 4, 5, 6] // -> [1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 10] // - min values: [1], [10], [2] -> [1, 1, 2, 2, 10] // - max values: [3], [10], [6] -> [1, 2, 3, 6, 10] getArrayHistogramFromData( {// Scalar buckets. {Value(1), 1 /* frequency */}, {Value(2), 1 /* frequency */}}, { // Array unique buckets. {Value(1), 1 /* frequency */}, {Value(2), 2 /* frequency */}, {Value(3), 2 /* frequency */}, {Value(4), 1 /* frequency */}, {Value(5), 1 /* frequency */}, {Value(6), 1 /* frequency */}, {Value(10), 1 /* frequency */}, }, { // Array min buckets. {Value(1), 1 /* frequency */}, {Value(2), 1 /* frequency */}, {Value(10), 1 /* frequency */}, }, { // Array max buckets. {Value(3), 1 /* frequency */}, {Value(6), 1 /* frequency */}, {Value(10), 1 /* frequency */}, }, {{sbe::value::TypeTags::NumberInt32, 4}} // Array type counts. )); CEHistogramTester t(collName, collCardinality, collStats); // Test simple predicates against 'a'. Note: in the $elemMatch case, we exclude scalar // estimates. Without $elemMatch, we add the array histogram and scalar histogram estimates // together. // Test equality predicates. ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$eq: 0}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 1}"); ASSERT_EQ_ELEMMATCH_CE(t, 3.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$eq: 2}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$eq: 3}"); ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 4}"); ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 6}"); ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$eq: 10}"); ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$eq: 11}"); // Test histogram boundary values. ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$lt: 1}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$lte: 1}"); ASSERT_EQ_ELEMMATCH_CE(t, 0.0 /* CE */, 0.0 /* $elemMatch CE */, "a", "{$gt: 10}"); ASSERT_EQ_ELEMMATCH_CE(t, 1.0 /* CE */, 1.0 /* $elemMatch CE */, "a", "{$gte: 10}"); ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lte: 10}"); ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lt: 10}"); ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gt: 1}"); ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 1}"); ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lte: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 4.0 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$lt: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.0 /* $elemMatch CE */, "a", "{$gt: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.0 /* CE */, 2.41 /* $elemMatch CE */, "a", "{$gte: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.45 /* CE */, 2.41 /* $elemMatch CE */, "a", "{$gt: 2, $lt: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 3.27 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 2, $lt: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 2.45 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gt: 2, $lte: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 3.27 /* CE */, 3.0 /* $elemMatch CE */, "a", "{$gte: 2, $lte: 5}"); } TEST(CEHistogramTest, TestArrayHistogramOnCompositePredicates) { const auto collName = "test"; const auto collCardinality = 175; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); // A scalar histogram with values in the range [1,10], most of which are in the middle bucket. collStats->addHistogram( "scalar", getArrayHistogramFromData({ {Value(1), 10 /* frequency */}, {Value(2), 10 /* frequency */}, {Value(3), 20 /* frequency */, 120 /* range frequency */, 5 /* ndv */}, {Value(8), 5 /* frequency */, 10 /* range frequency */, 3 /* ndv */}, })); // An array histogram built on the following arrays with 35 occurrences of each: // [[1, 2, 3], [5, 5, 5, 5, 5], [6], [], [8, 9, 10]] collStats->addHistogram( "array", getArrayHistogramFromData( {/* No scalar buckets. */}, { // Array unique buckets. {Value(1), 35 /* frequency */}, {Value(2), 35 /* frequency */, 35 /* range frequency */, 2 /* ndv */}, {Value(5), 35 /* frequency */}, {Value(6), 35 /* frequency */, 105 /* range frequency */, 4 /* ndv */}, }, { // Array min buckets. {Value(1), 35 /* frequency */}, {Value(5), 35 /* frequency */}, {Value(6), 35 /* frequency */}, {Value(8), 35 /* frequency */}, }, { // Array max buckets. {Value(3), 35 /* frequency */}, {Value(5), 35 /* frequency */}, {Value(6), 35 /* frequency */}, {Value(10), 35 /* frequency */}, }, {{sbe::value::TypeTags::NumberInt32, collCardinality}} // Array type counts. )); collStats->addHistogram( "mixed", // The mixed histogram has 87 scalars that follow approximately the same distribution as // in the pure scalar case, and 88 arrays with the following distribution: // [{[1, 2, 3]: 17}, {[5, 5, 5, 5, 5]: 17}, {[6]: 17}, {[]: 20}, {[8, 9, 10]: 17}] getArrayHistogramFromData( { // Scalar buckets. {Value(1), 5 /* frequency */}, {Value(2), 5 /* frequency */}, {Value(3), 10 /* frequency */, 60 /* range frequency */, 5 /* ndv */}, {Value(8), 2 /* frequency */, 5 /* range frequency */, 3 /* ndv */}, }, { // Array unique buckets. {Value(1), 17 /* frequency */, 34 /* range frequency */, 3 /* ndv */}, {Value(5), 17 /* frequency */}, {Value(6), 17 /* frequency */, 51 /* range frequency */, 4 /* ndv */}, }, { // Array min buckets. {Value(1), 17 /* frequency */}, {Value(5), 17 /* frequency */}, {Value(6), 17 /* frequency */}, {Value(8), 17 /* frequency */}, }, { // Array max buckets. {Value(3), 17 /* frequency */}, {Value(5), 17 /* frequency */}, {Value(6), 17 /* frequency */}, {Value(10), 17 /* frequency */}, }, {{sbe::value::TypeTags::NumberInt32, 88}} // Array type counts. )); CEHistogramTester t(collName, collCardinality, collStats); // Test cardinality of individual predicates. ASSERT_EQ_ELEMMATCH_CE(t, 5.0 /* CE */, 0.0 /* $elemMatch CE */, "scalar", "{$eq: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 35.0 /* CE */, 35.0 /* $elemMatch CE */, "array", "{$eq: 5}"); ASSERT_EQ_ELEMMATCH_CE(t, 19.5 /* CE */, 17.0 /* $elemMatch CE */, "mixed", "{$eq: 5}"); // Test cardinality of predicate combinations; the following tests make sure we correctly track // which paths have $elemMatches and which don't. Some notes: // - Whenever we use 'scalar' + $elemMatch, we expect an estimate of 0 because $elemMatch never // returns documents on non-array paths. // - Whenever we use 'mixed' + $elemMatch, we expect the estimate to decrease because we omit // scalar values in 'mixed' from our estimate. // - We do not expect the estimate on 'array' to be affected by the presence of $elemMatch, // since we only have array values for this field. // Composite predicate on 'scalar' and 'array' fields. ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, array: {$eq: 5}}", 2.236); ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 2.236); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 0.0); // Composite predicate on 'mixed' and 'array' fields. ASSERT_MATCH_CE(t, "{mixed: {$eq: 5}, array: {$eq: 5}}", 8.721); ASSERT_MATCH_CE(t, "{mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 8.721); ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 7.603); // Composite predicate on 'scalar' and 'mixed' fields. ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}}", 1.669); ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}}", 1.559); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}}", 0.0); // Composite predicate on all three fields without '$elemMatch' on 'array'. ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}, array: {$eq: 5}}", 1.116); ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 1.042); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}, array: {$eq: 5}}", 0.0); // Composite predicate on all three fields with '$elemMatch' on 'array' (same expected results // as above). ASSERT_MATCH_CE(t, "{scalar: {$eq: 5}, mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 1.116); // Test case where the same path has both $match and $elemMatch (same as $elemMatch case). ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, scalar: {$eq: 5}}", 0.0); ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}}", 17.0); ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 35.0); // Test case with multiple predicates and ranges. ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$lt: 5}}", 67.88); ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$gt: 5}}", 28.19); // Test multiple $elemMatches. ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 0.0); ASSERT_MATCH_CE(t, "{mixed: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 7.603); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}}", 0.0); ASSERT_MATCH_CE( t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: {$eq: 5}}", 0.0); ASSERT_MATCH_CE( t, "{scalar: {$eq: 5}, mixed: {$elemMatch: {$eq: 5}}, array: {$elemMatch: {$eq: 5}}}", 1.042); ASSERT_MATCH_CE( t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$eq: 5}, array: {$elemMatch: {$eq: 5}}}", 0.0); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}, mixed: {$elemMatch: {$eq: 5}}, array: " "{$elemMatch: {$eq: 5}}}", 0.0); ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$lt: 5}}}", 32.09); ASSERT_MATCH_CE(t, "{array: {$elemMatch: {$lt: 5}}, mixed: {$elemMatch: {$gt: 5}}}", 42.78); // Verify that we still return an estimate of 0.0 for any $elemMatch predicate on a scalar // field when we have a non-multikey index. t.setIndexes({{"aScalarIndex", makeIndexDefinition("scalar", CollationOp::Ascending, /* isMultiKey */ false)}}); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$eq: 5}}}", 0.0); ASSERT_MATCH_CE(t, "{scalar: {$elemMatch: {$gt: 1, $lt: 10}}}", 0.0); } TEST(CEHistogramTest, TestMixedElemMatchAndNonElemMatch) { const auto collName = "test"; const auto collCardinality = 1; std::shared_ptr collStats(new CollectionStatisticsMock(collCardinality)); // A very simple histogram encoding a collection with one document {a: [3, 10]}. collStats->addHistogram("a", getArrayHistogramFromData({/* No scalar buckets. */}, { // Array unique buckets. {Value(3), 1 /* frequency */}, {Value(10), 1 /* frequency */}, }, { // Array min buckets. {Value(3), 1 /* frequency */}, }, { // Array max buckets. {Value(10), 1 /* frequency */}, }, {{sbe::value::TypeTags::NumberInt32, collCardinality}} // Array type counts. )); CEHistogramTester t(collName, collCardinality, collStats); // Tests without indexes. ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10}}", 1.0); ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$eq: 3}, $gt: 3, $lt: 10}}", 1.0); ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$eq: 3}}}", 1.0); ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$gt: 3, $lt: 10}}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}, $gt: 3, $lt: 10}}", 0.0); // Tests with multikey index (note that the index on "a" must be multikey due to arrays). t.setIndexes( {{"anIndex", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ true)}}); ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10}}", 1.0); ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$eq: 3}, $gt: 3, $lt: 10}}", 1.0); ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$eq: 3}}}", 1.0); ASSERT_MATCH_CE(t, "{a: {$gt: 3, $lt: 10, $elemMatch: {$gt: 3, $lt: 10}}}", 0.0); ASSERT_MATCH_CE(t, "{a: {$elemMatch: {$gt: 3, $lt: 10}, $gt: 3, $lt: 10}}", 0.0); } } // namespace } // namespace mongo::ce