summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMilena Ivanova <milena.ivanova@mongodb.com>2022-09-15 13:40:33 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-15 14:33:28 +0000
commitb4c5194e3c0dfa8d092d39ae2389e54969b2a329 (patch)
tree0066df8946f2d08a10cb47cf2aeab775a2d2c00f
parent36420773c0fa0b76c6d756cec0cc6349e321e8c2 (diff)
downloadmongo-b4c5194e3c0dfa8d092d39ae2389e54969b2a329.tar.gz
SERVER-67151 Integrate elemMatch cardinality estimation into master
-rw-r--r--src/mongo/db/query/ce/SConscript10
-rw-r--r--src/mongo/db/query/ce/ce_array_data_test.cpp285
-rw-r--r--src/mongo/db/query/ce/ce_interpolation_test.cpp65
-rw-r--r--src/mongo/db/query/ce/ce_test_utils.cpp27
-rw-r--r--src/mongo/db/query/ce/ce_test_utils.h21
-rw-r--r--src/mongo/db/query/ce/histogram_estimation.cpp85
-rw-r--r--src/mongo/db/query/ce/histogram_estimation.h4
7 files changed, 433 insertions, 64 deletions
diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript
index 5439f5a078a..db504072718 100644
--- a/src/mongo/db/query/ce/SConscript
+++ b/src/mongo/db/query/ce/SConscript
@@ -85,6 +85,16 @@ env.CppUnitTest(
)
env.CppUnitTest(
+ target="ce_array_data_test",
+ source=[
+ "ce_array_data_test.cpp",
+ ],
+ LIBDEPS=[
+ 'ce_test_utils',
+ ],
+)
+
+env.CppUnitTest(
target='stats_cache_loader_test',
source=[
'stats_cache_loader_test.cpp',
diff --git a/src/mongo/db/query/ce/ce_array_data_test.cpp b/src/mongo/db/query/ce/ce_array_data_test.cpp
new file mode 100644
index 00000000000..b8e7d203c59
--- /dev/null
+++ b/src/mongo/db/query/ce/ce_array_data_test.cpp
@@ -0,0 +1,285 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include <vector>
+
+#include "mongo/db/exec/sbe/values/value.h"
+#include "mongo/db/query/ce/array_histogram.h"
+#include "mongo/db/query/ce/ce_test_utils.h"
+#include "mongo/db/query/ce/histogram_estimation.h"
+#include "mongo/db/query/query_test_service_context.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::ce {
+namespace {
+
+using namespace sbe;
+
+/**
+ * Structure representing a range query and its estimated and actual cardinalities.
+ * Used to record hand-crafted queries over a pre-generated dataset.
+ */
+struct QuerySpec {
+ // Low bound of the query range.
+ int32_t low;
+ // Upper bound of the query range.
+ int32_t high;
+ // Estimated cardinality of $match query.
+ double estMatch;
+ // Actual cardinality of $match query.
+ double actMatch;
+ // Estimated cardinality of $elemMatch query.
+ double estElemMatch;
+ // Actual cardinality of $elemMatch query.
+ double actElemMatch;
+};
+
+static std::pair<double, double> computeErrors(size_t actualCard, double estimatedCard) {
+ double error = estimatedCard - actualCard;
+ double relError = (actualCard == 0) ? (estimatedCard == 0 ? 0.0 : -1.0) : error / actualCard;
+ return std::make_pair(error, relError);
+}
+
+static std::string serializeQuery(QuerySpec& q, bool isElemMatch) {
+ std::ostringstream os;
+ os << "{$match: {a: {";
+ if (isElemMatch) {
+ os << "$elemMatch: {";
+ }
+ os << "$gt: " << q.low;
+ os << ", $lt: " << q.high;
+ if (isElemMatch) {
+ os << "}";
+ }
+ os << "}}}\n";
+ return os.str();
+}
+
+static std::string computeRMSE(std::vector<QuerySpec>& querySet, bool isElemMatch) {
+ double rms = 0.0, relRms = 0.0, meanAbsSelErr = 0.0;
+ size_t trialSize = querySet.size();
+ const size_t dataSize = 1000;
+
+ std::ostringstream os;
+ os << "\nQueries:\n";
+ for (auto& q : querySet) {
+ double estimatedCard = isElemMatch ? q.estElemMatch : q.estMatch;
+ double actualCard = isElemMatch ? q.actElemMatch : q.actMatch;
+
+ auto [error, relError] = computeErrors(actualCard, estimatedCard);
+ rms += error * error;
+ relRms += relError * relError;
+ meanAbsSelErr += std::abs(error);
+ os << serializeQuery(q, isElemMatch);
+ os << "Estimated: " << estimatedCard << " Actual " << actualCard << " (Error: " << error
+ << " RelError: " << relError << ")\n\n";
+ }
+ rms = std::sqrt(rms / trialSize);
+ relRms = std::sqrt(relRms / trialSize);
+ meanAbsSelErr /= (trialSize * dataSize);
+
+ os << "=====" << (isElemMatch ? " ElemMatch errors: " : "Match errors:") << "=====\n";
+ os << "RMSE : " << rms << " RelRMSE : " << relRms
+ << " MeanAbsSelectivityError: " << meanAbsSelErr << std::endl;
+ return os.str();
+}
+
+TEST(EstimatorArrayDataTest, Histogram1000ArraysSmall10Buckets) {
+ std::vector<BucketData> scalarData{{}};
+ const ScalarHistogram scalarHist = createHistogram(scalarData);
+
+ std::vector<BucketData> minData{{0, 5.0, 0.0, 0.0},
+ {553, 2.0, 935.0, 303.0},
+ {591, 4.0, 2.0, 1.0},
+ {656, 2.0, 21.0, 12.0},
+ {678, 3.0, 6.0, 3.0},
+ {693, 2.0, 1.0, 1.0},
+ {730, 1.0, 6.0, 3.0},
+ {788, 1.0, 2.0, 2.0},
+ {847, 2.0, 4.0, 1.0},
+ {867, 1.0, 0.0, 0.0}};
+
+ const ScalarHistogram aMinHist = createHistogram(minData);
+
+ std::vector<BucketData> maxData{{117, 1.0, 0.0, 0.0},
+ {210, 1.0, 1.0, 1.0},
+ {591, 1.0, 8.0, 4.0},
+ {656, 1.0, 0.0, 0.0},
+ {353, 2.0, 18.0, 9.0},
+ {610, 5.0, 125.0, 65.0},
+ {733, 8.0, 134.0, 53.0},
+ {768, 6.0, 50.0, 16.0},
+ {957, 8.0, 448.0, 137.0},
+ {1000, 7.0, 176.0, 40.0}};
+
+ const ScalarHistogram aMaxHist = createHistogram(maxData);
+
+ std::vector<BucketData> uniqueData{{0, 5.0, 0.0, 0.0},
+ {16, 11.0, 74.0, 13.0},
+ {192, 13.0, 698.0, 148.0},
+ {271, 9.0, 312.0, 70.0},
+ {670, 7.0, 1545.0, 355.0},
+ {712, 9.0, 159.0, 32.0},
+ {776, 11.0, 247.0, 54.0},
+ {869, 9.0, 361.0, 85.0},
+ {957, 8.0, 323.0, 76.0},
+ {1000, 7.0, 188.0, 40.0}};
+
+ const ScalarHistogram aUniqueHist = createHistogram(uniqueData);
+
+ std::map<value::TypeTags, size_t> typeCounts;
+ std::map<value::TypeTags, size_t> arrayTypeCounts;
+ // Dataset generated as 1000 arrays of size between 3 to 5.
+ typeCounts.insert({value::TypeTags::Array, 1000});
+ arrayTypeCounts.insert({value::TypeTags::NumberInt32, 3996});
+
+ const ArrayHistogram arrHist(
+ scalarHist, typeCounts, aUniqueHist, aMinHist, aMaxHist, arrayTypeCounts);
+
+ std::vector<QuerySpec> querySet{{10, 20, 35.7, 93.0, 37.8, 39.0},
+ {10, 60, 103.3, 240.0, 158.0, 196.0},
+ {320, 330, 554.5, 746.0, 26.0, 30.0},
+ {320, 400, 672.9, 832.0, 231.5, 298.0},
+ {980, 990, 88.8, 101.0, 36.5, 41.0},
+ {970, 1050, 129.7, 141.0, 129.7, 141.0}};
+
+ for (const auto q : querySet) {
+ // $match query, includeScalar = true.
+ double estCard = estimateCardRange(arrHist,
+ true,
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high));
+ ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1);
+
+ // $elemMatch query, includeScalar = false.
+ estCard = estimateCardRange(arrHist,
+ false,
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high));
+ ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1);
+ }
+ std::cout << computeRMSE(querySet, false) << std::endl;
+ std::cout << computeRMSE(querySet, true) << std::endl;
+}
+
+TEST(EstimatorArrayDataTest, Histogram1000ArraysLarge10Buckets) {
+ std::vector<BucketData> scalarData{{}};
+ const ScalarHistogram scalarHist = createHistogram(scalarData);
+
+ std::vector<BucketData> minData{{0, 2.0, 0.0, 0.0},
+ {1324, 4.0, 925.0, 408.0},
+ {1389, 5.0, 7.0, 5.0},
+ {1521, 2.0, 16.0, 10.0},
+ {1621, 2.0, 13.0, 7.0},
+ {1852, 5.0, 10.0, 9.0},
+ {1864, 2.0, 0.0, 0.0},
+ {1971, 1.0, 3.0, 3.0},
+ {2062, 2.0, 0.0, 0.0},
+ {2873, 1.0, 0.0, 0.0}};
+
+ const ScalarHistogram aMinHist = createHistogram(minData);
+
+ std::vector<BucketData> maxData{{2261, 1.0, 0.0, 0.0},
+ {2673, 1.0, 0.0, 0.0},
+ {2930, 1.0, 1.0, 1.0},
+ {3048, 2.0, 2.0, 2.0},
+ {3128, 3.0, 1.0, 1.0},
+ {3281, 2.0, 0.0, 0.0},
+ {3378, 2.0, 7.0, 5.0},
+ {3453, 4.0, 2.0, 2.0},
+ {3763, 6.0, 44.0, 23.0},
+ {5000, 1.0, 920.0, 416.0}};
+
+ const ScalarHistogram aMaxHist = createHistogram(maxData);
+
+ std::vector<BucketData> uniqueData{{0, 2.0, 0.0, 0.0},
+ {1106, 9.0, 1970.0, 704.0},
+ {1542, 11.0, 736.0, 280.0},
+ {3267, 6.0, 3141.0, 1097.0},
+ {3531, 6.0, 461.0, 175.0},
+ {3570, 7.0, 48.0, 20.0},
+ {4573, 8.0, 1851.0, 656.0},
+ {4619, 6.0, 65.0, 30.0},
+ {4782, 5.0, 265.0, 99.0},
+ {5000, 1.0, 342.0, 135.0}};
+
+ const ScalarHistogram aUniqueHist = createHistogram(uniqueData);
+
+ std::map<value::TypeTags, size_t> typeCounts;
+ std::map<value::TypeTags, size_t> arrayTypeCounts;
+ // Dataset generated as 1000 arrays of size between 8 to 10.
+ typeCounts.insert({value::TypeTags::Array, 1000});
+ arrayTypeCounts.insert({value::TypeTags::NumberInt32, 8940});
+
+ const ArrayHistogram arrHist(
+ scalarHist, typeCounts, aUniqueHist, aMinHist, aMaxHist, arrayTypeCounts);
+
+ std::vector<QuerySpec> querySet{{10, 20, 13.7, 39.0, 9.7, 26.0},
+ {10, 60, 41.6, 108.0, 55.7, 101.0},
+ {1000, 1010, 705.4, 861.0, 9.7, 7.0},
+ {1000, 1050, 733.3, 884.0, 55.7, 87.0},
+ {3250, 3300, 988.0, 988.0, 59.3, 86.0},
+ {4970, 4980, 23.3, 53.0, 8.5, 16.0}};
+
+ for (const auto q : querySet) {
+ // $match query, includeScalar = true.
+ double estCard = estimateCardRange(arrHist,
+ true,
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high));
+ ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1);
+
+ // $elemMatch query, includeScalar = false.
+ estCard = estimateCardRange(arrHist,
+ false,
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high));
+ ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1);
+ }
+ std::cout << computeRMSE(querySet, false) << std::endl;
+ std::cout << computeRMSE(querySet, true) << std::endl;
+}
+} // namespace
+} // namespace mongo::ce
diff --git a/src/mongo/db/query/ce/ce_interpolation_test.cpp b/src/mongo/db/query/ce/ce_interpolation_test.cpp
index 3e79374b627..6bbda0d0c34 100644
--- a/src/mongo/db/query/ce/ce_interpolation_test.cpp
+++ b/src/mongo/db/query/ce/ce_interpolation_test.cpp
@@ -44,47 +44,6 @@ double estimateIntValCard(const ScalarHistogram& hist, const int v, const Estima
return estimate(hist, tag, val, type).card;
};
-struct BucketData {
- Value _v;
- double _equalFreq;
- double _rangeFreq;
- double _ndv;
-
- BucketData(Value v, double equalFreq, double rangeFreq, double ndv)
- : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {}
- BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv)
- : BucketData(Value(v), equalFreq, rangeFreq, ndv) {}
- BucketData(int v, double equalFreq, double rangeFreq, double ndv)
- : BucketData(Value(v), equalFreq, rangeFreq, ndv) {}
-};
-
-ScalarHistogram createHistogram(const std::vector<BucketData>& data) {
- sbe::value::Array array;
- for (const auto& item : data) {
- const auto [tag, val] = stage_builder::makeValue(item._v);
- array.push_back(tag, val);
- }
-
- value::Array bounds;
- std::vector<Bucket> buckets;
-
- double cumulativeFreq = 0.0;
- double cumulativeNDV = 0.0;
-
- for (size_t i = 0; i < data.size(); i++) {
- const auto [tag, val] = array.getAt(i);
- bounds.push_back(tag, val);
-
- const auto& item = data.at(i);
- cumulativeFreq += item._equalFreq + item._rangeFreq;
- cumulativeNDV += item._ndv + 1.0;
- buckets.emplace_back(
- item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV);
- }
-
- return {std::move(bounds), std::move(buckets)};
-}
-
TEST(EstimatorTest, ManualHistogram) {
std::vector<BucketData> data{{0, 1.0, 1.0, 1.0},
{10, 1.0, 10.0, 5.0},
@@ -391,29 +350,29 @@ TEST(EstimatorTest, UniformIntArrayOnlyEstimate) {
value::TypeTags highTag = value::TypeTags::NumberInt64;
value::Value highVal = 600;
- // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}].
+ // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}].
double expectedCard =
estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal);
- ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1);
+ ASSERT_APPROX_EQUAL(27.0, expectedCard, 0.1); // actual 21.
- // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}].
- // Note: this should be the same as above, since we have no scalars.
+ // Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 600}}}].
+ // Note: although there are no scalars, the estimate is different than the
+ // above since we use different formulas.
expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal);
- ASSERT_APPROX_EQUAL(8.63, expectedCard, 0.1);
+ ASSERT_APPROX_EQUAL(92.0, expectedCard, 0.1); // actual 92.
// Query at the end of the domain: more precise estimates from ArrayMin, ArrayMax histograms.
lowVal = 10;
highVal = 110;
- // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}].
+ // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 10, $lt: 110}}}}].
expectedCard =
estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal);
- ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1);
+ ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1); // actual 29.
- // Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 600}}}}].
- // Note: this should be the same as above, since we have no scalars.
+ // Test interpolation for query: [{$match: {a: {$gt: 10, $lt: 110}}}].
expectedCard = estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal);
- ASSERT_APPROX_EQUAL(24.1, expectedCard, 0.1);
+ ASSERT_APPROX_EQUAL(27.8, expectedCard, 0.1); // actual 31.
}
TEST(EstimatorTest, UniformIntMixedArrayEstimate) {
@@ -465,12 +424,12 @@ TEST(EstimatorTest, UniformIntMixedArrayEstimate) {
// Test interpolation for query: [{$match: {a: {$gt: 500, $lt: 550}}}].
double expectedCard =
estimateCardRange(arrHist, true, false, lowTag, lowVal, false, highTag, highVal);
- ASSERT_APPROX_EQUAL(9.8, expectedCard, 0.1); // Actual: 94.
+ ASSERT_APPROX_EQUAL(90.9, expectedCard, 0.1); // Actual: 94.
// Test interpolation for query: [{$match: {a: {$elemMatch: {$gt: 500, $lt: 550}}}}].
expectedCard =
estimateCardRange(arrHist, false, false, lowTag, lowVal, false, highTag, highVal);
- ASSERT_APPROX_EQUAL(5.6, expectedCard, 0.1); // Actual: 8.
+ ASSERT_APPROX_EQUAL(11.0, expectedCard, 0.1); // Actual: 8.
}
} // namespace
diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp
index 6f68cc04304..fc34dd147ac 100644
--- a/src/mongo/db/query/ce/ce_test_utils.cpp
+++ b/src/mongo/db/query/ce/ce_test_utils.cpp
@@ -37,6 +37,7 @@
#include "mongo/db/query/optimizer/explain.h"
#include "mongo/db/query/optimizer/opt_phase_manager.h"
#include "mongo/db/query/optimizer/utils/unit_test_utils.h"
+#include "mongo/db/query/sbe_stage_builder_helpers.h"
#include "mongo/unittest/unittest.h"
namespace mongo::ce {
@@ -136,4 +137,30 @@ optimizer::CEType CETester::getCE(ABT& abt) const {
return outCard;
}
+ScalarHistogram createHistogram(const std::vector<BucketData>& data) {
+ sbe::value::Array array;
+ for (const auto& item : data) {
+ const auto [tag, val] = stage_builder::makeValue(item._v);
+ array.push_back(tag, val);
+ }
+
+ value::Array bounds;
+ std::vector<Bucket> buckets;
+
+ double cumulativeFreq = 0.0;
+ double cumulativeNDV = 0.0;
+
+ for (size_t i = 0; i < data.size(); i++) {
+ const auto [tag, val] = array.getAt(i);
+ bounds.push_back(tag, val);
+
+ const auto& item = data.at(i);
+ cumulativeFreq += item._equalFreq + item._rangeFreq;
+ cumulativeNDV += item._ndv + 1.0;
+ buckets.emplace_back(
+ item._equalFreq, item._rangeFreq, cumulativeFreq, item._ndv, cumulativeNDV);
+ }
+
+ return {std::move(bounds), std::move(buckets)};
+}
} // namespace mongo::ce
diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h
index 85ece3a1ee6..8d38b160a83 100644
--- a/src/mongo/db/query/ce/ce_test_utils.h
+++ b/src/mongo/db/query/ce/ce_test_utils.h
@@ -32,6 +32,7 @@
#include <cstddef>
#include <sys/types.h>
+#include "mongo/db/query/ce/scalar_histogram.h"
#include "mongo/db/query/optimizer/cascades/interfaces.h"
#include "mongo/db/query/optimizer/opt_phase_manager.h"
@@ -49,6 +50,7 @@ class CEInterface;
namespace ce {
using namespace optimizer;
+using namespace sbe;
// Enable this flag to log all estimates, and let all tests pass.
constexpr bool kCETestLogOnly = false;
@@ -128,5 +130,24 @@ private:
mutable PrefixId _prefixId;
};
+/**
+ * Test utility for helping with creation of manual histograms in the unit tests.
+ */
+struct BucketData {
+ Value _v;
+ double _equalFreq;
+ double _rangeFreq;
+ double _ndv;
+
+ BucketData(Value v, double equalFreq, double rangeFreq, double ndv)
+ : _v(v), _equalFreq(equalFreq), _rangeFreq(rangeFreq), _ndv(ndv) {}
+ BucketData(const std::string& v, double equalFreq, double rangeFreq, double ndv)
+ : BucketData(Value(v), equalFreq, rangeFreq, ndv) {}
+ BucketData(int v, double equalFreq, double rangeFreq, double ndv)
+ : BucketData(Value(v), equalFreq, rangeFreq, ndv) {}
+};
+
+ScalarHistogram createHistogram(const std::vector<BucketData>& data);
+
} // namespace ce
} // namespace mongo
diff --git a/src/mongo/db/query/ce/histogram_estimation.cpp b/src/mongo/db/query/ce/histogram_estimation.cpp
index 948729ee1b4..4fd2d8376db 100644
--- a/src/mongo/db/query/ce/histogram_estimation.cpp
+++ b/src/mongo/db/query/ce/histogram_estimation.cpp
@@ -266,6 +266,29 @@ static EstimationResult estimateRange(const ScalarHistogram& histogram,
return highEstimate - lowEstimate;
}
+/**
+ * Compute an estimate for range query on array data with formula:
+ * Card(ArrayMin(a < valHigh)) - Card(ArrayMax(a < valLow))
+ */
+static EstimationResult estimateRangeQueryOnArray(const ScalarHistogram& histogramAmin,
+ const ScalarHistogram& histogramAmax,
+ bool lowInclusive,
+ value::TypeTags tagLow,
+ value::Value valLow,
+ bool highInclusive,
+ value::TypeTags tagHigh,
+ value::Value valHigh) {
+ const EstimationType highType =
+ highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess;
+ const EstimationResult highEstimate = estimate(histogramAmin, tagHigh, valHigh, highType);
+
+ const EstimationType lowType =
+ lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual;
+ const EstimationResult lowEstimate = estimate(histogramAmax, tagLow, valLow, lowType);
+
+ return highEstimate - lowEstimate;
+}
+
double estimateCardRange(const ArrayHistogram& ah,
bool includeScalar,
/* Define lower bound. */
@@ -275,7 +298,8 @@ double estimateCardRange(const ArrayHistogram& ah,
/* Define upper bound. */
bool highInclusive,
value::TypeTags tagHigh,
- value::Value valHigh) {
+ value::Value valHigh,
+ EstimationAlgo estimationAlgo) {
uassert(6695701,
"Low bound must not be higher than high",
compareValues3w(tagLow, valLow, tagHigh, valHigh) <= 0);
@@ -287,16 +311,57 @@ double estimateCardRange(const ArrayHistogram& ah,
double result = 0.0;
if (ah.isArray()) {
- const auto arrayMinEst = estRange(ah.getArrayMin());
- const auto arrayMaxEst = estRange(ah.getArrayMax());
- const auto arrayUniqueEst = estRange(ah.getArrayUnique());
-
- // TODO: should we consider diving by sqrt(ndv) or just by ndv?
- const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0)
- ? 0.0
- : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv));
- result = std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity);
+ if (includeScalar) {
+ // Range query on array data.
+ const EstimationResult rangeCardOnArray = estimateRangeQueryOnArray(ah.getArrayMin(),
+ ah.getArrayMax(),
+ lowInclusive,
+ tagLow,
+ valLow,
+ highInclusive,
+ tagHigh,
+ valHigh);
+ result += rangeCardOnArray.card;
+ } else {
+ // $elemMatch query on array data.
+ const auto arrayMinEst = estRange(ah.getArrayMin());
+ const auto arrayMaxEst = estRange(ah.getArrayMax());
+ const auto arrayUniqueEst = estRange(ah.getArrayUnique());
+
+ const double totalArrayCount = getTotals(ah.getArrayMin()).card;
+ uassert(
+ 6715101, "Array histograms should contain at least one array", totalArrayCount > 0);
+ switch (estimationAlgo) {
+ case EstimationAlgo::HistogramV1: {
+ const double arrayUniqueDensity = (arrayUniqueEst.ndv == 0.0)
+ ? 0.0
+ : (arrayUniqueEst.card / std::sqrt(arrayUniqueEst.ndv));
+ result =
+ std::max(std::max(arrayMinEst.card, arrayMaxEst.card), arrayUniqueDensity);
+ break;
+ }
+ case EstimationAlgo::HistogramV2: {
+ const double avgArraySize =
+ getTotals(ah.getArrayUnique()).card / totalArrayCount;
+ const double adjustedUniqueCard = (avgArraySize == 0.0)
+ ? 0.0
+ : std::min(arrayUniqueEst.card / pow(avgArraySize, 0.2), totalArrayCount);
+ result =
+ std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard);
+ break;
+ }
+ case EstimationAlgo::HistogramV3: {
+ const double adjustedUniqueCard =
+ 0.85 * std::min(arrayUniqueEst.card, totalArrayCount);
+ result =
+ std::max(std::max(arrayMinEst.card, arrayMaxEst.card), adjustedUniqueCard);
+ break;
+ }
+ default:
+ MONGO_UNREACHABLE;
+ }
+ }
}
if (includeScalar) {
diff --git a/src/mongo/db/query/ce/histogram_estimation.h b/src/mongo/db/query/ce/histogram_estimation.h
index 0e554993983..3225dab1272 100644
--- a/src/mongo/db/query/ce/histogram_estimation.h
+++ b/src/mongo/db/query/ce/histogram_estimation.h
@@ -36,6 +36,7 @@
namespace mongo::ce {
enum class EstimationType { kEqual, kLess, kLessOrEqual, kGreater, kGreaterOrEqual };
+enum class EstimationAlgo { HistogramV1, HistogramV2, HistogramV3 };
const stdx::unordered_map<EstimationType, std::string> estimationTypeName = {
{EstimationType::kEqual, "eq"},
@@ -95,6 +96,7 @@ double estimateCardRange(const ArrayHistogram& ah,
/* Define upper bound. */
bool highInclusive,
sbe::value::TypeTags tagHigh,
- sbe::value::Value valHigh);
+ sbe::value::Value valHigh,
+ EstimationAlgo estAlgo = EstimationAlgo::HistogramV2);
} // namespace mongo::ce