summaryrefslogtreecommitdiff
path: root/src/mongo/db/query/ce/histogram_array_data_test.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/query/ce/histogram_array_data_test.cpp')
-rw-r--r--src/mongo/db/query/ce/histogram_array_data_test.cpp298
1 files changed, 298 insertions, 0 deletions
diff --git a/src/mongo/db/query/ce/histogram_array_data_test.cpp b/src/mongo/db/query/ce/histogram_array_data_test.cpp
new file mode 100644
index 00000000000..7f8bb92fc51
--- /dev/null
+++ b/src/mongo/db/query/ce/histogram_array_data_test.cpp
@@ -0,0 +1,298 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include <vector>
+
+#include "mongo/db/exec/sbe/values/value.h"
+#include "mongo/db/query/ce/histogram_predicate_estimation.h"
+#include "mongo/db/query/ce/test_utils.h"
+#include "mongo/db/query/query_test_service_context.h"
+#include "mongo/db/query/stats/array_histogram.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::optimizer::ce {
+namespace {
+namespace value = sbe::value;
+
+using stats::ArrayHistogram;
+using stats::ScalarHistogram;
+using stats::TypeCounts;
+
+/**
+ * Structure representing a range query and its estimated and actual cardinalities.
+ * Used to record hand-crafted queries over a pre-generated dataset.
+ */
+struct QuerySpec {
+ // Low bound of the query range.
+ int32_t low;
+ // Upper bound of the query range.
+ int32_t high;
+ // Estimated cardinality of $match query.
+ double estMatch;
+ // Actual cardinality of $match query.
+ double actMatch;
+ // Estimated cardinality of $elemMatch query.
+ double estElemMatch;
+ // Actual cardinality of $elemMatch query.
+ double actElemMatch;
+};
+
+static std::pair<double, double> computeErrors(size_t actualCard, double estimatedCard) {
+ double error = estimatedCard - actualCard;
+ double relError = (actualCard == 0) ? (estimatedCard == 0 ? 0.0 : -1.0) : error / actualCard;
+ return std::make_pair(error, relError);
+}
+
+static std::string serializeQuery(QuerySpec& q, bool isElemMatch) {
+ std::ostringstream os;
+ os << "{$match: {a: {";
+ if (isElemMatch) {
+ os << "$elemMatch: {";
+ }
+ os << "$gt: " << q.low;
+ os << ", $lt: " << q.high;
+ if (isElemMatch) {
+ os << "}";
+ }
+ os << "}}}\n";
+ return os.str();
+}
+
+static std::string computeRMSE(std::vector<QuerySpec>& querySet, bool isElemMatch) {
+ double rms = 0.0, relRms = 0.0, meanAbsSelErr = 0.0;
+ size_t trialSize = querySet.size();
+ const size_t dataSize = 1000;
+
+ std::ostringstream os;
+ os << "\nQueries:\n";
+ for (auto& q : querySet) {
+ double estimatedCard = isElemMatch ? q.estElemMatch : q.estMatch;
+ double actualCard = isElemMatch ? q.actElemMatch : q.actMatch;
+
+ auto [error, relError] = computeErrors(actualCard, estimatedCard);
+ rms += error * error;
+ relRms += relError * relError;
+ meanAbsSelErr += std::abs(error);
+ os << serializeQuery(q, isElemMatch);
+ os << "Estimated: " << estimatedCard << " Actual " << actualCard << " (Error: " << error
+ << " RelError: " << relError << ")\n\n";
+ }
+ rms = std::sqrt(rms / trialSize);
+ relRms = std::sqrt(relRms / trialSize);
+ meanAbsSelErr /= (trialSize * dataSize);
+
+ os << "=====" << (isElemMatch ? " ElemMatch errors: " : "Match errors:") << "=====\n";
+ os << "RMSE : " << rms << " RelRMSE : " << relRms
+ << " MeanAbsSelectivityError: " << meanAbsSelErr << std::endl;
+ return os.str();
+}
+
+TEST(EstimatorArrayDataTest, Histogram1000ArraysSmall10Buckets) {
+ std::vector<BucketData> scalarData{{}};
+ const ScalarHistogram scalarHist = createHistogram(scalarData);
+
+ std::vector<BucketData> minData{{0, 5.0, 0.0, 0.0},
+ {553, 2.0, 935.0, 303.0},
+ {591, 4.0, 2.0, 1.0},
+ {656, 2.0, 21.0, 12.0},
+ {678, 3.0, 6.0, 3.0},
+ {693, 2.0, 1.0, 1.0},
+ {730, 1.0, 6.0, 3.0},
+ {788, 1.0, 2.0, 2.0},
+ {847, 2.0, 4.0, 1.0},
+ {867, 1.0, 0.0, 0.0}};
+
+ const ScalarHistogram aMinHist = createHistogram(minData);
+
+ std::vector<BucketData> maxData{{117, 1.0, 0.0, 0.0},
+ {210, 1.0, 1.0, 1.0},
+ {591, 1.0, 8.0, 4.0},
+ {656, 1.0, 0.0, 0.0},
+ {353, 2.0, 18.0, 9.0},
+ {610, 5.0, 125.0, 65.0},
+ {733, 8.0, 134.0, 53.0},
+ {768, 6.0, 50.0, 16.0},
+ {957, 8.0, 448.0, 137.0},
+ {1000, 7.0, 176.0, 40.0}};
+
+ const ScalarHistogram aMaxHist = createHistogram(maxData);
+
+ std::vector<BucketData> uniqueData{{0, 5.0, 0.0, 0.0},
+ {16, 11.0, 74.0, 13.0},
+ {192, 13.0, 698.0, 148.0},
+ {271, 9.0, 312.0, 70.0},
+ {670, 7.0, 1545.0, 355.0},
+ {712, 9.0, 159.0, 32.0},
+ {776, 11.0, 247.0, 54.0},
+ {869, 9.0, 361.0, 85.0},
+ {957, 8.0, 323.0, 76.0},
+ {1000, 7.0, 188.0, 40.0}};
+
+ const ScalarHistogram aUniqueHist = createHistogram(uniqueData);
+
+ TypeCounts typeCounts;
+ TypeCounts arrayTypeCounts;
+ // Dataset generated as 1000 arrays of size between 3 to 5.
+ typeCounts.insert({value::TypeTags::Array, 1000});
+ arrayTypeCounts.insert({value::TypeTags::NumberInt32, 3996});
+
+ const ArrayHistogram arrHist(scalarHist,
+ typeCounts,
+ aUniqueHist,
+ aMinHist,
+ aMaxHist,
+ arrayTypeCounts,
+ 0 /* emptyArrayCount */);
+
+ std::vector<QuerySpec> querySet{{10, 20, 35.7, 93.0, 37.8, 39.0},
+ {10, 60, 103.3, 240.0, 158.0, 196.0},
+ {320, 330, 554.5, 746.0, 26.0, 30.0},
+ {320, 400, 672.9, 832.0, 231.5, 298.0},
+ {980, 990, 88.8, 101.0, 36.5, 41.0},
+ {970, 1050, 129.7, 141.0, 129.7, 141.0}};
+
+ for (const auto q : querySet) {
+ // $match query, includeScalar = true.
+ double estCard = estimateCardRange(arrHist,
+ false /* lowInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false /* highInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high),
+ true /* includeScalar */);
+ ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1);
+
+ // $elemMatch query, includeScalar = false.
+ estCard = estimateCardRange(arrHist,
+ false /* lowInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false /* highInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high),
+ false /* includeScalar */);
+ ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1);
+ }
+ std::cout << computeRMSE(querySet, false /* isElemMatch */) << std::endl;
+ std::cout << computeRMSE(querySet, true /* isElemMatch */) << std::endl;
+}
+
+TEST(EstimatorArrayDataTest, Histogram1000ArraysLarge10Buckets) {
+ std::vector<BucketData> scalarData{{}};
+ const ScalarHistogram scalarHist = createHistogram(scalarData);
+
+ std::vector<BucketData> minData{{0, 2.0, 0.0, 0.0},
+ {1324, 4.0, 925.0, 408.0},
+ {1389, 5.0, 7.0, 5.0},
+ {1521, 2.0, 16.0, 10.0},
+ {1621, 2.0, 13.0, 7.0},
+ {1852, 5.0, 10.0, 9.0},
+ {1864, 2.0, 0.0, 0.0},
+ {1971, 1.0, 3.0, 3.0},
+ {2062, 2.0, 0.0, 0.0},
+ {2873, 1.0, 0.0, 0.0}};
+
+ const ScalarHistogram aMinHist = createHistogram(minData);
+
+ std::vector<BucketData> maxData{{2261, 1.0, 0.0, 0.0},
+ {2673, 1.0, 0.0, 0.0},
+ {2930, 1.0, 1.0, 1.0},
+ {3048, 2.0, 2.0, 2.0},
+ {3128, 3.0, 1.0, 1.0},
+ {3281, 2.0, 0.0, 0.0},
+ {3378, 2.0, 7.0, 5.0},
+ {3453, 4.0, 2.0, 2.0},
+ {3763, 6.0, 44.0, 23.0},
+ {5000, 1.0, 920.0, 416.0}};
+
+ const ScalarHistogram aMaxHist = createHistogram(maxData);
+
+ std::vector<BucketData> uniqueData{{0, 2.0, 0.0, 0.0},
+ {1106, 9.0, 1970.0, 704.0},
+ {1542, 11.0, 736.0, 280.0},
+ {3267, 6.0, 3141.0, 1097.0},
+ {3531, 6.0, 461.0, 175.0},
+ {3570, 7.0, 48.0, 20.0},
+ {4573, 8.0, 1851.0, 656.0},
+ {4619, 6.0, 65.0, 30.0},
+ {4782, 5.0, 265.0, 99.0},
+ {5000, 1.0, 342.0, 135.0}};
+
+ const ScalarHistogram aUniqueHist = createHistogram(uniqueData);
+
+ TypeCounts typeCounts;
+ TypeCounts arrayTypeCounts;
+ // Dataset generated as 1000 arrays of size between 8 to 10.
+ typeCounts.insert({value::TypeTags::Array, 1000});
+ arrayTypeCounts.insert({value::TypeTags::NumberInt32, 8940});
+
+ const ArrayHistogram arrHist(scalarHist,
+ typeCounts,
+ aUniqueHist,
+ aMinHist,
+ aMaxHist,
+ arrayTypeCounts,
+ 0 /* emptyArrayCount */);
+
+ std::vector<QuerySpec> querySet{{10, 20, 13.7, 39.0, 9.7, 26.0},
+ {10, 60, 41.6, 108.0, 55.7, 101.0},
+ {1000, 1010, 705.4, 861.0, 9.7, 7.0},
+ {1000, 1050, 733.3, 884.0, 55.7, 87.0},
+ {3250, 3300, 988.0, 988.0, 59.3, 86.0},
+ {4970, 4980, 23.3, 53.0, 8.5, 16.0}};
+
+ for (const auto q : querySet) {
+ // $match query, includeScalar = true.
+ double estCard = estimateCardRange(arrHist,
+ false /* lowInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false /* highInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high),
+ true /* includeScalar */);
+ ASSERT_APPROX_EQUAL(estCard, q.estMatch, 0.1);
+
+ // $elemMatch query, includeScalar = false.
+ estCard = estimateCardRange(arrHist,
+ false /* lowInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.low),
+ false /* highInclusive */,
+ value::TypeTags::NumberInt32,
+ sbe::value::bitcastFrom<int32_t>(q.high),
+ false /* includeScalar */);
+ ASSERT_APPROX_EQUAL(estCard, q.estElemMatch, 0.1);
+ }
+ std::cout << computeRMSE(querySet, false /* isElemMatch */) << std::endl;
+ std::cout << computeRMSE(querySet, true /* isElemMatch */) << std::endl;
+}
+} // namespace
+} // namespace mongo::optimizer::ce