diff options
Diffstat (limited to 'src')
24 files changed, 2184 insertions, 18 deletions
diff --git a/src/mongo/db/commands/cqf/cqf_aggregate.cpp b/src/mongo/db/commands/cqf/cqf_aggregate.cpp index 827ae2fe066..c51302d28a8 100644 --- a/src/mongo/db/commands/cqf/cqf_aggregate.cpp +++ b/src/mongo/db/commands/cqf/cqf_aggregate.cpp @@ -33,7 +33,10 @@ #include "mongo/db/exec/sbe/abt/abt_lower.h" #include "mongo/db/pipeline/abt/document_source_visitor.h" #include "mongo/db/pipeline/abt/match_expression_visitor.h" +#include "mongo/db/query/ce/ce_histogram.h" #include "mongo/db/query/ce/ce_sampling.h" +#include "mongo/db/query/ce/collection_statistics.h" +#include "mongo/db/query/ce_mode_parameter.h" #include "mongo/db/query/optimizer/cascades/ce_heuristic.h" #include "mongo/db/query/optimizer/cascades/cost_derivation.h" #include "mongo/db/query/optimizer/explain.h" @@ -457,8 +460,8 @@ std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> getSBEExecutorViaCascadesOp OPTIMIZER_DEBUG_LOG( 6264803, 5, "Translated ABT", "explain"_attr = ExplainGenerator::explainV2(abtTree)); - if (collectionExists && numRecords > 0 && - internalQueryEnableSamplingCardinalityEstimator.load()) { + if (internalQueryCardinalityEstimatorMode == ce::kSampling && collectionExists && + numRecords > 0) { Metadata metadataForSampling = metadata; // Do not use indexes for sampling. for (auto& entry : metadataForSampling._scanDefs) { @@ -486,17 +489,33 @@ std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> getSBEExecutorViaCascadesOp return optimizeAndCreateExecutor( phaseManager, std::move(abtTree), opCtx, expCtx, nss, collection); - } - // Use heuristics. - OptPhaseManager phaseManager{OptPhaseManager::getAllRewritesSet(), - prefixId, - std::move(metadata), - DebugInfo::kDefaultForProd}; - phaseManager.getHints() = queryHints; + } else if (internalQueryCardinalityEstimatorMode == ce::kHistogram && + ce::CollectionStatistics::hasCollectionStatistics(nss)) { + const auto& stats = ce::CollectionStatistics::getCollectionStatistics(nss); + auto ceDerivation = std::make_unique<CEHistogramTransport>(opCtx, stats); + OptPhaseManager phaseManager{OptPhaseManager::getAllRewritesSet(), + prefixId, + false /*requireRID*/, + std::move(metadata), + std::move(ceDerivation), + std::make_unique<DefaultCosting>(), + DebugInfo::kDefaultForProd}; - return optimizeAndCreateExecutor( - phaseManager, std::move(abtTree), opCtx, expCtx, nss, collection); + return optimizeAndCreateExecutor( + phaseManager, std::move(abtTree), opCtx, expCtx, nss, collection); + + } else { + // Default to using heuristics. + OptPhaseManager phaseManager{OptPhaseManager::getAllRewritesSet(), + prefixId, + std::move(metadata), + DebugInfo::kDefaultForProd}; + phaseManager.getHints() = queryHints; + + return optimizeAndCreateExecutor( + phaseManager, std::move(abtTree), opCtx, expCtx, nss, collection); + } } } // namespace mongo diff --git a/src/mongo/db/query/SConscript b/src/mongo/db/query/SConscript index 2f3a46adb83..d72ed205996 100644 --- a/src/mongo/db/query/SConscript +++ b/src/mongo/db/query/SConscript @@ -259,6 +259,7 @@ env.Library( env.Library( target="query_knobs", source=[ + 'ce_mode_parameter.cpp', 'plan_cache_size_parameter.cpp', 'query_feature_flags.idl', 'query_knobs.idl', @@ -353,6 +354,7 @@ env.CppUnitTest( "canonical_query_encoder_test.cpp", "canonical_query_test.cpp", "canonical_query_test_util.cpp", + "ce_mode_parameter_test.cpp", "classic_stage_builder_test.cpp", "count_command_test.cpp", "cursor_response_test.cpp", diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index c570a88b23f..4941d74512d 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -7,10 +7,42 @@ env = env.Clone() env.Library( target="query_ce", source=[ + 'array_histogram.cpp', + 'ce_histogram.cpp', + 'ce_math.cpp', 'ce_sampling.cpp', + 'ce_estimation.cpp', + 'collection_statistics.cpp', + 'histogram.cpp', + 'utils.cpp', ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/exec/sbe/query_sbe_abt', '$BUILD_DIR/mongo/db/query/optimizer/optimizer', ], ) + +env.Library( + target="ce_test_utils", + source=[ + 'ce_test_utils.cpp', + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/db/exec/sbe/query_sbe_values', + '$BUILD_DIR/mongo/db/exec/sbe/sbe_abt_test_util', + '$BUILD_DIR/mongo/db/query/optimizer/unit_test_utils', + "$BUILD_DIR/mongo/unittest/unittest", + 'query_ce', + ], +) + +env.CppUnitTest( + target="ce_histogram_test", + source=[ + "ce_histogram_test.cpp", + ], + LIBDEPS=[ + 'ce_test_utils', + ], +) diff --git a/src/mongo/db/query/ce/array_histogram.cpp b/src/mongo/db/query/ce/array_histogram.cpp new file mode 100644 index 00000000000..32da0754b7e --- /dev/null +++ b/src/mongo/db/query/ce/array_histogram.cpp @@ -0,0 +1,123 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/array_histogram.h" + +namespace mongo::ce { +using namespace sbe; + +ArrayHistogram::ArrayHistogram() : ArrayHistogram(Histogram(), {}) {} + +ArrayHistogram::ArrayHistogram(Histogram scalar, + std::map<value::TypeTags, size_t> typeCounts, + Histogram arrayUnique, + Histogram arrayMin, + Histogram arrayMax, + std::map<value::TypeTags, size_t> arrayTypeCounts) + : _scalar(std::move(scalar)), + _typeCounts(std::move(typeCounts)), + _arrayUnique(std::move(arrayUnique)), + _arrayMin(std::move(arrayMin)), + _arrayMax(std::move(arrayMax)), + _arrayTypeCounts(std::move(arrayTypeCounts)) { + invariant(isArray()); +} + +ArrayHistogram::ArrayHistogram(Histogram scalar, std::map<value::TypeTags, size_t> typeCounts) + : _scalar(std::move(scalar)), + _typeCounts(std::move(typeCounts)), + _arrayUnique(boost::none), + _arrayMin(boost::none), + _arrayMax(boost::none), + _arrayTypeCounts(boost::none) { + invariant(!isArray()); +} + +bool ArrayHistogram::isArray() const { + return _arrayUnique && _arrayMin && _arrayMax && _arrayTypeCounts; +} + +std::string typeCountsToString(const std::map<value::TypeTags, size_t>& typeCounts) { + std::ostringstream os; + os << "{"; + bool first = true; + for (auto [tag, count] : typeCounts) { + if (!first) + os << ", "; + os << tag << ": " << count; + first = false; + } + os << "}"; + return os.str(); +} + +std::string ArrayHistogram::toString() const { + std::ostringstream os; + os << "{\n"; + os << " scalar: " << _scalar.toString(); + os << ",\n typeCounts: " << typeCountsToString(_typeCounts); + if (isArray()) { + os << ",\n arrayUnique: " << _arrayUnique->toString(); + os << ",\n arrayMin: " << _arrayMin->toString(); + os << ",\n arrayMax: " << _arrayMax->toString(); + os << ",\n arrayTypeCounts: " << typeCountsToString(*_arrayTypeCounts); + } + os << "\n}\n"; + return os.str(); +} + +const Histogram& ArrayHistogram::getScalar() const { + return _scalar; +} + +const Histogram& ArrayHistogram::getArrayUnique() const { + invariant(isArray()); + return *_arrayUnique; +} + +const Histogram& ArrayHistogram::getArrayMin() const { + invariant(isArray()); + return *_arrayMin; +} + +const Histogram& ArrayHistogram::getArrayMax() const { + invariant(isArray()); + return *_arrayMax; +} + +const std::map<value::TypeTags, size_t>& ArrayHistogram::getTypeCounts() const { + return _typeCounts; +} + +const std::map<value::TypeTags, size_t>& ArrayHistogram::getArrayTypeCounts() const { + invariant(isArray()); + return *_arrayTypeCounts; +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/array_histogram.h b/src/mongo/db/query/ce/array_histogram.h new file mode 100644 index 00000000000..dce62420729 --- /dev/null +++ b/src/mongo/db/query/ce/array_histogram.h @@ -0,0 +1,84 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/ce/histogram.h" + +namespace mongo::ce { + +class ArrayHistogram { +public: + // Constructs an empty scalar histogram. + ArrayHistogram(); + + // Constructor for scalar field histograms. + ArrayHistogram(Histogram scalar, std::map<value::TypeTags, size_t> typeCounts); + + // Constructor for array field histograms. We have to initialize all array fields in this case. + ArrayHistogram(Histogram scalar, + std::map<value::TypeTags, size_t> typeCounts, + Histogram arrayUnique, + Histogram arrayMin, + Histogram arrayMax, + std::map<value::TypeTags, size_t> arrayTypeCounts); + + ArrayHistogram(const ArrayHistogram&) = delete; + + // Returns whether or not this histogram includes array data points. + bool isArray() const; + + std::string toString() const; + const Histogram& getScalar() const; + const Histogram& getArrayUnique() const; + const Histogram& getArrayMin() const; + const Histogram& getArrayMax() const; + const std::map<value::TypeTags, size_t>& getTypeCounts() const; + const std::map<value::TypeTags, size_t>& getArrayTypeCounts() const; + +private: + /* Histogram fields for all paths. */ + + // Contains values which appeared originally as scalars on the path. + Histogram _scalar; + std::map<value::TypeTags, size_t> _typeCounts; + + /* Histogram fields for array paths (only initialized if arrays are present). */ + + // Contains unique scalar values originating from arrays. + boost::optional<Histogram> _arrayUnique; + // Contains minimum values originating from arrays **per class**. + boost::optional<Histogram> _arrayMin; + // Contains maximum values originating from arrays **per class**. + boost::optional<Histogram> _arrayMax; + boost::optional<std::map<value::TypeTags, size_t>> _arrayTypeCounts; +}; + + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_estimation.cpp b/src/mongo/db/query/ce/ce_estimation.cpp new file mode 100644 index 00000000000..05be80562d3 --- /dev/null +++ b/src/mongo/db/query/ce/ce_estimation.cpp @@ -0,0 +1,168 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/ce_estimation.h" +#include "mongo/db/exec/sbe/abt/abt_lower.h" +#include "mongo/db/query/optimizer/syntax/expr.h" + +namespace mongo::ce { +using namespace sbe; + +/** + * Estimates the cardinality of an equality predicate given an ArrayHistogram and an SBE value and + * type tag pair. + */ +double estimateCardEq(const ArrayHistogram& ah, value::TypeTags tag, value::Value val) { + if (tag != value::TypeTags::Null) { + double card = ah.getScalar().estimate(tag, val, EstimationType::kEqual)._card; + if (ah.isArray()) { + return card + ah.getArrayUnique().estimate(tag, val, EstimationType::kEqual)._card; + } + return card; + } else { + // Predicate: {field: null} + // Count the values that are either null or that do not contain the field. + // TODO: + // This prototype doesn't have the concept of missing values. It can be added easily + // by adding a cardinality estimate that is >= the number of values. + // Estimation of $exists can be built on top of this estimate: + // {$exists: true} matches the documents that contain the field, including those where the + // field value is null. + // {$exists: false} matches only the documents that do not contain the field. + auto findNull = ah.getTypeCounts().find(value::TypeTags::Null); + if (findNull != ah.getTypeCounts().end()) { + return findNull->second; + } + return 0.0; + } +} + +static EstimationResult estimateRange(const Histogram& histogram, + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh) { + const EstimationType highType = + highInclusive ? EstimationType::kLessOrEqual : EstimationType::kLess; + const EstimationResult highEstimate = histogram.estimate(tagHigh, valHigh, highType); + + const EstimationType lowType = + lowInclusive ? EstimationType::kLess : EstimationType::kLessOrEqual; + const EstimationResult lowEstimate = histogram.estimate(tagLow, valLow, lowType); + + return highEstimate - lowEstimate; +} + +/** + * Estimates the cardinality of a range predicate given an ArrayHistogram and a range predicate. + * Set 'includeScalar' to true to indicate whether or not the provided range should include no-array + * values. The other fields define the range of the estimation. + */ +double estimateCardRange(const ArrayHistogram& ah, + bool includeScalar, + /* Define lower bound. */ + bool lowInclusive, + value::TypeTags tagLow, + value::Value valLow, + /* Define upper bound. */ + bool highInclusive, + value::TypeTags tagHigh, + value::Value valHigh) { + uassert(6695701, + "Low bound must not be higher than high", + compareValues3w(tagLow, valLow, tagHigh, valHigh) <= 0); + + // Helper lambda to shorten code for legibility. + auto estRange = [&](const Histogram& h) { + return estimateRange(h, lowInclusive, tagLow, valLow, highInclusive, tagHigh, valHigh); + }; + + double result = 0.0; + if (ah.isArray()) { + const auto arrayMinEst = estRange(ah.getArrayMin()); + const auto arrayMaxEst = estRange(ah.getArrayMax()); + const auto arrayUniqueEst = estRange(ah.getArrayUnique()); + + // TODO: should we consider diving by sqrt(ndv) or just by ndv? + const double arrayUniqueDensity = (arrayUniqueEst._ndv == 0.0) + ? 0.0 + : (arrayUniqueEst._card / std::sqrt(arrayUniqueEst._ndv)); + + result = std::max(std::max(arrayMinEst._card, arrayMaxEst._card), arrayUniqueDensity); + } + + if (includeScalar) { + const auto scalarEst = estRange(ah.getScalar()); + result += scalarEst._card; + } + + return result; +} + +double estimateIntervalCardinality(const ce::ArrayHistogram& ah, + const optimizer::IntervalRequirement& interval, + optimizer::CEType childResult) { + auto getBound = [](const optimizer::BoundRequirement& boundReq) { + return boundReq.getBound().cast<optimizer::Constant>()->get(); + }; + + if (interval.isFullyOpen()) { + return childResult; + } else if (interval.isEquality()) { + ce::SBEValue sbeValue = getBound(interval.getLowBound()); + auto cardinality = estimateCardEq(ah, sbeValue.getTag(), sbeValue.getValue()); + return cardinality; + } + + // Otherwise, we have a range. + ce::SBEValue lowSBEValue(sbe::value::TypeTags::MinKey, 0); + auto lowBound = interval.getLowBound(); + if (!lowBound.isInfinite()) { + lowSBEValue = getBound(lowBound); + } + + ce::SBEValue highSBEValue(sbe::value::TypeTags::MaxKey, 0); + auto highBound = interval.getHighBound(); + if (!highBound.isInfinite()) { + highSBEValue = getBound(highBound); + } + + return estimateCardRange(ah, + true /*includeScalar*/, + lowBound.isInclusive(), + lowSBEValue.getTag(), + lowSBEValue.getValue(), + highBound.isInclusive(), + highSBEValue.getTag(), + highSBEValue.getValue()); +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_estimation.h b/src/mongo/db/query/ce/ce_estimation.h new file mode 100644 index 00000000000..02fc036bd61 --- /dev/null +++ b/src/mongo/db/query/ce/ce_estimation.h @@ -0,0 +1,48 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/ce/array_histogram.h" +#include "mongo/db/query/optimizer/defs.h" +#include "mongo/db/query/optimizer/index_bounds.h" + +namespace mongo::ce { + +double estimateCardEq(const ArrayHistogram& ah, value::TypeTags tag, value::Value val); + +/** + * Given an array histogram, an interval, and the input cardinality, estimates the cardinality of + * the interval. + */ +double estimateIntervalCardinality(const ArrayHistogram& estimator, + const optimizer::IntervalRequirement& interval, + optimizer::CEType inputCardinality); + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_histogram.cpp b/src/mongo/db/query/ce/ce_histogram.cpp new file mode 100644 index 00000000000..1a04b740333 --- /dev/null +++ b/src/mongo/db/query/ce/ce_histogram.cpp @@ -0,0 +1,160 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/exec/sbe/abt/abt_lower.h" + +#include "mongo/db/query/ce/ce_estimation.h" +#include "mongo/db/query/ce/ce_histogram.h" +#include "mongo/db/query/ce/ce_math.h" +#include "mongo/db/query/ce/collection_statistics.h" + +#include "mongo/db/query/optimizer/cascades/ce_heuristic.h" +#include "mongo/db/query/optimizer/utils/abt_hash.h" +#include "mongo/db/query/optimizer/utils/memo_utils.h" + +namespace mongo::optimizer::cascades { + +using namespace properties; + +class CEHistogramTransportImpl { +public: + CEHistogramTransportImpl(OperationContext* opCtx, const ce::CollectionStatistics& stats) + : _opCtx(opCtx), _heuristicCE(), _stats(stats) {} + + ~CEHistogramTransportImpl() {} + + CEType transport(const ABT& n, + const ScanNode& node, + const Memo& memo, + const LogicalProps& logicalProps, + CEType /*bindResult*/) { + return _stats.getCardinality(); + } + + CEType transport(const ABT& n, + const SargableNode& node, + const Memo& memo, + const LogicalProps& logicalProps, + CEType childResult, + CEType /*bindsResult*/, + CEType /*refsResult*/) { + // Early out and return 0 since we don't expect to get more results. + if (childResult == 0.0) { + return 0.0; + } + + std::vector<double> topLevelSelectivities; + for (const auto& [key, req] : node.getReqMap()) { + std::vector<double> disjSelectivities; + auto path = key._path.cast<PathGet>()->name(); + + // Fallback to heuristic if no histogram. + auto histogram = _stats.getHistogram(path); + if (!histogram) { + // For now, because of the structure of SargableNode and the implementation of + // HeuristicCE, we can't combine heuristic & histogram estimates. In this case, + // default to Heuristic if we don't have a histogram for any of the predicates. + return _heuristicCE.deriveCE(memo, logicalProps, n.ref()); + } + + // Intervals are in DNF. + const auto intervalDNF = req.getIntervals(); + const auto disjuncts = intervalDNF.cast<IntervalReqExpr::Disjunction>()->nodes(); + for (const auto& disjunct : disjuncts) { + const auto& conjuncts = disjunct.cast<IntervalReqExpr::Conjunction>()->nodes(); + + std::vector<double> conjSelectivities; + for (const auto& conjunct : conjuncts) { + const auto& interval = conjunct.cast<IntervalReqExpr::Atom>()->getExpr(); + auto cardinality = + ce::estimateIntervalCardinality(*histogram, interval, childResult); + + // We have to convert the cardinality to a selectivity. The histogram returns + // the cardinality for the entire collection; however, fewer records may be + // expected at the SargableNode. + conjSelectivities.push_back(cardinality / _stats.getCardinality()); + } + + auto backoff = ce::conjExponentialBackoff(conjSelectivities); + disjSelectivities.push_back(backoff); + } + + auto backoff = ce::disjExponentialBackoff(disjSelectivities); + topLevelSelectivities.push_back(backoff); + } + + // The elements of the PartialSchemaRequirements map represent an implicit conjunction. + auto backoff = ce::conjExponentialBackoff(topLevelSelectivities); + return backoff * childResult; + } + + CEType transport(const ABT& n, + const RootNode& node, + const Memo& memo, + const LogicalProps& logicalProps, + CEType childResult, + CEType /*refsResult*/) { + // Root node does not change cardinality. + return childResult; + } + + /** + * Other ABT types default to heuristic CE. + */ + template <typename T, typename... Ts> + CEType transport(const ABT& n, + const T& /*node*/, + const Memo& memo, + const LogicalProps& logicalProps, + Ts&&...) { + if (canBeLogicalNode<T>()) { + return _heuristicCE.deriveCE(memo, logicalProps, n.ref()); + } + return 0.0; + } + +private: + OperationContext* _opCtx; + HeuristicCE _heuristicCE; + const ce::CollectionStatistics& _stats; +}; + +CEHistogramTransport::CEHistogramTransport(OperationContext* opCtx, + const ce::CollectionStatistics& stats) + : _impl(std::make_unique<CEHistogramTransportImpl>(opCtx, stats)) {} + +CEHistogramTransport::~CEHistogramTransport() {} + +CEType CEHistogramTransport::deriveCE(const Memo& memo, + const LogicalProps& logicalProps, + const ABT::reference_type logicalNodeRef) const { + return algebra::transport<true>(logicalNodeRef, *this->_impl, memo, logicalProps); +} + +} // namespace mongo::optimizer::cascades diff --git a/src/mongo/db/query/ce/ce_histogram.h b/src/mongo/db/query/ce/ce_histogram.h new file mode 100644 index 00000000000..6132cc19339 --- /dev/null +++ b/src/mongo/db/query/ce/ce_histogram.h @@ -0,0 +1,52 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/ce/collection_statistics.h" +#include "mongo/db/query/optimizer/cascades/interfaces.h" + +namespace mongo::optimizer::cascades { + +class CEHistogramTransportImpl; + +class CEHistogramTransport : public CEInterface { +public: + CEHistogramTransport(OperationContext* opCtx, const ce::CollectionStatistics& stats); + ~CEHistogramTransport(); + + CEType deriveCE(const Memo& memo, + const properties::LogicalProps& logicalProps, + ABT::reference_type logicalNodeRef) const final; + +private: + std::unique_ptr<CEHistogramTransportImpl> _impl; +}; + +} // namespace mongo::optimizer::cascades diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp new file mode 100644 index 00000000000..76a32ef5fa0 --- /dev/null +++ b/src/mongo/db/query/ce/ce_histogram_test.cpp @@ -0,0 +1,366 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/ce_estimation.h" +#include "mongo/db/query/ce/ce_histogram.h" +#include "mongo/db/query/ce/ce_test_utils.h" +#include "mongo/db/query/sbe_stage_builder_helpers.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { +namespace { + +using namespace optimizer; +using namespace cascades; + +class CEHistogramTester : public CETester { +public: + CEHistogramTester(std::string collName, double numRecords, const CollectionStatistics& stats) + : CETester(collName, numRecords), _stats{stats} {} + +protected: + std::unique_ptr<CEInterface> getCETransport(OperationContext* opCtx) override { + return std::make_unique<CEHistogramTransport>(opCtx, _stats); + } + +private: + const CollectionStatistics& _stats; +}; + +struct TestBucket { + Value val; + int equalFreq; + int rangeFreq = 0; + int ndv = 1; /* ndv including bucket boundary*/ +}; + +std::unique_ptr<ArrayHistogram> getHistogramFromData(std::vector<TestBucket> testBuckets) { + value::Array bounds; + std::vector<Bucket> buckets; + std::map<value::TypeTags, size_t> typeCounts; + + int cumulativeFreq = 0; + int cumulativeNDV = 0; + for (auto b : testBuckets) { + // Add bucket boundary value to bounds. + auto sbeVal = stage_builder::makeValue(b.val); + auto [tag, val] = sbeVal; + bounds.push_back(tag, val); + + // Increment count of values for each type tag. + if (auto it = typeCounts.find(tag); it != typeCounts.end()) { + it->second += b.equalFreq + b.rangeFreq; + } else { + typeCounts[tag] = b.equalFreq + b.rangeFreq; + } + cumulativeFreq += b.equalFreq + b.rangeFreq; + cumulativeNDV += b.ndv; + + // Create a histogram bucket. + buckets.emplace_back(b.equalFreq, + b.rangeFreq, + cumulativeFreq, + b.ndv - 1, /* ndv excluding bucket boundary*/ + cumulativeNDV); + } + + return std::make_unique<ArrayHistogram>(Histogram(std::move(bounds), std::move(buckets)), + std::move(typeCounts)); +} + +TEST(CEHistogramTest, AssertSmallMaxDiffHistogramEstimatesAtomicPredicates) { + const auto collName = "test"; + const auto collCardinality = 8; + + CollectionStatistics collStats(collCardinality); + + // Construct a histogram with two buckets: one for 3 ints equal to 1, another for 5 strings + // equal to "ing". + const std::string& str = "ing"; + collStats.addHistogram("a", + getHistogramFromData({ + {Value(1), 3 /* frequency */}, + {Value(str), 5 /* frequency */}, + })); + + CEHistogramTester t(collName, collCardinality, collStats); + + // Test $eq. + ASSERT_MATCH_CE(t, "{a: {$eq: 1}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$eq: 2}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$eq: \"ing\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$eq: \"foo\"}}", 0.0); + + // Test case when field doesn't match fieldpath of histogram. This falls back to heuristics. + ASSERT_MATCH_CE(t, "{b: {$eq: 1}}", 0.8); + + // Test $gt. + ASSERT_MATCH_CE(t, "{a: {$gt: 3}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 1}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 0}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$gt: \"bar\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$gt: \"ing\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: \"zap\"}}", 0.0); + + // Test $lt. + ASSERT_MATCH_CE(t, "{a: {$lt: 3}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$lt: 1}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: 0}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: \"bar\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: \"ing\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: \"zap\"}}", 5.0); + + // Test $gte. + ASSERT_MATCH_CE(t, "{a: {$gte: 3}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 1}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 0}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$gte: \"bar\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$gte: \"ing\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$gte: \"zap\"}}", 0.0); + + // Test $lte. + ASSERT_MATCH_CE(t, "{a: {$lte: 3}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$lte: 1}}", 3.0); + ASSERT_MATCH_CE(t, "{a: {$lte: 0}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lte: \"bar\"}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lte: \"ing\"}}", 5.0); + ASSERT_MATCH_CE(t, "{a: {$lte: \"zap\"}}", 5.0); +} + +TEST(CEHistogramTest, AssertSmallHistogramEstimatesComplexPredicates) { + const auto collName = "test"; + const auto collCardinality = 9; + + CollectionStatistics collStats(collCardinality); + + // Construct a histogram with three int buckets for field 'a'. + collStats.addHistogram("a", + getHistogramFromData({ + {Value(1), 3 /* frequency */}, + {Value(2), 5 /* frequency */}, + {Value(3), 1 /* frequency */}, + })); + + // Construct a histogram with two int buckets for field 'b'. + collStats.addHistogram("b", + getHistogramFromData({ + {Value(22), 3 /* frequency */}, + {Value(33), 6 /* frequency */}, + })); + + CEHistogramTester t(collName, collCardinality, collStats); + + // Test simple conjunctions on one field. Note the first example: the range we expect to see + // here is (1, 3); however, the structure in the SargableNode gives us a conjunction of two + // intervals instead: (1, "") ^ (nan, 3) This is then estimated using exponential backoff to + // give us a less accurate result. The correct cardinality here would be 5. + ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lt: 3}}", 5.66); + ASSERT_MATCH_CE(t, "{a: {$gt: 1}, a: {$lte: 3}}", 6.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lt: 3}}", 8.0); + ASSERT_MATCH_CE(t, "{a: {$gte: 1}, a: {$lte: 3}}", 9.0); + + // Test ranges which exclude each other. + ASSERT_MATCH_CE(t, "{a: {$lt: 1}, a: {$gt: 3}}", 0.0); + + // Test overlapping ranges. This is a similar case to {a: {$gt: 1}, a: {$lt: 3}} above: we + // expect to see the range [2, 2]; instead, we see the range [nan, 2] ^ [2, ""). + ASSERT_MATCH_CE(t, "{a: {$lte: 2}, a: {$gte: 2}}", 5.66); + + // Test conjunctions over multiple fields for which we have histograms. Here we expect a + // cardinality estimated by exponential backoff. + ASSERT_MATCH_CE(t, "{a: {$eq: 2}, b: {$eq: 22}}", 2.24); + ASSERT_MATCH_CE(t, "{a: {$eq: 11}, b: {$eq: 22}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$gt: 11}, a: {$lte: 100}, b: {$eq: 22}}", 0.0); + ASSERT_MATCH_CE(t, "{a: {$lt: 3}, a: {$gte: 1}, b: {$lt: 100}, b: {$gt: 30}}", 5.66); + + // Test conjunctions over multiple fields for which we may not have histograms. This falls back + // to heuristic estimation. + ASSERT_MATCH_CE(t, "{a: {$eq: 2}, c: {$eq: 1}}", 0.09); + ASSERT_MATCH_CE(t, "{c: {$eq: 2}, d: {$eq: 22}}", 0.09); +} + +TEST(CEHistogramTest, SanityTestEmptyHistogram) { + const auto collName = "test"; + const auto collCardinality = 0; + + CollectionStatistics collStats(collCardinality); + collStats.addHistogram("empty", std::make_unique<ArrayHistogram>()); + CEHistogramTester t(collName, collCardinality, collStats); + + ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}}", 0.0); + ASSERT_MATCH_CE(t, "{empty: {$lt: 1.0}, empty: {$gt: 0.0}}", 0.0); + ASSERT_MATCH_CE(t, "{empty: {$eq: 1.0}, other: {$eq: \"anything\"}}", 0.0); + ASSERT_MATCH_CE(t, "{other: {$eq: \"anything\"}, empty: {$eq: 1.0}}", 0.0); +} + +TEST(CEHistogramTest, AssertOneBucketOneIntHistogram) { + const auto collName = "test"; + const auto collCardinality = 50; + + CollectionStatistics collStats(collCardinality); + + // Create a histogram with a single bucket that contains exactly one int (42) with a frequency + // of 50 (equal to the collection cardinality). + collStats.addHistogram("soloInt", + getHistogramFromData({ + {Value(42), collCardinality /* frequency */}, + })); + + CEHistogramTester t(collName, collCardinality, collStats); + + // Check against a variety of intervals that include 42 as a bound. + ASSERT_MATCH_CE(t, "{soloInt: {$eq: 42}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lte: 42}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 42}}", collCardinality); + + // Check against a variety of intervals that include 42 only as one bound. + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lt: 43}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 42}, soloInt: {$lte: 43}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lt: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 42}, soloInt: {$lte: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 42}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 42}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 42}}", collCardinality); + + // Check against a variety of intervals close to 42 using a lower bound of 41 and a higher bound + // of 43. + ASSERT_MATCH_CE(t, "{soloInt: {$eq: 41}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$eq: 43}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$lte: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lt: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lt: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gt: 41}, soloInt: {$lte: 43}}", collCardinality); + ASSERT_MATCH_CE(t, "{soloInt: {$gte: 41}, soloInt: {$lte: 43}}", collCardinality); + + // Check against different types. + // TODO: investigate why this returns 5? + // ASSERT_MATCH_CE(t, "{soloInt: {$eq: [42]}}", 0.0); + // TODO: investigate why this line triggers an out of memory exception. + // ASSERT_MATCH_CE(t, "{soloInt: {$eq: {soloInt: 42}}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$eq: \"42\"}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: \"42\"}}", 0.0); + ASSERT_MATCH_CE(t, "{soloInt: {$lt: 42.1}}", collCardinality); +} + +TEST(CEHistogramTest, AssertOneBoundIntRangeHistogram) { + const auto collName = "test"; + const auto collCardinality = 51; + + CollectionStatistics collStats(collCardinality); + + collStats.addHistogram( + "intRange", + getHistogramFromData({ + {Value(10), 5 /* frequency */}, + {Value(20), 1 /* frequency */, 45 /* range frequency */, 10 /* ndv */}, + })); + + CEHistogramTester t(collName, collCardinality, collStats); + + // Test ranges that overlap only with the lower bound. + // Note: 5 values equal 10. + ASSERT_MATCH_CE(t, "{intRange: {$eq: 10}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 10}, intRange: {$gte: 10}}", 5.0); + + // Test ranges that overlap only with the upper bound. + ASSERT_MATCH_CE(t, "{intRange: {$eq: 11}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 15}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 15.5}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 20}}", 1.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 20}}", 1.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 10}}", 46.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 15}}", 23.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 15}}", 23.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lte: 20}}", 23.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lte: 20}}", 23.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 11}, intRange: {$lt: 20}}", 23.27); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 11}, intRange: {$lt: 20}}", 23.27); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lt: 15}}", 17.25); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lt: 15}}", 17.25); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 12}, intRange: {$lte: 15}}", 17.25); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 12}, intRange: {$lte: 15}}", 17.25); + + // Test ranges that partially overlap with the entire histogram. + ASSERT_MATCH_CE(t, "{intRange: {$lt: 11}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lte: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 8}, intRange: {$lt: 15}}", 27.5); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 8}, intRange: {$lte: 15}}", 27.5); + + // Test ranges that include all values in the histogram. + ASSERT_MATCH_CE(t, "{intRange: {$gte: 10}, intRange: {$lte: 20}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lte: 30}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lt: 30}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 1}, intRange: {$lte: 30}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 1}, intRange: {$lt: 30}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 100}}", collCardinality); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 100}}", collCardinality); + + // Test ranges that are fully included in the histogram. + ASSERT_MATCH_CE(t, "{intRange: {$eq: 10.5}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 12.5}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 19.36}}", 5.0); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 19}, intRange: {$gt: 11}}", 17.26); + + // Test ranges that don't overlap with the histogram. + ASSERT_MATCH_CE(t, "{intRange: {$lt: 10}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$lt: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$lte: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 20.1}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$eq: 21}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 21}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 20}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 100}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lte: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lt: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 30}, intRange: {$lt: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 30}, intRange: {$lte: 50}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lte: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lt: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gte: 0}, intRange: {$lt: 5}}", 0.0); + ASSERT_MATCH_CE(t, "{intRange: {$gt: 0}, intRange: {$lte: 5}}", 0.0); +} + +} // namespace +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_math.cpp b/src/mongo/db/query/ce/ce_math.cpp new file mode 100644 index 00000000000..a5dd7bf04de --- /dev/null +++ b/src/mongo/db/query/ce/ce_math.cpp @@ -0,0 +1,62 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include <algorithm> +#include <cmath> +#include <functional> + +#include "mongo/db/query/ce/ce_math.h" + +namespace mongo::ce { +double conjExponentialBackoff(std::vector<double> conjSelectivities) { + std::sort(conjSelectivities.begin(), conjSelectivities.end()); + double sel = conjSelectivities[0]; + double f = 1.0; + size_t i = 1; + while (i < conjSelectivities.size() && i <= kMaxBackoffIterations) { + f /= 2.0; + sel *= std::pow(conjSelectivities[i], f); + i++; + } + return sel; +} + +double disjExponentialBackoff(std::vector<double> disjSelectivities) { + std::sort(disjSelectivities.begin(), disjSelectivities.end(), std::greater<double>()); + double sel = 1.0 - disjSelectivities[0]; + double f = 1.0; + size_t i = 1; + while (i < disjSelectivities.size() && i <= kMaxBackoffIterations) { + f /= 2.0; + sel *= std::pow(1 - disjSelectivities[i], f); + i++; + } + return 1.0 - sel; +} +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_math.h b/src/mongo/db/query/ce/ce_math.h new file mode 100644 index 00000000000..3ef529a9224 --- /dev/null +++ b/src/mongo/db/query/ce/ce_math.h @@ -0,0 +1,52 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <vector> + +namespace mongo::ce { + +/** + * Specifies the maximum number of iterations to use when estimating via exponential backoff. + */ +const size_t kMaxBackoffIterations = 3; + +/** + * Estimates the selectivity of a conjunction given the selectivities of its subexpressions using + * exponential backoff. + */ +double conjExponentialBackoff(std::vector<double> conjSelectivities); + +/** + * Estimates the selectivity of a disjunction given the selectivities of its subexpressions using + * exponential backoff. + */ +double disjExponentialBackoff(std::vector<double> disjSelectivities); +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp new file mode 100644 index 00000000000..eb7824eb483 --- /dev/null +++ b/src/mongo/db/query/ce/ce_test_utils.cpp @@ -0,0 +1,76 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/ce_test_utils.h" + +#include "mongo/db/query/optimizer/cascades/ce_heuristic.h" +#include "mongo/db/query/optimizer/cascades/logical_props_derivation.h" +#include "mongo/db/query/optimizer/explain.h" +#include "mongo/db/query/optimizer/opt_phase_manager.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { + +using namespace optimizer; +using namespace cascades; + +CETester::CETester(std::string collName, double numRecords) + : _collName(std::move(collName)), _numRecords(std::move(numRecords)) {} + +double CETester::getCE(const std::string& query) { + // Mock opCtx for test. + QueryTestServiceContext serviceContext; + auto opCtx = serviceContext.makeOperationContext(); + + // Mock memo. + ScanDefinition sd({}, {}, {DistributionType::Centralized}, true, _numRecords); + Metadata metadata({{_collName, sd}}); + Memo memo(DebugInfo::kDefaultForTests, + metadata, + std::make_unique<DefaultLogicalPropsDerivation>(), + std::make_unique<HeuristicCE>()); + + // Construct placeholder PhaseManager. + PrefixId prefixId; + OptPhaseManager phaseManager({OptPhaseManager::OptPhase::MemoSubstitutionPhase}, + prefixId, + {{{_collName, {{}, {}}}}}, + DebugInfo::kDefaultForTests); + + // Construct ABT from pipeline and optimize. + ABT abt = translatePipeline("[{$match: " + query + "}]", _collName); + ASSERT_TRUE(phaseManager.optimize(abt)); + + // Get cardinality estimate. + auto cht = getCETransport(opCtx.get()); + return cht->deriveCE(memo, {}, abt.ref()); +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h new file mode 100644 index 00000000000..e6fc660e27b --- /dev/null +++ b/src/mongo/db/query/ce/ce_test_utils.h @@ -0,0 +1,83 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/query/optimizer/cascades/interfaces.h" + +namespace mongo { + +namespace optimizer { +namespace cascades { + +// Forward declaration. +class CEInterface; + +} // namespace cascades +} // namespace optimizer + +namespace ce { + +const double kMaxCEError = 0.01; + +/** + * A helpful macro for asserting that the CE of a $match predicate is approximately what we were + * expecting. + */ +#define ASSERT_MATCH_CE(ce, predicate, expectedCE) \ + ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError) + +/** + * A test utility class for helping verify the cardinality of CE transports on a given $match + * predicate. + */ +class CETester { +public: + CETester(std::string collName, double numRecords); + + /** + * Returns the estimated cardinality of a given 'matchPredicate'. + */ + double getCE(const std::string& matchPredicate); + +protected: + /** + * Subclasses need to override this method to initialize the transports they are testing. + */ + virtual std::unique_ptr<optimizer::cascades::CEInterface> getCETransport( + OperationContext* opCtx) = 0; + + std::string _collName; + + // The number of records in the collection we are testing. + double _numRecords; +}; + +} // namespace ce +} // namespace mongo diff --git a/src/mongo/db/query/ce/collection_statistics.cpp b/src/mongo/db/query/ce/collection_statistics.cpp new file mode 100644 index 00000000000..397228d785a --- /dev/null +++ b/src/mongo/db/query/ce/collection_statistics.cpp @@ -0,0 +1,63 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/collection_statistics.h" +#include "mongo/util/assert_util.h" + +namespace mongo::ce { + +bool CollectionStatistics::hasCollectionStatistics(const NamespaceString& nss) { + return false; // TODO: actually check if we have statistics for 'nss' here. +} + +const CollectionStatistics& CollectionStatistics::getCollectionStatistics( + const NamespaceString& nss) { + MONGO_UNIMPLEMENTED; // TODO: actually get statistics here. +} + +CollectionStatistics::CollectionStatistics(double cardinality) + : _cardinality{cardinality}, _histograms{} {}; + +double CollectionStatistics::getCardinality() const { + return _cardinality; +} + +void CollectionStatistics::addHistogram(const std::string& path, + std::unique_ptr<ArrayHistogram> histogram) { + _histograms[path] = std::move(histogram); +} + +const ArrayHistogram* CollectionStatistics::getHistogram(const std::string& path) const { + if (auto mapIt = _histograms.find(path); mapIt != _histograms.end()) { + return mapIt->second.get(); + } + return nullptr; +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/collection_statistics.h b/src/mongo/db/query/ce/collection_statistics.h new file mode 100644 index 00000000000..4ac133129cf --- /dev/null +++ b/src/mongo/db/query/ce/collection_statistics.h @@ -0,0 +1,76 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/namespace_string.h" +#include "mongo/db/query/ce/array_histogram.h" + +namespace mongo::ce { + +using Histograms = std::map<std::string, std::unique_ptr<ArrayHistogram>>; + +class CollectionStatistics { +public: + /** + * Returns whether collection statistics for a collection with namespace 'nss' are available. + */ + static bool hasCollectionStatistics(const NamespaceString& nss); + + /** + * Retrieves the collection statistics for a collection with namespace 'nss'. + * + * Note: Must check hasCollectionStatistics(nss) first, as this will throw if statistics are + * unavailable for 'nss'. + */ + static const CollectionStatistics& getCollectionStatistics(const NamespaceString& nss); + + CollectionStatistics(double cardinality); + + /** + * Returns the cardinality of the given collection. + */ + double getCardinality() const; + + /** + * Adds a histogram along the given path. + */ + void addHistogram(const std::string& path, std::unique_ptr<ArrayHistogram> histogram); + + /** + * Returns the histogram for the given field path, or nullptr if none exists. + */ + const ArrayHistogram* getHistogram(const std::string& path) const; + +private: + double _cardinality; + Histograms _histograms; +}; + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/histogram.cpp b/src/mongo/db/query/ce/histogram.cpp new file mode 100644 index 00000000000..2359929707d --- /dev/null +++ b/src/mongo/db/query/ce/histogram.cpp @@ -0,0 +1,220 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/histogram.h" + +namespace mongo::ce { + +using namespace sbe; + +Bucket::Bucket( + double equalFreq, double rangeFreq, double cumulativeFreq, double ndv, double cumulativeNDV) + : _equalFreq(equalFreq), + _rangeFreq(rangeFreq), + _cumulativeFreq(cumulativeFreq), + _ndv(ndv), + _cumulativeNDV(cumulativeNDV) { + uassert(6695702, "Invalid equalFreq", _equalFreq >= 0.0); + uassert(6695703, "Invalid rangeFreq", _rangeFreq >= 0.0); + uassert(6695704, "Invalid ndv", _ndv <= _rangeFreq); + uassert(6695705, "Invalid cumulative frequency", _cumulativeFreq >= _equalFreq + _rangeFreq); + uassert(6695706, "Invalid cumulative ndv", _cumulativeNDV >= _ndv + 1.0); +} + +std::string Bucket::toString() const { + std::ostringstream os; + os << "equalFreq: " << _equalFreq << ", rangeFreq: " << _rangeFreq + << ", cumulativeFreq: " << _cumulativeFreq << ", ndv: " << _ndv + << ", cumulativeNDV: " << _cumulativeNDV; + return os.str(); +} + +Histogram::Histogram() : Histogram({}, {}) {} + +Histogram::Histogram(value::Array bounds, std::vector<Bucket> buckets) + : _bounds(std::move(bounds)), _buckets(std::move(buckets)) { + uassert(6695707, "Invalid sizes", bounds.size() == buckets.size()); +} + +std::string Histogram::toString() const { + std::ostringstream os; + os << "["; + for (size_t i = 0; i < _buckets.size(); i++) { + os << "{val: " << _bounds.getAt(i) << ", " << _buckets.at(i).toString() << "}"; + if (_buckets.size() - i > 1) + os << ","; + } + os << "]"; + return os.str(); +} + +std::string Histogram::plot() const { + std::ostringstream os; + double maxFreq = 0; + const double maxBucketSize = 100; + + for (const auto& bucket : _buckets) { + double maxBucketFreq = std::max(bucket._equalFreq, bucket._rangeFreq); + maxFreq = std::max(maxFreq, maxBucketFreq); + } + + std::vector<std::pair<double, std::string>> headers; + size_t maxHeaderSize = 0; + for (size_t i = 0; i < _buckets.size(); ++i) { + std::ostringstream rngHeader; + std::ostringstream eqlHeader; + double scaledRngF = maxBucketSize * _buckets[i]._rangeFreq / maxFreq; + double scaledEqlF = maxBucketSize * _buckets[i]._equalFreq / maxFreq; + rngHeader << _bounds.getAt(i) << ": " << _buckets[i]._rangeFreq; + eqlHeader << _bounds.getAt(i) << ": " << _buckets[i]._equalFreq; + auto rngStr = rngHeader.str(); + maxHeaderSize = std::max(maxHeaderSize, rngStr.size()); + headers.emplace_back(scaledRngF, rngStr); + auto eqlStr = eqlHeader.str(); + maxHeaderSize = std::max(maxHeaderSize, eqlStr.size()); + headers.emplace_back(scaledEqlF, eqlStr); + } + + const std::string maxLine(maxBucketSize + maxHeaderSize + 3, '-'); + os << maxLine << "\n"; + for (size_t j = 0; j < headers.size(); ++j) { + auto header = headers.at(j); + header.second.resize(maxHeaderSize, ' '); + const std::string bar(std::round(header.first), '*'); + os << header.second << " | " << bar << "\n"; + } + os << maxLine << "\n"; + + return os.str(); +} + +EstimationResult Histogram::getTotals() const { + if (_buckets.empty()) { + return {0.0, 0.0}; + } + + const Bucket& last = _buckets.back(); + return {last._cumulativeFreq, last._cumulativeNDV}; +} + +EstimationResult Histogram::estimate(value::TypeTags tag, + value::Value val, + EstimationType type) const { + switch (type) { + case EstimationType::kGreater: + return getTotals() - estimate(tag, val, EstimationType::kLessOrEqual); + + case EstimationType::kGreaterOrEqual: + return getTotals() - estimate(tag, val, EstimationType::kLess); + + default: + // Continue. + break; + } + + size_t bucketIndex = 0; + { + size_t len = _buckets.size(); + while (len > 0) { + const size_t half = len >> 1; + const auto [boundTag, boundVal] = _bounds.getAt(bucketIndex + half); + + if (compareValues3w(boundTag, boundVal, tag, val) < 0) { + bucketIndex += half + 1; + len -= half + 1; + } else { + len = half; + } + } + } + if (bucketIndex == _buckets.size()) { + // Value beyond the largest endpoint. + switch (type) { + case EstimationType::kEqual: + return {0.0, 0.0}; + + case EstimationType::kLess: + case EstimationType::kLessOrEqual: + return getTotals(); + + default: + MONGO_UNREACHABLE; + } + } + + const Bucket& bucket = _buckets.at(bucketIndex); + const auto [boundTag, boundVal] = _bounds.getAt(bucketIndex); + const bool isEndpoint = compareValues3w(boundTag, boundVal, tag, val) == 0; + + switch (type) { + case EstimationType::kEqual: { + if (isEndpoint) { + return {bucket._equalFreq, 1.0}; + } + return {(bucket._ndv == 0.0) ? 0.0 : bucket._rangeFreq / bucket._ndv, 1.0}; + } + + case EstimationType::kLess: { + double resultCard = bucket._cumulativeFreq - bucket._equalFreq; + double resultNDV = bucket._cumulativeNDV - 1.0; + + if (!isEndpoint) { + // TODO: consider value interpolation instead of assigning 50% of the weight. + resultCard -= bucket._rangeFreq / 2.0; + resultNDV -= bucket._ndv / 2.0; + } + return {resultCard, resultNDV}; + } + + case EstimationType::kLessOrEqual: { + double resultCard = bucket._cumulativeFreq; + double resultNDV = bucket._cumulativeNDV; + + if (!isEndpoint) { + // TODO: consider value interpolation instead of assigning 50% of the weight. + resultCard -= bucket._equalFreq + bucket._rangeFreq / 2.0; + resultNDV -= 1.0 + bucket._ndv / 2.0; + } + return {resultCard, resultNDV}; + } + + default: + MONGO_UNREACHABLE; + } +} + +const value::Array& Histogram::getBounds() const { + return _bounds; +} + +const std::vector<Bucket>& Histogram::getBuckets() const { + return _buckets; +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/histogram.h b/src/mongo/db/query/ce/histogram.h new file mode 100644 index 00000000000..d8e051b0c47 --- /dev/null +++ b/src/mongo/db/query/ce/histogram.h @@ -0,0 +1,125 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <utility> +#include <vector> + +#include "mongo/db/query/ce/utils.h" + +namespace mongo::ce { + +using namespace sbe; + +/** + * Statistics related to a single histogram bucket. The boundary value is kept in a separate array, + * so that each bucket has a corresponding boundary value. The reason for this to manage the memory + * of values. + */ +struct Bucket { + Bucket(double equalFreq, + double rangeFreq, + double cumulativeFreq, + double ndv, + double cumulativeNDV); + + std::string toString() const; + + // Frequency of the bound value itself. + double _equalFreq; + + // Frequency of other values. + double _rangeFreq; + + // Sum of frequencies of preceding buckets to avoid recomputing. Includes both _equalFreq and + // _rangeFreq. + double _cumulativeFreq; + + // Number of distinct values in this bucket, excludes the bound. + double _ndv; + + double _cumulativeNDV; + + // TODO: add other statistical values like average size, etc +}; + +enum class EstimationType { kEqual, kLess, kLessOrEqual, kGreater, kGreaterOrEqual }; + +const stdx::unordered_map<EstimationType, std::string> estimationTypeName = { + {EstimationType::kEqual, "eq"}, + {EstimationType::kLess, "lt"}, + {EstimationType::kLessOrEqual, "lte"}, + {EstimationType::kGreater, "gt"}, + {EstimationType::kGreaterOrEqual, "gte"}}; + +struct EstimationResult { + double _card; + double _ndv; + + EstimationResult operator-(const EstimationResult& other) const { + return {_card - other._card, _ndv - other._ndv}; + } +}; + +/** + * A Histogram over a set of values. The histogram consists of two parallel vectors - one with the + * individual value statistics, and another one with the actual boundary values. + */ +class Histogram { +public: + Histogram(); + Histogram(value::Array bounds, std::vector<Bucket> buckets); + + std::string toString() const; + std::string plot() const; + + EstimationResult getTotals() const; + EstimationResult estimate(value::TypeTags tag, value::Value val, EstimationType type) const; + + const value::Array& getBounds() const; + const std::vector<Bucket>& getBuckets() const; + + bool empty() const { + return _buckets.empty(); + } + +private: + // Bucket bounds representing the **highest** value in each bucket. + value::Array _bounds; + + std::vector<Bucket> _buckets; + + // TODO: add counts for types of values not in histogram: arrays, objects, null, missing, etc. + + // TODO: _fieldPath - how to represent? + // TODO: other metadata? +}; + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/utils.cpp b/src/mongo/db/query/ce/utils.cpp new file mode 100644 index 00000000000..5ee6f3a083e --- /dev/null +++ b/src/mongo/db/query/ce/utils.cpp @@ -0,0 +1,140 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce/utils.h" + +#include "mongo/db/query/ce/histogram.h" + +/** + * Note: This is a temporary file used for histogram generation until the histogram implementation + * is finalized. + */ + +namespace mongo::ce { + +using namespace sbe; + +SBEValue::SBEValue(value::TypeTags tag, value::Value val) : _tag(tag), _val(val) {} + +SBEValue::SBEValue(std::pair<value::TypeTags, value::Value> v) : SBEValue(v.first, v.second) {} + +SBEValue::SBEValue(const SBEValue& other) { + auto [tag, val] = copyValue(other._tag, other._val); + _tag = tag; + _val = val; +} + +SBEValue::SBEValue(SBEValue&& other) { + _tag = other._tag; + _val = other._val; + + other._tag = value::TypeTags::Nothing; + other._val = 0; +} + +SBEValue::~SBEValue() { + value::releaseValue(_tag, _val); +} + +SBEValue& SBEValue::operator=(const SBEValue& other) { + value::releaseValue(_tag, _val); + + auto [tag, val] = copyValue(other._tag, other._val); + _tag = tag; + _val = val; + return *this; +} + +SBEValue& SBEValue::operator=(SBEValue&& other) { + value::releaseValue(_tag, _val); + + _tag = other._tag; + _val = other._val; + + other._tag = value::TypeTags::Nothing; + other._val = 0; + + return *this; +} + +std::pair<value::TypeTags, value::Value> SBEValue::get() const { + return std::make_pair(_tag, _val); +} + +value::TypeTags SBEValue::getTag() const { + return _tag; +} + +value::Value SBEValue::getValue() const { + return _val; +} + +std::pair<value::TypeTags, value::Value> makeInt64Value(const int v) { + return std::make_pair(value::TypeTags::NumberInt64, value::bitcastFrom<int64_t>(v)); +}; + +std::pair<value::TypeTags, value::Value> makeNullValue() { + return std::make_pair(value::TypeTags::Null, 0); +}; + +bool sameTypeClass(value::TypeTags tag1, value::TypeTags tag2) { + if (tag1 == tag2) { + return true; + } + + static constexpr const char* kTempFieldName = "temp"; + + BSONObjBuilder minb1; + minb1.appendMinForType(kTempFieldName, value::tagToType(tag1)); + const BSONObj min1 = minb1.obj(); + + BSONObjBuilder minb2; + minb2.appendMinForType(kTempFieldName, value::tagToType(tag2)); + const BSONObj min2 = minb2.obj(); + + return min1.woCompare(min2) == 0; +} + +int32_t compareValues3w(value::TypeTags tag1, + value::Value val1, + value::TypeTags tag2, + value::Value val2) { + const auto [compareTag, compareVal] = value::compareValue(tag1, val1, tag2, val2); + uassert(6695716, "Invalid comparison result", compareTag == value::TypeTags::NumberInt32); + return value::bitcastTo<int32_t>(compareVal); +} + +void sortValueVector(std::vector<SBEValue>& sortVector) { + const auto cmp = [](const SBEValue& a, const SBEValue& b) { + return compareValues3w(a.getTag(), a.getValue(), b.getTag(), b.getValue()) < 0; + }; + std::sort(sortVector.begin(), sortVector.end(), cmp); +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/utils.h b/src/mongo/db/query/ce/utils.h new file mode 100644 index 00000000000..61e39c0bc84 --- /dev/null +++ b/src/mongo/db/query/ce/utils.h @@ -0,0 +1,79 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/exec/sbe/values/value.h" + +/** + * Note: This is a temporary file used for histogram generation until the histogram implementation + * is finalized. + */ + +namespace mongo::ce { + +using namespace sbe; + +enum class EstimationType; +class Histogram; + +class SBEValue { +public: + SBEValue(value::TypeTags tag, value::Value val); + SBEValue(std::pair<value::TypeTags, value::Value> v); + ~SBEValue(); + + SBEValue(const SBEValue& other); + SBEValue(SBEValue&& other); + + SBEValue& operator=(const SBEValue& other); + SBEValue& operator=(SBEValue&& other); + + std::pair<value::TypeTags, value::Value> get() const; + value::TypeTags getTag() const; + value::Value getValue() const; + +private: + value::TypeTags _tag; + value::Value _val; +}; + +std::pair<value::TypeTags, value::Value> makeInt64Value(int v); +std::pair<value::TypeTags, value::Value> makeNullValue(); + +bool sameTypeClass(value::TypeTags tag1, value::TypeTags tag2); + +int32_t compareValues3w(value::TypeTags tag1, + value::Value val1, + value::TypeTags tag2, + value::Value val2); + +void sortValueVector(std::vector<SBEValue>& sortVector); + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce_mode_parameter.cpp b/src/mongo/db/query/ce_mode_parameter.cpp new file mode 100644 index 00000000000..8c019d25746 --- /dev/null +++ b/src/mongo/db/query/ce_mode_parameter.cpp @@ -0,0 +1,40 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce_mode_parameter.h" +#include "mongo/db/query/query_knobs_gen.h" + +namespace mongo::ce { +Status validateCEMode(const std::string& value) { + if (value == kHeuristic || value == kHistogram || value == kSampling) { + return Status::OK(); + } + return Status(ErrorCodes::Error{6695700}, "Invalid cardinality estimation mode."); +} +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce_mode_parameter.h b/src/mongo/db/query/ce_mode_parameter.h new file mode 100644 index 00000000000..b267067747f --- /dev/null +++ b/src/mongo/db/query/ce_mode_parameter.h @@ -0,0 +1,47 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <string> + +#include "mongo/base/status.h" + +namespace mongo::ce { + +/** + * Defines cardinality estimation modes. + */ +const std::string kHeuristic = "heuristic"; +const std::string kHistogram = "histogram"; +const std::string kSampling = "sampling"; + +Status validateCEMode(const std::string& value); + +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce_mode_parameter_test.cpp b/src/mongo/db/query/ce_mode_parameter_test.cpp new file mode 100644 index 00000000000..66e4b23bc38 --- /dev/null +++ b/src/mongo/db/query/ce_mode_parameter_test.cpp @@ -0,0 +1,47 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/ce_mode_parameter.h" + +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { + +TEST(CEModeParameterTest, ValidatesValidCEModes) { + ASSERT_OK(validateCEMode("heuristic")); + ASSERT_OK(validateCEMode("histogram")); + ASSERT_OK(validateCEMode("sampling")); +} + +TEST(CEModeParameterTest, RejectsInvalidCEModes) { + ASSERT_NOT_OK(validateCEMode("notamode")); + ASSERT_NOT_OK(validateCEMode("")); +} + +} // namespace mongo::ce diff --git a/src/mongo/db/query/query_knobs.idl b/src/mongo/db/query/query_knobs.idl index 4b7198314f2..0f64623fa63 100644 --- a/src/mongo/db/query/query_knobs.idl +++ b/src/mongo/db/query/query_knobs.idl @@ -29,6 +29,7 @@ global: cpp_namespace: "mongo" cpp_includes: + - "mongo/db/query/ce_mode_parameter.h" - "mongo/db/query/plan_cache_size_parameter.h" - "mongo/db/query/sbe_plan_cache_on_parameter_change.h" - "mongo/platform/atomic_proxy.h" @@ -713,14 +714,15 @@ server_parameters: expr: 100 * 1024 * 1024 validator: gt: 0 - - internalQueryEnableSamplingCardinalityEstimator: - description: "Set to use the sampling-based method for estimating cardinality in the Cascades - optimizer." + + internalQueryCardinalityEstimatorMode: + description: "Set to select a method for estimating cardinality in the Cascades optimizer." set_at: [ startup, runtime ] - cpp_varname: "internalQueryEnableSamplingCardinalityEstimator" - cpp_vartype: AtomicWord<bool> - default: true + cpp_varname: "internalQueryCardinalityEstimatorMode" + cpp_vartype: std::string + default: sampling + validator: + callback: ce::validateCEMode internalQueryEnableCascadesOptimizer: description: "Set to use the new optimizer path, must be used in conjunction with the feature |