diff options
author | Timour Katchaounov <timour.katchaounov@mongodb.com> | 2022-07-19 09:57:59 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-07-19 10:30:38 +0000 |
commit | 4af213f08a582a90534f5f98a9cb1d0976e4945b (patch) | |
tree | dc6b991beacaa4c91dfe5a1d7d9874a548bf946a | |
parent | 0fa5c9a8075ef9d27d3d94b68876a6fbd5afd54f (diff) | |
download | mongo-4af213f08a582a90534f5f98a9cb1d0976e4945b.tar.gz |
SERVER-67166 Heuristic CE for SargableNode
-rw-r--r-- | src/mongo/db/commands/cqf/cqf_aggregate.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/query/ce/SConscript | 11 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_heuristic_test.cpp | 218 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_histogram.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_histogram.h | 2 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_histogram_test.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.cpp | 48 | ||||
-rw-r--r-- | src/mongo/db/query/ce/ce_test_utils.h | 37 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/cascades/ce_heuristic.cpp | 136 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/interval_intersection_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/logical_rewriter_optimizer_test.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/physical_rewriter_optimizer_test.cpp | 125 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/utils/ce_math.cpp (renamed from src/mongo/db/query/ce/ce_math.cpp) | 44 | ||||
-rw-r--r-- | src/mongo/db/query/optimizer/utils/ce_math.h (renamed from src/mongo/db/query/ce/ce_math.h) | 27 |
15 files changed, 604 insertions, 89 deletions
diff --git a/src/mongo/db/commands/cqf/cqf_aggregate.cpp b/src/mongo/db/commands/cqf/cqf_aggregate.cpp index c51302d28a8..304ea553f69 100644 --- a/src/mongo/db/commands/cqf/cqf_aggregate.cpp +++ b/src/mongo/db/commands/cqf/cqf_aggregate.cpp @@ -493,7 +493,7 @@ std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> getSBEExecutorViaCascadesOp } else if (internalQueryCardinalityEstimatorMode == ce::kHistogram && ce::CollectionStatistics::hasCollectionStatistics(nss)) { const auto& stats = ce::CollectionStatistics::getCollectionStatistics(nss); - auto ceDerivation = std::make_unique<CEHistogramTransport>(opCtx, stats); + auto ceDerivation = std::make_unique<CEHistogramTransport>(stats); OptPhaseManager phaseManager{OptPhaseManager::getAllRewritesSet(), prefixId, false /*requireRID*/, diff --git a/src/mongo/db/query/ce/SConscript b/src/mongo/db/query/ce/SConscript index 4941d74512d..46c8a254e75 100644 --- a/src/mongo/db/query/ce/SConscript +++ b/src/mongo/db/query/ce/SConscript @@ -9,7 +9,6 @@ env.Library( source=[ 'array_histogram.cpp', 'ce_histogram.cpp', - 'ce_math.cpp', 'ce_sampling.cpp', 'ce_estimation.cpp', 'collection_statistics.cpp', @@ -46,3 +45,13 @@ env.CppUnitTest( 'ce_test_utils', ], ) + +env.CppUnitTest( + target="ce_heuristic_test", + source=[ + "ce_heuristic_test.cpp", + ], + LIBDEPS=[ + 'ce_test_utils', + ], +) diff --git a/src/mongo/db/query/ce/ce_heuristic_test.cpp b/src/mongo/db/query/ce/ce_heuristic_test.cpp new file mode 100644 index 00000000000..f8a98c1001e --- /dev/null +++ b/src/mongo/db/query/ce/ce_heuristic_test.cpp @@ -0,0 +1,218 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include <string> + +#include "mongo/db/query/ce/ce_test_utils.h" +#include "mongo/db/query/optimizer/cascades/ce_heuristic.h" +#include "mongo/db/query/optimizer/cascades/cost_derivation.h" +#include "mongo/db/query/optimizer/cascades/logical_props_derivation.h" +#include "mongo/db/query/optimizer/cascades/memo.h" +#include "mongo/db/query/optimizer/defs.h" +#include "mongo/db/query/optimizer/explain.h" +#include "mongo/db/query/optimizer/metadata.h" +#include "mongo/db/query/optimizer/opt_phase_manager.h" +#include "mongo/db/query/optimizer/utils/unit_test_utils.h" +#include "mongo/db/query/optimizer/utils/utils.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::ce { +namespace { + +using namespace optimizer; +using namespace optimizer::cascades; + +constexpr double kCollCard = 10000.0; +const std::string collName = "test"; + +class HeuristicCETester : public CETester { +public: + HeuristicCETester(std::string collName) : CETester(collName, 0) {} + +protected: + std::unique_ptr<CEInterface> getCETransport() const override { + return std::make_unique<HeuristicCE>(); + } +}; + +HeuristicCETester ht(collName); + +TEST(CEHeuristicTest, CEwithoutOptimization) { + std::string query = "{a0 : {$gt : 14, $lt : 21}}"; + ht.setCollCard(kCollCard); + double ce = ht.getCE(query, 0); + ASSERT_APPROX_EQUAL(100.0, ce, kMaxCEError); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Eq) { + std::string query = "{a : 123}"; + ASSERT_MATCH_CE_CARD(ht, query, 0.0, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.73205, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 2.64575, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 10.0, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 100.0, 10000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Gt) { + std::string query = "{a: {$gt: 44}}"; + ASSERT_MATCH_CE_CARD(ht, query, 0.01, 0.0); + ASSERT_MATCH_CE_CARD(ht, query, 0.7, 1.0); + ASSERT_MATCH_CE_CARD(ht, query, 6.3, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 44.55, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 330, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_Gt_Lt) { + std::string query = "{a: {$gt: 44, $lt: 99}}"; + ASSERT_MATCH_CE_CARD(ht, query, 0.585662, 1.0); + ASSERT_MATCH_CE_CARD(ht, query, 5.27096, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 29.885, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 189.571, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_AND2Eq) { + std::string query = "{a : 13, b : 42}"; + ASSERT_MATCH_CE_CARD(ht, query, 1.31607, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.62658, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.77828, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 10.0, 10000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_AND3Eq) { + std::string query = "{a : 13, b : 42, c : 69}"; + ASSERT_MATCH_CE_CARD(ht, query, 1.1472, 3.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.27537, 7.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.33352, 10.0); + ASSERT_MATCH_CE_CARD(ht, query, 1.77828, 100.0); + ASSERT_MATCH_CE_CARD(ht, query, 3.16228, 10000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_OR1path) { + std::string query = "{$or: [{a0: {$gt: 44}}, {a0: {$lt: 9}}]}"; + ASSERT_MATCH_CE_CARD(ht, query, 7.52115, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 58.6188, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 451.581, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_OR2paths) { + std::string query = "{$or: [{a0: {$gt:44}}, {b0: {$lt: 9}}]}"; + // TODO: Disjunctions on different paths are not SARGable, and are represented as a single + // FilterNode. FilterNode is currenlty estimated to be always 10% selectivity. + ASSERT_MATCH_CE_CARD(ht, query, 0.9, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 9.9, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 100.0, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathSimple) { + std::string query = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 44}}]}" + "]}"; + ASSERT_MATCH_CE_CARD(ht, query, 6.59965, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 41.2515, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 270.42, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF1pathComplex) { + // Each disjunct has different number of conjuncts, so that its selectivity is different. + // We need 5 disjuncts to test exponential backoff which cuts off at the first 4. + // The conjuncts are in selectivity order. + std::string query1 = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}, {a0: {$lt: 77}}]}" + "]}"; + auto ce1 = ht.getCE(query1); + // The conjuncts are in inverse selectivity order. + std::string query2 = + "{$or: [" + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}, {a0: {$lt: 77}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}, {a0: {$lt: " + "81}}]}," + "{$and: [{a0: {$gt:40}}, {a0: {$lt: 99}}, {a0: {$gt: 42}}, {a0: {$lt: 88}}]}," + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}, {a0: {$gt: 42}}]}," + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}" + "]}"; + auto ce2 = ht.getCE(query2); + ASSERT_APPROX_EQUAL(ce1, ce2, kMaxCEError); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_DNF2paths) { + // TODO: Disjunctions on different paths are not SARGable, and are represented as a single + // FilterNode. FilterNode is currently estimated to be always 10% selectivity. + std::string query = + "{$or: [" + "{$and: [{a0: {$gt: 9}}, {a0: {$lt: 12}}]}," + "{$and: [{b0: {$gt:40}}, {b0: {$lt: 44}}]}" + "]}"; + ASSERT_MATCH_CE_CARD(ht, query, 0.9, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 9.9, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 100.0, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF1path) { + std::string query = + "{$and : [" + "{$or : [ {a0 : {$gt : 11}}, {a0 : {$lt : 44}} ]}," + "{$or : [ {a0 : {$gt : 77}}, {a0 : {$eq : 51}} ]}" + "]}"; + ASSERT_MATCH_CE_CARD(ht, query, 6.87663, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 42.7435, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 275.419, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionPhase_CNF2paths) { + std::string query = + "{$and : [" + "{$or : [ {a0 : {$gt : 11}}, {a0 : {$lt : 44}} ]}," + "{$or : [ {b0 : {$gt : 77}}, {b0 : {$eq : 51}} ]}" + "]}"; + ASSERT_MATCH_CE_CARD(ht, query, 6.21212, 9.0); + ASSERT_MATCH_CE_CARD(ht, query, 36.4418, 99.0); + ASSERT_MATCH_CE_CARD(ht, query, 228.935, 1000.0); +} + +TEST(CEHeuristicTest, CEAfterMemoSubstitutionExplorationPhases) { + std::string query = "{a : 13, b : 42}"; + ht.setCollCard(kCollCard); + double ce = ht.getCE(query, 2); + ASSERT_APPROX_EQUAL(10.0, ce, kMaxCEError); +} + +} // namespace +} // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_histogram.cpp b/src/mongo/db/query/ce/ce_histogram.cpp index 1a04b740333..7ce299b4707 100644 --- a/src/mongo/db/query/ce/ce_histogram.cpp +++ b/src/mongo/db/query/ce/ce_histogram.cpp @@ -31,11 +31,11 @@ #include "mongo/db/query/ce/ce_estimation.h" #include "mongo/db/query/ce/ce_histogram.h" -#include "mongo/db/query/ce/ce_math.h" #include "mongo/db/query/ce/collection_statistics.h" #include "mongo/db/query/optimizer/cascades/ce_heuristic.h" #include "mongo/db/query/optimizer/utils/abt_hash.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" #include "mongo/db/query/optimizer/utils/memo_utils.h" namespace mongo::optimizer::cascades { @@ -44,8 +44,8 @@ using namespace properties; class CEHistogramTransportImpl { public: - CEHistogramTransportImpl(OperationContext* opCtx, const ce::CollectionStatistics& stats) - : _opCtx(opCtx), _heuristicCE(), _stats(stats) {} + CEHistogramTransportImpl(const ce::CollectionStatistics& stats) + : _heuristicCE(), _stats(stats) {} ~CEHistogramTransportImpl() {} @@ -101,16 +101,16 @@ public: conjSelectivities.push_back(cardinality / _stats.getCardinality()); } - auto backoff = ce::conjExponentialBackoff(conjSelectivities); + auto backoff = ce::conjExponentialBackoff(std::move(conjSelectivities)); disjSelectivities.push_back(backoff); } - auto backoff = ce::disjExponentialBackoff(disjSelectivities); + auto backoff = ce::disjExponentialBackoff(std::move(disjSelectivities)); topLevelSelectivities.push_back(backoff); } // The elements of the PartialSchemaRequirements map represent an implicit conjunction. - auto backoff = ce::conjExponentialBackoff(topLevelSelectivities); + auto backoff = ce::conjExponentialBackoff(std::move(topLevelSelectivities)); return backoff * childResult; } @@ -140,14 +140,12 @@ public: } private: - OperationContext* _opCtx; HeuristicCE _heuristicCE; const ce::CollectionStatistics& _stats; }; -CEHistogramTransport::CEHistogramTransport(OperationContext* opCtx, - const ce::CollectionStatistics& stats) - : _impl(std::make_unique<CEHistogramTransportImpl>(opCtx, stats)) {} +CEHistogramTransport::CEHistogramTransport(const ce::CollectionStatistics& stats) + : _impl(std::make_unique<CEHistogramTransportImpl>(stats)) {} CEHistogramTransport::~CEHistogramTransport() {} diff --git a/src/mongo/db/query/ce/ce_histogram.h b/src/mongo/db/query/ce/ce_histogram.h index 6132cc19339..dfc556ebf87 100644 --- a/src/mongo/db/query/ce/ce_histogram.h +++ b/src/mongo/db/query/ce/ce_histogram.h @@ -38,7 +38,7 @@ class CEHistogramTransportImpl; class CEHistogramTransport : public CEInterface { public: - CEHistogramTransport(OperationContext* opCtx, const ce::CollectionStatistics& stats); + CEHistogramTransport(const ce::CollectionStatistics& stats); ~CEHistogramTransport(); CEType deriveCE(const Memo& memo, diff --git a/src/mongo/db/query/ce/ce_histogram_test.cpp b/src/mongo/db/query/ce/ce_histogram_test.cpp index 76a32ef5fa0..3be39685e83 100644 --- a/src/mongo/db/query/ce/ce_histogram_test.cpp +++ b/src/mongo/db/query/ce/ce_histogram_test.cpp @@ -27,6 +27,8 @@ * it in the license file. */ +#include <iostream> + #include "mongo/db/query/ce/ce_estimation.h" #include "mongo/db/query/ce/ce_histogram.h" #include "mongo/db/query/ce/ce_test_utils.h" @@ -45,8 +47,8 @@ public: : CETester(collName, numRecords), _stats{stats} {} protected: - std::unique_ptr<CEInterface> getCETransport(OperationContext* opCtx) override { - return std::make_unique<CEHistogramTransport>(opCtx, _stats); + std::unique_ptr<CEInterface> getCETransport() const override { + return std::make_unique<CEHistogramTransport>(_stats); } private: @@ -118,7 +120,7 @@ TEST(CEHistogramTest, AssertSmallMaxDiffHistogramEstimatesAtomicPredicates) { ASSERT_MATCH_CE(t, "{a: {$eq: \"foo\"}}", 0.0); // Test case when field doesn't match fieldpath of histogram. This falls back to heuristics. - ASSERT_MATCH_CE(t, "{b: {$eq: 1}}", 0.8); + ASSERT_MATCH_CE(t, "{b: {$eq: 1}}", 2.82843); // Test $gt. ASSERT_MATCH_CE(t, "{a: {$gt: 3}}", 0.0); @@ -201,8 +203,8 @@ TEST(CEHistogramTest, AssertSmallHistogramEstimatesComplexPredicates) { // Test conjunctions over multiple fields for which we may not have histograms. This falls back // to heuristic estimation. - ASSERT_MATCH_CE(t, "{a: {$eq: 2}, c: {$eq: 1}}", 0.09); - ASSERT_MATCH_CE(t, "{c: {$eq: 2}, d: {$eq: 22}}", 0.09); + ASSERT_MATCH_CE(t, "{a: {$eq: 2}, c: {$eq: 1}}", 1.73205); + ASSERT_MATCH_CE(t, "{c: {$eq: 2}, d: {$eq: 22}}", 1.73205); } TEST(CEHistogramTest, SanityTestEmptyHistogram) { diff --git a/src/mongo/db/query/ce/ce_test_utils.cpp b/src/mongo/db/query/ce/ce_test_utils.cpp index eb7824eb483..b35e36928dc 100644 --- a/src/mongo/db/query/ce/ce_test_utils.cpp +++ b/src/mongo/db/query/ce/ce_test_utils.cpp @@ -27,9 +27,12 @@ * it in the license file. */ +#include <cstddef> + #include "mongo/db/query/ce/ce_test_utils.h" #include "mongo/db/query/optimizer/cascades/ce_heuristic.h" +#include "mongo/db/query/optimizer/cascades/cost_derivation.h" #include "mongo/db/query/optimizer/cascades/logical_props_derivation.h" #include "mongo/db/query/optimizer/explain.h" #include "mongo/db/query/optimizer/opt_phase_manager.h" @@ -41,27 +44,31 @@ namespace mongo::ce { using namespace optimizer; using namespace cascades; -CETester::CETester(std::string collName, double numRecords) - : _collName(std::move(collName)), _numRecords(std::move(numRecords)) {} - -double CETester::getCE(const std::string& query) { - // Mock opCtx for test. - QueryTestServiceContext serviceContext; - auto opCtx = serviceContext.makeOperationContext(); +CETester::CETester(std::string collName, double collCard) + : _collName(std::move(collName)), _collCard(collCard) {} +double CETester::getCE(const std::string& query, size_t optimizationLevel) const { // Mock memo. - ScanDefinition sd({}, {}, {DistributionType::Centralized}, true, _numRecords); + ScanDefinition sd({}, {}, {DistributionType::Centralized}, true, _collCard); Metadata metadata({{_collName, sd}}); - Memo memo(DebugInfo::kDefaultForTests, - metadata, - std::make_unique<DefaultLogicalPropsDerivation>(), - std::make_unique<HeuristicCE>()); - // Construct placeholder PhaseManager. + std::vector<OptPhaseManager::OptPhase> optPhaseChoices{ + OptPhaseManager::OptPhase::MemoSubstitutionPhase, + OptPhaseManager::OptPhase::MemoExplorationPhase}; + optimizationLevel = std::min(optimizationLevel, optPhaseChoices.size()); + OptPhaseManager::PhaseSet optPhases; + for (size_t i = 0; i < optimizationLevel; ++i) { + optPhases.insert(optPhaseChoices[i]); + } + + // Construct placeholder PhaseManager. Notice that it also creates a Memo internally. PrefixId prefixId; - OptPhaseManager phaseManager({OptPhaseManager::OptPhase::MemoSubstitutionPhase}, + OptPhaseManager phaseManager(optPhases, prefixId, - {{{_collName, {{}, {}}}}}, + true /*requireRID*/, + metadata, + std::make_unique<HeuristicCE>(), + std::make_unique<DefaultCosting>(), DebugInfo::kDefaultForTests); // Construct ABT from pipeline and optimize. @@ -69,8 +76,15 @@ double CETester::getCE(const std::string& query) { ASSERT_TRUE(phaseManager.optimize(abt)); // Get cardinality estimate. - auto cht = getCETransport(opCtx.get()); - return cht->deriveCE(memo, {}, abt.ref()); + auto cht = getCETransport(); + auto ce = cht->deriveCE(phaseManager.getMemo(), {}, abt.ref()); + +#ifdef CE_TEST_LOG_MODE + std::cout << "Query: " << query << ", card: " << _collCard << ", Estimated: " << ce + << std::endl; +#endif + + return ce; } } // namespace mongo::ce diff --git a/src/mongo/db/query/ce/ce_test_utils.h b/src/mongo/db/query/ce/ce_test_utils.h index e6fc660e27b..4d21e834a4e 100644 --- a/src/mongo/db/query/ce/ce_test_utils.h +++ b/src/mongo/db/query/ce/ce_test_utils.h @@ -29,6 +29,9 @@ #pragma once +#include <cstddef> +#include <sys/types.h> + #include "mongo/db/query/optimizer/cascades/interfaces.h" namespace mongo { @@ -44,14 +47,35 @@ class CEInterface; namespace ce { +// Enable this flag to log all estimates, and let all tests pass. +//#define CE_TEST_LOG_MODE 1 + const double kMaxCEError = 0.01; /** - * A helpful macro for asserting that the CE of a $match predicate is approximately what we were + * Helpful macros for asserting that the CE of a $match predicate is approximately what we were * expecting. */ + +#ifndef CE_TEST_LOG_MODE #define ASSERT_MATCH_CE(ce, predicate, expectedCE) \ ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError) +#else +#define ASSERT_MATCH_CE(ce, predicate, expectedCE) \ + ce.getCE(predicate); \ + ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError) +#endif + +#ifndef CE_TEST_LOG_MODE +#define ASSERT_MATCH_CE_CARD(ce, predicate, expectedCE, collCard) \ + ce.setCollCard(collCard); \ + ASSERT_APPROX_EQUAL(expectedCE, ce.getCE(predicate), kMaxCEError) +#else +#define ASSERT_MATCH_CE_CARD(ce, predicate, expectedCE, collCard) \ + ce.setCollCard(collCard); \ + ce.getCE(predicate); \ + ASSERT_APPROX_EQUAL(1.0, 1.0, kMaxCEError) +#endif /** * A test utility class for helping verify the cardinality of CE transports on a given $match @@ -64,19 +88,22 @@ public: /** * Returns the estimated cardinality of a given 'matchPredicate'. */ - double getCE(const std::string& matchPredicate); + double getCE(const std::string& matchPredicate, size_t optimizationLevel = 1) const; + + void setCollCard(double card) { + _collCard = card; + } protected: /** * Subclasses need to override this method to initialize the transports they are testing. */ - virtual std::unique_ptr<optimizer::cascades::CEInterface> getCETransport( - OperationContext* opCtx) = 0; + virtual std::unique_ptr<optimizer::cascades::CEInterface> getCETransport() const = 0; std::string _collName; // The number of records in the collection we are testing. - double _numRecords; + double _collCard; }; } // namespace ce diff --git a/src/mongo/db/query/optimizer/SConscript b/src/mongo/db/query/optimizer/SConscript index 02b3b90bcd0..c050a0d79e7 100644 --- a/src/mongo/db/query/optimizer/SConscript +++ b/src/mongo/db/query/optimizer/SConscript @@ -39,6 +39,7 @@ env.Library( "rewrites/path_lower.cpp", "syntax/expr.cpp", "utils/abt_hash.cpp", + 'utils/ce_math.cpp', "utils/interval_utils.cpp", "utils/memo_utils.cpp", "utils/utils.cpp", diff --git a/src/mongo/db/query/optimizer/cascades/ce_heuristic.cpp b/src/mongo/db/query/optimizer/cascades/ce_heuristic.cpp index 6d89cd1a436..802c1d24892 100644 --- a/src/mongo/db/query/optimizer/cascades/ce_heuristic.cpp +++ b/src/mongo/db/query/optimizer/cascades/ce_heuristic.cpp @@ -28,18 +28,26 @@ */ #include "mongo/db/query/optimizer/cascades/ce_heuristic.h" -#include "mongo/db/query/optimizer/utils/memo_utils.h" + +#include "mongo/db/query/optimizer/cascades/memo.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" +#include "mongo/util/assert_util.h" namespace mongo::optimizer::cascades { using namespace properties; +using namespace mongo::ce; + +// Invalid estimate - an arbitrary negative value used for initialization. +constexpr SelectivityType kInvalidSel = -1.0; +constexpr SelectivityType kDefaultFilterSel = 0.1; class CEHeuristicTransport { public: CEType transport(const ScanNode& node, CEType /*bindResult*/) { // Default cardinality estimate. const CEType metadataCE = _memo.getMetadata()._scanDefs.at(node.getScanDefName()).getCE(); - return (metadataCE < 0.0) ? 1000.00 : metadataCE; + return (metadataCE < 0.0) ? kDefaultCard : metadataCE; } CEType transport(const ValueScanNode& node, CEType /*bindResult*/) { @@ -53,6 +61,9 @@ public: } CEType transport(const FilterNode& node, CEType childResult, CEType /*exprResult*/) { + if (childResult == 0.0) { + return 0.0; + } if (node.getFilter() == Constant::boolean(true)) { // Trivially true filter. return childResult; @@ -61,7 +72,7 @@ public: return 0.0; } else { // Estimate filter selectivity at 0.1. - return 0.1 * childResult; + return std::max(kDefaultFilterSel * childResult, kMinCard); } } @@ -70,16 +81,120 @@ public: return childResult; } + /** + * Default selectivity of equalities. To avoid super small selectivities for small + * cardinalities, that would result in 0 cardinality for many small inputs, the + * estimate is scaled as inputCard grows. The bigger inputCard, the smaller the + * selectivity. + */ + SelectivityType equalitySel(const CEType inputCard) { + uassert(6716604, "Zero cardinality must be handled by the caller.", inputCard > 0.0); + return std::sqrt(inputCard) / inputCard; + } + + /** + * Default selectivity of intervals with bounds on both ends. These intervals are + * considered less selective than equalities. + * Examples: (a > 'abc' AND a < 'hta'), (0 < b <= 13) + */ + SelectivityType closedRangeSel(const CEType inputCard) { + CEType sel = kInvalidSel; + if (inputCard < 20.0) { + sel = 0.50; + } else if (inputCard < 100.0) { + sel = 0.33; + } else { + sel = 0.20; + } + return sel; + } + + /** + * Default selectivity of intervals open on one end. These intervals are + * considered less selective than those with both ends specified by the user query. + * Examples: (a > 'xyz'), (b <= 13) + */ + SelectivityType openRangeSel(const CEType inputCard) { + SelectivityType sel = kInvalidSel; + if (inputCard < 20.0) { + sel = 0.70; + } else if (inputCard < 100.0) { + sel = 0.45; + } else { + sel = 0.33; + } + return sel; + } + + mongo::sbe::value::TypeTags boundType(const BoundRequirement& bound) { + const auto constBoundPtr = bound.getBound().cast<Constant>(); + if (constBoundPtr == nullptr) { + return mongo::sbe::value::TypeTags::Nothing; + } + const auto [tag, val] = constBoundPtr->get(); + return tag; + } + + SelectivityType intervalSel(const IntervalRequirement& interval, const CEType inputCard) { + SelectivityType sel = kInvalidSel; + if (interval.isFullyOpen()) { + sel = 1.0; + } else if (interval.isEquality()) { + sel = equalitySel(inputCard); + } else if (interval.getHighBound().isPlusInf() || interval.getLowBound().isMinusInf() || + boundType(interval.getLowBound()) != boundType(interval.getHighBound())) { + // The interval has an actual bound only on one of it ends if: + // - one of the bounds is infinite, or + // - both bounds are of a different type - this is the case when due to type bracketing + // one of the bounds is the lowest/highest value of the previous/next type. + // TODO: Notice that sometimes type bracketing uses a min/max value from the same type, + // so sometimes we may not detect an open-ended interval. + sel = openRangeSel(inputCard); + } else { + sel = closedRangeSel(inputCard); + } + uassert(6716603, "Invalid selectivity.", validSelectivity(sel)); + return sel; + } + CEType transport(const SargableNode& node, - CEType /*childResult*/, + CEType childResult, CEType /*bindsResult*/, CEType /*refsResult*/) { - ABT lowered = node.getChild(); + // Early out and return 0 since we don't expect to get more results. + if (childResult == 0.0) { + return 0.0; + } + + SelectivityType topLevelSel = 1.0; + std::vector<SelectivityType> topLevelSelectivities; for (const auto& [key, req] : node.getReqMap()) { - // TODO: consider issuing one filter node per interval. - lowerPartialSchemaRequirement(key, req, lowered); + SelectivityType disjSel = 1.0; + std::vector<SelectivityType> disjSelectivities; + // Intervals are in DNF. + const auto intervalDNF = req.getIntervals(); + const auto disjuncts = intervalDNF.cast<IntervalReqExpr::Disjunction>()->nodes(); + for (const auto& disjunct : disjuncts) { + const auto& conjuncts = disjunct.cast<IntervalReqExpr::Conjunction>()->nodes(); + SelectivityType conjSel = 1.0; + std::vector<SelectivityType> conjSelectivities; + for (const auto& conjunct : conjuncts) { + const auto& interval = conjunct.cast<IntervalReqExpr::Atom>()->getExpr(); + const SelectivityType sel = intervalSel(interval, childResult); + conjSelectivities.push_back(sel); + } + conjSel = ce::conjExponentialBackoff(std::move(conjSelectivities)); + disjSelectivities.push_back(conjSel); + } + disjSel = ce::disjExponentialBackoff(std::move(disjSelectivities)); + topLevelSelectivities.push_back(disjSel); } - return algebra::transport<false>(lowered, *this); + + // The elements of the PartialSchemaRequirements map represent an implicit conjunction. + topLevelSel = ce::conjExponentialBackoff(std::move(topLevelSelectivities)); + CEType card = std::max(topLevelSel * childResult, kMinCard); + uassert(6716602, "Invalid cardinality.", mongo::ce::validCardinality(card)); + return card; } CEType transport(const RIDIntersectNode& node, @@ -96,7 +211,7 @@ public: CEType /*exprResult*/) { const auto& filter = node.getFilter(); - double selectivity = 0.1; + SelectivityType selectivity = kDefaultFilterSel; if (filter == Constant::boolean(false)) { selectivity = 0.0; } else if (filter == Constant::boolean(true)) { @@ -194,7 +309,8 @@ private: CEType HeuristicCE::deriveCE(const Memo& memo, const LogicalProps& /*logicalProps*/, const ABT::reference_type logicalNodeRef) const { - return CEHeuristicTransport::derive(memo, logicalNodeRef); + CEType card = CEHeuristicTransport::derive(memo, logicalNodeRef); + return card; } } // namespace mongo::optimizer::cascades diff --git a/src/mongo/db/query/optimizer/interval_intersection_test.cpp b/src/mongo/db/query/optimizer/interval_intersection_test.cpp index a965e2600e2..33c2d5c4b5b 100644 --- a/src/mongo/db/query/optimizer/interval_intersection_test.cpp +++ b/src/mongo/db/query/optimizer/interval_intersection_test.cpp @@ -114,8 +114,8 @@ TEST(IntervalIntersection, SingleFieldIntersection) { " BindBlock:\n" " [rid_0]\n" " Source []\n"}, - {"{$or: [{$and: [{a0: {$gt:9}}, {a0: {$lt: 12}}]}, {$and: [{a0: {$gt:40}}, {a0: {$lt: " - "44}}]}]}", + {"{$or: [{$and: [{a0: {$gt:9, $lt:999}}, {a0: {$gt: 0, $lt: 12}}]}," + " {$and: [{a0: {$gt:40, $lt:997}}, {a0: {$gt:0, $lt: 44}}]}]}", "Root []\n" "| | projections: \n" "| | scan_0\n" diff --git a/src/mongo/db/query/optimizer/logical_rewriter_optimizer_test.cpp b/src/mongo/db/query/optimizer/logical_rewriter_optimizer_test.cpp index a1468c7e8e7..6d0708d0703 100644 --- a/src/mongo/db/query/optimizer/logical_rewriter_optimizer_test.cpp +++ b/src/mongo/db/query/optimizer/logical_rewriter_optimizer_test.cpp @@ -1440,12 +1440,12 @@ TEST(LogicalRewriter, SargableCE) { " groupId: 1\n" " | | Logical properties:\n" " | | cardinalityEstimate: \n" - " | | ce: 10\n" + " | | ce: 5.62341\n" " | | requirementCEs: \n" " | | refProjection: ptest, path: 'PathGet [a] PathIdentity []', ce: " - "100\n" + "31.6228\n" " | | refProjection: ptest, path: 'PathGet [b] PathIdentity []', ce: " - "100\n" + "31.6228\n" " | | projections: \n" " | | ptest\n" " | | indexingAvailability: \n" @@ -1473,7 +1473,7 @@ TEST(LogicalRewriter, SargableCE) { " groupId: 2\n" " | | Logical properties:\n" " | | cardinalityEstimate: \n" - " | | ce: 10\n" + " | | ce: 5.62341\n" " | | projections: \n" " | | ptest\n" " | | indexingAvailability: \n" diff --git a/src/mongo/db/query/optimizer/physical_rewriter_optimizer_test.cpp b/src/mongo/db/query/optimizer/physical_rewriter_optimizer_test.cpp index 6f0a2654909..fc7cbf49485 100644 --- a/src/mongo/db/query/optimizer/physical_rewriter_optimizer_test.cpp +++ b/src/mongo/db/query/optimizer/physical_rewriter_optimizer_test.cpp @@ -40,6 +40,9 @@ namespace mongo::optimizer { namespace { +// Default selectivity of predicates used by HintedCE to force certain plans. +constexpr double kDefaultSelectivity = 0.1; + TEST(PhysRewriter, PhysicalRewriterBasic) { using namespace properties; PrefixId prefixId; @@ -1189,6 +1192,17 @@ TEST(PhysRewriter, FilterIndexing4) { ABT scanNode = make<ScanNode>("root", "c1"); + // TODO: SERVER-68006 will investigate why the plan changes without the hints. + PartialSchemaSelHints hints; + hints.emplace(PartialSchemaKey{"root", make<PathGet>("a", make<PathIdentity>())}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", make<PathGet>("b", make<PathIdentity>())}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", make<PathGet>("c", make<PathIdentity>())}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", make<PathGet>("d", make<PathIdentity>())}, + kDefaultSelectivity); + ABT evalNode = make<EvaluationNode>( "pa", make<EvalPath>(make<PathGet>("a", make<PathIdentity>()), make<Variable>("root")), @@ -1232,6 +1246,7 @@ TEST(PhysRewriter, FilterIndexing4) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{ {}, @@ -1243,6 +1258,8 @@ TEST(PhysRewriter, FilterIndexing4) { false /*isMultiKey*/, {DistributionType::Centralized}, {}}}}}}}}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); ABT optimized = std::move(rootNode); @@ -1251,7 +1268,7 @@ TEST(PhysRewriter, FilterIndexing4) { phaseManager.getHints()._disableHashJoinRIDIntersect = true; ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(65, 80, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(65, 110, phaseManager.getMemo().getStats()._physPlanExplorationCount); ASSERT_EXPLAIN_V2( "Root []\n" @@ -1342,7 +1359,7 @@ TEST(PhysRewriter, FilterIndexing5) { ABT optimized = std::move(rootNode); ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(25, 55, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(25, 70, phaseManager.getMemo().getStats()._physPlanExplorationCount); // We can cover both fields with the index, and need separate sort on "b". ASSERT_EXPLAIN_V2( @@ -1502,7 +1519,7 @@ TEST(PhysRewriter, FilterIndexingStress) { // Without the changes to restrict SargableNode split to which this test is tied, we would // be exploring 2^kFilterCount plans, one for each created group. - ASSERT_EQ(51, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_EQ(55, phaseManager.getMemo().getStats()._physPlanExplorationCount); const BSONObj& explainRoot = ExplainGenerator::explainBSONObj(optimized); ASSERT_BSON_PATH("\"Filter\"", explainRoot, "child.nodeType"); @@ -1554,6 +1571,14 @@ TEST(PhysRewriter, FilterIndexingVariable) { ABT scanNode = make<ScanNode>("root", "c1"); + // TODO: SERVER-68006 will investigate why the plan changes without the hints. + PartialSchemaSelHints hints; + hints.emplace(PartialSchemaKey{"root", + make<PathGet>("a", + make<PathTraverse>(make<PathIdentity>(), + PathTraverse::kSingleLevel))}, + kDefaultSelectivity); + // Encode a condition using two query parameters (expressed as functions): // "a" > param_0 AND "a" >= param_1 (observe param_1 comparison is inclusive). ABT filterNode = make<FilterNode>( @@ -1575,11 +1600,14 @@ TEST(PhysRewriter, FilterIndexingVariable) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{ {}, {{"index1", makeIndexDefinition("a", CollationOp::Ascending, false /*isMultiKey*/)}}}}}}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); ABT optimized = std::move(rootNode); @@ -1724,7 +1752,7 @@ TEST(PhysRewriter, FilterReorder) { make<PathGet>(projName, make<PathTraverse>(make<PathIdentity>(), PathTraverse::kSingleLevel))}, - 0.1 * (kFilterCount - i)); + kDefaultSelectivity * (kFilterCount - i)); result = make<FilterNode>( make<EvalFilter>(make<PathGet>(std::move(projName), make<PathTraverse>(make<PathCompare>(Operations::Eq, @@ -2086,6 +2114,12 @@ TEST(PhysRewriter, MultiKeyIndex) { using namespace properties; PrefixId prefixId; + PartialSchemaSelHints hints; + hints.emplace(PartialSchemaKey{"root", make<PathGet>("a", make<PathIdentity>())}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", make<PathGet>("b", make<PathIdentity>())}, + kDefaultSelectivity); + ABT scanNode = make<ScanNode>("root", "c1"); ABT evalANode = make<EvaluationNode>( @@ -2120,12 +2154,15 @@ TEST(PhysRewriter, MultiKeyIndex) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{ {}, {{"index1", makeIndexDefinition("a", CollationOp::Ascending, false /*isMultiKey*/)}, {"index2", makeIndexDefinition("b", CollationOp::Descending, false /*isMultiKey*/)}}}}}}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); { @@ -2135,7 +2172,7 @@ TEST(PhysRewriter, MultiKeyIndex) { phaseManager.getHints()._disableHashJoinRIDIntersect = true; ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(15, 25, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(13, 25, phaseManager.getMemo().getStats()._physPlanExplorationCount); // GroupBy+Union cannot propagate collation requirement, and we need a separate // CollationNode. @@ -2355,7 +2392,7 @@ TEST(PhysRewriter, CompoundIndex1) { ABT optimized = rootNode; ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(60, 110, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(60, 130, phaseManager.getMemo().getStats()._physPlanExplorationCount); const BSONObj& explainRoot = ExplainGenerator::explainBSONObj(optimized); ASSERT_BSON_PATH("\"BinaryJoin\"", explainRoot, "child.nodeType"); @@ -2443,7 +2480,7 @@ TEST(PhysRewriter, CompoundIndex2) { ABT optimized = rootNode; ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(100, 170, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(100, 210, phaseManager.getMemo().getStats()._physPlanExplorationCount); const BSONObj& explainRoot = ExplainGenerator::explainBSONObj(optimized); ASSERT_BSON_PATH("\"BinaryJoin\"", explainRoot, "child.nodeType"); @@ -2527,7 +2564,7 @@ TEST(PhysRewriter, CompoundIndex3) { ABT optimized = rootNode; ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(70, 110, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(70, 130, phaseManager.getMemo().getStats()._physPlanExplorationCount); ASSERT_EXPLAIN_V2( "Root []\n" @@ -2932,6 +2969,18 @@ TEST(PhysRewriter, IndexBoundsIntersect3) { using namespace properties; PrefixId prefixId; + PartialSchemaSelHints hints; + hints.emplace( + PartialSchemaKey{ + "root", + make<PathGet>( + "a", + make<PathTraverse>( + make<PathGet>( + "b", make<PathTraverse>(make<PathIdentity>(), PathTraverse::kSingleLevel)), + PathTraverse::kSingleLevel))}, + kDefaultSelectivity); + ABT scanNode = make<ScanNode>("root", "c1"); ABT filterNode = make<FilterNode>( @@ -2960,6 +3009,7 @@ TEST(PhysRewriter, IndexBoundsIntersect3) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{ {}, @@ -2967,6 +3017,8 @@ TEST(PhysRewriter, IndexBoundsIntersect3) { IndexDefinition{{{makeIndexPath(FieldPathType{"a", "b"}, true /*isMultiKey*/), CollationOp::Ascending}}, true /*isMultiKey*/}}}}}}}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); ABT optimized = rootNode; @@ -3159,10 +3211,10 @@ TEST(PhysRewriter, IndexResidualReq) { // Make sure we can use the index to cover "b" while testing "b.c" with a separate filter. ASSERT_EXPLAIN_PROPS_V2( - "Properties [cost: 0.070002, localCost: 0, adjustedCE: 10]\n" + "Properties [cost: 0.402121, localCost: 0, adjustedCE: 189.571]\n" "| | Logical:\n" "| | cardinalityEstimate: \n" - "| | ce: 10\n" + "| | ce: 189.571\n" "| | projections: \n" "| | pa\n" "| | root\n" @@ -3183,14 +3235,14 @@ TEST(PhysRewriter, IndexResidualReq) { "| | pa\n" "| RefBlock: \n" "| Variable [pa]\n" - "Properties [cost: 0.070002, localCost: 0.070002, adjustedCE: 10]\n" + "Properties [cost: 0.402121, localCost: 0.402121, adjustedCE: 189.571]\n" "| | Logical:\n" "| | cardinalityEstimate: \n" - "| | ce: 10\n" + "| | ce: 189.571\n" "| | requirementCEs: \n" - "| | refProjection: root, path: 'PathGet [a] PathIdentity []', ce: 100\n" + "| | refProjection: root, path: 'PathGet [a] PathIdentity []', ce: 330\n" "| | refProjection: root, path: 'PathGet [b] PathGet [c] PathIdentity []', " - "ce: 100\n" + "ce: 330\n" "| | projections: \n" "| | pa\n" "| | root\n" @@ -3286,7 +3338,7 @@ TEST(PhysRewriter, IndexResidualReq1) { ABT optimized = rootNode; ASSERT_TRUE(phaseManager.optimize(optimized)); - ASSERT_BETWEEN(25, 30, phaseManager.getMemo().getStats()._physPlanExplorationCount); + ASSERT_BETWEEN(25, 45, phaseManager.getMemo().getStats()._physPlanExplorationCount); // Prefer index1 over index2 and index3 in order to cover all fields. ASSERT_EXPLAIN_V2( @@ -3322,6 +3374,19 @@ TEST(PhysRewriter, IndexResidualReq2) { ABT scanNode = make<ScanNode>("root", "c1"); + // TODO: SERVER-68006 will investigate why the plan changes without the hints. + PartialSchemaSelHints hints; + hints.emplace(PartialSchemaKey{"root", + make<PathGet>("a", + make<PathTraverse>(make<PathIdentity>(), + PathTraverse::kSingleLevel))}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", + make<PathGet>("b", + make<PathTraverse>(make<PathIdentity>(), + PathTraverse::kSingleLevel))}, + kDefaultSelectivity); + ABT filterANode = make<FilterNode>( make<EvalFilter>( make<PathGet>("a", @@ -3346,6 +3411,7 @@ TEST(PhysRewriter, IndexResidualReq2) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{{}, {{"index1", @@ -3353,6 +3419,8 @@ TEST(PhysRewriter, IndexResidualReq2) { {{"a", CollationOp::Ascending, true /*isMultiKey*/}, {"c", CollationOp::Ascending, true /*isMultiKey*/}, {"b", CollationOp::Ascending, true /*isMultiKey*/}})}}}}}}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); ABT optimized = rootNode; @@ -3862,6 +3930,15 @@ TEST(PhysRewriter, IndexPartitioning) { ABT scanNode = make<ScanNode>("root", "c1"); + // TODO: SERVER-68006 This test results in an failed assert when cardinality estimates change. + // The assert is in physical_rewriter.cpp: + // "Must optimize successfully if found compatible properties!" + PartialSchemaSelHints hints; + hints.emplace(PartialSchemaKey{"root", make<PathGet>("a", make<PathIdentity>())}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", make<PathGet>("b", make<PathIdentity>())}, + kDefaultSelectivity); + ABT projectionANode = make<EvaluationNode>( "pa", make<EvalPath>(make<PathGet>("a", make<PathIdentity>()), make<Variable>("root")), @@ -3895,6 +3972,7 @@ TEST(PhysRewriter, IndexPartitioning) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{ {}, @@ -3906,6 +3984,8 @@ TEST(PhysRewriter, IndexPartitioning) { {}}}}, {DistributionType::HashPartitioning, makeSeq(makeNonMultikeyIndexPath("b"))}}}}, 5 /*numberOfPartitions*/}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); ABT optimized = rootNode; @@ -3973,6 +4053,18 @@ TEST(PhysRewriter, IndexPartitioning1) { ABT scanNode = make<ScanNode>("root", "c1"); + // TODO: SERVER-68006 This test results in an failed assert when cardinality estimates change. + // The assert is in physical_rewriter.cpp: + // "Must optimize successfully if found compatible properties!" + // The interesting thing is that it doesn't fail on each test run, but sometimes, which + // makes one think the failure depends on rounding errors of cost/cardinality estimates, + // which in turn results in different sub-plans. + PartialSchemaSelHints hints; + hints.emplace(PartialSchemaKey{"root", make<PathGet>("a", make<PathIdentity>())}, + kDefaultSelectivity); + hints.emplace(PartialSchemaKey{"root", make<PathGet>("b", make<PathIdentity>())}, + kDefaultSelectivity); + ABT projectionANode = make<EvaluationNode>( "pa", make<EvalPath>(make<PathGet>("a", make<PathIdentity>()), make<Variable>("root")), @@ -4006,6 +4098,7 @@ TEST(PhysRewriter, IndexPartitioning1) { OptPhaseManager::OptPhase::MemoExplorationPhase, OptPhaseManager::OptPhase::MemoImplementationPhase}, prefixId, + false /*requireRID*/, {{{"c1", ScanDefinition{ {}, @@ -4023,6 +4116,8 @@ TEST(PhysRewriter, IndexPartitioning1) { {}}}}, {DistributionType::HashPartitioning, makeSeq(makeNonMultikeyIndexPath("c"))}}}}, 5 /*numberOfPartitions*/}, + std::make_unique<HintedCE>(std::move(hints)), + std::make_unique<DefaultCosting>(), {true /*debugMode*/, 2 /*debugLevel*/, DebugInfo::kIterationLimitForTests}); ABT optimized = rootNode; diff --git a/src/mongo/db/query/ce/ce_math.cpp b/src/mongo/db/query/optimizer/utils/ce_math.cpp index a5dd7bf04de..b56adb4d860 100644 --- a/src/mongo/db/query/ce/ce_math.cpp +++ b/src/mongo/db/query/optimizer/utils/ce_math.cpp @@ -27,19 +27,31 @@ * it in the license file. */ -#include <algorithm> -#include <cmath> -#include <functional> +#include <algorithm> // std::sort +#include <cmath> // std::pow +#include <functional> // std::greater -#include "mongo/db/query/ce/ce_math.h" +#include "mongo/db/query/optimizer/utils/ce_math.h" namespace mongo::ce { -double conjExponentialBackoff(std::vector<double> conjSelectivities) { - std::sort(conjSelectivities.begin(), conjSelectivities.end()); - double sel = conjSelectivities[0]; - double f = 1.0; + +bool validSelectivity(SelectivityType sel) { + return (sel >= 0.0 && sel <= 1.0); +} + +bool validCardinality(CEType card) { + return (card >= kMinCard && card <= std::numeric_limits<CEType>::max()); +} + +SelectivityType conjExponentialBackoff(std::vector<SelectivityType> conjSelectivities) { + size_t actualMaxBackoffElements = std::min(conjSelectivities.size(), kMaxBackoffElements); + std::partial_sort(conjSelectivities.begin(), + conjSelectivities.begin() + actualMaxBackoffElements, + conjSelectivities.end()); + SelectivityType sel = conjSelectivities[0]; + SelectivityType f = 1.0; size_t i = 1; - while (i < conjSelectivities.size() && i <= kMaxBackoffIterations) { + while (i < actualMaxBackoffElements) { f /= 2.0; sel *= std::pow(conjSelectivities[i], f); i++; @@ -47,12 +59,16 @@ double conjExponentialBackoff(std::vector<double> conjSelectivities) { return sel; } -double disjExponentialBackoff(std::vector<double> disjSelectivities) { - std::sort(disjSelectivities.begin(), disjSelectivities.end(), std::greater<double>()); - double sel = 1.0 - disjSelectivities[0]; - double f = 1.0; +SelectivityType disjExponentialBackoff(std::vector<SelectivityType> disjSelectivities) { + size_t actualMaxBackoffElements = std::min(disjSelectivities.size(), kMaxBackoffElements); + std::partial_sort(disjSelectivities.begin(), + disjSelectivities.begin() + actualMaxBackoffElements, + disjSelectivities.end(), + std::greater<SelectivityType>()); + SelectivityType sel = 1.0 - disjSelectivities[0]; + SelectivityType f = 1.0; size_t i = 1; - while (i < disjSelectivities.size() && i <= kMaxBackoffIterations) { + while (i < actualMaxBackoffElements) { f /= 2.0; sel *= std::pow(1 - disjSelectivities[i], f); i++; diff --git a/src/mongo/db/query/ce/ce_math.h b/src/mongo/db/query/optimizer/utils/ce_math.h index 3ef529a9224..a29a7d1f34b 100644 --- a/src/mongo/db/query/ce/ce_math.h +++ b/src/mongo/db/query/optimizer/utils/ce_math.h @@ -29,24 +29,43 @@ #pragma once +#include <limits> #include <vector> +#include "mongo/db/query/optimizer/defs.h" + namespace mongo::ce { +using namespace mongo::optimizer; + +// Default cardinality when actual collection cardinality is unknown. +// Mostly used by unit tests. +constexpr CEType kDefaultCard = 1000.00; + +// Minimum estimated cardinality. In the absense of any statistics we can never +// assume there are less than this many matching documents. +constexpr CEType kMinCard = 0.01; + /** - * Specifies the maximum number of iterations to use when estimating via exponential backoff. + * Specifies the maximum number of elements (selectivities) to use when estimating via + * exponential backoff. */ -const size_t kMaxBackoffIterations = 3; +const size_t kMaxBackoffElements = 4; + + +bool validSelectivity(SelectivityType sel); + +bool validCardinality(CEType card); /** * Estimates the selectivity of a conjunction given the selectivities of its subexpressions using * exponential backoff. */ -double conjExponentialBackoff(std::vector<double> conjSelectivities); +SelectivityType conjExponentialBackoff(std::vector<SelectivityType> conjSelectivities); /** * Estimates the selectivity of a disjunction given the selectivities of its subexpressions using * exponential backoff. */ -double disjExponentialBackoff(std::vector<double> disjSelectivities); +SelectivityType disjExponentialBackoff(std::vector<SelectivityType> disjSelectivities); } // namespace mongo::ce |