diff options
author | Timour Katchaounov <timour.katchaounov@mongodb.com> | 2021-03-21 19:11:40 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-05-10 20:41:28 +0000 |
commit | 2f73b66d3cd004e0110bbee47151b75c7fc0791f (patch) | |
tree | d8a4926fe8cfac15aad840af0ce3373df66d9a59 /src/mongo/db/matcher | |
parent | f3dbbebb303be094a563989f086cbf35e61e1b69 (diff) | |
download | mongo-2f73b66d3cd004e0110bbee47151b75c7fc0791f.tar.gz |
SERVER-34012 Planner's logic for taking union of index bounds intervals is slow for large $or queries
Implement a rewrite of single-path ORs into IN-lists.
Diffstat (limited to 'src/mongo/db/matcher')
-rw-r--r-- | src/mongo/db/matcher/expression_leaf.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/matcher/expression_leaf.h | 6 | ||||
-rw-r--r-- | src/mongo/db/matcher/expression_optimize_test.cpp | 43 | ||||
-rw-r--r-- | src/mongo/db/matcher/expression_tree.cpp | 136 |
4 files changed, 190 insertions, 0 deletions
diff --git a/src/mongo/db/matcher/expression_leaf.cpp b/src/mongo/db/matcher/expression_leaf.cpp index b10ced9bfe4..25280408af2 100644 --- a/src/mongo/db/matcher/expression_leaf.cpp +++ b/src/mongo/db/matcher/expression_leaf.cpp @@ -389,6 +389,7 @@ std::unique_ptr<MatchExpression> InMatchExpression::shallowClone() const { next->_hasEmptyArray = _hasEmptyArray; next->_equalitySet = _equalitySet; next->_originalEqualityVector = _originalEqualityVector; + next->_equalityStorage = _equalityStorage; for (auto&& regex : _regexes) { std::unique_ptr<RegexMatchExpression> clonedRegex( static_cast<RegexMatchExpression*>(regex->shallowClone().release())); @@ -546,6 +547,10 @@ Status InMatchExpression::setEqualities(std::vector<BSONElement> equalities) { return Status::OK(); } +void InMatchExpression::setBackingBSON(BSONObj equalityStorage) { + _equalityStorage = std::move(equalityStorage); +} + Status InMatchExpression::addRegex(std::unique_ptr<RegexMatchExpression> expr) { _regexes.push_back(std::move(expr)); return Status::OK(); diff --git a/src/mongo/db/matcher/expression_leaf.h b/src/mongo/db/matcher/expression_leaf.h index fe6878eb05d..3c35445bc0b 100644 --- a/src/mongo/db/matcher/expression_leaf.h +++ b/src/mongo/db/matcher/expression_leaf.h @@ -584,6 +584,8 @@ public: Status setEqualities(std::vector<BSONElement> equalities); + void setBackingBSON(BSONObj equalityStorage); + Status addRegex(std::unique_ptr<RegexMatchExpression> expr); const std::vector<BSONElement>& getEqualities() const { @@ -648,6 +650,10 @@ private: // Container of regex elements this object owns. std::vector<std::unique_ptr<RegexMatchExpression>> _regexes; + + // When this $in is generated internally, e.g. via a rewrite, this is where we store the + // data of the corresponding equality elements. + BSONObj _equalityStorage; }; /** diff --git a/src/mongo/db/matcher/expression_optimize_test.cpp b/src/mongo/db/matcher/expression_optimize_test.cpp index eaaf42e3e76..6a41ad33643 100644 --- a/src/mongo/db/matcher/expression_optimize_test.cpp +++ b/src/mongo/db/matcher/expression_optimize_test.cpp @@ -29,6 +29,8 @@ #include "mongo/db/pipeline/expression.h" +#include <vector> + #include "mongo/db/matcher/expression_always_boolean.h" #include "mongo/db/pipeline/expression_context_for_test.h" #include "mongo/db/query/canonical_query.h" @@ -448,5 +450,46 @@ TEST(ExpressionOptimizeTest, NestedOrWithAlwaysTrueOptimizesToAlwaysTrue) { ASSERT_BSONOBJ_EQ(bob.obj(), fromjson("{$alwaysTrue: 1}")); } +TEST(ExpressionOptimizeTest, OrRewrittenToIn) { + const std::vector<std::pair<std::string, std::string>> queries = { + {"{$or: [{f1: 5}, {f1: 3}, {f1: 7}]}", "{ f1: { $in: [ 3, 5, 7 ] } }"}, + {"{$or: [{f1: {$eq: 5}}, {f1: {$eq: 3}}, {f1: {$eq: 7}}]}", "{ f1: { $in: [ 3, 5, 7 ] } }"}, + {"{$or: [{f1: 42}, {f1: NaN}, {f1: 99}]}", "{ f1: { $in: [ NaN, 42, 99 ] } }"}, + {"{$or: [{f1: /^x/}, {f1:'ab'}]}", "{ f1: { $in: [ \"ab\", /^x/ ] } }"}, + {"{$or: [{f1: /^x/}, {f1:'^a'}]}", "{ f1: { $in: [ \"^a\", /^x/ ] } }"}, + {"{$or: [{f1: 42}, {f1: null}, {f1: 99}]}", + "{ $or: [ { f1: { $in: [ 42, 99 ] } }, { f1: { $eq: null } } ] }"}, + {"{$or: [{f1: 1}, {f2: 9}, {f1: 99}]}", + "{ $or: [ { f1: { $in: [ 1, 99 ] } }, { f2: { $eq: 9 } } ] }"}, + {"{$and: [{$or: [{f1: 7}, {f1: 3}, {f1: 5}]}, {$or: [{f1: 1}, {f1: 2}, {f1: 3}]}]}", + "{ $and: [ { f1: { $in: [ 3, 5, 7 ] } }, { f1: { $in: [ 1, 2, 3 ] } } ] }"}, + {"{$or: [{$or: [{f1: 7}, {f1: 3}, {f1: 5}]}, {$or: [{f1: 1}, {f1: 2}, {f1: 3}]}]}", + "{ $or: [ { f1: { $in: [ 3, 5, 7 ] } }, { f1: { $in: [ 1, 2, 3 ] } } ] }"}, + {"{$or: [{$and: [{f1: 7}, {f2: 7}, {f1: 5}]}, {$or: [{f1: 1}, {f1: 2}, {f1: 3}]}]}", + "{ $or: [ { $and: [ { f1: { $eq: 7 } }, { f2: { $eq: 7 } }, { f1: { $eq: 5 } } ] }," + " { f1: { $in: [ 1, 2, 3 ] } } ] }"}, + }; + + auto optimizeExpr = [](std::string exprStr) { + auto obj = fromjson(exprStr); + std::unique_ptr<MatchExpression> matchExpression(parseMatchExpression(obj)); + auto optimizedMatchExpression = MatchExpression::optimize(std::move(matchExpression)); + BSONObjBuilder bob; + optimizedMatchExpression->serialize(&bob, true); + return bob.obj(); + }; + + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[0].first), fromjson(queries[0].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[1].first), fromjson(queries[1].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[2].first), fromjson(queries[2].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[3].first), fromjson(queries[3].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[4].first), fromjson(queries[4].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[5].first), fromjson(queries[5].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[6].first), fromjson(queries[6].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[7].first), fromjson(queries[7].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[8].first), fromjson(queries[8].second)); + ASSERT_BSONOBJ_EQ(optimizeExpr(queries[9].first), fromjson(queries[9].second)); +} + } // namespace } // namespace mongo diff --git a/src/mongo/db/matcher/expression_tree.cpp b/src/mongo/db/matcher/expression_tree.cpp index 92fab31668b..9df86035a47 100644 --- a/src/mongo/db/matcher/expression_tree.cpp +++ b/src/mongo/db/matcher/expression_tree.cpp @@ -152,6 +152,142 @@ MatchExpression::ExpressionOptimizerFunc ListOfMatchExpression::getOptimizer() c } } + // Rewrite an OR with EQ conditions on the same path as an IN-list. Example: + // {$or: [{name: "Don"}, {name: "Alice"}]} + // is rewritten as: + // {name: {$in: ["Alice", "Don"]}} + if (matchType == MatchExpression::OR && children.size() > 1) { + size_t countEquivEqPaths = 0; + size_t countNonEquivExpr = 0; + boost::optional<std::string> childPath; + const CollatorInterface* eqCollator = nullptr; + + auto isNullOrRegEx = [](const BSONElement& elm) { + return (elm.isNull() || elm.type() == BSONType::RegEx); + }; + + // Check if all children are equality conditions or regular expressions with the + // same path argument, and same collation. + for (auto& childExpression : children) { + if (childExpression->matchType() != MatchExpression::EQ && + childExpression->matchType() != MatchExpression::REGEX) { + ++countNonEquivExpr; + continue; + } + + // Disjunctions of equalities use $eq comparison, which has different semantics + // from $in equality comparison in two cases: + // (1) ('null' $eq 'undefined' = true), while ('null' $in 'undefined' = false), + // that is, when comparing a 'null' argument to an 'undefined' value, the + // result is different for $eq vs $in. + // (2) the regex under the equality is matched literally as a string constant, + // while a regex inside $in is matched as a regular expression. + // $lookup processing explicitly depends on this different semantics. + // Both these cases should not be rewritten into $in because of the different + // comparison semantics. + const CollatorInterface* curCollator = nullptr; + if (childExpression->matchType() == MatchExpression::EQ) { + auto eqExpression = + static_cast<EqualityMatchExpression*>(childExpression.get()); + curCollator = eqExpression->getCollator(); + if (isNullOrRegEx(eqExpression->getData())) { + ++countNonEquivExpr; + continue; + } + } + + // childExpression is an equality with $in comparison semantics. + // The current approach assumes there is one (large) group of $eq disjuncts + // that are on the same path. + if (!childPath) { + // The path of the first equality. + childPath = childExpression->path().toString(); + eqCollator = curCollator; + countEquivEqPaths = 1; + } else if (*childPath == childExpression->path() && eqCollator == curCollator) { + ++countEquivEqPaths; // subsequent equality on the same path + } else { + ++countNonEquivExpr; // equality on another path + } + } + tassert(3401201, + "All expressions must be classified as either eq-equiv or non-eq-equiv.", + countEquivEqPaths + countNonEquivExpr == children.size()); + + // The condition above checks that there are at least two equalities that can be + // rewritten to an $in, and the we have classified all $or conditions into two disjunct + // groups. + if (countEquivEqPaths > 1) { + tassert(3401202, "There must be a common path.", childPath); + auto inExpression = std::make_unique<InMatchExpression>(StringData(*childPath)); + auto nonEquivOrExpr = + (countNonEquivExpr > 0) ? std::make_unique<OrMatchExpression>() : nullptr; + BSONArrayBuilder bab; + + for (auto& childExpression : children) { + if (*childPath != childExpression->path()) { + nonEquivOrExpr->add(std::move(childExpression)); + } else if (childExpression->matchType() == MatchExpression::EQ) { + std::unique_ptr<EqualityMatchExpression> eqExpressionPtr{ + static_cast<EqualityMatchExpression*>(childExpression.release())}; + if (isNullOrRegEx(eqExpressionPtr->getData()) || + eqExpressionPtr->getCollator() != eqCollator) { + nonEquivOrExpr->add(std::move(eqExpressionPtr)); + } else { + bab.append(eqExpressionPtr->getData()); + } + } else if (childExpression->matchType() == MatchExpression::REGEX) { + std::unique_ptr<RegexMatchExpression> regexExpressionPtr{ + static_cast<RegexMatchExpression*>(childExpression.release())}; + // Reset the path because when we parse a $in expression which contains a + // regexp, we create a RegexMatchExpression with an empty path. + regexExpressionPtr->setPath({}); + auto status = inExpression->addRegex(std::move(regexExpressionPtr)); + tassert(3401203, // TODO SERVER-55449 convert to tassertStatusOK. + "Conversion from OR to IN should always succeed.", + status == Status::OK()); + } else { + nonEquivOrExpr->add(std::move(childExpression)); + } + } + children.clear(); + tassert(3401204, + "Incorrect number of non-equivalent expressions", + !nonEquivOrExpr || nonEquivOrExpr->numChildren() == countNonEquivExpr); + + auto backingArr = bab.arr(); + std::vector<BSONElement> inEqualities; + backingArr.elems(inEqualities); + tassert(3401205, + "Incorrect number of in-equivalent expressions", + !countEquivEqPaths || + (inEqualities.size() + inExpression->getRegexes().size()) == + countEquivEqPaths); + + auto status = inExpression->setEqualities(std::move(inEqualities)); + tassert(3401206, // TODO SERVER-55449 convert to tassertStatusOK. + "Conversion from OR to IN should always succeed.", + status == Status::OK()); + + inExpression->setBackingBSON(std::move(backingArr)); + if (eqCollator) { + inExpression->setCollator(eqCollator); + } + + if (countNonEquivExpr > 0) { + auto parentOrExpr = std::make_unique<OrMatchExpression>(); + parentOrExpr->add(std::move(inExpression)); + if (countNonEquivExpr == 1) { + parentOrExpr->add(nonEquivOrExpr->releaseChild(0)); + } else { + parentOrExpr->add(std::move(nonEquivOrExpr)); + } + return parentOrExpr; + } + return inExpression; + } + } + return expression; }; } |