diff options
author | David Storch <david.storch@mongodb.com> | 2022-05-03 00:23:39 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-05-03 01:29:24 +0000 |
commit | 2cee6cb397119b83fa45b53743e3bcea4106c5fc (patch) | |
tree | d4249a2c7197f2a3a65d3ea936f70fe416ae9a40 /src/mongo/db/query | |
parent | 0ba7cf78b7d59497eafd38b5aafeac29e8484cf1 (diff) | |
download | mongo-2cee6cb397119b83fa45b53743e3bcea4106c5fc.tar.gz |
SERVER-66015 Distinguish simple and non-simple regexes in SBE plan cache key
Diffstat (limited to 'src/mongo/db/query')
-rw-r--r-- | src/mongo/db/query/SConscript | 4 | ||||
-rw-r--r-- | src/mongo/db/query/analyze_regex.cpp | 180 | ||||
-rw-r--r-- | src/mongo/db/query/analyze_regex.h | 49 | ||||
-rw-r--r-- | src/mongo/db/query/canonical_query_encoder.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/query/index_bounds_builder.cpp | 142 |
5 files changed, 252 insertions, 139 deletions
diff --git a/src/mongo/db/query/SConscript b/src/mongo/db/query/SConscript index e141fdb2642..1f42baff54c 100644 --- a/src/mongo/db/query/SConscript +++ b/src/mongo/db/query/SConscript @@ -32,6 +32,7 @@ env.Library( "sort_pattern", ], LIBDEPS_PRIVATE=[ + "common_query_enums_and_helpers", ], ) @@ -159,8 +160,9 @@ env.Library( target="common_query_enums_and_helpers", source=[ "allowed_contexts.cpp", + "analyze_regex.cpp", "explain_options.cpp", - "explain_verbosity.idl" + "explain_verbosity.idl", ], LIBDEPS=[ "$BUILD_DIR/mongo/base", diff --git a/src/mongo/db/query/analyze_regex.cpp b/src/mongo/db/query/analyze_regex.cpp new file mode 100644 index 00000000000..fc1296670cc --- /dev/null +++ b/src/mongo/db/query/analyze_regex.cpp @@ -0,0 +1,180 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/analyze_regex.h" + +#include <cstring> + +#include "mongo/util/ctype.h" +#include "mongo/util/str.h" + +namespace mongo::analyze_regex { + +namespace { +/** + * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This + * function reports no false negatives, but will return false positives. For example, a pipe + * character inside of a character class or the \Q...\E escape sequence has no special meaning but + * may still be reported by this function as being non-escaped. + */ +bool stringMayHaveUnescapedPipe(StringData str) { + if (str.size() > 0 && str[0] == '|') { + return true; + } + if (str.size() > 1 && str[1] == '|' && str[0] != '\\') { + return true; + } + + for (size_t i = 2U; i < str.size(); ++i) { + auto probe = str[i]; + auto prev = str[i - 1]; + auto tail = str[i - 2]; + + // We consider the pipe to have a special meaning if it is not preceded by a backslash, or + // preceded by a backslash that is itself escaped. + if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) { + return true; + } + } + return false; +} +} // namespace + +std::pair<std::string, bool> getRegexPrefixMatch(const char* regex, const char* flags) { + bool multilineOK; + if (regex[0] == '\\' && regex[1] == 'A') { + multilineOK = true; + regex += 2; + } else if (regex[0] == '^') { + multilineOK = false; + regex += 1; + } else { + return {"", false}; + } + + // A regex with an unescaped pipe character is not considered a simple regex. + if (stringMayHaveUnescapedPipe(StringData(regex))) { + return {"", false}; + } + + bool extended = false; + while (*flags) { + switch (*(flags++)) { + case 'm': + // Multiline mode. + if (multilineOK) + continue; + else + return {"", false}; + case 's': + // Single-line mode specified. This just changes the behavior of the '.' character + // to match every character instead of every character except '\n'. + continue; + case 'x': + // Extended free-spacing mode. + extended = true; + break; + default: + // Cannot use the index. + return {"", false}; + } + } + + str::stream ss; + + std::string r = ""; + while (*regex) { + char c = *(regex++); + + if (c == '*' || c == '?') { + // These are the only two symbols that make the last char optional. + r = ss; + r = r.substr(0, r.size() - 1); + // Patterns like /^a?/ and /^a*/ can be implemented by scanning all the strings + // beginning with the prefix "a", but the regex must be reapplied. We cannot convert + // such regexes into exact bounds. + return {r, false}; + } else if (c == '\\') { + c = *(regex++); + if (c == 'Q') { + // \Q...\E quotes everything inside. + while (*regex) { + c = (*regex++); + if (c == '\\' && (*regex == 'E')) { + regex++; // skip the 'E' + break; // go back to start of outer loop + } else { + ss << c; // character should match itself + } + } + } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || + (c == '\0')) { + // Bail out on any of these escape sequences. + r = ss; + break; + } else { + // Slash followed by non-alphanumeric represents the following char. + ss << c; + } + } else if (strchr("^$.[()+{", c)) { + // List of "metacharacters" from man pcrepattern. + // + // For prefix patterns ending in '.*' (ex. /^abc.*/) we can build exact index bounds. + if (!multilineOK && (c == '.')) { + c = *(regex++); + if (c == '*' && *regex == 0) { + return {ss, true}; + } else { + c = *(regex--); + } + } + r = ss; + break; + } else if (extended && c == '#') { + // Comment. + r = ss; + break; + } else if (extended && ctype::isSpace(c)) { + continue; + } else { + // Self-matching char. + ss << c; + } + } + + bool isExactPrefixMatch = false; + if (r.empty() && *regex == 0) { + r = ss; + isExactPrefixMatch = !r.empty(); + } + + return {r, isExactPrefixMatch}; +} + +} // namespace mongo::analyze_regex diff --git a/src/mongo/db/query/analyze_regex.h b/src/mongo/db/query/analyze_regex.h new file mode 100644 index 00000000000..5cce3597f13 --- /dev/null +++ b/src/mongo/db/query/analyze_regex.h @@ -0,0 +1,49 @@ +/** + * Copyright (C) 2022-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <string> +#include <utility> + +namespace mongo::analyze_regex { + +/** + * Analyzes the regular expression given by 'regex' and 'flags' and returns a pair containing the + * following: + * - A (possibly empty) string which must be the prefix of all strings which match the regex. For + * example, for /^bar?/ this would be the string "bar". + * - A boolean which indicates whether the regex can be converted to simple prefix match. For + * instance, /^bar/ means "starts with 'bar'" and would return the pair ("bar", true). On the other + * hand, /^bar?/ does not match all strings that start with "bar" and therefore would return + * ("bar", false). + */ +std::pair<std::string, bool> getRegexPrefixMatch(const char* regex, const char* flags); + +} // namespace mongo::analyze_regex diff --git a/src/mongo/db/query/canonical_query_encoder.cpp b/src/mongo/db/query/canonical_query_encoder.cpp index c03d5ee2e88..494894ec097 100644 --- a/src/mongo/db/query/canonical_query_encoder.cpp +++ b/src/mongo/db/query/canonical_query_encoder.cpp @@ -41,6 +41,7 @@ #include "mongo/db/matcher/expression_text_noop.h" #include "mongo/db/matcher/expression_where.h" #include "mongo/db/matcher/expression_where_noop.h" +#include "mongo/db/query/analyze_regex.h" #include "mongo/db/query/projection.h" #include "mongo/db/query/query_feature_flags_gen.h" #include "mongo/db/query/query_knobs_gen.h" @@ -93,6 +94,11 @@ const char kEncodeParamMarker = '?'; // constant is typically encoded as a BSON type byte followed by a BSON value (without the // BSONElement's field name). const char kEncodeConstantLiteralMarker = ':'; +// Precedes a byte which encodes the bounds tightness associated with a predicate. The structure of +// the plan (i.e. presence of filters) is affected by bounds tightness. Therefore, if different +// parameter values can result in different tightnesses, this must be explicitly encoded into the +// plan cache key. +const char kEncodeBoundsTightnessDiscriminator = ':'; /** * AppendChar provides the compiler with a type for a "appendChar(...)" member function. @@ -750,6 +756,16 @@ public: encodeParamMarker(*sourceRegexParam); encodeParamMarker(*compiledRegexParam); + + // Encode a discriminator so that a "simple" regex which is exactly convertible into + // index bounds has a different shape from a non-simple regex. + // + // We don't actually need to know the contents of the prefix string, so we ignore the + // first member of the pair. + auto [_, isExact] = analyze_regex::getRegexPrefixMatch(expr->getString().c_str(), + expr->getFlags().c_str()); + _builder->appendChar(kEncodeBoundsTightnessDiscriminator); + _builder->appendChar(static_cast<char>(isExact)); } else { tassert(6579301, "If source param is not set in $regex expression compiled param must be unset " diff --git a/src/mongo/db/query/index_bounds_builder.cpp b/src/mongo/db/query/index_bounds_builder.cpp index bd82140b5c1..01e13b5f058 100644 --- a/src/mongo/db/query/index_bounds_builder.cpp +++ b/src/mongo/db/query/index_bounds_builder.cpp @@ -43,6 +43,7 @@ #include "mongo/db/matcher/expression_geo.h" #include "mongo/db/matcher/expression_internal_bucket_geo_within.h" #include "mongo/db/matcher/expression_internal_expr_comparison.h" +#include "mongo/db/query/analyze_regex.h" #include "mongo/db/query/collation/collation_index_key.h" #include "mongo/db/query/collation/collator_interface.h" #include "mongo/db/query/expression_index.h" @@ -52,8 +53,6 @@ #include "mongo/db/query/planner_wildcard_helpers.h" #include "mongo/db/query/query_knobs_gen.h" #include "mongo/logv2/log.h" -#include "mongo/util/ctype.h" -#include "mongo/util/str.h" #include "third_party/s2/s2cell.h" #include "third_party/s2/s2regioncoverer.h" @@ -93,34 +92,6 @@ IndexBoundsBuilder::BoundsTightness getInequalityPredicateTightness(const Interv : IndexBoundsBuilder::INEXACT_FETCH; } -/** - * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This - * function reports no false negatives, but will return false positives. For example, a pipe - * character inside of a character class or the \Q...\E escape sequence has no special meaning but - * may still be reported by this function as being non-escaped. - */ -bool stringMayHaveUnescapedPipe(StringData str) { - if (str.size() > 0 && str[0] == '|') { - return true; - } - if (str.size() > 1 && str[1] == '|' && str[0] != '\\') { - return true; - } - - for (size_t i = 2U; i < str.size(); ++i) { - auto probe = str[i]; - auto prev = str[i - 1]; - auto tail = str[i - 2]; - - // We consider the pipe to have a special meaning if it is not preceded by a backslash, or - // preceded by a backslash that is itself escaped. - if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) { - return true; - } - } - return false; -} - const BSONObj kUndefinedElementObj = BSON("" << BSONUndefined); const BSONObj kNullElementObj = BSON("" << BSONNULL); const BSONObj kEmptyArrayElementObj = BSON("" << BSONArray()); @@ -184,116 +155,11 @@ string IndexBoundsBuilder::simpleRegex(const char* regex, return ""; } - *tightnessOut = IndexBoundsBuilder::INEXACT_COVERED; - - bool multilineOK; - if (regex[0] == '\\' && regex[1] == 'A') { - multilineOK = true; - regex += 2; - } else if (regex[0] == '^') { - multilineOK = false; - regex += 1; - } else { - return ""; - } - - // A regex with an unescaped pipe character is not considered a simple regex. - if (stringMayHaveUnescapedPipe(StringData(regex))) { - return ""; - } - - bool extended = false; - while (*flags) { - switch (*(flags++)) { - case 'm': - // Multiline mode. - if (multilineOK) - continue; - else - return ""; - case 's': - // Single-line mode specified. This just changes the behavior of the '.' - // character to match every character instead of every character except '\n'. - continue; - case 'x': - // Extended free-spacing mode. - extended = true; - break; - default: - // Cannot use the index. - return ""; - } - } - - str::stream ss; - - string r = ""; - while (*regex) { - char c = *(regex++); - - if (c == '*' || c == '?') { - // These are the only two symbols that make the last char optional - r = ss; - r = r.substr(0, r.size() - 1); - return r; // breaking here fails with /^a?/ - } else if (c == '\\') { - c = *(regex++); - if (c == 'Q') { - // \Q...\E quotes everything inside - while (*regex) { - c = (*regex++); - if (c == '\\' && (*regex == 'E')) { - regex++; // skip the 'E' - break; // go back to start of outer loop - } else { - ss << c; // character should match itself - } - } - } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || - (c == '\0')) { - // don't know what to do with these - r = ss; - break; - } else { - // slash followed by non-alphanumeric represents the following char - ss << c; - } - } else if (strchr("^$.[()+{", c)) { - // list of "metacharacters" from man pcrepattern - // For prefix patterns ending in '.*' (ex. /^abc.*/) we can build exact index bounds. - if (!multilineOK && (c == '.')) { - c = *(regex++); - if (c == '*' && *regex == 0) { - *tightnessOut = IndexBoundsBuilder::EXACT; - return ss; - } else { - c = *(regex--); - } - } - r = ss; - break; - } else if (extended && c == '#') { - // comment - r = ss; - break; - } else if (extended && ctype::isSpace(c)) { - continue; - } else { - // self-matching char - ss << c; - } - } - - if (r.empty() && *regex == 0) { - r = ss; - *tightnessOut = r.empty() ? IndexBoundsBuilder::INEXACT_COVERED : IndexBoundsBuilder::EXACT; - } - - return r; + auto [prefixStr, isExact] = analyze_regex::getRegexPrefixMatch(regex, flags); + *tightnessOut = isExact ? IndexBoundsBuilder::EXACT : IndexBoundsBuilder::INEXACT_COVERED; + return prefixStr; } - -// static void IndexBoundsBuilder::allValuesForField(const BSONElement& elt, OrderedIntervalList* out) { // ARGH, BSONValue would make this shorter. BSONObjBuilder bob; |