summaryrefslogtreecommitdiff
path: root/src/mongo/db/query
diff options
context:
space:
mode:
authorDavid Storch <david.storch@mongodb.com>2022-05-03 00:23:39 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-05-03 01:29:24 +0000
commit2cee6cb397119b83fa45b53743e3bcea4106c5fc (patch)
treed4249a2c7197f2a3a65d3ea936f70fe416ae9a40 /src/mongo/db/query
parent0ba7cf78b7d59497eafd38b5aafeac29e8484cf1 (diff)
downloadmongo-2cee6cb397119b83fa45b53743e3bcea4106c5fc.tar.gz
SERVER-66015 Distinguish simple and non-simple regexes in SBE plan cache key
Diffstat (limited to 'src/mongo/db/query')
-rw-r--r--src/mongo/db/query/SConscript4
-rw-r--r--src/mongo/db/query/analyze_regex.cpp180
-rw-r--r--src/mongo/db/query/analyze_regex.h49
-rw-r--r--src/mongo/db/query/canonical_query_encoder.cpp16
-rw-r--r--src/mongo/db/query/index_bounds_builder.cpp142
5 files changed, 252 insertions, 139 deletions
diff --git a/src/mongo/db/query/SConscript b/src/mongo/db/query/SConscript
index e141fdb2642..1f42baff54c 100644
--- a/src/mongo/db/query/SConscript
+++ b/src/mongo/db/query/SConscript
@@ -32,6 +32,7 @@ env.Library(
"sort_pattern",
],
LIBDEPS_PRIVATE=[
+ "common_query_enums_and_helpers",
],
)
@@ -159,8 +160,9 @@ env.Library(
target="common_query_enums_and_helpers",
source=[
"allowed_contexts.cpp",
+ "analyze_regex.cpp",
"explain_options.cpp",
- "explain_verbosity.idl"
+ "explain_verbosity.idl",
],
LIBDEPS=[
"$BUILD_DIR/mongo/base",
diff --git a/src/mongo/db/query/analyze_regex.cpp b/src/mongo/db/query/analyze_regex.cpp
new file mode 100644
index 00000000000..fc1296670cc
--- /dev/null
+++ b/src/mongo/db/query/analyze_regex.cpp
@@ -0,0 +1,180 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/query/analyze_regex.h"
+
+#include <cstring>
+
+#include "mongo/util/ctype.h"
+#include "mongo/util/str.h"
+
+namespace mongo::analyze_regex {
+
+namespace {
+/**
+ * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This
+ * function reports no false negatives, but will return false positives. For example, a pipe
+ * character inside of a character class or the \Q...\E escape sequence has no special meaning but
+ * may still be reported by this function as being non-escaped.
+ */
+bool stringMayHaveUnescapedPipe(StringData str) {
+ if (str.size() > 0 && str[0] == '|') {
+ return true;
+ }
+ if (str.size() > 1 && str[1] == '|' && str[0] != '\\') {
+ return true;
+ }
+
+ for (size_t i = 2U; i < str.size(); ++i) {
+ auto probe = str[i];
+ auto prev = str[i - 1];
+ auto tail = str[i - 2];
+
+ // We consider the pipe to have a special meaning if it is not preceded by a backslash, or
+ // preceded by a backslash that is itself escaped.
+ if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) {
+ return true;
+ }
+ }
+ return false;
+}
+} // namespace
+
+std::pair<std::string, bool> getRegexPrefixMatch(const char* regex, const char* flags) {
+ bool multilineOK;
+ if (regex[0] == '\\' && regex[1] == 'A') {
+ multilineOK = true;
+ regex += 2;
+ } else if (regex[0] == '^') {
+ multilineOK = false;
+ regex += 1;
+ } else {
+ return {"", false};
+ }
+
+ // A regex with an unescaped pipe character is not considered a simple regex.
+ if (stringMayHaveUnescapedPipe(StringData(regex))) {
+ return {"", false};
+ }
+
+ bool extended = false;
+ while (*flags) {
+ switch (*(flags++)) {
+ case 'm':
+ // Multiline mode.
+ if (multilineOK)
+ continue;
+ else
+ return {"", false};
+ case 's':
+ // Single-line mode specified. This just changes the behavior of the '.' character
+ // to match every character instead of every character except '\n'.
+ continue;
+ case 'x':
+ // Extended free-spacing mode.
+ extended = true;
+ break;
+ default:
+ // Cannot use the index.
+ return {"", false};
+ }
+ }
+
+ str::stream ss;
+
+ std::string r = "";
+ while (*regex) {
+ char c = *(regex++);
+
+ if (c == '*' || c == '?') {
+ // These are the only two symbols that make the last char optional.
+ r = ss;
+ r = r.substr(0, r.size() - 1);
+ // Patterns like /^a?/ and /^a*/ can be implemented by scanning all the strings
+ // beginning with the prefix "a", but the regex must be reapplied. We cannot convert
+ // such regexes into exact bounds.
+ return {r, false};
+ } else if (c == '\\') {
+ c = *(regex++);
+ if (c == 'Q') {
+ // \Q...\E quotes everything inside.
+ while (*regex) {
+ c = (*regex++);
+ if (c == '\\' && (*regex == 'E')) {
+ regex++; // skip the 'E'
+ break; // go back to start of outer loop
+ } else {
+ ss << c; // character should match itself
+ }
+ }
+ } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
+ (c == '\0')) {
+ // Bail out on any of these escape sequences.
+ r = ss;
+ break;
+ } else {
+ // Slash followed by non-alphanumeric represents the following char.
+ ss << c;
+ }
+ } else if (strchr("^$.[()+{", c)) {
+ // List of "metacharacters" from man pcrepattern.
+ //
+ // For prefix patterns ending in '.*' (ex. /^abc.*/) we can build exact index bounds.
+ if (!multilineOK && (c == '.')) {
+ c = *(regex++);
+ if (c == '*' && *regex == 0) {
+ return {ss, true};
+ } else {
+ c = *(regex--);
+ }
+ }
+ r = ss;
+ break;
+ } else if (extended && c == '#') {
+ // Comment.
+ r = ss;
+ break;
+ } else if (extended && ctype::isSpace(c)) {
+ continue;
+ } else {
+ // Self-matching char.
+ ss << c;
+ }
+ }
+
+ bool isExactPrefixMatch = false;
+ if (r.empty() && *regex == 0) {
+ r = ss;
+ isExactPrefixMatch = !r.empty();
+ }
+
+ return {r, isExactPrefixMatch};
+}
+
+} // namespace mongo::analyze_regex
diff --git a/src/mongo/db/query/analyze_regex.h b/src/mongo/db/query/analyze_regex.h
new file mode 100644
index 00000000000..5cce3597f13
--- /dev/null
+++ b/src/mongo/db/query/analyze_regex.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace mongo::analyze_regex {
+
+/**
+ * Analyzes the regular expression given by 'regex' and 'flags' and returns a pair containing the
+ * following:
+ * - A (possibly empty) string which must be the prefix of all strings which match the regex. For
+ * example, for /^bar?/ this would be the string "bar".
+ * - A boolean which indicates whether the regex can be converted to simple prefix match. For
+ * instance, /^bar/ means "starts with 'bar'" and would return the pair ("bar", true). On the other
+ * hand, /^bar?/ does not match all strings that start with "bar" and therefore would return
+ * ("bar", false).
+ */
+std::pair<std::string, bool> getRegexPrefixMatch(const char* regex, const char* flags);
+
+} // namespace mongo::analyze_regex
diff --git a/src/mongo/db/query/canonical_query_encoder.cpp b/src/mongo/db/query/canonical_query_encoder.cpp
index c03d5ee2e88..494894ec097 100644
--- a/src/mongo/db/query/canonical_query_encoder.cpp
+++ b/src/mongo/db/query/canonical_query_encoder.cpp
@@ -41,6 +41,7 @@
#include "mongo/db/matcher/expression_text_noop.h"
#include "mongo/db/matcher/expression_where.h"
#include "mongo/db/matcher/expression_where_noop.h"
+#include "mongo/db/query/analyze_regex.h"
#include "mongo/db/query/projection.h"
#include "mongo/db/query/query_feature_flags_gen.h"
#include "mongo/db/query/query_knobs_gen.h"
@@ -93,6 +94,11 @@ const char kEncodeParamMarker = '?';
// constant is typically encoded as a BSON type byte followed by a BSON value (without the
// BSONElement's field name).
const char kEncodeConstantLiteralMarker = ':';
+// Precedes a byte which encodes the bounds tightness associated with a predicate. The structure of
+// the plan (i.e. presence of filters) is affected by bounds tightness. Therefore, if different
+// parameter values can result in different tightnesses, this must be explicitly encoded into the
+// plan cache key.
+const char kEncodeBoundsTightnessDiscriminator = ':';
/**
* AppendChar provides the compiler with a type for a "appendChar(...)" member function.
@@ -750,6 +756,16 @@ public:
encodeParamMarker(*sourceRegexParam);
encodeParamMarker(*compiledRegexParam);
+
+ // Encode a discriminator so that a "simple" regex which is exactly convertible into
+ // index bounds has a different shape from a non-simple regex.
+ //
+ // We don't actually need to know the contents of the prefix string, so we ignore the
+ // first member of the pair.
+ auto [_, isExact] = analyze_regex::getRegexPrefixMatch(expr->getString().c_str(),
+ expr->getFlags().c_str());
+ _builder->appendChar(kEncodeBoundsTightnessDiscriminator);
+ _builder->appendChar(static_cast<char>(isExact));
} else {
tassert(6579301,
"If source param is not set in $regex expression compiled param must be unset "
diff --git a/src/mongo/db/query/index_bounds_builder.cpp b/src/mongo/db/query/index_bounds_builder.cpp
index bd82140b5c1..01e13b5f058 100644
--- a/src/mongo/db/query/index_bounds_builder.cpp
+++ b/src/mongo/db/query/index_bounds_builder.cpp
@@ -43,6 +43,7 @@
#include "mongo/db/matcher/expression_geo.h"
#include "mongo/db/matcher/expression_internal_bucket_geo_within.h"
#include "mongo/db/matcher/expression_internal_expr_comparison.h"
+#include "mongo/db/query/analyze_regex.h"
#include "mongo/db/query/collation/collation_index_key.h"
#include "mongo/db/query/collation/collator_interface.h"
#include "mongo/db/query/expression_index.h"
@@ -52,8 +53,6 @@
#include "mongo/db/query/planner_wildcard_helpers.h"
#include "mongo/db/query/query_knobs_gen.h"
#include "mongo/logv2/log.h"
-#include "mongo/util/ctype.h"
-#include "mongo/util/str.h"
#include "third_party/s2/s2cell.h"
#include "third_party/s2/s2regioncoverer.h"
@@ -93,34 +92,6 @@ IndexBoundsBuilder::BoundsTightness getInequalityPredicateTightness(const Interv
: IndexBoundsBuilder::INEXACT_FETCH;
}
-/**
- * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This
- * function reports no false negatives, but will return false positives. For example, a pipe
- * character inside of a character class or the \Q...\E escape sequence has no special meaning but
- * may still be reported by this function as being non-escaped.
- */
-bool stringMayHaveUnescapedPipe(StringData str) {
- if (str.size() > 0 && str[0] == '|') {
- return true;
- }
- if (str.size() > 1 && str[1] == '|' && str[0] != '\\') {
- return true;
- }
-
- for (size_t i = 2U; i < str.size(); ++i) {
- auto probe = str[i];
- auto prev = str[i - 1];
- auto tail = str[i - 2];
-
- // We consider the pipe to have a special meaning if it is not preceded by a backslash, or
- // preceded by a backslash that is itself escaped.
- if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) {
- return true;
- }
- }
- return false;
-}
-
const BSONObj kUndefinedElementObj = BSON("" << BSONUndefined);
const BSONObj kNullElementObj = BSON("" << BSONNULL);
const BSONObj kEmptyArrayElementObj = BSON("" << BSONArray());
@@ -184,116 +155,11 @@ string IndexBoundsBuilder::simpleRegex(const char* regex,
return "";
}
- *tightnessOut = IndexBoundsBuilder::INEXACT_COVERED;
-
- bool multilineOK;
- if (regex[0] == '\\' && regex[1] == 'A') {
- multilineOK = true;
- regex += 2;
- } else if (regex[0] == '^') {
- multilineOK = false;
- regex += 1;
- } else {
- return "";
- }
-
- // A regex with an unescaped pipe character is not considered a simple regex.
- if (stringMayHaveUnescapedPipe(StringData(regex))) {
- return "";
- }
-
- bool extended = false;
- while (*flags) {
- switch (*(flags++)) {
- case 'm':
- // Multiline mode.
- if (multilineOK)
- continue;
- else
- return "";
- case 's':
- // Single-line mode specified. This just changes the behavior of the '.'
- // character to match every character instead of every character except '\n'.
- continue;
- case 'x':
- // Extended free-spacing mode.
- extended = true;
- break;
- default:
- // Cannot use the index.
- return "";
- }
- }
-
- str::stream ss;
-
- string r = "";
- while (*regex) {
- char c = *(regex++);
-
- if (c == '*' || c == '?') {
- // These are the only two symbols that make the last char optional
- r = ss;
- r = r.substr(0, r.size() - 1);
- return r; // breaking here fails with /^a?/
- } else if (c == '\\') {
- c = *(regex++);
- if (c == 'Q') {
- // \Q...\E quotes everything inside
- while (*regex) {
- c = (*regex++);
- if (c == '\\' && (*regex == 'E')) {
- regex++; // skip the 'E'
- break; // go back to start of outer loop
- } else {
- ss << c; // character should match itself
- }
- }
- } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
- (c == '\0')) {
- // don't know what to do with these
- r = ss;
- break;
- } else {
- // slash followed by non-alphanumeric represents the following char
- ss << c;
- }
- } else if (strchr("^$.[()+{", c)) {
- // list of "metacharacters" from man pcrepattern
- // For prefix patterns ending in '.*' (ex. /^abc.*/) we can build exact index bounds.
- if (!multilineOK && (c == '.')) {
- c = *(regex++);
- if (c == '*' && *regex == 0) {
- *tightnessOut = IndexBoundsBuilder::EXACT;
- return ss;
- } else {
- c = *(regex--);
- }
- }
- r = ss;
- break;
- } else if (extended && c == '#') {
- // comment
- r = ss;
- break;
- } else if (extended && ctype::isSpace(c)) {
- continue;
- } else {
- // self-matching char
- ss << c;
- }
- }
-
- if (r.empty() && *regex == 0) {
- r = ss;
- *tightnessOut = r.empty() ? IndexBoundsBuilder::INEXACT_COVERED : IndexBoundsBuilder::EXACT;
- }
-
- return r;
+ auto [prefixStr, isExact] = analyze_regex::getRegexPrefixMatch(regex, flags);
+ *tightnessOut = isExact ? IndexBoundsBuilder::EXACT : IndexBoundsBuilder::INEXACT_COVERED;
+ return prefixStr;
}
-
-// static
void IndexBoundsBuilder::allValuesForField(const BSONElement& elt, OrderedIntervalList* out) {
// ARGH, BSONValue would make this shorter.
BSONObjBuilder bob;