SERVER-66015 Distinguish simple and non-simple regexes in SBE plan cache key

author: David Storch <david.storch@mongodb.com> 2022-05-03 00:23:39 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-05-03 01:29:24 +0000
commit: 2cee6cb397119b83fa45b53743e3bcea4106c5fc (patch)
tree: d4249a2c7197f2a3a65d3ea936f70fe416ae9a40 /src/mongo/db/query
parent: 0ba7cf78b7d59497eafd38b5aafeac29e8484cf1 (diff)
download: mongo-2cee6cb397119b83fa45b53743e3bcea4106c5fc.tar.gz
5 files changed, 252 insertions, 139 deletions
diff --git a/src/mongo/db/query/SConscript b/src/mongo/db/query/SConscript
index e141fdb2642..1f42baff54c 100644
--- a/src/mongo/db/query/SConscript
+++ b/src/mongo/db/query/SConscript
@@ -32,6 +32,7 @@ env.Library(
         "sort_pattern",
     ],
     LIBDEPS_PRIVATE=[
+        "common_query_enums_and_helpers",
     ],
 )
 
@@ -159,8 +160,9 @@ env.Library(
     target="common_query_enums_and_helpers",
     source=[
         "allowed_contexts.cpp",
+        "analyze_regex.cpp",
         "explain_options.cpp",
-        "explain_verbosity.idl"
+        "explain_verbosity.idl",
     ],
     LIBDEPS=[
         "$BUILD_DIR/mongo/base",
diff --git a/src/mongo/db/query/analyze_regex.cpp b/src/mongo/db/query/analyze_regex.cpp
new file mode 100644
index 00000000000..fc1296670cc
--- /dev/null
+++ b/src/mongo/db/query/analyze_regex.cpp
@@ -0,0 +1,180 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/query/analyze_regex.h"
+
+#include <cstring>
+
+#include "mongo/util/ctype.h"
+#include "mongo/util/str.h"
+
+namespace mongo::analyze_regex {
+
+namespace {
+/**
+ * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This
+ * function reports no false negatives, but will return false positives. For example, a pipe
+ * character inside of a character class or the \Q...\E escape sequence has no special meaning but
+ * may still be reported by this function as being non-escaped.
+ */
+bool stringMayHaveUnescapedPipe(StringData str) {
+    if (str.size() > 0 && str[0] == '|') {
+        return true;
+    }
+    if (str.size() > 1 && str[1] == '|' && str[0] != '\\') {
+        return true;
+    }
+
+    for (size_t i = 2U; i < str.size(); ++i) {
+        auto probe = str[i];
+        auto prev = str[i - 1];
+        auto tail = str[i - 2];
+
+        // We consider the pipe to have a special meaning if it is not preceded by a backslash, or
+        // preceded by a backslash that is itself escaped.
+        if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) {
+            return true;
+        }
+    }
+    return false;
+}
+}  // namespace
+
+std::pair<std::string, bool> getRegexPrefixMatch(const char* regex, const char* flags) {
+    bool multilineOK;
+    if (regex[0] == '\\' && regex[1] == 'A') {
+        multilineOK = true;
+        regex += 2;
+    } else if (regex[0] == '^') {
+        multilineOK = false;
+        regex += 1;
+    } else {
+        return {"", false};
+    }
+
+    // A regex with an unescaped pipe character is not considered a simple regex.
+    if (stringMayHaveUnescapedPipe(StringData(regex))) {
+        return {"", false};
+    }
+
+    bool extended = false;
+    while (*flags) {
+        switch (*(flags++)) {
+            case 'm':
+                // Multiline mode.
+                if (multilineOK)
+                    continue;
+                else
+                    return {"", false};
+            case 's':
+                // Single-line mode specified. This just changes the behavior of the '.' character
+                // to match every character instead of every character except '\n'.
+                continue;
+            case 'x':
+                // Extended free-spacing mode.
+                extended = true;
+                break;
+            default:
+                // Cannot use the index.
+                return {"", false};
+        }
+    }
+
+    str::stream ss;
+
+    std::string r = "";
+    while (*regex) {
+        char c = *(regex++);
+
+        if (c == '*' || c == '?') {
+            // These are the only two symbols that make the last char optional.
+            r = ss;
+            r = r.substr(0, r.size() - 1);
+            // Patterns like /^a?/ and /^a*/ can be implemented by scanning all the strings
+            // beginning with the prefix "a", but the regex must be reapplied. We cannot convert
+            // such regexes into exact bounds.
+            return {r, false};
+        } else if (c == '\\') {
+            c = *(regex++);
+            if (c == 'Q') {
+                // \Q...\E quotes everything inside.
+                while (*regex) {
+                    c = (*regex++);
+                    if (c == '\\' && (*regex == 'E')) {
+                        regex++;  // skip the 'E'
+                        break;    // go back to start of outer loop
+                    } else {
+                        ss << c;  // character should match itself
+                    }
+                }
+            } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
+                       (c == '\0')) {
+                // Bail out on any of these escape sequences.
+                r = ss;
+                break;
+            } else {
+                // Slash followed by non-alphanumeric represents the following char.
+                ss << c;
+            }
+        } else if (strchr("^$.[()+{", c)) {
+            // List of "metacharacters" from man pcrepattern.
+            //
+            // For prefix patterns ending in '.*' (ex. /^abc.*/) we can build exact index bounds.
+            if (!multilineOK && (c == '.')) {
+                c = *(regex++);
+                if (c == '*' && *regex == 0) {
+                    return {ss, true};
+                } else {
+                    c = *(regex--);
+                }
+            }
+            r = ss;
+            break;
+        } else if (extended && c == '#') {
+            // Comment.
+            r = ss;
+            break;
+        } else if (extended && ctype::isSpace(c)) {
+            continue;
+        } else {
+            // Self-matching char.
+            ss << c;
+        }
+    }
+
+    bool isExactPrefixMatch = false;
+    if (r.empty() && *regex == 0) {
+        r = ss;
+        isExactPrefixMatch = !r.empty();
+    }
+
+    return {r, isExactPrefixMatch};
+}
+
+}  // namespace mongo::analyze_regex
diff --git a/src/mongo/db/query/analyze_regex.h b/src/mongo/db/query/analyze_regex.h
new file mode 100644
index 00000000000..5cce3597f13
--- /dev/null
+++ b/src/mongo/db/query/analyze_regex.h
@@ -0,0 +1,49 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace mongo::analyze_regex {
+
+/**
+ * Analyzes the regular expression given by 'regex' and 'flags' and returns a pair containing the
+ * following:
+ *  - A (possibly empty) string which must be the prefix of all strings which match the regex. For
+ *  example, for /^bar?/ this would be the string "bar".
+ *  - A boolean which indicates whether the regex can be converted to simple prefix match. For
+ *  instance, /^bar/ means "starts with 'bar'" and would return the pair ("bar", true). On the other
+ *  hand, /^bar?/ does not match all strings that start with "bar" and therefore would return
+ *  ("bar", false).
+ */
+std::pair<std::string, bool> getRegexPrefixMatch(const char* regex, const char* flags);
+
+}  // namespace mongo::analyze_regex
diff --git a/src/mongo/db/query/canonical_query_encoder.cpp b/src/mongo/db/query/canonical_query_encoder.cpp
index c03d5ee2e88..494894ec097 100644
--- a/src/mongo/db/query/canonical_query_encoder.cpp
+++ b/src/mongo/db/query/canonical_query_encoder.cpp
@@ -41,6 +41,7 @@
 #include "mongo/db/matcher/expression_text_noop.h"
 #include "mongo/db/matcher/expression_where.h"
 #include "mongo/db/matcher/expression_where_noop.h"
+#include "mongo/db/query/analyze_regex.h"
 #include "mongo/db/query/projection.h"
 #include "mongo/db/query/query_feature_flags_gen.h"
 #include "mongo/db/query/query_knobs_gen.h"
@@ -93,6 +94,11 @@ const char kEncodeParamMarker = '?';
 // constant is typically encoded as a BSON type byte followed by a BSON value (without the
 // BSONElement's field name).
 const char kEncodeConstantLiteralMarker = ':';
+// Precedes a byte which encodes the bounds tightness associated with a predicate. The structure of
+// the plan (i.e. presence of filters) is affected by bounds tightness. Therefore, if different
+// parameter values can result in different tightnesses, this must be explicitly encoded into the
+// plan cache key.
+const char kEncodeBoundsTightnessDiscriminator = ':';
 
 /**
  * AppendChar provides the compiler with a type for a "appendChar(...)" member function.
@@ -750,6 +756,16 @@ public:
 
             encodeParamMarker(*sourceRegexParam);
             encodeParamMarker(*compiledRegexParam);
+
+            // Encode a discriminator so that a "simple" regex which is exactly convertible into
+            // index bounds has a different shape from a non-simple regex.
+            //
+            // We don't actually need to know the contents of the prefix string, so we ignore the
+            // first member of the pair.
+            auto [_, isExact] = analyze_regex::getRegexPrefixMatch(expr->getString().c_str(),
+                                                                   expr->getFlags().c_str());
+            _builder->appendChar(kEncodeBoundsTightnessDiscriminator);
+            _builder->appendChar(static_cast<char>(isExact));
         } else {
             tassert(6579301,
                     "If source param is not set in $regex expression compiled param must be unset "
diff --git a/src/mongo/db/query/index_bounds_builder.cpp b/src/mongo/db/query/index_bounds_builder.cpp
index bd82140b5c1..01e13b5f058 100644
--- a/src/mongo/db/query/index_bounds_builder.cpp
+++ b/src/mongo/db/query/index_bounds_builder.cpp
@@ -43,6 +43,7 @@
 #include "mongo/db/matcher/expression_geo.h"
 #include "mongo/db/matcher/expression_internal_bucket_geo_within.h"
 #include "mongo/db/matcher/expression_internal_expr_comparison.h"
+#include "mongo/db/query/analyze_regex.h"
 #include "mongo/db/query/collation/collation_index_key.h"
 #include "mongo/db/query/collation/collator_interface.h"
 #include "mongo/db/query/expression_index.h"
@@ -52,8 +53,6 @@
 #include "mongo/db/query/planner_wildcard_helpers.h"
 #include "mongo/db/query/query_knobs_gen.h"
 #include "mongo/logv2/log.h"
-#include "mongo/util/ctype.h"
-#include "mongo/util/str.h"
 #include "third_party/s2/s2cell.h"
 #include "third_party/s2/s2regioncoverer.h"
 
@@ -93,34 +92,6 @@ IndexBoundsBuilder::BoundsTightness getInequalityPredicateTightness(const Interv
                                                           : IndexBoundsBuilder::INEXACT_FETCH;
 }
 
-/**
- * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This
- * function reports no false negatives, but will return false positives. For example, a pipe
- * character inside of a character class or the \Q...\E escape sequence has no special meaning but
- * may still be reported by this function as being non-escaped.
- */
-bool stringMayHaveUnescapedPipe(StringData str) {
-    if (str.size() > 0 && str[0] == '|') {
-        return true;
-    }
-    if (str.size() > 1 && str[1] == '|' && str[0] != '\\') {
-        return true;
-    }
-
-    for (size_t i = 2U; i < str.size(); ++i) {
-        auto probe = str[i];
-        auto prev = str[i - 1];
-        auto tail = str[i - 2];
-
-        // We consider the pipe to have a special meaning if it is not preceded by a backslash, or
-        // preceded by a backslash that is itself escaped.
-        if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) {
-            return true;
-        }
-    }
-    return false;
-}
-
 const BSONObj kUndefinedElementObj = BSON("" << BSONUndefined);
 const BSONObj kNullElementObj = BSON("" << BSONNULL);
 const BSONObj kEmptyArrayElementObj = BSON("" << BSONArray());
@@ -184,116 +155,11 @@ string IndexBoundsBuilder::simpleRegex(const char* regex,
         return "";
     }
 
-    *tightnessOut = IndexBoundsBuilder::INEXACT_COVERED;
-
-    bool multilineOK;
-    if (regex[0] == '\\' && regex[1] == 'A') {
-        multilineOK = true;
-        regex += 2;
-    } else if (regex[0] == '^') {
-        multilineOK = false;
-        regex += 1;
-    } else {
-        return "";
-    }
-
-    // A regex with an unescaped pipe character is not considered a simple regex.
-    if (stringMayHaveUnescapedPipe(StringData(regex))) {
-        return "";
-    }
-
-    bool extended = false;
-    while (*flags) {
-        switch (*(flags++)) {
-            case 'm':
-                // Multiline mode.
-                if (multilineOK)
-                    continue;
-                else
-                    return "";
-            case 's':
-                // Single-line mode specified. This just changes the behavior of the '.'
-                // character to match every character instead of every character except '\n'.
-                continue;
-            case 'x':
-                // Extended free-spacing mode.
-                extended = true;
-                break;
-            default:
-                // Cannot use the index.
-                return "";
-        }
-    }
-
-    str::stream ss;
-
-    string r = "";
-    while (*regex) {
-        char c = *(regex++);
-
-        if (c == '*' || c == '?') {
-            // These are the only two symbols that make the last char optional
-            r = ss;
-            r = r.substr(0, r.size() - 1);
-            return r;  // breaking here fails with /^a?/
-        } else if (c == '\\') {
-            c = *(regex++);
-            if (c == 'Q') {
-                // \Q...\E quotes everything inside
-                while (*regex) {
-                    c = (*regex++);
-                    if (c == '\\' && (*regex == 'E')) {
-                        regex++;  // skip the 'E'
-                        break;    // go back to start of outer loop
-                    } else {
-                        ss << c;  // character should match itself
-                    }
-                }
-            } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
-                       (c == '\0')) {
-                // don't know what to do with these
-                r = ss;
-                break;
-            } else {
-                // slash followed by non-alphanumeric represents the following char
-                ss << c;
-            }
-        } else if (strchr("^$.[()+{", c)) {
-            // list of "metacharacters" from man pcrepattern
-            // For prefix patterns ending in '.*' (ex. /^abc.*/) we can build exact index bounds.
-            if (!multilineOK && (c == '.')) {
-                c = *(regex++);
-                if (c == '*' && *regex == 0) {
-                    *tightnessOut = IndexBoundsBuilder::EXACT;
-                    return ss;
-                } else {
-                    c = *(regex--);
-                }
-            }
-            r = ss;
-            break;
-        } else if (extended && c == '#') {
-            // comment
-            r = ss;
-            break;
-        } else if (extended && ctype::isSpace(c)) {
-            continue;
-        } else {
-            // self-matching char
-            ss << c;
-        }
-    }
-
-    if (r.empty() && *regex == 0) {
-        r = ss;
-        *tightnessOut = r.empty() ? IndexBoundsBuilder::INEXACT_COVERED : IndexBoundsBuilder::EXACT;
-    }
-
-    return r;
+    auto [prefixStr, isExact] = analyze_regex::getRegexPrefixMatch(regex, flags);
+    *tightnessOut = isExact ? IndexBoundsBuilder::EXACT : IndexBoundsBuilder::INEXACT_COVERED;
+    return prefixStr;
 }
 
-
-// static
 void IndexBoundsBuilder::allValuesForField(const BSONElement& elt, OrderedIntervalList* out) {
     // ARGH, BSONValue would make this shorter.
     BSONObjBuilder bob;
author	David Storch <david.storch@mongodb.com>	2022-05-03 00:23:39 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-05-03 01:29:24 +0000
commit	2cee6cb397119b83fa45b53743e3bcea4106c5fc (patch)
tree	d4249a2c7197f2a3a65d3ea936f70fe416ae9a40 /src/mongo/db/query
parent	0ba7cf78b7d59497eafd38b5aafeac29e8484cf1 (diff)
download	mongo-2cee6cb397119b83fa45b53743e3bcea4106c5fc.tar.gz