diff options
author | Kyle Suarez <kyle.suarez@mongodb.com> | 2017-10-26 18:08:11 -0400 |
---|---|---|
committer | Kyle Suarez <kyle.suarez@mongodb.com> | 2017-10-26 18:08:34 -0400 |
commit | 260fd0c76599520d9d733874753a94a2db763538 (patch) | |
tree | 6d2e5c0c91ed2e279d3fa5cdef89dd687cf8514f /src/mongo/db | |
parent | 722136a8bff8208787bb97186d7b339bb5116ba3 (diff) | |
download | mongo-260fd0c76599520d9d733874753a94a2db763538.tar.gz |
SERVER-20432 allow some escaped | chars in regexes to use tight index bounds
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/query/index_bounds_builder.cpp | 50 | ||||
-rw-r--r-- | src/mongo/db/query/index_bounds_builder_test.cpp | 49 |
2 files changed, 85 insertions, 14 deletions
diff --git a/src/mongo/db/query/index_bounds_builder.cpp b/src/mongo/db/query/index_bounds_builder.cpp index cf94e16822c..344cb119877 100644 --- a/src/mongo/db/query/index_bounds_builder.cpp +++ b/src/mongo/db/query/index_bounds_builder.cpp @@ -61,6 +61,34 @@ IndexBoundsBuilder::BoundsTightness getInequalityPredicateTightness(const BSONEl : IndexBoundsBuilder::INEXACT_FETCH; } +/** + * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This + * function reports no false negatives, but will return false positives. For example, a pipe + * character inside of a character class or the \Q...\E escape sequence has no special meaning but + * may still be reported by this function as being non-escaped. + */ +bool stringMayHaveUnescapedPipe(StringData str) { + if (str.size() > 0 && str[0] == '|') { + return true; + } + if (str.size() > 1 && str[1] == '|' && str[0] != '\\') { + return true; + } + + for (size_t i = 2U; i < str.size(); ++i) { + auto probe = str[i]; + auto prev = str[i - 1]; + auto tail = str[i - 2]; + + // We consider the pipe to have a special meaning if it is not preceded by a backslash, or + // preceded by a backslash that is itself escaped. + if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) { + return true; + } + } + return false; +} + } // namespace string IndexBoundsBuilder::simpleRegex(const char* regex, @@ -77,7 +105,6 @@ string IndexBoundsBuilder::simpleRegex(const char* regex, return ""; } - string r = ""; *tightnessOut = IndexBoundsBuilder::INEXACT_COVERED; bool multilineOK; @@ -88,42 +115,43 @@ string IndexBoundsBuilder::simpleRegex(const char* regex, multilineOK = false; regex += 1; } else { - return r; + return ""; } - // A regex with the "|" character is never considered a simple regular expression. - if (StringData(regex).find('|') != std::string::npos) { + // A regex with an unescaped pipe character is not considered a simple regex. + if (stringMayHaveUnescapedPipe(StringData(regex))) { return ""; } bool extended = false; while (*flags) { switch (*(flags++)) { - case 'm': // multiline + case 'm': + // Multiline mode. if (multilineOK) continue; else - return r; + return ""; case 's': // Single-line mode specified. This just changes the behavior of the '.' // character to match every character instead of every character except '\n'. continue; - case 'x': // extended + case 'x': + // Extended free-spacing mode. extended = true; break; default: - return r; // cant use index + // Cannot use the index. + return ""; } } mongoutils::str::stream ss; + string r = ""; while (*regex) { char c = *(regex++); - // We should have bailed out early above if '|' is in the regex. - invariant(c != '|'); - if (c == '*' || c == '?') { // These are the only two symbols that make the last char optional r = ss; diff --git a/src/mongo/db/query/index_bounds_builder_test.cpp b/src/mongo/db/query/index_bounds_builder_test.cpp index 264972d017f..8305f91be4b 100644 --- a/src/mongo/db/query/index_bounds_builder_test.cpp +++ b/src/mongo/db/query/index_bounds_builder_test.cpp @@ -1046,8 +1046,9 @@ TEST(SimpleRegexTest, RootedLiteralNestedEscapeEnd) { ASSERT_EQUALS(tightness, IndexBoundsBuilder::EXACT); } -// A regular expression with the "|" character is not considered simple. See SERVER-15235. -TEST(SimpleRegexTest, PipeCharacterDisallowed) { +// An anchored regular expression that uses the "|" operator is not considered "simple" and has +// non-tight index bounds. +TEST(SimpleRegexTest, PipeCharacterUsesInexactBounds) { IndexEntry testIndex = IndexEntry(BSONObj()); IndexBoundsBuilder::BoundsTightness tightness; string prefix = IndexBoundsBuilder::simpleRegex("^(a(a|$)|b", "", testIndex, &tightness); @@ -1055,7 +1056,7 @@ TEST(SimpleRegexTest, PipeCharacterDisallowed) { ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); } -TEST(SimpleRegexTest, PipeCharacterDisallowed2) { +TEST(SimpleRegexTest, PipeCharacterUsesInexactBoundsWithTwoPrefixes) { IndexEntry testIndex = IndexEntry(BSONObj()); IndexBoundsBuilder::BoundsTightness tightness; string prefix = IndexBoundsBuilder::simpleRegex("^(a(a|$)|^b", "", testIndex, &tightness); @@ -1063,6 +1064,48 @@ TEST(SimpleRegexTest, PipeCharacterDisallowed2) { ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); } +TEST(SimpleRegexTest, PipeCharacterPrecededByEscapedBackslashUsesInexactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^a\\|b)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); + + prefix = IndexBoundsBuilder::simpleRegex(R"(^(foo\\|bar)\\|baz)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); +} + +// However, a regular expression with an escaped pipe (that is, using no special meaning) can use +// exact index bounds. +TEST(SimpleRegexTest, PipeCharacterEscapedWithBackslashUsesExactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^a\|b)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, "a|b"); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::EXACT); + + prefix = IndexBoundsBuilder::simpleRegex(R"(^\|1\|2\|\|)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, "|1|2||"); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::EXACT); +} + +TEST(SimpleRegexTest, FalsePositiveOnPipeInQEEscapeSequenceUsesInexactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^\Q|\E)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); +} + +TEST(SimpleRegexTest, FalsePositiveOnPipeInCharacterClassUsesInexactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^[|])", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); +} + // SERVER-9035 TEST(SimpleRegexTest, RootedSingleLineMode) { IndexEntry testIndex = IndexEntry(BSONObj()); |