diff options
author | Kyle Suarez <kyle.suarez@mongodb.com> | 2017-10-26 18:08:11 -0400 |
---|---|---|
committer | Kyle Suarez <kyle.suarez@mongodb.com> | 2017-10-26 18:08:34 -0400 |
commit | 260fd0c76599520d9d733874753a94a2db763538 (patch) | |
tree | 6d2e5c0c91ed2e279d3fa5cdef89dd687cf8514f | |
parent | 722136a8bff8208787bb97186d7b339bb5116ba3 (diff) | |
download | mongo-260fd0c76599520d9d733874753a94a2db763538.tar.gz |
SERVER-20432 allow some escaped | chars in regexes to use tight index bounds
-rw-r--r-- | jstests/core/index_bounds_pipe.js | 113 | ||||
-rw-r--r-- | src/mongo/db/query/index_bounds_builder.cpp | 50 | ||||
-rw-r--r-- | src/mongo/db/query/index_bounds_builder_test.cpp | 49 |
3 files changed, 198 insertions, 14 deletions
diff --git a/jstests/core/index_bounds_pipe.js b/jstests/core/index_bounds_pipe.js new file mode 100644 index 00000000000..e284d5c78f3 --- /dev/null +++ b/jstests/core/index_bounds_pipe.js @@ -0,0 +1,113 @@ +/** + * Tests the tightness of index bounds when attempting to match a regex that contains escaped and + * non-escaped pipe '|' characters. + */ +(function() { + 'use strict'; + + load('jstests/libs/analyze_plan.js'); + + const collName = 'index_bounds_pipe'; + const coll = db.getCollection(collName); + coll.drop(); + + assert.writeOK(coll.insert({_id: ''})); + assert.writeOK(coll.insert({_id: '\\|'})); + assert.writeOK(coll.insert({_id: 'a'})); + assert.writeOK(coll.insert({_id: 'a|b'})); + assert.writeOK(coll.insert({_id: 'b'})); + assert.writeOK(coll.insert({_id: '|'})); + + /** + * Asserts that a query on a field using 'params.regex' uses index bounds 'params.bounds' and + * returns results identical to 'params.results'. + * + * Also tests that a query using 'params.regex' will return documents with a field of type regex + * with an identical regular expression value. + */ + function assertIndexBoundsAndResult(params) { + const query = {_id: params.regex}; + const command = {find: collName, filter: query, projection: {_id: 1}, sort: {_id: 1}}; + const explain = db.runCommand({explain: command}); + assert.commandWorked(explain); + + // Check that the query uses correct index bounds. + const ixscan = getPlanStage(explain.queryPlanner.winningPlan, 'IXSCAN'); + assert.neq(ixscan, null, 'Plan unexpectedly missing IXSCAN stage: ' + tojson(explain)); + assert.eq(ixscan.indexBounds._id, + params.bounds, + 'Expected bounds of ' + tojson(params.bounds) + ' but got ' + + tojson(ixscan.indexBounds._id)); + + // Check that the query regex matches expected strings. + const results = db.runCommand(command); + assert.commandWorked(results); + assert.eq(results.cursor.firstBatch, + params.results, + 'Regex query ' + tojson(query) + ' returned incorrect results'); + + // Check that the query regex will exactly match identical regular expression objects. + const collRegexValue = db.getCollection(collName + params.regex); + collRegexValue.drop(); + assert.commandWorked(collRegexValue.createIndex({x: 1})); + + const doc = {_id: 0, x: params.regex}; + assert.writeOK(collRegexValue.insert(doc)); + + const regexQuery = {x: params.regex}; + assert.eq(collRegexValue.findOne(regexQuery), + doc, + 'Regex query ' + tojson(regexQuery) + + ' did not match document with identical regex value'); + } + + // An anchored regex that uses no special operators can use tight index bounds. + assertIndexBoundsAndResult( + {regex: /^a/, bounds: ['["a", "b")', '[/^a/, /^a/]'], results: [{_id: 'a'}, {_id: 'a|b'}]}); + assertIndexBoundsAndResult( + {regex: /^\\/, bounds: ['["\\", "]")', '[/^\\\\/, /^\\\\/]'], results: [{_id: '\\|'}]}); + + // An anchored regex using the alternation operator cannot use tight index bounds. + assertIndexBoundsAndResult({ + regex: /^a|b/, + bounds: ['["", {})', '[/^a|b/, /^a|b/]'], + results: [{_id: 'a'}, {_id: 'a|b'}, {_id: 'b'}] + }); + + // An anchored regex that uses an escaped pipe character can use tight index bounds. + assertIndexBoundsAndResult( + {regex: /^a\|/, bounds: ['["a|", "a}")', '[/^a\\|/, /^a\\|/]'], results: [{_id: 'a|b'}]}); + assertIndexBoundsAndResult( + {regex: /^\|/, bounds: ['["|", "}")', '[/^\\|/, /^\\|/]'], results: [{_id: '|'}]}); + + // A pipe character that is preceded by an escaped backslash is correctly interpreted as the + // alternation operator and cannot use tight index bounds. + assertIndexBoundsAndResult({ + regex: /^\\|b/, + bounds: ['["", {})', '[/^\\\\|b/, /^\\\\|b/]'], + results: [{_id: '\\|'}, {_id: 'a|b'}, {_id: 'b'}] + }); + assertIndexBoundsAndResult({ + regex: /^\\|^b/, + bounds: ['["", {})', '[/^\\\\|^b/, /^\\\\|^b/]'], + results: [{_id: '\\|'}, {_id: 'b'}] + }); + + // An escaped backslash immediately followed by an escaped pipe does not use tight index bounds. + assertIndexBoundsAndResult({ + regex: /^\\\|/, + bounds: ['["", {})', '[/^\\\\\\|/, /^\\\\\\|/]'], + results: [{_id: '\\|'}] + }); + + // A pipe escaped with the \Q...\E escape sequence does not use tight index bounds. + assertIndexBoundsAndResult( + {regex: /^\Q|\E/, bounds: ['["", {})', '[/^\\Q|\\E/, /^\\Q|\\E/]'], results: [{_id: '|'}]}); + + // An escaped pipe within \Q...\E can use tight index bounds. + assertIndexBoundsAndResult({ + regex: /^\Q\|\E/, + bounds: ['["\\|", "\\}")', '[/^\\Q\\|\\E/, /^\\Q\\|\\E/]'], + results: [{_id: '\\|'}] + }); +}()); diff --git a/src/mongo/db/query/index_bounds_builder.cpp b/src/mongo/db/query/index_bounds_builder.cpp index cf94e16822c..344cb119877 100644 --- a/src/mongo/db/query/index_bounds_builder.cpp +++ b/src/mongo/db/query/index_bounds_builder.cpp @@ -61,6 +61,34 @@ IndexBoundsBuilder::BoundsTightness getInequalityPredicateTightness(const BSONEl : IndexBoundsBuilder::INEXACT_FETCH; } +/** + * Returns true if 'str' contains a non-escaped pipe character '|' on a best-effort basis. This + * function reports no false negatives, but will return false positives. For example, a pipe + * character inside of a character class or the \Q...\E escape sequence has no special meaning but + * may still be reported by this function as being non-escaped. + */ +bool stringMayHaveUnescapedPipe(StringData str) { + if (str.size() > 0 && str[0] == '|') { + return true; + } + if (str.size() > 1 && str[1] == '|' && str[0] != '\\') { + return true; + } + + for (size_t i = 2U; i < str.size(); ++i) { + auto probe = str[i]; + auto prev = str[i - 1]; + auto tail = str[i - 2]; + + // We consider the pipe to have a special meaning if it is not preceded by a backslash, or + // preceded by a backslash that is itself escaped. + if (probe == '|' && (prev != '\\' || (prev == '\\' && tail == '\\'))) { + return true; + } + } + return false; +} + } // namespace string IndexBoundsBuilder::simpleRegex(const char* regex, @@ -77,7 +105,6 @@ string IndexBoundsBuilder::simpleRegex(const char* regex, return ""; } - string r = ""; *tightnessOut = IndexBoundsBuilder::INEXACT_COVERED; bool multilineOK; @@ -88,42 +115,43 @@ string IndexBoundsBuilder::simpleRegex(const char* regex, multilineOK = false; regex += 1; } else { - return r; + return ""; } - // A regex with the "|" character is never considered a simple regular expression. - if (StringData(regex).find('|') != std::string::npos) { + // A regex with an unescaped pipe character is not considered a simple regex. + if (stringMayHaveUnescapedPipe(StringData(regex))) { return ""; } bool extended = false; while (*flags) { switch (*(flags++)) { - case 'm': // multiline + case 'm': + // Multiline mode. if (multilineOK) continue; else - return r; + return ""; case 's': // Single-line mode specified. This just changes the behavior of the '.' // character to match every character instead of every character except '\n'. continue; - case 'x': // extended + case 'x': + // Extended free-spacing mode. extended = true; break; default: - return r; // cant use index + // Cannot use the index. + return ""; } } mongoutils::str::stream ss; + string r = ""; while (*regex) { char c = *(regex++); - // We should have bailed out early above if '|' is in the regex. - invariant(c != '|'); - if (c == '*' || c == '?') { // These are the only two symbols that make the last char optional r = ss; diff --git a/src/mongo/db/query/index_bounds_builder_test.cpp b/src/mongo/db/query/index_bounds_builder_test.cpp index 264972d017f..8305f91be4b 100644 --- a/src/mongo/db/query/index_bounds_builder_test.cpp +++ b/src/mongo/db/query/index_bounds_builder_test.cpp @@ -1046,8 +1046,9 @@ TEST(SimpleRegexTest, RootedLiteralNestedEscapeEnd) { ASSERT_EQUALS(tightness, IndexBoundsBuilder::EXACT); } -// A regular expression with the "|" character is not considered simple. See SERVER-15235. -TEST(SimpleRegexTest, PipeCharacterDisallowed) { +// An anchored regular expression that uses the "|" operator is not considered "simple" and has +// non-tight index bounds. +TEST(SimpleRegexTest, PipeCharacterUsesInexactBounds) { IndexEntry testIndex = IndexEntry(BSONObj()); IndexBoundsBuilder::BoundsTightness tightness; string prefix = IndexBoundsBuilder::simpleRegex("^(a(a|$)|b", "", testIndex, &tightness); @@ -1055,7 +1056,7 @@ TEST(SimpleRegexTest, PipeCharacterDisallowed) { ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); } -TEST(SimpleRegexTest, PipeCharacterDisallowed2) { +TEST(SimpleRegexTest, PipeCharacterUsesInexactBoundsWithTwoPrefixes) { IndexEntry testIndex = IndexEntry(BSONObj()); IndexBoundsBuilder::BoundsTightness tightness; string prefix = IndexBoundsBuilder::simpleRegex("^(a(a|$)|^b", "", testIndex, &tightness); @@ -1063,6 +1064,48 @@ TEST(SimpleRegexTest, PipeCharacterDisallowed2) { ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); } +TEST(SimpleRegexTest, PipeCharacterPrecededByEscapedBackslashUsesInexactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^a\\|b)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); + + prefix = IndexBoundsBuilder::simpleRegex(R"(^(foo\\|bar)\\|baz)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); +} + +// However, a regular expression with an escaped pipe (that is, using no special meaning) can use +// exact index bounds. +TEST(SimpleRegexTest, PipeCharacterEscapedWithBackslashUsesExactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^a\|b)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, "a|b"); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::EXACT); + + prefix = IndexBoundsBuilder::simpleRegex(R"(^\|1\|2\|\|)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, "|1|2||"); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::EXACT); +} + +TEST(SimpleRegexTest, FalsePositiveOnPipeInQEEscapeSequenceUsesInexactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^\Q|\E)", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); +} + +TEST(SimpleRegexTest, FalsePositiveOnPipeInCharacterClassUsesInexactBounds) { + IndexEntry testIndex = IndexEntry(BSONObj()); + IndexBoundsBuilder::BoundsTightness tightness; + string prefix = IndexBoundsBuilder::simpleRegex(R"(^[|])", "", testIndex, &tightness); + ASSERT_EQUALS(prefix, ""); + ASSERT_EQUALS(tightness, IndexBoundsBuilder::INEXACT_COVERED); +} + // SERVER-9035 TEST(SimpleRegexTest, RootedSingleLineMode) { IndexEntry testIndex = IndexEntry(BSONObj()); |