From 66334e9063743d14a309ab645d8c1b6b0fd95771 Mon Sep 17 00:00:00 2001 From: Arun Banala Date: Thu, 28 Nov 2019 16:45:10 +0000 Subject: SERVER-44617 $regexFind crash when one of the capture group doesn't match the input but pattern matches (cherry picked from commit 294ff7c1f6a3f79342aace87be524532d222da9f) --- jstests/aggregation/expressions/regex.js | 84 ++++++++++++++++++++++++++++++-- src/mongo/db/pipeline/expression.cpp | 42 ++++++++++++++-- 2 files changed, 117 insertions(+), 9 deletions(-) diff --git a/jstests/aggregation/expressions/regex.js b/jstests/aggregation/expressions/regex.js index bf128ec04cc..2dc400b0523 100644 --- a/jstests/aggregation/expressions/regex.js +++ b/jstests/aggregation/expressions/regex.js @@ -77,11 +77,13 @@ function testRegexAggException(inputObj, exceptionCode) { {"match": "mp", "idx": 2, "captures": ["mp", "p"]}, {"match": "mp", "idx": 10, "captures": ["mp", "p"]} ]); + // Regex in json syntax, with multiple captures and matches. testRegexFindAggForKey(0, {input: "$text", regex: /(m(p))/}, [ {"match": "mp", "idx": 2, "captures": ["mp", "p"]}, {"match": "mp", "idx": 10, "captures": ["mp", "p"]} ]); + // Verify no overlapping match sub-strings. assert.commandWorked(coll.insert({_id: 112, text: "aaaaa aaaa"})); testRegexFindAggForKey(112, {input: "$text", regex: /(aa)/}, [ @@ -94,6 +96,7 @@ function testRegexAggException(inputObj, exceptionCode) { {"match": "aaaa", "idx": 0, "captures": ["aa"]}, {"match": "aaaa", "idx": 6, "captures": ["aa"]} ]); + // Verify greedy match. testRegexFindAggForKey(112, {input: "$text", regex: /(a+)/}, [ {"match": "aaaaa", "idx": 0, "captures": ["aaaaa"]}, @@ -103,6 +106,7 @@ function testRegexAggException(inputObj, exceptionCode) { {"match": "aaaaa", "idx": 0, "captures": ["a"]}, {"match": "aaaa", "idx": 6, "captures": ["a"]}, ]); + // Verify lazy match. assert.commandWorked(coll.insert({_id: 113, text: "aaa aa"})); testRegexFindAggForKey(113, {input: "$text", regex: /(a+?)/}, [ @@ -152,6 +156,69 @@ function testRegexAggException(inputObj, exceptionCode) { {input: "$text", regex: /^(?:1|a)\,([0-9]+)/}, [{"match": "1,2", "idx": 0, "captures": ["2"]}]); + assert.commandWorked(coll.insert({_id: "capture_group", text: "ABCDXYZABCDXYZABCDXYZ"})); + + // Matching regex with a non-matching capture group. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /((Z)|B)(CD)/}, [ + {"match": "BCD", "idx": 1, "captures": ["B", null, "CD"]}, + {"match": "BCD", "idx": 8, "captures": ["B", null, "CD"]}, + {"match": "BCD", "idx": 15, "captures": ["B", null, "CD"]} + ]); + + // Non-matching regex with a matching capture group. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /EFG|(BC)E/}, []); + testRegexFindAggForKey("capture_group", {input: "$text", regex: /(BC)E/}, []); + + // Matching regex with zero matching capture groups because of optional quantifier. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /BC(P(DE)?)?/}, [ + {"match": "BC", "idx": 1, "captures": [null, null]}, + {"match": "BC", "idx": 8, "captures": [null, null]}, + {"match": "BC", "idx": 15, "captures": [null, null]} + ]); + + // Matching regex with non-matching capture groups because of optional quantifier. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /(BC)(P(DE)?)?/}, [ + {"match": "BC", "idx": 1, "captures": ["BC", null, null]}, + {"match": "BC", "idx": 8, "captures": ["BC", null, null]}, + {"match": "BC", "idx": 15, "captures": ["BC", null, null]} + ]); + testRegexFindAggForKey("capture_group", {input: "$text", regex: /(ABC)(D(XY)?)?Z/}, [ + {"match": "ABCDXYZ", "idx": 0, "captures": ["ABC", "DXY", "XY"]}, + {"match": "ABCDXYZ", "idx": 7, "captures": ["ABC", "DXY", "XY"]}, + {"match": "ABCDXYZ", "idx": 14, "captures": ["ABC", "DXY", "XY"]} + ]); + + // Greedy optional quantifier inside a lazy optional quantifier. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /(ABC)(D(XY)?)??/}, [ + {"match": "ABC", "idx": 0, "captures": ["ABC", null, null]}, + {"match": "ABC", "idx": 7, "captures": ["ABC", null, null]}, + {"match": "ABC", "idx": 14, "captures": ["ABC", null, null]} + ]); + + // Lazy optional quantifier inside a greedy optional quantifier. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /(ABC)(D(XY)??)?/}, [ + {"match": "ABCD", "idx": 0, "captures": ["ABC", "D", null]}, + {"match": "ABCD", "idx": 7, "captures": ["ABC", "D", null]}, + {"match": "ABCD", "idx": 14, "captures": ["ABC", "D", null]} + ]); + + // Everything is optional. + testRegexFindAggForKey("capture_group", {input: "$text", regex: /(ABC)?(XY)?/}, [ + {"match": "ABC", "idx": 0, "captures": ["ABC", null]}, + // Since everything is optional, every index position in the input is a match. + {"match": "", "idx": 3, "captures": [null, null]}, + {"match": "XY", "idx": 4, "captures": [null, "XY"]}, + {"match": "", "idx": 6, "captures": [null, null]}, + {"match": "ABC", "idx": 7, "captures": ["ABC", null]}, + {"match": "", "idx": 10, "captures": [null, null]}, + {"match": "XY", "idx": 11, "captures": [null, "XY"]}, + {"match": "", "idx": 13, "captures": [null, null]}, + {"match": "ABC", "idx": 14, "captures": ["ABC", null]}, + {"match": "", "idx": 17, "captures": [null, null]}, + {"match": "XY", "idx": 18, "captures": [null, "XY"]}, + {"match": "", "idx": 20, "captures": [null, null]}, + ]); + // Regex quantifier. assert.commandWorked(coll.insert({_id: 7, text: "abc12defgh345jklm"})); testRegexFindAggForKey( @@ -220,15 +287,23 @@ function testRegexAggException(inputObj, exceptionCode) { {input: "$text", regex: "$pattern"}, [{"match": "Text", "idx": 5, "captures": ["Te", "e"]}]); + assert.commandWorked(coll.insert({_id: "Empty input", text: ""})); + // Empty input matches empty regex. - testRegexFindAggForKey(0, {input: "", regex: ""}, [{"match": "", "idx": 0, "captures": []}]); - // Empty captures groups. + testRegexFindAggForKey( + "Empty input", {input: "$text", regex: ""}, [{"match": "", "idx": 0, "captures": []}]); + + // Empty capture groups. + testRegexFindAggForKey("Empty input", + {input: "$text", regex: "(missing)|()"}, + [{"match": "", "idx": 0, "captures": [null, ""]}]); testRegexFindAggForKey(0, {input: "bbbb", regex: "()"}, [ {"match": "", "idx": 0, "captures": [""]}, {"match": "", "idx": 1, "captures": [""]}, {"match": "", "idx": 2, "captures": [""]}, {"match": "", "idx": 3, "captures": [""]} ]); + // No matches. testRegexFindAggForKey(0, {input: "$text", regex: /foo/}, []); // Regex null. @@ -275,8 +350,9 @@ function testRegexAggException(inputObj, exceptionCode) { testRegexFindAggForKey( 3, {input: "$text", regex: /म/}, [{"match": "म", "idx": 0, "captures": []}]); // Unicode with capture group. - testRegexFindAggForKey( - 3, {input: "$text", regex: /(गो )/}, [{"match": "गो ", "idx": 3, "captures": ["गो "]}]); + testRegexFindAggForKey(3, + {input: "$text", regex: /(गो )|(missing)/}, + [{"match": "गो ", "idx": 3, "captures": ["गो ", null]}]); // Test that regexes support Unicode character properties. testRegexFindAggForKey(2, {input: "$text", regex: String.raw`\p{Hangul}`}, []); testRegexFindAggForKey(2, diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index 689dcf21e84..387ff0ab1cb 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -5862,13 +5862,14 @@ int ExpressionRegex::execute(RegexExecutionState* regexState) const { 0, // No need to overwrite the options set during pcre_compile. &(regexState->capturesBuffer.front()), regexState->capturesBuffer.size()); - // The 'execResult' will be (numCaptures + 1) if there is a match, -1 if there is no match, - // negative (other than -1) if there is an error during execution, and zero if capturesBuffer's - // capacity is not sufficient to hold all the results. The latter scenario should never occur. + // The 'execResult' will be -1 if there is no match, 0 < execResult <= (numCaptures + 1) + // depending on how many capture groups match, negative (other than -1) if there is an error + // during execution, and zero if capturesBuffer's capacity is not sufficient to hold all the + // results. The latter scenario should never occur. uassert(51156, str::stream() << "Error occurred while executing the regular expression in " << _opName << ". Result code: " << execResult, - execResult == -1 || execResult == (regexState->numCaptures + 1)); + execResult == -1 || (execResult > 0 && execResult <= (regexState->numCaptures + 1))); return execResult; } @@ -5884,9 +5885,35 @@ Value ExpressionRegex::nextMatch(RegexExecutionState* regexState) const { // calls. StringData input = *(regexState->input); + auto verifyBounds = [&input, this](auto startPos, auto limitPos, auto isCapture) { + // If a capture group was not matched, then the 'startPos' and 'limitPos' will both be -1. + // These bounds cannot occur for a match on the full string. + if (startPos == -1 || limitPos == -1) { + massert(31304, + str::stream() << "Unexpected error occurred while executing " << _opName + << ". startPos: " << startPos << ", limitPos: " << limitPos, + isCapture && startPos == -1 && limitPos == -1); + return; + } + + massert(31305, + str::stream() << "Unexpected error occurred while executing " << _opName + << ". startPos: " << startPos, + (startPos >= 0 && static_cast(startPos) <= input.size())); + massert(31306, + str::stream() << "Unexpected error occurred while executing " << _opName + << ". limitPos: " << limitPos, + (limitPos >= 0 && static_cast(limitPos) <= input.size())); + massert(31307, + str::stream() << "Unexpected error occurred while executing " << _opName + << ". startPos: " << startPos << ", limitPos: " << limitPos, + startPos <= limitPos); + }; + // The first and second entries of the 'capturesBuffer' will have the start and (end+1) indices // of the matched string, as byte offsets. '(limit - startIndex)' would be the length of the // captured string. + verifyBounds(regexState->capturesBuffer[0], regexState->capturesBuffer[1], false); const int matchStartByteIndex = regexState->capturesBuffer[0]; StringData matchedStr = input.substr(matchStartByteIndex, regexState->capturesBuffer[1] - matchStartByteIndex); @@ -5911,7 +5938,12 @@ Value ExpressionRegex::nextMatch(RegexExecutionState* regexState) const { for (int i = 0; i < regexState->numCaptures; ++i) { const int start = regexState->capturesBuffer[2 * (i + 1)]; const int limit = regexState->capturesBuffer[2 * (i + 1) + 1]; - captures.push_back(Value(input.substr(start, limit - start))); + verifyBounds(start, limit, true); + + // The 'start' and 'limit' will be set to -1, if the 'input' didn't match the current + // capture group. In this case we put a 'null' placeholder in place of the capture group. + captures.push_back(start == -1 && limit == -1 ? Value(BSONNULL) + : Value(input.substr(start, limit - start))); } MutableDocument match; -- cgit v1.2.1