diff options
-rw-r--r-- | jstests/aggregation/expressions/regex.js | 438 | ||||
-rw-r--r-- | jstests/aggregation/expressions/regexFind.js | 339 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 338 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.h | 11 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression_test.cpp | 59 |
5 files changed, 728 insertions, 457 deletions
diff --git a/jstests/aggregation/expressions/regex.js b/jstests/aggregation/expressions/regex.js new file mode 100644 index 00000000000..31b04b54976 --- /dev/null +++ b/jstests/aggregation/expressions/regex.js @@ -0,0 +1,438 @@ +/* + * Tests for $regexFind and $regexFindAll aggregation expression. + */ +(function() { + 'use strict'; + load("jstests/aggregation/extras/utils.js"); // For assertErrorCode(). + const coll = db.regex_find_expr; + coll.drop(); + + function testRegex(expression, inputObj, expectedOutput) { + const result = + coll.aggregate([ + {"$project": {_id: 0, "matches": {[expression]: inputObj}}}, + {"$sort": {"matches": 1}} // Sort to ensure the documents are returned in a + // deterministic order for sharded clusters. + ]) + .toArray(); + assert.eq(result, expectedOutput); + } + function testRegexForKey(expression, key, inputObj, expectedMatchObj) { + const result = + coll.aggregate( + [{"$match": {"_id": key}}, {"$project": {"matches": {[expression]: inputObj}}}]) + .toArray(); + const expectedOutput = [{"_id": key, "matches": expectedMatchObj}]; + assert.eq(result, expectedOutput); + } + + /** + * This function validates the output against both $regexFind and $regexFindAll expressions. + */ + function testRegexFindAgg(inputObj, expectedOutputForFindAll) { + testRegex("$regexFindAll", inputObj, expectedOutputForFindAll); + + // For each of the output document, get first element from "matches" array. This will + // convert 'regexFindAll' output to 'regexFind' output. + const expectedOutputForFind = expectedOutputForFindAll.map( + (element) => ({matches: element.matches.length == 0 ? null : element.matches[0]})); + testRegex("$regexFind", inputObj, expectedOutputForFind); + } + + /** + * This function validates the output against both $regexFind and $regexFindAll expressions. + */ + function testRegexFindAggForKey(key, inputObj, expectedOutputForFindAll) { + testRegexForKey("$regexFindAll", key, inputObj, expectedOutputForFindAll); + const expectedOutputForFind = + expectedOutputForFindAll.length == 0 ? null : expectedOutputForFindAll[0]; + testRegexForKey("$regexFind", key, inputObj, expectedOutputForFind); + } + + /** + * This function validates the output against both $regexFind and $regexFindAll expressions. + */ + function testRegexAggException(inputObj, exceptionCode) { + assertErrorCode( + coll, [{"$project": {"matches": {"$regexFindAll": inputObj}}}], exceptionCode); + assertErrorCode(coll, [{"$project": {"matches": {"$regexFind": inputObj}}}], exceptionCode); + } + + (function testWithSingleMatch() { + // Regex in string notation, find with multiple captures and matches. + assert.commandWorked(coll.insert({_id: 0, text: "Simple Example "})); + testRegexFindAggForKey(0, {input: "$text", regex: "(m(p))"}, [ + {"match": "mp", "idx": 2, "captures": ["mp", "p"]}, + {"match": "mp", "idx": 10, "captures": ["mp", "p"]} + ]); + // Regex in json syntax, with multiple captures and matches. + testRegexFindAggForKey(0, {input: "$text", regex: /(m(p))/}, [ + {"match": "mp", "idx": 2, "captures": ["mp", "p"]}, + {"match": "mp", "idx": 10, "captures": ["mp", "p"]} + ]); + // Verify no overlapping match sub-strings. + assert.commandWorked(coll.insert({_id: 112, text: "aaaaa aaaa"})); + testRegexFindAggForKey(112, {input: "$text", regex: /(aa)/}, [ + {"match": "aa", "idx": 0, "captures": ["aa"]}, + {"match": "aa", "idx": 2, "captures": ["aa"]}, + {"match": "aa", "idx": 6, "captures": ["aa"]}, + {"match": "aa", "idx": 8, "captures": ["aa"]} + ]); + testRegexFindAggForKey(112, {input: "$text", regex: /(aa)+/}, [ + {"match": "aaaa", "idx": 0, "captures": ["aa"]}, + {"match": "aaaa", "idx": 6, "captures": ["aa"]} + ]); + // Verify greedy match. + testRegexFindAggForKey(112, {input: "$text", regex: /(a+)/}, [ + {"match": "aaaaa", "idx": 0, "captures": ["aaaaa"]}, + {"match": "aaaa", "idx": 6, "captures": ["aaaa"]}, + ]); + testRegexFindAggForKey(112, {input: "$text", regex: /(a)+/}, [ + {"match": "aaaaa", "idx": 0, "captures": ["a"]}, + {"match": "aaaa", "idx": 6, "captures": ["a"]}, + ]); + // Verify lazy match. + assert.commandWorked(coll.insert({_id: 113, text: "aaa aa"})); + testRegexFindAggForKey(113, {input: "$text", regex: /(a+?)/}, [ + {"match": "a", "idx": 0, "captures": ["a"]}, + {"match": "a", "idx": 1, "captures": ["a"]}, + {"match": "a", "idx": 2, "captures": ["a"]}, + {"match": "a", "idx": 4, "captures": ["a"]}, + {"match": "a", "idx": 5, "captures": ["a"]} + ]); + testRegexFindAggForKey(113, {input: "$text", regex: /(a*?)/}, [ + {"match": "", "idx": 0, "captures": [""]}, + {"match": "", "idx": 1, "captures": [""]}, + {"match": "", "idx": 2, "captures": [""]}, + {"match": "", "idx": 3, "captures": [""]}, + {"match": "", "idx": 4, "captures": [""]}, + {"match": "", "idx": 5, "captures": [""]} + ]); + + // Regex string groups within group. + testRegexFindAggForKey( + 0, + {input: "$text", regex: "((S)(i)(m)(p)(l)(e))"}, + [{"match": "Simple", "idx": 0, "captures": ["Simple", "S", "i", "m", "p", "l", "e"]}]); + testRegexFindAggForKey( + 0, + {input: "$text", regex: "(S)(i)(m)((p)(l)(e))"}, + [{"match": "Simple", "idx": 0, "captures": ["S", "i", "m", "ple", "p", "l", "e"]}]); + + // Regex email pattern. + assert.commandWorked( + coll.insert({_id: 1, text: "Some field text with email mongo@mongodb.com"})); + testRegexFindAggForKey( + 1, + {input: "$text", regex: "([a-zA-Z0-9._-]+)@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+"}, + [{"match": "mongo@mongodb.com", "idx": 27, "captures": ["mongo"]}]); + + // Regex digits. + assert.commandWorked(coll.insert({_id: 5, text: "Text with 02 digits"})); + testRegexFindAggForKey( + 5, {input: "$text", regex: /[0-9]+/}, [{"match": "02", "idx": 10, "captures": []}]); + testRegexFindAggForKey( + 5, {input: "$text", regex: /(\d+)/}, [{"match": "02", "idx": 10, "captures": ["02"]}]); + + // Regex a non-capture group. + assert.commandWorked(coll.insert({_id: 6, text: "1,2,3,4,5,6,7,8,9,10"})); + testRegexFindAggForKey(6, + {input: "$text", regex: /^(?:1|a)\,([0-9]+)/}, + [{"match": "1,2", "idx": 0, "captures": ["2"]}]); + + // Regex quantifier. + assert.commandWorked(coll.insert({_id: 7, text: "abc12defgh345jklm"})); + testRegexFindAggForKey( + 7, {input: "$text", regex: /[0-9]{3}/}, [{"match": "345", "idx": 10, "captures": []}]); + + // Regex case insensitive option. + assert.commandWorked(coll.insert({_id: 8, text: "This Is Camel Case"})); + testRegexFindAggForKey(8, {input: "$text", regex: /camel/}, []); + testRegexFindAggForKey( + 8, {input: "$text", regex: /camel/i}, [{"match": "Camel", "idx": 8, "captures": []}]); + testRegexFindAggForKey(8, + {input: "$text", regex: /camel/, options: "i"}, + [{"match": "Camel", "idx": 8, "captures": []}]); + testRegexFindAggForKey(8, + {input: "$text", regex: "camel", options: "i"}, + [{"match": "Camel", "idx": 8, "captures": []}]); + + // Regex multi line option. + assert.commandWorked(coll.insert({_id: 9, text: "Foo line1\nFoo line2\nFoo line3"})); + // Verify no match with options flag off. + testRegexFindAggForKey(9, {input: "$text", regex: /^Foo line\d$/}, []); + // Verify match when flag is on. + testRegexFindAggForKey(9, {input: "$text", regex: /(^Foo line\d$)/m}, [ + {"match": "Foo line1", "idx": 0, "captures": ["Foo line1"]}, + {"match": "Foo line2", "idx": 10, "captures": ["Foo line2"]}, + {"match": "Foo line3", "idx": 20, "captures": ["Foo line3"]} + ]); + + // Regex single line option. + testRegexFindAggForKey(9, {input: "$text", regex: "Foo.*line"}, [ + {"match": "Foo line", "idx": 0, "captures": []}, + {"match": "Foo line", "idx": 10, "captures": []}, + {"match": "Foo line", "idx": 20, "captures": []} + ]); + testRegexFindAggForKey( + 9, + {input: "$text", regex: "Foo.*line", options: "s"}, + [{"match": "Foo line1\nFoo line2\nFoo line", "idx": 0, "captures": []}]); + + // Regex extended option. + testRegexFindAggForKey(9, {input: "$text", regex: "F o o # a comment"}, []); + testRegexFindAggForKey(9, {input: "$text", regex: "F o o # a comment", options: "x"}, [ + {"match": "Foo", "idx": 0, "captures": []}, + {"match": "Foo", "idx": 10, "captures": []}, + {"match": "Foo", "idx": 20, "captures": []} + ]); + testRegexFindAggForKey( + 9, {input: "$text", regex: "F o o # a comment \n\n# ignored", options: "x"}, [ + {"match": "Foo", "idx": 0, "captures": []}, + {"match": "Foo", "idx": 10, "captures": []}, + {"match": "Foo", "idx": 20, "captures": []} + ]); + testRegexFindAggForKey(9, {input: "$text", regex: "(F o o) # a comment", options: "x"}, [ + {"match": "Foo", "idx": 0, "captures": ["Foo"]}, + {"match": "Foo", "idx": 10, "captures": ["Foo"]}, + {"match": "Foo", "idx": 20, "captures": ["Foo"]} + ]); + + // Regex pattern from a document field value. + assert.commandWorked( + coll.insert({_id: 10, text: "Simple Value Example", pattern: "(m(p))"})); + testRegexFindAggForKey(10, {input: "$text", regex: "$pattern"}, [ + {"match": "mp", "idx": 2, "captures": ["mp", "p"]}, + {"match": "mp", "idx": 16, "captures": ["mp", "p"]} + ]); + assert.commandWorked(coll.insert({_id: 11, text: "OtherText", pattern: /(T(e))xt$/})); + testRegexFindAggForKey(11, + {input: "$text", regex: "$pattern"}, + [{"match": "Text", "idx": 5, "captures": ["Te", "e"]}]); + + // Empty input matches empty regex. + testRegexFindAggForKey( + 0, {input: "", regex: ""}, [{"match": "", "idx": 0, "captures": []}]); + // Empty captures groups. + testRegexFindAggForKey(0, {input: "bbbb", regex: "()"}, [ + {"match": "", "idx": 0, "captures": [""]}, + {"match": "", "idx": 1, "captures": [""]}, + {"match": "", "idx": 2, "captures": [""]}, + {"match": "", "idx": 3, "captures": [""]} + ]); + // No matches. + testRegexFindAggForKey(0, {input: "$text", regex: /foo/}, []); + // Regex null. + testRegexFindAggForKey(0, {input: "$text", regex: null}, []); + // Regex not present. + testRegexFindAggForKey(0, {input: "$text"}, []); + // Input not present. + testRegexFindAggForKey(0, {regex: /valid/}, []); + // Input null. + testRegexFindAggForKey(0, {input: null, regex: /valid/}, []); + // Empty object. + testRegexFindAggForKey(0, {}, []); + })(); + + (function testWithStartOptions() { + coll.drop(); + assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"})); + assert.commandWorked(coll.insert({_id: 3, text: "ab\ncd"})); + + // LIMIT_MATCH option to limit the number of comparisons PCRE does internally. + testRegexFindAggForKey(2, {input: "$text", regex: "(*LIMIT_MATCH=1)fé"}, []); + testRegexFindAggForKey(2, + {input: "$text", regex: "(*LIMIT_MATCH=3)(fé)"}, + [{"match": "fé", "idx": 2, "captures": ["fé"]}]); + + // (*LF) would change the feed system to UNIX like and (*CR) to windows like. So '\n' would + // match '.' with CR but not LF. + testRegexFindAggForKey(3, {input: "$text", regex: "(*LF)ab.cd"}, []); + testRegexFindAggForKey(3, + {input: "$text", regex: "(*CR)ab.cd"}, + [{"match": "ab\ncd", "idx": 0, "captures": []}]); + + // Multiple start options. + testRegexFindAggForKey(2, + {input: "$text", regex: String.raw `(*LIMIT_MATCH=5)(*UCP)^(\w+)`}, + [{"match": "cafétéria", "idx": 0, "captures": ["cafétéria"]}]); + testRegexFindAggForKey( + 2, {input: "$text", regex: String.raw `(*LIMIT_MATCH=1)(*UCP)^(\w+)`}, []); + })(); + + (function testWithUnicodeData() { + coll.drop(); + // Unicode index counting. + assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"})); + assert.commandWorked(coll.insert({_id: 3, text: "मा०गो डीबि"})); + testRegexFindAggForKey( + 2, {input: "$text", regex: "té"}, [{"match": "té", "idx": 4, "captures": []}]); + testRegexFindAggForKey( + 3, {input: "$text", regex: /म/}, [{"match": "म", "idx": 0, "captures": []}]); + // Unicode with capture group. + testRegexFindAggForKey(3, + {input: "$text", regex: /(गो )/}, + [{"match": "गो ", "idx": 3, "captures": ["गो "]}]); + // Test that regexes support Unicode character properties. + testRegexFindAggForKey(2, {input: "$text", regex: String.raw `\p{Hangul}`}, []); + testRegexFindAggForKey(2, + {input: "$text", regex: String.raw `\p{Latin}+$`}, + [{"match": "cafétéria", "idx": 0, "captures": []}]); + // Test that the (*UTF) and (*UTF8) options are accepted for unicode characters. + assert.commandWorked(coll.insert({_id: 12, text: "༢༣༤༤༤༥12༥A"})); + testRegexFindAggForKey(12, {input: "$text", regex: "(*UTF8)༤"}, [ + {"match": "༤", "idx": 2, "captures": []}, + {"match": "༤", "idx": 3, "captures": []}, + {"match": "༤", "idx": 4, "captures": []} + ]); + testRegexFindAggForKey(12, {input: "$text", regex: "(*UTF)༤"}, [ + {"match": "༤", "idx": 2, "captures": []}, + {"match": "༤", "idx": 3, "captures": []}, + {"match": "༤", "idx": 4, "captures": []} + ]); + // For ASCII characters. + assert.commandWorked(coll.insert({_id: 4, text: "123444"})); + testRegexFindAggForKey(4, + {input: "$text", regex: "(*UTF8)(44)"}, + [{"match": "44", "idx": 3, "captures": ["44"]}]); + testRegexFindAggForKey(4, + {input: "$text", regex: "(*UTF)(44)"}, + [{"match": "44", "idx": 3, "captures": ["44"]}]); + + // When the (*UCP) option is specified, Unicode "word" characters are included in the '\w' + // character type. + testRegexFindAggForKey(12, + {input: "$text", regex: String.raw `(*UCP)^(\w+)`}, + [{"match": "༢༣༤༤༤༥12༥A", "idx": 0, "captures": ["༢༣༤༤༤༥12༥A"]}]); + // When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode + // decimal digit characters. + testRegexFindAggForKey(12, + {input: "$text", regex: "(*UCP)^[[:digit:]]+"}, + [{"match": "༢༣༤༤༤༥12༥", "idx": 0, "captures": []}]); + testRegexFindAggForKey(12, {input: "$text", regex: "(*UCP)[[:digit:]]+$"}, []); + // When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode + // alphabetic characters. + assert.commandWorked(coll.insert({_id: 13, text: "박정수AB"})); + testRegexFindAggForKey(13, + {input: "$text", regex: String.raw `(*UCP)^[[:alpha:]]+`}, + [{"match": "박정수AB", "idx": 0, "captures": []}]); + + // No match when options are not set. + testRegexFindAggForKey(12, {input: "$text", regex: String.raw `^(\w+)`}, []); + testRegexFindAggForKey(12, {input: "$text", regex: "^[[:digit:]]"}, []); + testRegexFindAggForKey(2, {input: "$text", regex: "^[[:alpha:]]+$"}, []); + })(); + + (function testErrors() { + coll.drop(); + assert.commandWorked(coll.insert({text: "string"})); + // Null object. + testRegexAggException(null, 51103); + // Incorrect object parameter. + testRegexAggException("incorrect type", 51103); + // Test malformed regex. + testRegexAggException({input: "$text", regex: "[0-9"}, 51111); + testRegexAggException({regex: "[a-c"}, 51111); + // Malformed regex because start options not at the beginning. + testRegexAggException({input: "$text", regex: "^(*UCP)[[:alpha:]]+$"}, 51111); + testRegexAggException({input: "$text", regex: "((*UCP)[[:alpha:]]+$)"}, 51111); + // At least one of the 'input' field is not string. + assert.commandWorked(coll.insert({a: "string"})); + assert.commandWorked(coll.insert({a: {b: "object"}})); + testRegexAggException({input: "$a", regex: "valid"}, 51104); + testRegexAggException({input: "$a"}, 51104); + // 'regex' field is not string or regex. + testRegexAggException({input: "$text", regex: ["incorrect"]}, 51105); + // 'options' field is not string. + testRegexAggException({input: "$text", regex: "valid", options: 123}, 51106); + // Incorrect 'options' flag. + testRegexAggException({input: "$text", regex: "valid", options: 'a'}, 51108); + // 'options' are case-sensitive. + testRegexAggException({input: "$text", regex: "valid", options: "I"}, 51108); + // Options specified in both 'regex' and 'options'. + testRegexAggException({input: "$text", regex: /(m(p))/i, options: "i"}, 51107); + testRegexAggException({input: "$text", regex: /(m(p))/i, options: "x"}, 51107); + testRegexAggException({input: "$text", regex: /(m(p))/m, options: ""}, 51107); + // 'regex' as string with null characters. + testRegexAggException({input: "$text", regex: "sasd\0", options: "i"}, 51109); + testRegexAggException({regex: "sa\x00sd", options: "i"}, 51109); + // 'options' as string with null characters. + testRegexAggException({input: "$text", regex: /(m(p))/, options: "i\0"}, 51110); + testRegexAggException({input: "$text", options: "i\x00"}, 51110); + })(); + + (function testMultipleMatches() { + coll.drop(); + assert.commandWorked(coll.insert({a: "string1string2"})); + assert.commandWorked(coll.insert({a: "string3 string4"})); + // Both match. + testRegexFindAgg({input: "$a", regex: "(str.*?[0-9])"}, [ + { + "matches": [ + {"match": "string1", "idx": 0, "captures": ["string1"]}, + {"match": "string2", "idx": 7, "captures": ["string2"]} + ] + }, + { + "matches": [ + {"match": "string3", "idx": 0, "captures": ["string3"]}, + {"match": "string4", "idx": 8, "captures": ["string4"]} + ] + } + ]); + // Only one match. + testRegexFindAgg({input: "$a", regex: "(^.*[0-2]$)"}, [ + {"matches": []}, + {"matches": [{"match": "string1string2", "idx": 0, "captures": ["string1string2"]}]} + + ]); + // None match. + testRegexFindAgg({input: "$a", regex: "(^.*[5-9]$)"}, [{"matches": []}, {"matches": []}]); + })(); + + (function testInsideCondOperator() { + coll.drop(); + assert.commandWorked( + coll.insert({_id: 0, level: "Public Knowledge", info: "Company Name"})); + assert.commandWorked( + coll.insert({_id: 1, level: "Private Information", info: "Company Secret"})); + const expectedResults = + [{"_id": 0, "information": "Company Name"}, {"_id": 1, "information": "REDACTED"}]; + // For $regexFindAll. + let result = + coll.aggregate([{ + "$project": { + "information": { + "$cond": [ + { + "$eq": + [{"$regexFindAll": {input: "$level", regex: /public/i}}, []] + }, + "REDACTED", + "$info" + ] + } + } + }]) + .toArray(); + assert.eq(result, expectedResults); + // For $regexFind. + result = + coll.aggregate([{ + "$project": { + "information": { + "$cond": [ + { + "$eq": + [{"$regexFind": {input: "$level", regex: /public/i}}, null] + }, + "REDACTED", + "$info" + ] + } + } + }]) + .toArray(); + assert.eq(result, expectedResults); + })(); +}()); diff --git a/jstests/aggregation/expressions/regexFind.js b/jstests/aggregation/expressions/regexFind.js deleted file mode 100644 index e5743a453c5..00000000000 --- a/jstests/aggregation/expressions/regexFind.js +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Tests for $regexFind aggregation expression. - */ -(function() { - 'use strict'; - - load("jstests/aggregation/extras/utils.js"); // For assertErrorCode(). - - const coll = db.regex_find_expr; - coll.drop(); - - function testRegexFindAgg(regexFind, expectedOutput) { - const result = - coll.aggregate([ - {"$project": {_id: 0, "matches": {"$regexFind": regexFind}}}, - {"$sort": {"matches": 1}} // Ensure that the documents are returned in a - // deterministic order for sharded clusters. - ]) - .toArray(); - assert.eq(result, expectedOutput); - } - function testRegexFindAggForKey(key, regexFind, expectedMatchObj) { - const result = coll.aggregate([ - {"$match": {"_id": key}}, - {"$project": {"matches": {"$regexFind": regexFind}}} - ]) - .toArray(); - const expectedOutput = [{"_id": key, "matches": expectedMatchObj}]; - assert.eq(result, expectedOutput); - } - function testRegexFindAggException(regexFind, exceptionCode) { - assertErrorCode( - coll, [{"$project": {"matches": {"$regexFind": regexFind}}}], exceptionCode); - } - - (function testWithSingleMatch() { - // Regex in string notation, find with multiple captures. - assert.commandWorked(coll.insert({_id: 0, text: "Simple Example"})); - testRegexFindAggForKey(0, - {input: "$text", regex: "(m(p))"}, - {"match": "mp", "idx": 2, "captures": ["mp", "p"]}); - // Regex in json syntax, with multiple captures. - testRegexFindAggForKey(0, {input: "$text", regex: /(S)(i)(m)(p)(l)(e) (Ex)(am)(p)(le)/}, { - "match": "Simple Example", - "idx": 0, - "captures": ["S", "i", "m", "p", "l", "e", "Ex", "am", "p", "le"] - }); - - // Regex string groups within group. - testRegexFindAggForKey( - 0, - {input: "$text", regex: "((S)(i)(m)(p)(l)(e))"}, - {"match": "Simple", "idx": 0, "captures": ["Simple", "S", "i", "m", "p", "l", "e"]}); - testRegexFindAggForKey( - 0, - {input: "$text", regex: "(S)(i)(m)((p)(l)(e))"}, - {"match": "Simple", "idx": 0, "captures": ["S", "i", "m", "ple", "p", "l", "e"]}); - - // Regex email pattern. - assert.commandWorked( - coll.insert({_id: 1, text: "Some field text with email mongo@mongodb.com"})); - testRegexFindAggForKey( - 1, - {input: "$text", regex: "([a-zA-Z0-9._-]+)@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+"}, - {"match": "mongo@mongodb.com", "idx": 27, "captures": ["mongo"]}); - - // Regex digits. - assert.commandWorked(coll.insert({_id: 5, text: "Text with 02 digits"})); - testRegexFindAggForKey( - 5, {input: "$text", regex: /[0-9]+/}, {"match": "02", "idx": 10, "captures": []}); - testRegexFindAggForKey( - 5, {input: "$text", regex: /(\d+)/}, {"match": "02", "idx": 10, "captures": ["02"]}); - - // Regex a non-capture group. - assert.commandWorked(coll.insert({_id: 6, text: "1,2,3,4,5,6,7,8,9,10"})); - testRegexFindAggForKey(6, - {input: "$text", regex: /^(?:1|a)\,([0-9]+)/}, - {"match": "1,2", "idx": 0, "captures": ["2"]}); - - // Regex quantifier. - assert.commandWorked(coll.insert({_id: 7, text: "abc12defgh345jklm"})); - testRegexFindAggForKey( - 7, {input: "$text", regex: /[0-9]{3}/}, {"match": "345", "idx": 10, "captures": []}); - - // Regex case insensitive option. - assert.commandWorked(coll.insert({_id: 8, text: "This Is Camel Case"})); - testRegexFindAggForKey(8, {input: "$text", regex: /camel/}, null); - testRegexFindAggForKey( - 8, {input: "$text", regex: /camel/i}, {"match": "Camel", "idx": 8, "captures": []}); - testRegexFindAggForKey(8, - {input: "$text", regex: /camel/, options: "i"}, - {"match": "Camel", "idx": 8, "captures": []}); - testRegexFindAggForKey(8, - {input: "$text", regex: "camel", options: "i"}, - {"match": "Camel", "idx": 8, "captures": []}); - - // Regex multi line option. - assert.commandWorked(coll.insert({_id: 9, text: "Foo line1\nFoo line2\nFoo line3"})); - // Verify no match with options flag off. - testRegexFindAggForKey(9, {input: "$text", regex: /^Foo line\d$/}, null); - // Verify match when flag is on. - testRegexFindAggForKey(9, - {input: "$text", regex: /^Foo line\d$/m}, - {"match": "Foo line1", "idx": 0, "captures": []}); - - // Regex single line option. - testRegexFindAggForKey(9, - {input: "$text", regex: "Foo.*line"}, - {"match": "Foo line", "idx": 0, "captures": []}); - testRegexFindAggForKey( - 9, - {input: "$text", regex: "Foo.*line", options: "s"}, - {"match": "Foo line1\nFoo line2\nFoo line", "idx": 0, "captures": []}); - - // Regex extended option. - testRegexFindAggForKey(9, {input: "$text", regex: "F o o # a comment"}, null); - testRegexFindAggForKey(9, - {input: "$text", regex: "F o o # a comment", options: "x"}, - {"match": "Foo", "idx": 0, "captures": []}); - testRegexFindAggForKey( - 9, - {input: "$text", regex: "F o o # a comment \n\n# ignored", options: "x"}, - {"match": "Foo", "idx": 0, "captures": []}); - testRegexFindAggForKey(9, - {input: "$text", regex: "(F o o) # a comment", options: "x"}, - {"match": "Foo", "idx": 0, "captures": ["Foo"]}); - - // Regex pattern from a document field value. - assert.commandWorked(coll.insert({_id: 10, text: "Simple Value", pattern: "(m(p))"})); - testRegexFindAggForKey(10, - {input: "$text", regex: "$pattern"}, - {"match": "mp", "idx": 2, "captures": ["mp", "p"]}); - assert.commandWorked(coll.insert({_id: 11, text: "OtherText", pattern: /(T(e))xt$/})); - testRegexFindAggForKey(11, - {input: "$text", regex: "$pattern"}, - {"match": "Text", "idx": 5, "captures": ["Te", "e"]}); - - // 'regex' as object with null characters. - assert.commandWorked(coll.insert({_id: 12, text: "Null\0 charac\0ters"})); - testRegexFindAggForKey(12, {input: "$text", regex: /((Null)(\0))( )(charac\0t)/}, { - "match": "Null\0 charac\0t", - "idx": 0, - "captures": ["Null\0", "Null", "\0", " ", "charac\0t"] - }); - testRegexFindAggForKey( - 12, - {input: "$text", regex: /(\x00)( )(charac\x00t)/}, - {"match": "\0 charac\x00t", "idx": 4, "captures": ["\x00", " ", "charac\0t"]}); - // 'regex' as string with escaped null characters. - testRegexFindAggForKey(12, - {input: "$text", regex: "l\\0 charac\\0ter.*$"}, - {"match": "l\0 charac\0ters", "idx": 3, "captures": []}); - // No match with null characters in input. - testRegexFindAggForKey(12, {input: "$text", regex: /Null c/}, null); - // No match with null characters in regex. - testRegexFindAggForKey(12, {input: "$text", regex: /Nul\0l/}, null); - - // No matches. - testRegexFindAggForKey(0, {input: "$text", regex: /foo/}, null); - // Regex null. - testRegexFindAggForKey(0, {input: "$text", regex: null}, null); - // Regex not present. - testRegexFindAggForKey(0, {input: "$text"}, null); - // Input not present. - testRegexFindAggForKey(0, {regex: /valid/}, null); - // Input null. - testRegexFindAggForKey(0, {input: null, regex: /valid/}, null); - // Empty object. - testRegexFindAggForKey(0, {}, null); - })(); - - (function testWithStartOptions() { - coll.drop(); - assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"})); - assert.commandWorked(coll.insert({_id: 3, text: "ab\ncd"})); - - // LIMIT_MATCH option to limit the number of comparisons PCRE does internally. - testRegexFindAggForKey(2, {input: "$text", regex: "(*LIMIT_MATCH=1)fé"}, null); - testRegexFindAggForKey(2, - {input: "$text", regex: "(*LIMIT_MATCH=3)(fé)"}, - {"match": "fé", "idx": 2, "captures": ["fé"]}); - - // (*LF) will change the feed system to UNIX like and (*CR) to windows like. So '\n' would - // match '.' with CR but not LF. - testRegexFindAggForKey(3, {input: "$text", regex: "(*LF)ab.cd"}, null); - testRegexFindAggForKey(3, - {input: "$text", regex: "(*CR)ab.cd"}, - {"match": "ab\ncd", "idx": 0, "captures": []}); - - // Multiple start options. - testRegexFindAggForKey(2, - {input: "$text", regex: String.raw `(*LIMIT_MATCH=5)(*UCP)^(\w+)`}, - {"match": "cafétéria", "idx": 0, "captures": ["cafétéria"]}); - testRegexFindAggForKey( - 2, {input: "$text", regex: String.raw `(*LIMIT_MATCH=1)(*UCP)^(\w+)`}, null); - })(); - - (function testWithUnicodeData() { - coll.drop(); - // Unicode index counting. - assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"})); - assert.commandWorked(coll.insert({_id: 3, text: "मा०गो डीबि"})); - testRegexFindAggForKey( - 2, {input: "$text", regex: "té"}, {"match": "té", "idx": 4, "captures": []}); - testRegexFindAggForKey( - 3, {input: "$text", regex: /म/}, {"match": "म", "idx": 0, "captures": []}); - // Unicode with capture group. - testRegexFindAggForKey(3, - {input: "$text", regex: /(गो )/}, - {"match": "गो ", "idx": 3, "captures": ["गो "]}); - // Test that regexes support Unicode character properties. - testRegexFindAggForKey(2, {input: "$text", regex: String.raw `\p{Hangul}`}, null); - testRegexFindAggForKey(2, - {input: "$text", regex: String.raw `\p{Latin}+$`}, - {"match": "cafétéria", "idx": 0, "captures": []}); - // Test that the (*UTF) and (*UTF8) options are accepted for unicode characters. - assert.commandWorked(coll.insert({_id: 12, text: "༢༣༤༤༤༥12༥A"})); - testRegexFindAggForKey( - 12, {input: "$text", regex: "(*UTF8)༤"}, {"match": "༤", "idx": 2, "captures": []}); - testRegexFindAggForKey( - 12, {input: "$text", regex: "(*UTF)༤"}, {"match": "༤", "idx": 2, "captures": []}); - // For ASCII characters. - assert.commandWorked(coll.insert({_id: 4, text: "123444"})); - testRegexFindAggForKey(4, - {input: "$text", regex: "(*UTF8)(44)"}, - {"match": "44", "idx": 3, "captures": ["44"]}); - testRegexFindAggForKey(4, - {input: "$text", regex: "(*UTF)(44)"}, - {"match": "44", "idx": 3, "captures": ["44"]}); - - // When the (*UCP) option is specified, Unicode "word" characters are included in the '\w' - // character type. - testRegexFindAggForKey(12, - {input: "$text", regex: String.raw `(*UCP)^(\w+)`}, - {"match": "༢༣༤༤༤༥12༥A", "idx": 0, "captures": ["༢༣༤༤༤༥12༥A"]}); - // When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode - // decimal digit characters. - testRegexFindAggForKey(12, - {input: "$text", regex: "(*UCP)^[[:digit:]]+"}, - {"match": "༢༣༤༤༤༥12༥", "idx": 0, "captures": []}); - testRegexFindAggForKey(12, {input: "$text", regex: "(*UCP)[[:digit:]]+$"}, null); - // When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode - // alphabetic characters. - assert.commandWorked(coll.insert({_id: 13, text: "박정수AB"})); - testRegexFindAggForKey(13, - {input: "$text", regex: String.raw `(*UCP)^[[:alpha:]]+`}, - {"match": "박정수AB", "idx": 0, "captures": []}); - - // No match when options are not set. - testRegexFindAggForKey(12, {input: "$text", regex: String.raw `^(\w+)`}, null); - testRegexFindAggForKey(12, {input: "$text", regex: "^[[:digit:]]"}, null); - testRegexFindAggForKey(2, {input: "$text", regex: "^[[:alpha:]]+$"}, null); - })(); - - (function testErrors() { - coll.drop(); - assert.commandWorked(coll.insert({text: "string"})); - // Null object. - testRegexFindAggException(null, 51103); - // Incorrect object parameter. - testRegexFindAggException("incorrect type", 51103); - // Test malformed regex. - testRegexFindAggException({input: "$text", regex: "[0-9"}, 51111); - // Malformed regex because start options not at the beginning. - testRegexFindAggException({input: "$text", regex: "^(*UCP)[[:alpha:]]+$"}, 51111); - testRegexFindAggException({input: "$text", regex: "((*UCP)[[:alpha:]]+$)"}, 51111); - // At least one of the 'input' field is not string. - assert.commandWorked(coll.insert({a: "string"})); - assert.commandWorked(coll.insert({a: {b: "object"}})); - testRegexFindAggException({input: "$a", regex: "valid"}, 51104); - // 'regex' field is not string or regex. - testRegexFindAggException({input: "$text", regex: ["incorrect"]}, 51105); - // 'options' field is not string. - testRegexFindAggException({input: "$text", regex: "valid", options: 123}, 51106); - // Incorrect 'options' flag. - testRegexFindAggException({input: "$text", regex: "valid", options: 'a'}, 51108); - // 'options' are case-sensitive. - testRegexFindAggException({input: "$text", regex: "valid", options: "I"}, 51108); - // Options specified in both 'regex' and 'options'. - testRegexFindAggException({input: "$text", regex: /(m(p))/i, options: "i"}, 51107); - testRegexFindAggException({input: "$text", regex: /(m(p))/i, options: "x"}, 51107); - testRegexFindAggException({input: "$text", regex: /(m(p))/m, options: ""}, 51107); - // 'regex' as string with null characters. - testRegexFindAggException({input: "$text", regex: "sasd\0", options: "i"}, 51109); - testRegexFindAggException({input: "$text", regex: "sa\x00sd", options: "i"}, 51109); - // 'options' as string with null characters. - testRegexFindAggException({input: "$text", regex: /(m(p))/, options: "i\0"}, 51110); - testRegexFindAggException({input: "$text", regex: /(m(p))/, options: "i\x00"}, 51110); - - })(); - - (function testMultipleMatches() { - coll.drop(); - assert.commandWorked(coll.insert({a: "string1"})); - assert.commandWorked(coll.insert({a: "string2"})); - // Both match. - testRegexFindAgg({input: "$a", regex: "(^str.*)"}, [ - {"matches": {"match": "string1", "idx": 0, "captures": ["string1"]}}, - {"matches": {"match": "string2", "idx": 0, "captures": ["string2"]}} - ]); - // Only one match. - testRegexFindAgg({input: "$a", regex: "(^.*[0-1]$)"}, [ - {"matches": null}, - {"matches": {"match": "string1", "idx": 0, "captures": ["string1"]}} - ]); - // None match. - testRegexFindAgg({input: "$a", regex: "(^.*[3-9]$)"}, - [{"matches": null}, {"matches": null}]); - })(); - - (function testInsideCondOperator() { - coll.drop(); - assert.commandWorked( - coll.insert({_id: 0, level: "Public Knowledge", info: "Company Name"})); - assert.commandWorked( - coll.insert({_id: 1, level: "Private Information", info: "Company Secret"})); - - const result = - coll.aggregate([{ - "$project": { - "information": { - "$cond": [ - { - "$eq": - [{"$regexFind": {input: "$level", regex: /public/i}}, null] - }, - "REDACTED", - "$info" - ] - } - } - }]) - .toArray(); - assert.eq(result, [ - {"_id": 0, "information": "Company Name"}, - {"_id": 1, "information": "REDACTED"}, - ]); - })(); -}()); diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index d0d54cd7e14..f2bd565989e 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -5661,138 +5661,248 @@ Value ExpressionConvert::performConversion(BSONType targetType, Value inputValue namespace { -Value generateRegexCapturesAndMatches(StringData pattern, - const int numCaptures, - const pcrecpp::RE_Options& options, - StringData input, - int startBytePos, - int startCodePointPos) { - - const auto pcreOptions = options.all_options(); - // The first two-thirds of the vector is used to pass back captured substrings' start and limit - // indexes. The remaining third of the vector is used as workspace by pcre_exec() while matching - // capturing subpatterns, and is not available for passing back information. - const size_t sizeOfOVector = (1 + numCaptures) * 3; - const char* compile_error; - int eoffset; - - // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the index of - // the match). So we are using the C interface. First we compile all the regex options to - // generate pcre object, which will later be used to match against the input string. - pcre* pcre = pcre_compile(pattern.rawData(), pcreOptions, &compile_error, &eoffset, nullptr); - if (pcre == nullptr) { - uasserted(51111, str::stream() << "Invalid Regex: " << compile_error); - } - - // TODO: Evaluate the upper bound for this array and fail the request if numCaptures are higher - // than the limit (SERVER-37848). - std::vector<int> outVector(sizeOfOVector); - const int out = pcre_exec(pcre, - 0, - input.rawData(), - input.size(), - startBytePos, - 0, // No need to overwrite the options set during pcre_compile. - &outVector.front(), - sizeOfOVector); - (*pcre_free)(pcre); - // The 'out' parameter will be zero if outVector's size is not big enough to hold all the - // captures, which should never be the case. - invariant(out != 0); - - // No match. - if (out < 0) { - return Value(BSONNULL); +class RegexMatchHandler { +public: + RegexMatchHandler(const Value& inputExpr) : _pcre(nullptr), _nullish(false) { + _validateInputAndExtractElements(inputExpr); + _compile(regex_util::flags2PcreOptions(_options, false).all_options()); } - // The first and second entires of the outVector have the start and limit indices of the matched - // string. as byte offsets. - const int matchStartByteIndex = outVector[0]; - // We iterate through the input string's contents preceding the match index, in order to convert - // the byte offset to a code point offset. - for (int byteIx = startBytePos; byteIx < matchStartByteIndex; ++startCodePointPos) { - byteIx += getCodePointLength(input[byteIx]); + ~RegexMatchHandler() { + if (_pcre != nullptr) { + pcre_free(_pcre); + } } - StringData matchedStr = input.substr(outVector[0], outVector[1] - outVector[0]); - std::vector<Value> captures; - // The next 2 * numCaptures entries hold the start index and limit pairs, for each of the - // capture groups. We skip the first two elements and start iteration from 3rd element so that - // we only construct the strings for capture groups. - for (int i = 0; i < numCaptures; i++) { - const int start = outVector[2 * (i + 1)]; - const int limit = outVector[2 * (i + 1) + 1]; - captures.push_back(Value(input.substr(start, limit - start))); - } + /** + * The function will match '_input' string based on the regex pattern present in '_pcre'. If + * there is a match, the function will return a 'Value' object encapsulating the matched string, + * the code point index of the matched string and a vector representing all the captured + * substrings. The function will also update the parameters 'startBytePos' and + * 'startCodePointPos' to the corresponding new indices. If there is no match, the function will + * return null 'Value' object. + */ + Value nextMatch(int* startBytePos, int* startCodePointPos) { + invariant(startBytePos != nullptr && startCodePointPos != nullptr); + + // Use input as StringData throughout the function to avoid copying the string on 'substr' + // calls. + StringData input = _input; + int execResult = pcre_exec(_pcre, + 0, + input.rawData(), + input.size(), + *startBytePos, + 0, // No need to overwrite the options set during pcre_compile. + &_capturesBuffer.front(), + _capturesBuffer.size()); + // No match. + if (execResult < 0) { + return Value(BSONNULL); + } + // The 'execResult' will be zero if _capturesBuffer's size is not big enough to hold all + // the captures, which should never be the case. + invariant(execResult == _numCaptures + 1); + + // The first and second entries of the '_capturesBuffer' will have the start and limit + // indices of the matched string, as byte offsets. '(limit - startIndex)' would be the + // length of the captured string. + const int matchStartByteIndex = _capturesBuffer[0]; + StringData matchedStr = + input.substr(matchStartByteIndex, _capturesBuffer[1] - matchStartByteIndex); + // We iterate through the input string's contents preceding the match index, in order to + // convert the byte offset to a code point offset. + for (int byteIx = *startBytePos; byteIx < matchStartByteIndex; ++(*startCodePointPos)) { + byteIx += getCodePointLength(input[byteIx]); + } + // Set the start index for match to the new one. + *startBytePos = matchStartByteIndex; + + std::vector<Value> captures; + captures.reserve(_numCaptures); + // The next '2 * numCaptures' entries (after the first two entries) of '_capturesBuffer' + // will hold the start index and limit pairs, for each of the capture groups. We skip the + // first two elements and start iteration from 3rd element so that we only construct the + // strings for capture groups. + for (int i = 0; i < _numCaptures; ++i) { + const int start = _capturesBuffer[2 * (i + 1)]; + const int limit = _capturesBuffer[2 * (i + 1) + 1]; + captures.push_back(Value(input.substr(start, limit - start))); + } - MutableDocument match; - match.addField("match", Value(matchedStr)); - match.addField("idx", Value(startCodePointPos)); - match.addField("captures", Value(captures)); - return match.freezeToValue(); -} + MutableDocument match; + match.addField("match", Value(matchedStr)); + match.addField("idx", Value(*startCodePointPos)); + match.addField("captures", Value(captures)); + return match.freezeToValue(); + } -} // namespace + int numCaptures() { + return _numCaptures; + } -Value ExpressionRegexFind::evaluate(const Document& root) const { + bool nullish() { + return _nullish; + } - const Value expr = vpOperand[0]->evaluate(root); - uassert(51103, - str::stream() << "$regexFind expects an object of named arguments, but found type " - << expr.getType(), - !expr.nullish() && expr.getType() == BSONType::Object); - Value textInput = expr.getDocument().getField("input"); - Value regexPattern = expr.getDocument().getField("regex"); - Value regexOptions = expr.getDocument().getField("options"); - - uassert(51104, - "input field should be of type string", - textInput.nullish() || textInput.getType() == BSONType::String); - uassert(51105, - "regex field should be of type string or regex", - regexPattern.nullish() || regexPattern.getType() == BSONType::String || - regexPattern.getType() == BSONType::RegEx); - uassert(51106, - "options should be of type string", - regexOptions.nullish() || regexOptions.getType() == BSONType::String); - if (textInput.nullish() || regexPattern.nullish()) { - return Value(BSONNULL); + StringData getInput() { + return _input; } - StringData pattern, optionFlags; - // The 'regex' field can be a RegEx object with its own options/options specified separately... - if (regexPattern.getType() == BSONType::RegEx) { - StringData regexFlags = regexPattern.getRegexFlags(); - pattern = regexPattern.getRegex(); - uassert( - 51107, - str::stream() << "Found regex option(s) specified in both 'regex' and 'option' fields", - regexOptions.nullish() || regexFlags.empty()); - optionFlags = regexOptions.nullish() ? regexFlags : regexOptions.getStringData(); - } else { - // ... or it can be a string field with options specified separately. - pattern = regexPattern.getStringData(); +private: + RegexMatchHandler(const RegexMatchHandler&) = delete; + + void _compile(const int pcreOptions) { + const char* compile_error; + int eoffset; + // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the + // index of the match). So we are using the C interface. First we compile all the regex + // options to generate pcre object, which will later be used to match against the input + // string. + _pcre = pcre_compile(_pattern.c_str(), pcreOptions, &compile_error, &eoffset, nullptr); + if (_pcre == nullptr) { + uasserted(51111, str::stream() << "Invalid Regex: " << compile_error); + } + + // Calculate the number of capture groups present in '_pattern' and store in '_numCaptures'. + int pcre_retval = pcre_fullinfo(_pcre, NULL, PCRE_INFO_CAPTURECOUNT, &_numCaptures); + invariant(pcre_retval == 0); + + // The first two-thirds of the vector is used to pass back captured substrings' start and + // limit indexes. The remaining third of the vector is used as workspace by pcre_exec() + // while matching capturing subpatterns, and is not available for passing back information. + // TODO: Evaluate the upper bound for this array and fail the request if numCaptures are + // higher than the limit (SERVER-37848). + _capturesBuffer = std::vector<int>((1 + _numCaptures) * 3); + } + + void _validateInputAndExtractElements(const Value& inputExpr) { + uassert(51103, + str::stream() << "$regexFind expects an object of named arguments, but found type " + << inputExpr.getType(), + inputExpr.getType() == BSONType::Object); + Value textInput = inputExpr.getDocument().getField("input"); + Value regexPattern = inputExpr.getDocument().getField("regex"); + Value regexOptions = inputExpr.getDocument().getField("options"); + + uassert(51104, + "'input' field should be of type string", + textInput.nullish() || textInput.getType() == BSONType::String); + uassert(51105, + "'regex' field should be of type string or regex", + regexPattern.nullish() || regexPattern.getType() == BSONType::String || + regexPattern.getType() == BSONType::RegEx); + uassert(51106, + "'options' should be of type string", + regexOptions.nullish() || regexOptions.getType() == BSONType::String); + + // If either the text input or regex pattern is nullish, then we consider the operation as a + // whole nullish. + _nullish = textInput.nullish() || regexPattern.nullish(); + + if (textInput.getType() == BSONType::String) { + _input = textInput.getString(); + } + + // The 'regex' field can be a RegEx object and may have its own options... + if (regexPattern.getType() == BSONType::RegEx) { + StringData regexFlags = regexPattern.getRegexFlags(); + _pattern = regexPattern.getRegex(); + uassert(51107, + str::stream() + << "Found regex option(s) specified in both 'regex' and 'option' fields", + regexOptions.nullish() || regexFlags.empty()); + if (!regexFlags.empty()) { + _options = regexFlags.toString(); + } + } else if (regexPattern.getType() == BSONType::String) { + // ...or it can be a string field with options specified separately. + _pattern = regexPattern.getString(); + } + // If 'options' is non-null, we must extract and validate its contents even if + // 'regexPattern' is nullish. if (!regexOptions.nullish()) { - optionFlags = regexOptions.getStringData(); + _options = regexOptions.getString(); } - } - uassert(51109, - "Regular expression cannot contain an embedded null byte", - pattern.find('\0', 0) == string::npos); - uassert(51110, - "Regular expression options string cannot contain an embedded null byte", - optionFlags.find('\0', 0) == string::npos); + uassert(51109, + "Regular expression cannot contain an embedded null byte", + _pattern.find('\0', 0) == string::npos); + uassert(51110, + "Regular expression options string cannot contain an embedded null byte", + _options.find('\0', 0) == string::npos); + } + + pcre* _pcre; + // Number of capture groups present in '_pattern'. + int _numCaptures; + // Holds the start and limit indices of match and captures for the current match. + std::vector<int> _capturesBuffer; + std::string _input; + std::string _pattern; + std::string _options; + bool _nullish; +}; - pcrecpp::RE_Options opt = regex_util::flags2PcreOptions(optionFlags, false); - pcrecpp::RE regex(pattern.rawData(), opt); - return generateRegexCapturesAndMatches( - pattern, regex.NumberOfCapturingGroups(), opt, textInput.getStringData(), 0, 0); -} +} // namespace +Value ExpressionRegexFind::evaluate(const Document& root) const { + + RegexMatchHandler regex(vpOperand[0]->evaluate(root)); + if (regex.nullish()) { + return Value(BSONNULL); + } + int startByteIndex = 0, startCodePointIndex = 0; + return regex.nextMatch(&startByteIndex, &startCodePointIndex); +} REGISTER_EXPRESSION(regexFind, ExpressionRegexFind::parse); const char* ExpressionRegexFind::getOpName() const { return "$regexFind"; } +Value ExpressionRegexFindAll::evaluate(const Document& root) const { + + std::vector<Value> output; + RegexMatchHandler regex(vpOperand[0]->evaluate(root)); + if (regex.nullish()) { + return Value(output); + } + int startByteIndex = 0, startCodePointIndex = 0; + StringData input = regex.getInput(); + + // Using do...while loop because, when input is an empty string, we still want to see if there + // is a match. + do { + auto matchObj = regex.nextMatch(&startByteIndex, &startCodePointIndex); + if (matchObj.getType() == BSONType::jstNULL) { + break; + } + output.push_back(matchObj); + std::string matchStr = matchObj.getDocument().getField("match").getString(); + if (matchStr.empty()) { + // This would only happen if the regex matched an empty string. In this case, even if + // the character at startByteIndex matches the regex, we cannot return it since we are + // already returing an empty string starting at this index. So we move on to the next + // byte index. + startByteIndex += getCodePointLength(input[startByteIndex]); + ++startCodePointIndex; + continue; + } + // We don't want any overlapping sub-strings. So we move 'startByteIndex' to point to the + // byte after 'matchStr'. We move the code point index also correspondingly. + startByteIndex += matchStr.size(); + for (size_t byteIx = 0; byteIx < matchStr.size(); ++startCodePointIndex) { + byteIx += getCodePointLength(matchStr[byteIx]); + } + invariant(startByteIndex > 0 && startCodePointIndex > 0 && + startCodePointIndex <= startByteIndex); + } while (static_cast<size_t>(startByteIndex) < input.size()); + return Value(output); +} + +REGISTER_EXPRESSION(regexFindAll, ExpressionRegexFindAll::parse); +const char* ExpressionRegexFindAll::getOpName() const { + return "$regexFindAll"; +} + } // namespace mongo diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h index 4aa91a67086..b0949cca3fc 100644 --- a/src/mongo/db/pipeline/expression.h +++ b/src/mongo/db/pipeline/expression.h @@ -2093,7 +2093,7 @@ private: boost::intrusive_ptr<Expression> _onNull; }; -class ExpressionRegexFind final : public ExpressionFixedArity<ExpressionRegexFind, 1> { +class ExpressionRegexFind : public ExpressionFixedArity<ExpressionRegexFind, 1> { public: explicit ExpressionRegexFind(const boost::intrusive_ptr<ExpressionContext>& expCtx) : ExpressionFixedArity<ExpressionRegexFind, 1>(expCtx) {} @@ -2101,4 +2101,13 @@ public: Value evaluate(const Document& root) const final; const char* getOpName() const final; }; + +class ExpressionRegexFindAll final : public ExpressionFixedArity<ExpressionRegexFindAll, 1> { +public: + explicit ExpressionRegexFindAll(const boost::intrusive_ptr<ExpressionContext>& expCtx) + : ExpressionFixedArity<ExpressionRegexFindAll, 1>(expCtx) {} + + Value evaluate(const Document& root) const final; + const char* getOpName() const final; +}; } diff --git a/src/mongo/db/pipeline/expression_test.cpp b/src/mongo/db/pipeline/expression_test.cpp index ca254af56c7..c9bc46a2c8b 100644 --- a/src/mongo/db/pipeline/expression_test.cpp +++ b/src/mongo/db/pipeline/expression_test.cpp @@ -5950,7 +5950,7 @@ TEST(GetComputedPathsTest, ExpressionMapNotConsideredRenameWithDottedInputPath) } // namespace GetComputedPathsTest -namespace ExpressionRegexFindTest { +namespace ExpressionRegexTest { TEST(ExpressionRegexFindTest, BasicTest) { Value input(fromjson("{input: 'asdf', regex: '^as' }")); @@ -5979,11 +5979,64 @@ TEST(ExpressionRegexFindTest, FailureCase) { intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); ExpressionRegexFind regexF(expCtx); regexF.addOperand(ExpressionConstant::create(expCtx, input)); - ASSERT_THROWS(regexF.evaluate(Document()), DBException); + ASSERT_THROWS_CODE(regexF.evaluate(Document()), DBException, 51105); } +TEST(ExpressionRegexFindAllTest, MultipleMatches) { + Value input(fromjson("{input: 'a1b2c3', regex: '([a-c][1-3])' }")); + std::vector<Value> expectedOut = {Value(fromjson("{match: 'a1', idx:0, captures:['a1']}")), + Value(fromjson("{match: 'b2', idx:2, captures:['b2']}")), + Value(fromjson("{match: 'c3', idx:4, captures:['c3']}"))}; + intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); + ExpressionRegexFindAll regexF(expCtx); + regexF.addOperand(ExpressionConstant::create(expCtx, input)); + Value output = regexF.evaluate(Document()); + ASSERT_VALUE_EQ(output, Value(expectedOut)); +} + +TEST(ExpressionRegexFindAllTest, NoMatch) { + Value input(fromjson("{input: 'a1b2c3', regex: 'ab' }")); + intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); + ExpressionRegexFindAll regexF(expCtx); + regexF.addOperand(ExpressionConstant::create(expCtx, input)); + Value output = regexF.evaluate(Document()); + ASSERT_VALUE_EQ(output, Value(std::vector<Value>())); +} + +TEST(ExpressionRegexFindAllTest, FailureCase) { + Value input(fromjson("{input: 'FirstLine\\nSecondLine', regex: '[0-9'}")); + intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); + ExpressionRegexFindAll regexF(expCtx); + regexF.addOperand(ExpressionConstant::create(expCtx, input)); + ASSERT_THROWS_CODE(regexF.evaluate(Document()), DBException, 51111); +} + +TEST(ExpressionRegexFindAllTest, InvalidUTF8InInput) { + std::string inputField = "1234 "; + // Append an invalid UTF-8 character. + inputField += static_cast<char>(0xE5); + inputField += " 1234"; + Value input(fromjson("{input: '" + inputField + "', regex: '[0-9]'}")); + intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); + ExpressionRegexFindAll regexF(expCtx); + regexF.addOperand(ExpressionConstant::create(expCtx, input)); + // Verify no match if there is an invalid UTF-8 character in input. + ASSERT_VALUE_EQ(regexF.evaluate(Document()), Value(std::vector<Value>())); +} + +TEST(ExpressionRegexFindAllTest, InvalidUTF8InRegex) { + std::string regexField = "1234 "; + // Append an invalid UTF-8 character. + regexField += static_cast<char>(0xE5); + Value input(fromjson("{input: '123456', regex: '" + regexField + "'}")); + intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); + ExpressionRegexFindAll regexF(expCtx); + regexF.addOperand(ExpressionConstant::create(expCtx, input)); + // Verify that PCRE will error if REGEX is not a valid UTF-8. + ASSERT_THROWS_CODE(regexF.evaluate(Document()), DBException, 51111); +} -} // namespace ExpressionRegexFindTest +} // namespace ExpressionRegexTest class All : public Suite { public: |