summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jstests/aggregation/expressions/regex.js438
-rw-r--r--jstests/aggregation/expressions/regexFind.js339
-rw-r--r--src/mongo/db/pipeline/expression.cpp338
-rw-r--r--src/mongo/db/pipeline/expression.h11
-rw-r--r--src/mongo/db/pipeline/expression_test.cpp59
5 files changed, 728 insertions, 457 deletions
diff --git a/jstests/aggregation/expressions/regex.js b/jstests/aggregation/expressions/regex.js
new file mode 100644
index 00000000000..31b04b54976
--- /dev/null
+++ b/jstests/aggregation/expressions/regex.js
@@ -0,0 +1,438 @@
+/*
+ * Tests for $regexFind and $regexFindAll aggregation expression.
+ */
+(function() {
+ 'use strict';
+ load("jstests/aggregation/extras/utils.js"); // For assertErrorCode().
+ const coll = db.regex_find_expr;
+ coll.drop();
+
+ function testRegex(expression, inputObj, expectedOutput) {
+ const result =
+ coll.aggregate([
+ {"$project": {_id: 0, "matches": {[expression]: inputObj}}},
+ {"$sort": {"matches": 1}} // Sort to ensure the documents are returned in a
+ // deterministic order for sharded clusters.
+ ])
+ .toArray();
+ assert.eq(result, expectedOutput);
+ }
+ function testRegexForKey(expression, key, inputObj, expectedMatchObj) {
+ const result =
+ coll.aggregate(
+ [{"$match": {"_id": key}}, {"$project": {"matches": {[expression]: inputObj}}}])
+ .toArray();
+ const expectedOutput = [{"_id": key, "matches": expectedMatchObj}];
+ assert.eq(result, expectedOutput);
+ }
+
+ /**
+ * This function validates the output against both $regexFind and $regexFindAll expressions.
+ */
+ function testRegexFindAgg(inputObj, expectedOutputForFindAll) {
+ testRegex("$regexFindAll", inputObj, expectedOutputForFindAll);
+
+ // For each of the output document, get first element from "matches" array. This will
+ // convert 'regexFindAll' output to 'regexFind' output.
+ const expectedOutputForFind = expectedOutputForFindAll.map(
+ (element) => ({matches: element.matches.length == 0 ? null : element.matches[0]}));
+ testRegex("$regexFind", inputObj, expectedOutputForFind);
+ }
+
+ /**
+ * This function validates the output against both $regexFind and $regexFindAll expressions.
+ */
+ function testRegexFindAggForKey(key, inputObj, expectedOutputForFindAll) {
+ testRegexForKey("$regexFindAll", key, inputObj, expectedOutputForFindAll);
+ const expectedOutputForFind =
+ expectedOutputForFindAll.length == 0 ? null : expectedOutputForFindAll[0];
+ testRegexForKey("$regexFind", key, inputObj, expectedOutputForFind);
+ }
+
+ /**
+ * This function validates the output against both $regexFind and $regexFindAll expressions.
+ */
+ function testRegexAggException(inputObj, exceptionCode) {
+ assertErrorCode(
+ coll, [{"$project": {"matches": {"$regexFindAll": inputObj}}}], exceptionCode);
+ assertErrorCode(coll, [{"$project": {"matches": {"$regexFind": inputObj}}}], exceptionCode);
+ }
+
+ (function testWithSingleMatch() {
+ // Regex in string notation, find with multiple captures and matches.
+ assert.commandWorked(coll.insert({_id: 0, text: "Simple Example "}));
+ testRegexFindAggForKey(0, {input: "$text", regex: "(m(p))"}, [
+ {"match": "mp", "idx": 2, "captures": ["mp", "p"]},
+ {"match": "mp", "idx": 10, "captures": ["mp", "p"]}
+ ]);
+ // Regex in json syntax, with multiple captures and matches.
+ testRegexFindAggForKey(0, {input: "$text", regex: /(m(p))/}, [
+ {"match": "mp", "idx": 2, "captures": ["mp", "p"]},
+ {"match": "mp", "idx": 10, "captures": ["mp", "p"]}
+ ]);
+ // Verify no overlapping match sub-strings.
+ assert.commandWorked(coll.insert({_id: 112, text: "aaaaa aaaa"}));
+ testRegexFindAggForKey(112, {input: "$text", regex: /(aa)/}, [
+ {"match": "aa", "idx": 0, "captures": ["aa"]},
+ {"match": "aa", "idx": 2, "captures": ["aa"]},
+ {"match": "aa", "idx": 6, "captures": ["aa"]},
+ {"match": "aa", "idx": 8, "captures": ["aa"]}
+ ]);
+ testRegexFindAggForKey(112, {input: "$text", regex: /(aa)+/}, [
+ {"match": "aaaa", "idx": 0, "captures": ["aa"]},
+ {"match": "aaaa", "idx": 6, "captures": ["aa"]}
+ ]);
+ // Verify greedy match.
+ testRegexFindAggForKey(112, {input: "$text", regex: /(a+)/}, [
+ {"match": "aaaaa", "idx": 0, "captures": ["aaaaa"]},
+ {"match": "aaaa", "idx": 6, "captures": ["aaaa"]},
+ ]);
+ testRegexFindAggForKey(112, {input: "$text", regex: /(a)+/}, [
+ {"match": "aaaaa", "idx": 0, "captures": ["a"]},
+ {"match": "aaaa", "idx": 6, "captures": ["a"]},
+ ]);
+ // Verify lazy match.
+ assert.commandWorked(coll.insert({_id: 113, text: "aaa aa"}));
+ testRegexFindAggForKey(113, {input: "$text", regex: /(a+?)/}, [
+ {"match": "a", "idx": 0, "captures": ["a"]},
+ {"match": "a", "idx": 1, "captures": ["a"]},
+ {"match": "a", "idx": 2, "captures": ["a"]},
+ {"match": "a", "idx": 4, "captures": ["a"]},
+ {"match": "a", "idx": 5, "captures": ["a"]}
+ ]);
+ testRegexFindAggForKey(113, {input: "$text", regex: /(a*?)/}, [
+ {"match": "", "idx": 0, "captures": [""]},
+ {"match": "", "idx": 1, "captures": [""]},
+ {"match": "", "idx": 2, "captures": [""]},
+ {"match": "", "idx": 3, "captures": [""]},
+ {"match": "", "idx": 4, "captures": [""]},
+ {"match": "", "idx": 5, "captures": [""]}
+ ]);
+
+ // Regex string groups within group.
+ testRegexFindAggForKey(
+ 0,
+ {input: "$text", regex: "((S)(i)(m)(p)(l)(e))"},
+ [{"match": "Simple", "idx": 0, "captures": ["Simple", "S", "i", "m", "p", "l", "e"]}]);
+ testRegexFindAggForKey(
+ 0,
+ {input: "$text", regex: "(S)(i)(m)((p)(l)(e))"},
+ [{"match": "Simple", "idx": 0, "captures": ["S", "i", "m", "ple", "p", "l", "e"]}]);
+
+ // Regex email pattern.
+ assert.commandWorked(
+ coll.insert({_id: 1, text: "Some field text with email mongo@mongodb.com"}));
+ testRegexFindAggForKey(
+ 1,
+ {input: "$text", regex: "([a-zA-Z0-9._-]+)@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+"},
+ [{"match": "mongo@mongodb.com", "idx": 27, "captures": ["mongo"]}]);
+
+ // Regex digits.
+ assert.commandWorked(coll.insert({_id: 5, text: "Text with 02 digits"}));
+ testRegexFindAggForKey(
+ 5, {input: "$text", regex: /[0-9]+/}, [{"match": "02", "idx": 10, "captures": []}]);
+ testRegexFindAggForKey(
+ 5, {input: "$text", regex: /(\d+)/}, [{"match": "02", "idx": 10, "captures": ["02"]}]);
+
+ // Regex a non-capture group.
+ assert.commandWorked(coll.insert({_id: 6, text: "1,2,3,4,5,6,7,8,9,10"}));
+ testRegexFindAggForKey(6,
+ {input: "$text", regex: /^(?:1|a)\,([0-9]+)/},
+ [{"match": "1,2", "idx": 0, "captures": ["2"]}]);
+
+ // Regex quantifier.
+ assert.commandWorked(coll.insert({_id: 7, text: "abc12defgh345jklm"}));
+ testRegexFindAggForKey(
+ 7, {input: "$text", regex: /[0-9]{3}/}, [{"match": "345", "idx": 10, "captures": []}]);
+
+ // Regex case insensitive option.
+ assert.commandWorked(coll.insert({_id: 8, text: "This Is Camel Case"}));
+ testRegexFindAggForKey(8, {input: "$text", regex: /camel/}, []);
+ testRegexFindAggForKey(
+ 8, {input: "$text", regex: /camel/i}, [{"match": "Camel", "idx": 8, "captures": []}]);
+ testRegexFindAggForKey(8,
+ {input: "$text", regex: /camel/, options: "i"},
+ [{"match": "Camel", "idx": 8, "captures": []}]);
+ testRegexFindAggForKey(8,
+ {input: "$text", regex: "camel", options: "i"},
+ [{"match": "Camel", "idx": 8, "captures": []}]);
+
+ // Regex multi line option.
+ assert.commandWorked(coll.insert({_id: 9, text: "Foo line1\nFoo line2\nFoo line3"}));
+ // Verify no match with options flag off.
+ testRegexFindAggForKey(9, {input: "$text", regex: /^Foo line\d$/}, []);
+ // Verify match when flag is on.
+ testRegexFindAggForKey(9, {input: "$text", regex: /(^Foo line\d$)/m}, [
+ {"match": "Foo line1", "idx": 0, "captures": ["Foo line1"]},
+ {"match": "Foo line2", "idx": 10, "captures": ["Foo line2"]},
+ {"match": "Foo line3", "idx": 20, "captures": ["Foo line3"]}
+ ]);
+
+ // Regex single line option.
+ testRegexFindAggForKey(9, {input: "$text", regex: "Foo.*line"}, [
+ {"match": "Foo line", "idx": 0, "captures": []},
+ {"match": "Foo line", "idx": 10, "captures": []},
+ {"match": "Foo line", "idx": 20, "captures": []}
+ ]);
+ testRegexFindAggForKey(
+ 9,
+ {input: "$text", regex: "Foo.*line", options: "s"},
+ [{"match": "Foo line1\nFoo line2\nFoo line", "idx": 0, "captures": []}]);
+
+ // Regex extended option.
+ testRegexFindAggForKey(9, {input: "$text", regex: "F o o # a comment"}, []);
+ testRegexFindAggForKey(9, {input: "$text", regex: "F o o # a comment", options: "x"}, [
+ {"match": "Foo", "idx": 0, "captures": []},
+ {"match": "Foo", "idx": 10, "captures": []},
+ {"match": "Foo", "idx": 20, "captures": []}
+ ]);
+ testRegexFindAggForKey(
+ 9, {input: "$text", regex: "F o o # a comment \n\n# ignored", options: "x"}, [
+ {"match": "Foo", "idx": 0, "captures": []},
+ {"match": "Foo", "idx": 10, "captures": []},
+ {"match": "Foo", "idx": 20, "captures": []}
+ ]);
+ testRegexFindAggForKey(9, {input: "$text", regex: "(F o o) # a comment", options: "x"}, [
+ {"match": "Foo", "idx": 0, "captures": ["Foo"]},
+ {"match": "Foo", "idx": 10, "captures": ["Foo"]},
+ {"match": "Foo", "idx": 20, "captures": ["Foo"]}
+ ]);
+
+ // Regex pattern from a document field value.
+ assert.commandWorked(
+ coll.insert({_id: 10, text: "Simple Value Example", pattern: "(m(p))"}));
+ testRegexFindAggForKey(10, {input: "$text", regex: "$pattern"}, [
+ {"match": "mp", "idx": 2, "captures": ["mp", "p"]},
+ {"match": "mp", "idx": 16, "captures": ["mp", "p"]}
+ ]);
+ assert.commandWorked(coll.insert({_id: 11, text: "OtherText", pattern: /(T(e))xt$/}));
+ testRegexFindAggForKey(11,
+ {input: "$text", regex: "$pattern"},
+ [{"match": "Text", "idx": 5, "captures": ["Te", "e"]}]);
+
+ // Empty input matches empty regex.
+ testRegexFindAggForKey(
+ 0, {input: "", regex: ""}, [{"match": "", "idx": 0, "captures": []}]);
+ // Empty captures groups.
+ testRegexFindAggForKey(0, {input: "bbbb", regex: "()"}, [
+ {"match": "", "idx": 0, "captures": [""]},
+ {"match": "", "idx": 1, "captures": [""]},
+ {"match": "", "idx": 2, "captures": [""]},
+ {"match": "", "idx": 3, "captures": [""]}
+ ]);
+ // No matches.
+ testRegexFindAggForKey(0, {input: "$text", regex: /foo/}, []);
+ // Regex null.
+ testRegexFindAggForKey(0, {input: "$text", regex: null}, []);
+ // Regex not present.
+ testRegexFindAggForKey(0, {input: "$text"}, []);
+ // Input not present.
+ testRegexFindAggForKey(0, {regex: /valid/}, []);
+ // Input null.
+ testRegexFindAggForKey(0, {input: null, regex: /valid/}, []);
+ // Empty object.
+ testRegexFindAggForKey(0, {}, []);
+ })();
+
+ (function testWithStartOptions() {
+ coll.drop();
+ assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"}));
+ assert.commandWorked(coll.insert({_id: 3, text: "ab\ncd"}));
+
+ // LIMIT_MATCH option to limit the number of comparisons PCRE does internally.
+ testRegexFindAggForKey(2, {input: "$text", regex: "(*LIMIT_MATCH=1)fé"}, []);
+ testRegexFindAggForKey(2,
+ {input: "$text", regex: "(*LIMIT_MATCH=3)(fé)"},
+ [{"match": "fé", "idx": 2, "captures": ["fé"]}]);
+
+ // (*LF) would change the feed system to UNIX like and (*CR) to windows like. So '\n' would
+ // match '.' with CR but not LF.
+ testRegexFindAggForKey(3, {input: "$text", regex: "(*LF)ab.cd"}, []);
+ testRegexFindAggForKey(3,
+ {input: "$text", regex: "(*CR)ab.cd"},
+ [{"match": "ab\ncd", "idx": 0, "captures": []}]);
+
+ // Multiple start options.
+ testRegexFindAggForKey(2,
+ {input: "$text", regex: String.raw `(*LIMIT_MATCH=5)(*UCP)^(\w+)`},
+ [{"match": "cafétéria", "idx": 0, "captures": ["cafétéria"]}]);
+ testRegexFindAggForKey(
+ 2, {input: "$text", regex: String.raw `(*LIMIT_MATCH=1)(*UCP)^(\w+)`}, []);
+ })();
+
+ (function testWithUnicodeData() {
+ coll.drop();
+ // Unicode index counting.
+ assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"}));
+ assert.commandWorked(coll.insert({_id: 3, text: "मा०गो डीबि"}));
+ testRegexFindAggForKey(
+ 2, {input: "$text", regex: "té"}, [{"match": "té", "idx": 4, "captures": []}]);
+ testRegexFindAggForKey(
+ 3, {input: "$text", regex: /म/}, [{"match": "म", "idx": 0, "captures": []}]);
+ // Unicode with capture group.
+ testRegexFindAggForKey(3,
+ {input: "$text", regex: /(गो )/},
+ [{"match": "गो ", "idx": 3, "captures": ["गो "]}]);
+ // Test that regexes support Unicode character properties.
+ testRegexFindAggForKey(2, {input: "$text", regex: String.raw `\p{Hangul}`}, []);
+ testRegexFindAggForKey(2,
+ {input: "$text", regex: String.raw `\p{Latin}+$`},
+ [{"match": "cafétéria", "idx": 0, "captures": []}]);
+ // Test that the (*UTF) and (*UTF8) options are accepted for unicode characters.
+ assert.commandWorked(coll.insert({_id: 12, text: "༢༣༤༤༤༥12༥A"}));
+ testRegexFindAggForKey(12, {input: "$text", regex: "(*UTF8)༤"}, [
+ {"match": "༤", "idx": 2, "captures": []},
+ {"match": "༤", "idx": 3, "captures": []},
+ {"match": "༤", "idx": 4, "captures": []}
+ ]);
+ testRegexFindAggForKey(12, {input: "$text", regex: "(*UTF)༤"}, [
+ {"match": "༤", "idx": 2, "captures": []},
+ {"match": "༤", "idx": 3, "captures": []},
+ {"match": "༤", "idx": 4, "captures": []}
+ ]);
+ // For ASCII characters.
+ assert.commandWorked(coll.insert({_id: 4, text: "123444"}));
+ testRegexFindAggForKey(4,
+ {input: "$text", regex: "(*UTF8)(44)"},
+ [{"match": "44", "idx": 3, "captures": ["44"]}]);
+ testRegexFindAggForKey(4,
+ {input: "$text", regex: "(*UTF)(44)"},
+ [{"match": "44", "idx": 3, "captures": ["44"]}]);
+
+ // When the (*UCP) option is specified, Unicode "word" characters are included in the '\w'
+ // character type.
+ testRegexFindAggForKey(12,
+ {input: "$text", regex: String.raw `(*UCP)^(\w+)`},
+ [{"match": "༢༣༤༤༤༥12༥A", "idx": 0, "captures": ["༢༣༤༤༤༥12༥A"]}]);
+ // When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode
+ // decimal digit characters.
+ testRegexFindAggForKey(12,
+ {input: "$text", regex: "(*UCP)^[[:digit:]]+"},
+ [{"match": "༢༣༤༤༤༥12༥", "idx": 0, "captures": []}]);
+ testRegexFindAggForKey(12, {input: "$text", regex: "(*UCP)[[:digit:]]+$"}, []);
+ // When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode
+ // alphabetic characters.
+ assert.commandWorked(coll.insert({_id: 13, text: "박정수AB"}));
+ testRegexFindAggForKey(13,
+ {input: "$text", regex: String.raw `(*UCP)^[[:alpha:]]+`},
+ [{"match": "박정수AB", "idx": 0, "captures": []}]);
+
+ // No match when options are not set.
+ testRegexFindAggForKey(12, {input: "$text", regex: String.raw `^(\w+)`}, []);
+ testRegexFindAggForKey(12, {input: "$text", regex: "^[[:digit:]]"}, []);
+ testRegexFindAggForKey(2, {input: "$text", regex: "^[[:alpha:]]+$"}, []);
+ })();
+
+ (function testErrors() {
+ coll.drop();
+ assert.commandWorked(coll.insert({text: "string"}));
+ // Null object.
+ testRegexAggException(null, 51103);
+ // Incorrect object parameter.
+ testRegexAggException("incorrect type", 51103);
+ // Test malformed regex.
+ testRegexAggException({input: "$text", regex: "[0-9"}, 51111);
+ testRegexAggException({regex: "[a-c"}, 51111);
+ // Malformed regex because start options not at the beginning.
+ testRegexAggException({input: "$text", regex: "^(*UCP)[[:alpha:]]+$"}, 51111);
+ testRegexAggException({input: "$text", regex: "((*UCP)[[:alpha:]]+$)"}, 51111);
+ // At least one of the 'input' field is not string.
+ assert.commandWorked(coll.insert({a: "string"}));
+ assert.commandWorked(coll.insert({a: {b: "object"}}));
+ testRegexAggException({input: "$a", regex: "valid"}, 51104);
+ testRegexAggException({input: "$a"}, 51104);
+ // 'regex' field is not string or regex.
+ testRegexAggException({input: "$text", regex: ["incorrect"]}, 51105);
+ // 'options' field is not string.
+ testRegexAggException({input: "$text", regex: "valid", options: 123}, 51106);
+ // Incorrect 'options' flag.
+ testRegexAggException({input: "$text", regex: "valid", options: 'a'}, 51108);
+ // 'options' are case-sensitive.
+ testRegexAggException({input: "$text", regex: "valid", options: "I"}, 51108);
+ // Options specified in both 'regex' and 'options'.
+ testRegexAggException({input: "$text", regex: /(m(p))/i, options: "i"}, 51107);
+ testRegexAggException({input: "$text", regex: /(m(p))/i, options: "x"}, 51107);
+ testRegexAggException({input: "$text", regex: /(m(p))/m, options: ""}, 51107);
+ // 'regex' as string with null characters.
+ testRegexAggException({input: "$text", regex: "sasd\0", options: "i"}, 51109);
+ testRegexAggException({regex: "sa\x00sd", options: "i"}, 51109);
+ // 'options' as string with null characters.
+ testRegexAggException({input: "$text", regex: /(m(p))/, options: "i\0"}, 51110);
+ testRegexAggException({input: "$text", options: "i\x00"}, 51110);
+ })();
+
+ (function testMultipleMatches() {
+ coll.drop();
+ assert.commandWorked(coll.insert({a: "string1string2"}));
+ assert.commandWorked(coll.insert({a: "string3 string4"}));
+ // Both match.
+ testRegexFindAgg({input: "$a", regex: "(str.*?[0-9])"}, [
+ {
+ "matches": [
+ {"match": "string1", "idx": 0, "captures": ["string1"]},
+ {"match": "string2", "idx": 7, "captures": ["string2"]}
+ ]
+ },
+ {
+ "matches": [
+ {"match": "string3", "idx": 0, "captures": ["string3"]},
+ {"match": "string4", "idx": 8, "captures": ["string4"]}
+ ]
+ }
+ ]);
+ // Only one match.
+ testRegexFindAgg({input: "$a", regex: "(^.*[0-2]$)"}, [
+ {"matches": []},
+ {"matches": [{"match": "string1string2", "idx": 0, "captures": ["string1string2"]}]}
+
+ ]);
+ // None match.
+ testRegexFindAgg({input: "$a", regex: "(^.*[5-9]$)"}, [{"matches": []}, {"matches": []}]);
+ })();
+
+ (function testInsideCondOperator() {
+ coll.drop();
+ assert.commandWorked(
+ coll.insert({_id: 0, level: "Public Knowledge", info: "Company Name"}));
+ assert.commandWorked(
+ coll.insert({_id: 1, level: "Private Information", info: "Company Secret"}));
+ const expectedResults =
+ [{"_id": 0, "information": "Company Name"}, {"_id": 1, "information": "REDACTED"}];
+ // For $regexFindAll.
+ let result =
+ coll.aggregate([{
+ "$project": {
+ "information": {
+ "$cond": [
+ {
+ "$eq":
+ [{"$regexFindAll": {input: "$level", regex: /public/i}}, []]
+ },
+ "REDACTED",
+ "$info"
+ ]
+ }
+ }
+ }])
+ .toArray();
+ assert.eq(result, expectedResults);
+ // For $regexFind.
+ result =
+ coll.aggregate([{
+ "$project": {
+ "information": {
+ "$cond": [
+ {
+ "$eq":
+ [{"$regexFind": {input: "$level", regex: /public/i}}, null]
+ },
+ "REDACTED",
+ "$info"
+ ]
+ }
+ }
+ }])
+ .toArray();
+ assert.eq(result, expectedResults);
+ })();
+}());
diff --git a/jstests/aggregation/expressions/regexFind.js b/jstests/aggregation/expressions/regexFind.js
deleted file mode 100644
index e5743a453c5..00000000000
--- a/jstests/aggregation/expressions/regexFind.js
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Tests for $regexFind aggregation expression.
- */
-(function() {
- 'use strict';
-
- load("jstests/aggregation/extras/utils.js"); // For assertErrorCode().
-
- const coll = db.regex_find_expr;
- coll.drop();
-
- function testRegexFindAgg(regexFind, expectedOutput) {
- const result =
- coll.aggregate([
- {"$project": {_id: 0, "matches": {"$regexFind": regexFind}}},
- {"$sort": {"matches": 1}} // Ensure that the documents are returned in a
- // deterministic order for sharded clusters.
- ])
- .toArray();
- assert.eq(result, expectedOutput);
- }
- function testRegexFindAggForKey(key, regexFind, expectedMatchObj) {
- const result = coll.aggregate([
- {"$match": {"_id": key}},
- {"$project": {"matches": {"$regexFind": regexFind}}}
- ])
- .toArray();
- const expectedOutput = [{"_id": key, "matches": expectedMatchObj}];
- assert.eq(result, expectedOutput);
- }
- function testRegexFindAggException(regexFind, exceptionCode) {
- assertErrorCode(
- coll, [{"$project": {"matches": {"$regexFind": regexFind}}}], exceptionCode);
- }
-
- (function testWithSingleMatch() {
- // Regex in string notation, find with multiple captures.
- assert.commandWorked(coll.insert({_id: 0, text: "Simple Example"}));
- testRegexFindAggForKey(0,
- {input: "$text", regex: "(m(p))"},
- {"match": "mp", "idx": 2, "captures": ["mp", "p"]});
- // Regex in json syntax, with multiple captures.
- testRegexFindAggForKey(0, {input: "$text", regex: /(S)(i)(m)(p)(l)(e) (Ex)(am)(p)(le)/}, {
- "match": "Simple Example",
- "idx": 0,
- "captures": ["S", "i", "m", "p", "l", "e", "Ex", "am", "p", "le"]
- });
-
- // Regex string groups within group.
- testRegexFindAggForKey(
- 0,
- {input: "$text", regex: "((S)(i)(m)(p)(l)(e))"},
- {"match": "Simple", "idx": 0, "captures": ["Simple", "S", "i", "m", "p", "l", "e"]});
- testRegexFindAggForKey(
- 0,
- {input: "$text", regex: "(S)(i)(m)((p)(l)(e))"},
- {"match": "Simple", "idx": 0, "captures": ["S", "i", "m", "ple", "p", "l", "e"]});
-
- // Regex email pattern.
- assert.commandWorked(
- coll.insert({_id: 1, text: "Some field text with email mongo@mongodb.com"}));
- testRegexFindAggForKey(
- 1,
- {input: "$text", regex: "([a-zA-Z0-9._-]+)@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+"},
- {"match": "mongo@mongodb.com", "idx": 27, "captures": ["mongo"]});
-
- // Regex digits.
- assert.commandWorked(coll.insert({_id: 5, text: "Text with 02 digits"}));
- testRegexFindAggForKey(
- 5, {input: "$text", regex: /[0-9]+/}, {"match": "02", "idx": 10, "captures": []});
- testRegexFindAggForKey(
- 5, {input: "$text", regex: /(\d+)/}, {"match": "02", "idx": 10, "captures": ["02"]});
-
- // Regex a non-capture group.
- assert.commandWorked(coll.insert({_id: 6, text: "1,2,3,4,5,6,7,8,9,10"}));
- testRegexFindAggForKey(6,
- {input: "$text", regex: /^(?:1|a)\,([0-9]+)/},
- {"match": "1,2", "idx": 0, "captures": ["2"]});
-
- // Regex quantifier.
- assert.commandWorked(coll.insert({_id: 7, text: "abc12defgh345jklm"}));
- testRegexFindAggForKey(
- 7, {input: "$text", regex: /[0-9]{3}/}, {"match": "345", "idx": 10, "captures": []});
-
- // Regex case insensitive option.
- assert.commandWorked(coll.insert({_id: 8, text: "This Is Camel Case"}));
- testRegexFindAggForKey(8, {input: "$text", regex: /camel/}, null);
- testRegexFindAggForKey(
- 8, {input: "$text", regex: /camel/i}, {"match": "Camel", "idx": 8, "captures": []});
- testRegexFindAggForKey(8,
- {input: "$text", regex: /camel/, options: "i"},
- {"match": "Camel", "idx": 8, "captures": []});
- testRegexFindAggForKey(8,
- {input: "$text", regex: "camel", options: "i"},
- {"match": "Camel", "idx": 8, "captures": []});
-
- // Regex multi line option.
- assert.commandWorked(coll.insert({_id: 9, text: "Foo line1\nFoo line2\nFoo line3"}));
- // Verify no match with options flag off.
- testRegexFindAggForKey(9, {input: "$text", regex: /^Foo line\d$/}, null);
- // Verify match when flag is on.
- testRegexFindAggForKey(9,
- {input: "$text", regex: /^Foo line\d$/m},
- {"match": "Foo line1", "idx": 0, "captures": []});
-
- // Regex single line option.
- testRegexFindAggForKey(9,
- {input: "$text", regex: "Foo.*line"},
- {"match": "Foo line", "idx": 0, "captures": []});
- testRegexFindAggForKey(
- 9,
- {input: "$text", regex: "Foo.*line", options: "s"},
- {"match": "Foo line1\nFoo line2\nFoo line", "idx": 0, "captures": []});
-
- // Regex extended option.
- testRegexFindAggForKey(9, {input: "$text", regex: "F o o # a comment"}, null);
- testRegexFindAggForKey(9,
- {input: "$text", regex: "F o o # a comment", options: "x"},
- {"match": "Foo", "idx": 0, "captures": []});
- testRegexFindAggForKey(
- 9,
- {input: "$text", regex: "F o o # a comment \n\n# ignored", options: "x"},
- {"match": "Foo", "idx": 0, "captures": []});
- testRegexFindAggForKey(9,
- {input: "$text", regex: "(F o o) # a comment", options: "x"},
- {"match": "Foo", "idx": 0, "captures": ["Foo"]});
-
- // Regex pattern from a document field value.
- assert.commandWorked(coll.insert({_id: 10, text: "Simple Value", pattern: "(m(p))"}));
- testRegexFindAggForKey(10,
- {input: "$text", regex: "$pattern"},
- {"match": "mp", "idx": 2, "captures": ["mp", "p"]});
- assert.commandWorked(coll.insert({_id: 11, text: "OtherText", pattern: /(T(e))xt$/}));
- testRegexFindAggForKey(11,
- {input: "$text", regex: "$pattern"},
- {"match": "Text", "idx": 5, "captures": ["Te", "e"]});
-
- // 'regex' as object with null characters.
- assert.commandWorked(coll.insert({_id: 12, text: "Null\0 charac\0ters"}));
- testRegexFindAggForKey(12, {input: "$text", regex: /((Null)(\0))( )(charac\0t)/}, {
- "match": "Null\0 charac\0t",
- "idx": 0,
- "captures": ["Null\0", "Null", "\0", " ", "charac\0t"]
- });
- testRegexFindAggForKey(
- 12,
- {input: "$text", regex: /(\x00)( )(charac\x00t)/},
- {"match": "\0 charac\x00t", "idx": 4, "captures": ["\x00", " ", "charac\0t"]});
- // 'regex' as string with escaped null characters.
- testRegexFindAggForKey(12,
- {input: "$text", regex: "l\\0 charac\\0ter.*$"},
- {"match": "l\0 charac\0ters", "idx": 3, "captures": []});
- // No match with null characters in input.
- testRegexFindAggForKey(12, {input: "$text", regex: /Null c/}, null);
- // No match with null characters in regex.
- testRegexFindAggForKey(12, {input: "$text", regex: /Nul\0l/}, null);
-
- // No matches.
- testRegexFindAggForKey(0, {input: "$text", regex: /foo/}, null);
- // Regex null.
- testRegexFindAggForKey(0, {input: "$text", regex: null}, null);
- // Regex not present.
- testRegexFindAggForKey(0, {input: "$text"}, null);
- // Input not present.
- testRegexFindAggForKey(0, {regex: /valid/}, null);
- // Input null.
- testRegexFindAggForKey(0, {input: null, regex: /valid/}, null);
- // Empty object.
- testRegexFindAggForKey(0, {}, null);
- })();
-
- (function testWithStartOptions() {
- coll.drop();
- assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"}));
- assert.commandWorked(coll.insert({_id: 3, text: "ab\ncd"}));
-
- // LIMIT_MATCH option to limit the number of comparisons PCRE does internally.
- testRegexFindAggForKey(2, {input: "$text", regex: "(*LIMIT_MATCH=1)fé"}, null);
- testRegexFindAggForKey(2,
- {input: "$text", regex: "(*LIMIT_MATCH=3)(fé)"},
- {"match": "fé", "idx": 2, "captures": ["fé"]});
-
- // (*LF) will change the feed system to UNIX like and (*CR) to windows like. So '\n' would
- // match '.' with CR but not LF.
- testRegexFindAggForKey(3, {input: "$text", regex: "(*LF)ab.cd"}, null);
- testRegexFindAggForKey(3,
- {input: "$text", regex: "(*CR)ab.cd"},
- {"match": "ab\ncd", "idx": 0, "captures": []});
-
- // Multiple start options.
- testRegexFindAggForKey(2,
- {input: "$text", regex: String.raw `(*LIMIT_MATCH=5)(*UCP)^(\w+)`},
- {"match": "cafétéria", "idx": 0, "captures": ["cafétéria"]});
- testRegexFindAggForKey(
- 2, {input: "$text", regex: String.raw `(*LIMIT_MATCH=1)(*UCP)^(\w+)`}, null);
- })();
-
- (function testWithUnicodeData() {
- coll.drop();
- // Unicode index counting.
- assert.commandWorked(coll.insert({_id: 2, text: "cafétéria"}));
- assert.commandWorked(coll.insert({_id: 3, text: "मा०गो डीबि"}));
- testRegexFindAggForKey(
- 2, {input: "$text", regex: "té"}, {"match": "té", "idx": 4, "captures": []});
- testRegexFindAggForKey(
- 3, {input: "$text", regex: /म/}, {"match": "म", "idx": 0, "captures": []});
- // Unicode with capture group.
- testRegexFindAggForKey(3,
- {input: "$text", regex: /(गो )/},
- {"match": "गो ", "idx": 3, "captures": ["गो "]});
- // Test that regexes support Unicode character properties.
- testRegexFindAggForKey(2, {input: "$text", regex: String.raw `\p{Hangul}`}, null);
- testRegexFindAggForKey(2,
- {input: "$text", regex: String.raw `\p{Latin}+$`},
- {"match": "cafétéria", "idx": 0, "captures": []});
- // Test that the (*UTF) and (*UTF8) options are accepted for unicode characters.
- assert.commandWorked(coll.insert({_id: 12, text: "༢༣༤༤༤༥12༥A"}));
- testRegexFindAggForKey(
- 12, {input: "$text", regex: "(*UTF8)༤"}, {"match": "༤", "idx": 2, "captures": []});
- testRegexFindAggForKey(
- 12, {input: "$text", regex: "(*UTF)༤"}, {"match": "༤", "idx": 2, "captures": []});
- // For ASCII characters.
- assert.commandWorked(coll.insert({_id: 4, text: "123444"}));
- testRegexFindAggForKey(4,
- {input: "$text", regex: "(*UTF8)(44)"},
- {"match": "44", "idx": 3, "captures": ["44"]});
- testRegexFindAggForKey(4,
- {input: "$text", regex: "(*UTF)(44)"},
- {"match": "44", "idx": 3, "captures": ["44"]});
-
- // When the (*UCP) option is specified, Unicode "word" characters are included in the '\w'
- // character type.
- testRegexFindAggForKey(12,
- {input: "$text", regex: String.raw `(*UCP)^(\w+)`},
- {"match": "༢༣༤༤༤༥12༥A", "idx": 0, "captures": ["༢༣༤༤༤༥12༥A"]});
- // When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode
- // decimal digit characters.
- testRegexFindAggForKey(12,
- {input: "$text", regex: "(*UCP)^[[:digit:]]+"},
- {"match": "༢༣༤༤༤༥12༥", "idx": 0, "captures": []});
- testRegexFindAggForKey(12, {input: "$text", regex: "(*UCP)[[:digit:]]+$"}, null);
- // When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode
- // alphabetic characters.
- assert.commandWorked(coll.insert({_id: 13, text: "박정수AB"}));
- testRegexFindAggForKey(13,
- {input: "$text", regex: String.raw `(*UCP)^[[:alpha:]]+`},
- {"match": "박정수AB", "idx": 0, "captures": []});
-
- // No match when options are not set.
- testRegexFindAggForKey(12, {input: "$text", regex: String.raw `^(\w+)`}, null);
- testRegexFindAggForKey(12, {input: "$text", regex: "^[[:digit:]]"}, null);
- testRegexFindAggForKey(2, {input: "$text", regex: "^[[:alpha:]]+$"}, null);
- })();
-
- (function testErrors() {
- coll.drop();
- assert.commandWorked(coll.insert({text: "string"}));
- // Null object.
- testRegexFindAggException(null, 51103);
- // Incorrect object parameter.
- testRegexFindAggException("incorrect type", 51103);
- // Test malformed regex.
- testRegexFindAggException({input: "$text", regex: "[0-9"}, 51111);
- // Malformed regex because start options not at the beginning.
- testRegexFindAggException({input: "$text", regex: "^(*UCP)[[:alpha:]]+$"}, 51111);
- testRegexFindAggException({input: "$text", regex: "((*UCP)[[:alpha:]]+$)"}, 51111);
- // At least one of the 'input' field is not string.
- assert.commandWorked(coll.insert({a: "string"}));
- assert.commandWorked(coll.insert({a: {b: "object"}}));
- testRegexFindAggException({input: "$a", regex: "valid"}, 51104);
- // 'regex' field is not string or regex.
- testRegexFindAggException({input: "$text", regex: ["incorrect"]}, 51105);
- // 'options' field is not string.
- testRegexFindAggException({input: "$text", regex: "valid", options: 123}, 51106);
- // Incorrect 'options' flag.
- testRegexFindAggException({input: "$text", regex: "valid", options: 'a'}, 51108);
- // 'options' are case-sensitive.
- testRegexFindAggException({input: "$text", regex: "valid", options: "I"}, 51108);
- // Options specified in both 'regex' and 'options'.
- testRegexFindAggException({input: "$text", regex: /(m(p))/i, options: "i"}, 51107);
- testRegexFindAggException({input: "$text", regex: /(m(p))/i, options: "x"}, 51107);
- testRegexFindAggException({input: "$text", regex: /(m(p))/m, options: ""}, 51107);
- // 'regex' as string with null characters.
- testRegexFindAggException({input: "$text", regex: "sasd\0", options: "i"}, 51109);
- testRegexFindAggException({input: "$text", regex: "sa\x00sd", options: "i"}, 51109);
- // 'options' as string with null characters.
- testRegexFindAggException({input: "$text", regex: /(m(p))/, options: "i\0"}, 51110);
- testRegexFindAggException({input: "$text", regex: /(m(p))/, options: "i\x00"}, 51110);
-
- })();
-
- (function testMultipleMatches() {
- coll.drop();
- assert.commandWorked(coll.insert({a: "string1"}));
- assert.commandWorked(coll.insert({a: "string2"}));
- // Both match.
- testRegexFindAgg({input: "$a", regex: "(^str.*)"}, [
- {"matches": {"match": "string1", "idx": 0, "captures": ["string1"]}},
- {"matches": {"match": "string2", "idx": 0, "captures": ["string2"]}}
- ]);
- // Only one match.
- testRegexFindAgg({input: "$a", regex: "(^.*[0-1]$)"}, [
- {"matches": null},
- {"matches": {"match": "string1", "idx": 0, "captures": ["string1"]}}
- ]);
- // None match.
- testRegexFindAgg({input: "$a", regex: "(^.*[3-9]$)"},
- [{"matches": null}, {"matches": null}]);
- })();
-
- (function testInsideCondOperator() {
- coll.drop();
- assert.commandWorked(
- coll.insert({_id: 0, level: "Public Knowledge", info: "Company Name"}));
- assert.commandWorked(
- coll.insert({_id: 1, level: "Private Information", info: "Company Secret"}));
-
- const result =
- coll.aggregate([{
- "$project": {
- "information": {
- "$cond": [
- {
- "$eq":
- [{"$regexFind": {input: "$level", regex: /public/i}}, null]
- },
- "REDACTED",
- "$info"
- ]
- }
- }
- }])
- .toArray();
- assert.eq(result, [
- {"_id": 0, "information": "Company Name"},
- {"_id": 1, "information": "REDACTED"},
- ]);
- })();
-}());
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index d0d54cd7e14..f2bd565989e 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -5661,138 +5661,248 @@ Value ExpressionConvert::performConversion(BSONType targetType, Value inputValue
namespace {
-Value generateRegexCapturesAndMatches(StringData pattern,
- const int numCaptures,
- const pcrecpp::RE_Options& options,
- StringData input,
- int startBytePos,
- int startCodePointPos) {
-
- const auto pcreOptions = options.all_options();
- // The first two-thirds of the vector is used to pass back captured substrings' start and limit
- // indexes. The remaining third of the vector is used as workspace by pcre_exec() while matching
- // capturing subpatterns, and is not available for passing back information.
- const size_t sizeOfOVector = (1 + numCaptures) * 3;
- const char* compile_error;
- int eoffset;
-
- // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the index of
- // the match). So we are using the C interface. First we compile all the regex options to
- // generate pcre object, which will later be used to match against the input string.
- pcre* pcre = pcre_compile(pattern.rawData(), pcreOptions, &compile_error, &eoffset, nullptr);
- if (pcre == nullptr) {
- uasserted(51111, str::stream() << "Invalid Regex: " << compile_error);
- }
-
- // TODO: Evaluate the upper bound for this array and fail the request if numCaptures are higher
- // than the limit (SERVER-37848).
- std::vector<int> outVector(sizeOfOVector);
- const int out = pcre_exec(pcre,
- 0,
- input.rawData(),
- input.size(),
- startBytePos,
- 0, // No need to overwrite the options set during pcre_compile.
- &outVector.front(),
- sizeOfOVector);
- (*pcre_free)(pcre);
- // The 'out' parameter will be zero if outVector's size is not big enough to hold all the
- // captures, which should never be the case.
- invariant(out != 0);
-
- // No match.
- if (out < 0) {
- return Value(BSONNULL);
+class RegexMatchHandler {
+public:
+ RegexMatchHandler(const Value& inputExpr) : _pcre(nullptr), _nullish(false) {
+ _validateInputAndExtractElements(inputExpr);
+ _compile(regex_util::flags2PcreOptions(_options, false).all_options());
}
- // The first and second entires of the outVector have the start and limit indices of the matched
- // string. as byte offsets.
- const int matchStartByteIndex = outVector[0];
- // We iterate through the input string's contents preceding the match index, in order to convert
- // the byte offset to a code point offset.
- for (int byteIx = startBytePos; byteIx < matchStartByteIndex; ++startCodePointPos) {
- byteIx += getCodePointLength(input[byteIx]);
+ ~RegexMatchHandler() {
+ if (_pcre != nullptr) {
+ pcre_free(_pcre);
+ }
}
- StringData matchedStr = input.substr(outVector[0], outVector[1] - outVector[0]);
- std::vector<Value> captures;
- // The next 2 * numCaptures entries hold the start index and limit pairs, for each of the
- // capture groups. We skip the first two elements and start iteration from 3rd element so that
- // we only construct the strings for capture groups.
- for (int i = 0; i < numCaptures; i++) {
- const int start = outVector[2 * (i + 1)];
- const int limit = outVector[2 * (i + 1) + 1];
- captures.push_back(Value(input.substr(start, limit - start)));
- }
+ /**
+ * The function will match '_input' string based on the regex pattern present in '_pcre'. If
+ * there is a match, the function will return a 'Value' object encapsulating the matched string,
+ * the code point index of the matched string and a vector representing all the captured
+ * substrings. The function will also update the parameters 'startBytePos' and
+ * 'startCodePointPos' to the corresponding new indices. If there is no match, the function will
+ * return null 'Value' object.
+ */
+ Value nextMatch(int* startBytePos, int* startCodePointPos) {
+ invariant(startBytePos != nullptr && startCodePointPos != nullptr);
+
+ // Use input as StringData throughout the function to avoid copying the string on 'substr'
+ // calls.
+ StringData input = _input;
+ int execResult = pcre_exec(_pcre,
+ 0,
+ input.rawData(),
+ input.size(),
+ *startBytePos,
+ 0, // No need to overwrite the options set during pcre_compile.
+ &_capturesBuffer.front(),
+ _capturesBuffer.size());
+ // No match.
+ if (execResult < 0) {
+ return Value(BSONNULL);
+ }
+ // The 'execResult' will be zero if _capturesBuffer's size is not big enough to hold all
+ // the captures, which should never be the case.
+ invariant(execResult == _numCaptures + 1);
+
+ // The first and second entries of the '_capturesBuffer' will have the start and limit
+ // indices of the matched string, as byte offsets. '(limit - startIndex)' would be the
+ // length of the captured string.
+ const int matchStartByteIndex = _capturesBuffer[0];
+ StringData matchedStr =
+ input.substr(matchStartByteIndex, _capturesBuffer[1] - matchStartByteIndex);
+ // We iterate through the input string's contents preceding the match index, in order to
+ // convert the byte offset to a code point offset.
+ for (int byteIx = *startBytePos; byteIx < matchStartByteIndex; ++(*startCodePointPos)) {
+ byteIx += getCodePointLength(input[byteIx]);
+ }
+ // Set the start index for match to the new one.
+ *startBytePos = matchStartByteIndex;
+
+ std::vector<Value> captures;
+ captures.reserve(_numCaptures);
+ // The next '2 * numCaptures' entries (after the first two entries) of '_capturesBuffer'
+ // will hold the start index and limit pairs, for each of the capture groups. We skip the
+ // first two elements and start iteration from 3rd element so that we only construct the
+ // strings for capture groups.
+ for (int i = 0; i < _numCaptures; ++i) {
+ const int start = _capturesBuffer[2 * (i + 1)];
+ const int limit = _capturesBuffer[2 * (i + 1) + 1];
+ captures.push_back(Value(input.substr(start, limit - start)));
+ }
- MutableDocument match;
- match.addField("match", Value(matchedStr));
- match.addField("idx", Value(startCodePointPos));
- match.addField("captures", Value(captures));
- return match.freezeToValue();
-}
+ MutableDocument match;
+ match.addField("match", Value(matchedStr));
+ match.addField("idx", Value(*startCodePointPos));
+ match.addField("captures", Value(captures));
+ return match.freezeToValue();
+ }
-} // namespace
+ int numCaptures() {
+ return _numCaptures;
+ }
-Value ExpressionRegexFind::evaluate(const Document& root) const {
+ bool nullish() {
+ return _nullish;
+ }
- const Value expr = vpOperand[0]->evaluate(root);
- uassert(51103,
- str::stream() << "$regexFind expects an object of named arguments, but found type "
- << expr.getType(),
- !expr.nullish() && expr.getType() == BSONType::Object);
- Value textInput = expr.getDocument().getField("input");
- Value regexPattern = expr.getDocument().getField("regex");
- Value regexOptions = expr.getDocument().getField("options");
-
- uassert(51104,
- "input field should be of type string",
- textInput.nullish() || textInput.getType() == BSONType::String);
- uassert(51105,
- "regex field should be of type string or regex",
- regexPattern.nullish() || regexPattern.getType() == BSONType::String ||
- regexPattern.getType() == BSONType::RegEx);
- uassert(51106,
- "options should be of type string",
- regexOptions.nullish() || regexOptions.getType() == BSONType::String);
- if (textInput.nullish() || regexPattern.nullish()) {
- return Value(BSONNULL);
+ StringData getInput() {
+ return _input;
}
- StringData pattern, optionFlags;
- // The 'regex' field can be a RegEx object with its own options/options specified separately...
- if (regexPattern.getType() == BSONType::RegEx) {
- StringData regexFlags = regexPattern.getRegexFlags();
- pattern = regexPattern.getRegex();
- uassert(
- 51107,
- str::stream() << "Found regex option(s) specified in both 'regex' and 'option' fields",
- regexOptions.nullish() || regexFlags.empty());
- optionFlags = regexOptions.nullish() ? regexFlags : regexOptions.getStringData();
- } else {
- // ... or it can be a string field with options specified separately.
- pattern = regexPattern.getStringData();
+private:
+ RegexMatchHandler(const RegexMatchHandler&) = delete;
+
+ void _compile(const int pcreOptions) {
+ const char* compile_error;
+ int eoffset;
+ // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the
+ // index of the match). So we are using the C interface. First we compile all the regex
+ // options to generate pcre object, which will later be used to match against the input
+ // string.
+ _pcre = pcre_compile(_pattern.c_str(), pcreOptions, &compile_error, &eoffset, nullptr);
+ if (_pcre == nullptr) {
+ uasserted(51111, str::stream() << "Invalid Regex: " << compile_error);
+ }
+
+ // Calculate the number of capture groups present in '_pattern' and store in '_numCaptures'.
+ int pcre_retval = pcre_fullinfo(_pcre, NULL, PCRE_INFO_CAPTURECOUNT, &_numCaptures);
+ invariant(pcre_retval == 0);
+
+ // The first two-thirds of the vector is used to pass back captured substrings' start and
+ // limit indexes. The remaining third of the vector is used as workspace by pcre_exec()
+ // while matching capturing subpatterns, and is not available for passing back information.
+ // TODO: Evaluate the upper bound for this array and fail the request if numCaptures are
+ // higher than the limit (SERVER-37848).
+ _capturesBuffer = std::vector<int>((1 + _numCaptures) * 3);
+ }
+
+ void _validateInputAndExtractElements(const Value& inputExpr) {
+ uassert(51103,
+ str::stream() << "$regexFind expects an object of named arguments, but found type "
+ << inputExpr.getType(),
+ inputExpr.getType() == BSONType::Object);
+ Value textInput = inputExpr.getDocument().getField("input");
+ Value regexPattern = inputExpr.getDocument().getField("regex");
+ Value regexOptions = inputExpr.getDocument().getField("options");
+
+ uassert(51104,
+ "'input' field should be of type string",
+ textInput.nullish() || textInput.getType() == BSONType::String);
+ uassert(51105,
+ "'regex' field should be of type string or regex",
+ regexPattern.nullish() || regexPattern.getType() == BSONType::String ||
+ regexPattern.getType() == BSONType::RegEx);
+ uassert(51106,
+ "'options' should be of type string",
+ regexOptions.nullish() || regexOptions.getType() == BSONType::String);
+
+ // If either the text input or regex pattern is nullish, then we consider the operation as a
+ // whole nullish.
+ _nullish = textInput.nullish() || regexPattern.nullish();
+
+ if (textInput.getType() == BSONType::String) {
+ _input = textInput.getString();
+ }
+
+ // The 'regex' field can be a RegEx object and may have its own options...
+ if (regexPattern.getType() == BSONType::RegEx) {
+ StringData regexFlags = regexPattern.getRegexFlags();
+ _pattern = regexPattern.getRegex();
+ uassert(51107,
+ str::stream()
+ << "Found regex option(s) specified in both 'regex' and 'option' fields",
+ regexOptions.nullish() || regexFlags.empty());
+ if (!regexFlags.empty()) {
+ _options = regexFlags.toString();
+ }
+ } else if (regexPattern.getType() == BSONType::String) {
+ // ...or it can be a string field with options specified separately.
+ _pattern = regexPattern.getString();
+ }
+ // If 'options' is non-null, we must extract and validate its contents even if
+ // 'regexPattern' is nullish.
if (!regexOptions.nullish()) {
- optionFlags = regexOptions.getStringData();
+ _options = regexOptions.getString();
}
- }
- uassert(51109,
- "Regular expression cannot contain an embedded null byte",
- pattern.find('\0', 0) == string::npos);
- uassert(51110,
- "Regular expression options string cannot contain an embedded null byte",
- optionFlags.find('\0', 0) == string::npos);
+ uassert(51109,
+ "Regular expression cannot contain an embedded null byte",
+ _pattern.find('\0', 0) == string::npos);
+ uassert(51110,
+ "Regular expression options string cannot contain an embedded null byte",
+ _options.find('\0', 0) == string::npos);
+ }
+
+ pcre* _pcre;
+ // Number of capture groups present in '_pattern'.
+ int _numCaptures;
+ // Holds the start and limit indices of match and captures for the current match.
+ std::vector<int> _capturesBuffer;
+ std::string _input;
+ std::string _pattern;
+ std::string _options;
+ bool _nullish;
+};
- pcrecpp::RE_Options opt = regex_util::flags2PcreOptions(optionFlags, false);
- pcrecpp::RE regex(pattern.rawData(), opt);
- return generateRegexCapturesAndMatches(
- pattern, regex.NumberOfCapturingGroups(), opt, textInput.getStringData(), 0, 0);
-}
+} // namespace
+Value ExpressionRegexFind::evaluate(const Document& root) const {
+
+ RegexMatchHandler regex(vpOperand[0]->evaluate(root));
+ if (regex.nullish()) {
+ return Value(BSONNULL);
+ }
+ int startByteIndex = 0, startCodePointIndex = 0;
+ return regex.nextMatch(&startByteIndex, &startCodePointIndex);
+}
REGISTER_EXPRESSION(regexFind, ExpressionRegexFind::parse);
const char* ExpressionRegexFind::getOpName() const {
return "$regexFind";
}
+Value ExpressionRegexFindAll::evaluate(const Document& root) const {
+
+ std::vector<Value> output;
+ RegexMatchHandler regex(vpOperand[0]->evaluate(root));
+ if (regex.nullish()) {
+ return Value(output);
+ }
+ int startByteIndex = 0, startCodePointIndex = 0;
+ StringData input = regex.getInput();
+
+ // Using do...while loop because, when input is an empty string, we still want to see if there
+ // is a match.
+ do {
+ auto matchObj = regex.nextMatch(&startByteIndex, &startCodePointIndex);
+ if (matchObj.getType() == BSONType::jstNULL) {
+ break;
+ }
+ output.push_back(matchObj);
+ std::string matchStr = matchObj.getDocument().getField("match").getString();
+ if (matchStr.empty()) {
+ // This would only happen if the regex matched an empty string. In this case, even if
+ // the character at startByteIndex matches the regex, we cannot return it since we are
+ // already returing an empty string starting at this index. So we move on to the next
+ // byte index.
+ startByteIndex += getCodePointLength(input[startByteIndex]);
+ ++startCodePointIndex;
+ continue;
+ }
+ // We don't want any overlapping sub-strings. So we move 'startByteIndex' to point to the
+ // byte after 'matchStr'. We move the code point index also correspondingly.
+ startByteIndex += matchStr.size();
+ for (size_t byteIx = 0; byteIx < matchStr.size(); ++startCodePointIndex) {
+ byteIx += getCodePointLength(matchStr[byteIx]);
+ }
+ invariant(startByteIndex > 0 && startCodePointIndex > 0 &&
+ startCodePointIndex <= startByteIndex);
+ } while (static_cast<size_t>(startByteIndex) < input.size());
+ return Value(output);
+}
+
+REGISTER_EXPRESSION(regexFindAll, ExpressionRegexFindAll::parse);
+const char* ExpressionRegexFindAll::getOpName() const {
+ return "$regexFindAll";
+}
+
} // namespace mongo
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
index 4aa91a67086..b0949cca3fc 100644
--- a/src/mongo/db/pipeline/expression.h
+++ b/src/mongo/db/pipeline/expression.h
@@ -2093,7 +2093,7 @@ private:
boost::intrusive_ptr<Expression> _onNull;
};
-class ExpressionRegexFind final : public ExpressionFixedArity<ExpressionRegexFind, 1> {
+class ExpressionRegexFind : public ExpressionFixedArity<ExpressionRegexFind, 1> {
public:
explicit ExpressionRegexFind(const boost::intrusive_ptr<ExpressionContext>& expCtx)
: ExpressionFixedArity<ExpressionRegexFind, 1>(expCtx) {}
@@ -2101,4 +2101,13 @@ public:
Value evaluate(const Document& root) const final;
const char* getOpName() const final;
};
+
+class ExpressionRegexFindAll final : public ExpressionFixedArity<ExpressionRegexFindAll, 1> {
+public:
+ explicit ExpressionRegexFindAll(const boost::intrusive_ptr<ExpressionContext>& expCtx)
+ : ExpressionFixedArity<ExpressionRegexFindAll, 1>(expCtx) {}
+
+ Value evaluate(const Document& root) const final;
+ const char* getOpName() const final;
+};
}
diff --git a/src/mongo/db/pipeline/expression_test.cpp b/src/mongo/db/pipeline/expression_test.cpp
index ca254af56c7..c9bc46a2c8b 100644
--- a/src/mongo/db/pipeline/expression_test.cpp
+++ b/src/mongo/db/pipeline/expression_test.cpp
@@ -5950,7 +5950,7 @@ TEST(GetComputedPathsTest, ExpressionMapNotConsideredRenameWithDottedInputPath)
} // namespace GetComputedPathsTest
-namespace ExpressionRegexFindTest {
+namespace ExpressionRegexTest {
TEST(ExpressionRegexFindTest, BasicTest) {
Value input(fromjson("{input: 'asdf', regex: '^as' }"));
@@ -5979,11 +5979,64 @@ TEST(ExpressionRegexFindTest, FailureCase) {
intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
ExpressionRegexFind regexF(expCtx);
regexF.addOperand(ExpressionConstant::create(expCtx, input));
- ASSERT_THROWS(regexF.evaluate(Document()), DBException);
+ ASSERT_THROWS_CODE(regexF.evaluate(Document()), DBException, 51105);
}
+TEST(ExpressionRegexFindAllTest, MultipleMatches) {
+ Value input(fromjson("{input: 'a1b2c3', regex: '([a-c][1-3])' }"));
+ std::vector<Value> expectedOut = {Value(fromjson("{match: 'a1', idx:0, captures:['a1']}")),
+ Value(fromjson("{match: 'b2', idx:2, captures:['b2']}")),
+ Value(fromjson("{match: 'c3', idx:4, captures:['c3']}"))};
+ intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
+ ExpressionRegexFindAll regexF(expCtx);
+ regexF.addOperand(ExpressionConstant::create(expCtx, input));
+ Value output = regexF.evaluate(Document());
+ ASSERT_VALUE_EQ(output, Value(expectedOut));
+}
+
+TEST(ExpressionRegexFindAllTest, NoMatch) {
+ Value input(fromjson("{input: 'a1b2c3', regex: 'ab' }"));
+ intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
+ ExpressionRegexFindAll regexF(expCtx);
+ regexF.addOperand(ExpressionConstant::create(expCtx, input));
+ Value output = regexF.evaluate(Document());
+ ASSERT_VALUE_EQ(output, Value(std::vector<Value>()));
+}
+
+TEST(ExpressionRegexFindAllTest, FailureCase) {
+ Value input(fromjson("{input: 'FirstLine\\nSecondLine', regex: '[0-9'}"));
+ intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
+ ExpressionRegexFindAll regexF(expCtx);
+ regexF.addOperand(ExpressionConstant::create(expCtx, input));
+ ASSERT_THROWS_CODE(regexF.evaluate(Document()), DBException, 51111);
+}
+
+TEST(ExpressionRegexFindAllTest, InvalidUTF8InInput) {
+ std::string inputField = "1234 ";
+ // Append an invalid UTF-8 character.
+ inputField += static_cast<char>(0xE5);
+ inputField += " 1234";
+ Value input(fromjson("{input: '" + inputField + "', regex: '[0-9]'}"));
+ intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
+ ExpressionRegexFindAll regexF(expCtx);
+ regexF.addOperand(ExpressionConstant::create(expCtx, input));
+ // Verify no match if there is an invalid UTF-8 character in input.
+ ASSERT_VALUE_EQ(regexF.evaluate(Document()), Value(std::vector<Value>()));
+}
+
+TEST(ExpressionRegexFindAllTest, InvalidUTF8InRegex) {
+ std::string regexField = "1234 ";
+ // Append an invalid UTF-8 character.
+ regexField += static_cast<char>(0xE5);
+ Value input(fromjson("{input: '123456', regex: '" + regexField + "'}"));
+ intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
+ ExpressionRegexFindAll regexF(expCtx);
+ regexF.addOperand(ExpressionConstant::create(expCtx, input));
+ // Verify that PCRE will error if REGEX is not a valid UTF-8.
+ ASSERT_THROWS_CODE(regexF.evaluate(Document()), DBException, 51111);
+}
-} // namespace ExpressionRegexFindTest
+} // namespace ExpressionRegexTest
class All : public Suite {
public: