summaryrefslogtreecommitdiff
path: root/jstests/core/regex_unicode.js
diff options
context:
space:
mode:
Diffstat (limited to 'jstests/core/regex_unicode.js')
-rw-r--r--jstests/core/regex_unicode.js227
1 files changed, 118 insertions, 109 deletions
diff --git a/jstests/core/regex_unicode.js b/jstests/core/regex_unicode.js
index 32a3d177831..2befd6f700c 100644
--- a/jstests/core/regex_unicode.js
+++ b/jstests/core/regex_unicode.js
@@ -2,113 +2,122 @@
* Test regexes with various Unicode options.
*/
(function() {
- "use strict";
-
- const coll = db.getCollection("regex_unicode");
- coll.drop();
-
- // Populate the collection with strings containing ASCII and non-ASCII characters.
- let docAllAscii = {_id: 0, text: "kyle"};
- let docNoAscii = {_id: 1, text: "박정수"};
- let docMixed = {_id: 2, text: "suárez"};
- [docAllAscii, docNoAscii, docMixed].forEach((doc) => assert.commandWorked(coll.insert(doc)));
-
- /**
- * Helper function that asserts that a find command with a filter on the "text" field using
- * 'regex' returns 'expected' when sorting by _id ascending.
- */
- function assertFindResultsEq(regex, expected) {
- const res = coll.find({text: {$regex: regex}}).sort({_id: 1}).toArray();
- const errfn =
- `Regex query "${regex}" returned ${tojson(res)} ` + `but expected ${tojson(expected)}`;
- assert.eq(res, expected, errfn);
- }
-
- // Sanity check on exact characters.
- assertFindResultsEq("y", [docAllAscii]);
- assertFindResultsEq("e", [docAllAscii, docMixed]);
- assertFindResultsEq("á", [docMixed]);
- assertFindResultsEq("정", [docNoAscii]);
-
- // Test that the (*UTF) and (*UTF8) options are accepted.
- assertFindResultsEq("(*UTF)e", [docAllAscii, docMixed]);
- assertFindResultsEq("(*UTF)á", [docMixed]);
- assertFindResultsEq("(*UTF)정", [docNoAscii]);
- assertFindResultsEq("(*UTF8)e", [docAllAscii, docMixed]);
- assertFindResultsEq("(*UTF8)á", [docMixed]);
- assertFindResultsEq("(*UTF8)정", [docNoAscii]);
-
- // Test that regexes support Unicode character properties.
- assertFindResultsEq(String.raw `\p{Latin}`, [docAllAscii, docMixed]);
- assertFindResultsEq(String.raw `^\p{Latin}+$`, [docAllAscii, docMixed]);
- assertFindResultsEq(String.raw `\p{Hangul}`, [docNoAscii]);
- assertFindResultsEq(String.raw `^\p{Hangul}+$`, [docNoAscii]);
- assertFindResultsEq(String.raw `^\p{L}+$`, [docAllAscii, docNoAscii, docMixed]);
- assertFindResultsEq(String.raw `^\p{Xan}+$`, [docAllAscii, docNoAscii, docMixed]);
-
- // Tests for the '\w' character type, which matches any "word" character. In the default mode,
- // characters outside of the ASCII code point range are excluded.
-
- // An unanchored regex should match the two documents that contain at least one ASCII character.
- assertFindResultsEq(String.raw `\w`, [docAllAscii, docMixed]);
-
- // This anchored regex will only match the document with exclusively ASCII characters, since the
- // Unicode character in the mixed document will prevent it from being considered all "word"
- // characters.
- assertFindResultsEq(String.raw `^\w+$`, [docAllAscii]);
-
- // When the (*UCP) option is specified, Unicode "word" characters are included in the '\w'
- // character type, so all three documents should match.
- assertFindResultsEq(String.raw `(*UCP)\w`, [docAllAscii, docNoAscii, docMixed]);
- assertFindResultsEq(String.raw `(*UCP)^\w+$`, [docAllAscii, docNoAscii, docMixed]);
-
- // By default, the [:alpha:] character class matches ASCII alphabetic characters.
- assertFindResultsEq("[[:alpha:]]", [docAllAscii, docMixed]);
- assertFindResultsEq("^[[:alpha:]]+$", [docAllAscii]);
-
- // When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode
- // alphabetic characters.
- assertFindResultsEq("(*UCP)[[:alpha:]]", [docAllAscii, docNoAscii, docMixed]);
- assertFindResultsEq("(*UCP)^[[:alpha:]]+$", [docAllAscii, docNoAscii, docMixed]);
-
- // Drop the collection and repopulate it with numerical characters.
- coll.drop();
- docAllAscii = {_id: 0, text: "02191996"};
- docNoAscii = {_id: 1, text: "༢༣༤༥"};
- docMixed = {_id: 2, text: "9୩୪୬୯6"};
- [docAllAscii, docNoAscii, docMixed].forEach((doc) => assert.commandWorked(coll.insert(doc)));
-
- // Sanity check on exact characters.
- assertFindResultsEq("1", [docAllAscii]);
- assertFindResultsEq("9", [docAllAscii, docMixed]);
- assertFindResultsEq("୪", [docMixed]);
- assertFindResultsEq("༣", [docNoAscii]);
-
- // Test that the regexes are matched by the numeric Unicode character property.
- assertFindResultsEq(String.raw `^\p{N}+$`, [docAllAscii, docNoAscii, docMixed]);
- assertFindResultsEq(String.raw `^\p{Xan}+$`, [docAllAscii, docNoAscii, docMixed]);
-
- // Tests for the '\d' character type, which matches any "digit" character. In the default mode,
- // characters outside of the ASCII code point range are excluded.
- // An unanchored regex should match the two documents that contain at least one ASCII character.
- assertFindResultsEq(String.raw `\d`, [docAllAscii, docMixed]);
-
- // This anchored regex will only match the document with exclusively ASCII characters, since the
- // Unicode character in the mixed document will prevent it from being considered all "digit"
- // characters.
- assertFindResultsEq(String.raw `^\d+$`, [docAllAscii]);
-
- // When the (*UCP) option is specified, Unicode "digit" characters are included in the '\d'
- // character type, so all three documents should match.
- assertFindResultsEq(String.raw `(*UCP)\d`, [docAllAscii, docNoAscii, docMixed]);
- assertFindResultsEq(String.raw `(*UCP)^\d+$`, [docAllAscii, docNoAscii, docMixed]);
-
- // By default, the [:digit:] character class matches ASCII decimal digit characters.
- assertFindResultsEq("[[:digit:]]", [docAllAscii, docMixed]);
- assertFindResultsEq("^[[:digit:]]+$", [docAllAscii]);
-
- // When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode
- // decimal digit characters.
- assertFindResultsEq("(*UCP)[[:digit:]]", [docAllAscii, docNoAscii, docMixed]);
- assertFindResultsEq("(*UCP)^[[:digit:]]+$", [docAllAscii, docNoAscii, docMixed]);
+"use strict";
+
+const coll = db.getCollection("regex_unicode");
+coll.drop();
+
+// Populate the collection with strings containing ASCII and non-ASCII characters.
+let docAllAscii = {_id: 0, text: "kyle"};
+let docNoAscii = {_id: 1, text: "박정수"};
+let docMixed = {_id: 2, text: "suárez"};
+[docAllAscii, docNoAscii, docMixed].forEach((doc) => assert.commandWorked(coll.insert(doc)));
+
+/**
+ * Helper function that asserts that a find command with a filter on the "text" field using
+ * 'regex' returns 'expected' when sorting by _id ascending.
+ */
+function assertFindResultsEq(regex, expected) {
+ const res = coll.find({text: {$regex: regex}}).sort({_id: 1}).toArray();
+ const errfn = `Regex query "${regex}" returned ${tojson(res)} ` +
+ `but expected ${tojson(expected)}`;
+ assert.eq(res, expected, errfn);
+}
+
+// Sanity check on exact characters.
+assertFindResultsEq("y", [docAllAscii]);
+assertFindResultsEq("e", [docAllAscii, docMixed]);
+assertFindResultsEq("á", [docMixed]);
+assertFindResultsEq("정", [docNoAscii]);
+
+// Test that the (*UTF) and (*UTF8) options are accepted.
+assertFindResultsEq("(*UTF)e", [docAllAscii, docMixed]);
+assertFindResultsEq("(*UTF)á", [docMixed]);
+assertFindResultsEq("(*UTF)정", [docNoAscii]);
+assertFindResultsEq("(*UTF8)e", [docAllAscii, docMixed]);
+assertFindResultsEq("(*UTF8)á", [docMixed]);
+assertFindResultsEq("(*UTF8)정", [docNoAscii]);
+
+// Test that regexes support Unicode character properties.
+assertFindResultsEq(String.raw`\p{Latin}`, [docAllAscii, docMixed]);
+assertFindResultsEq(String.raw`^\p{Latin}+$`, [docAllAscii, docMixed]);
+assertFindResultsEq(String.raw`\p{Hangul}`, [docNoAscii]);
+assertFindResultsEq(String.raw`^\p{Hangul}+$`, [docNoAscii]);
+assertFindResultsEq(String.raw`^\p{L}+$`, [docAllAscii, docNoAscii, docMixed]);
+assertFindResultsEq(String.raw`^\p{Xan}+$`, [docAllAscii, docNoAscii, docMixed]);
+
+// Tests for the '\w' character type, which matches any "word" character. In the default mode,
+// characters outside of the ASCII code point range are excluded.
+
+// An unanchored regex should match the two documents that contain at least one ASCII character.
+assertFindResultsEq(String.raw`\w`, [docAllAscii, docMixed]);
+
+// This anchored regex will only match the document with exclusively ASCII characters, since the
+// Unicode character in the mixed document will prevent it from being considered all "word"
+// characters.
+assertFindResultsEq(String.raw`^\w+$`, [docAllAscii]);
+
+// When the (*UCP) option is specified, Unicode "word" characters are included in the '\w'
+// character type, so all three documents should match.
+assertFindResultsEq(String.raw`(*UCP)\w`, [docAllAscii, docNoAscii, docMixed]);
+assertFindResultsEq(String.raw`(*UCP)^\w+$`, [docAllAscii, docNoAscii, docMixed]);
+
+// By default, the [:alpha:] character class matches ASCII alphabetic characters.
+assertFindResultsEq("[[:alpha:]]", [docAllAscii, docMixed]);
+assertFindResultsEq("^[[:alpha:]]+$", [docAllAscii]);
+
+// When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode
+// alphabetic characters.
+assertFindResultsEq("(*UCP)[[:alpha:]]", [docAllAscii, docNoAscii, docMixed]);
+assertFindResultsEq("(*UCP)^[[:alpha:]]+$", [docAllAscii, docNoAscii, docMixed]);
+
+// Drop the collection and repopulate it with numerical characters.
+coll.drop();
+docAllAscii = {
+ _id: 0,
+ text: "02191996"
+};
+docNoAscii = {
+ _id: 1,
+ text: "༢༣༤༥"
+};
+docMixed = {
+ _id: 2,
+ text: "9୩୪୬୯6"
+};
+[docAllAscii, docNoAscii, docMixed].forEach((doc) => assert.commandWorked(coll.insert(doc)));
+
+// Sanity check on exact characters.
+assertFindResultsEq("1", [docAllAscii]);
+assertFindResultsEq("9", [docAllAscii, docMixed]);
+assertFindResultsEq("୪", [docMixed]);
+assertFindResultsEq("༣", [docNoAscii]);
+
+// Test that the regexes are matched by the numeric Unicode character property.
+assertFindResultsEq(String.raw`^\p{N}+$`, [docAllAscii, docNoAscii, docMixed]);
+assertFindResultsEq(String.raw`^\p{Xan}+$`, [docAllAscii, docNoAscii, docMixed]);
+
+// Tests for the '\d' character type, which matches any "digit" character. In the default mode,
+// characters outside of the ASCII code point range are excluded.
+// An unanchored regex should match the two documents that contain at least one ASCII character.
+assertFindResultsEq(String.raw`\d`, [docAllAscii, docMixed]);
+
+// This anchored regex will only match the document with exclusively ASCII characters, since the
+// Unicode character in the mixed document will prevent it from being considered all "digit"
+// characters.
+assertFindResultsEq(String.raw`^\d+$`, [docAllAscii]);
+
+// When the (*UCP) option is specified, Unicode "digit" characters are included in the '\d'
+// character type, so all three documents should match.
+assertFindResultsEq(String.raw`(*UCP)\d`, [docAllAscii, docNoAscii, docMixed]);
+assertFindResultsEq(String.raw`(*UCP)^\d+$`, [docAllAscii, docNoAscii, docMixed]);
+
+// By default, the [:digit:] character class matches ASCII decimal digit characters.
+assertFindResultsEq("[[:digit:]]", [docAllAscii, docMixed]);
+assertFindResultsEq("^[[:digit:]]+$", [docAllAscii]);
+
+// When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode
+// decimal digit characters.
+assertFindResultsEq("(*UCP)[[:digit:]]", [docAllAscii, docNoAscii, docMixed]);
+assertFindResultsEq("(*UCP)^[[:digit:]]+$", [docAllAscii, docNoAscii, docMixed]);
}());