/** * Test regexes with various Unicode options. */ (function() { "use strict"; const coll = db.getCollection("regex_unicode"); coll.drop(); // Populate the collection with strings containing ASCII and non-ASCII characters. let docAllAscii = {_id: 0, text: "kyle"}; let docNoAscii = {_id: 1, text: "박정수"}; let docMixed = {_id: 2, text: "suárez"}; [docAllAscii, docNoAscii, docMixed].forEach((doc) => assert.commandWorked(coll.insert(doc))); /** * Helper function that asserts that a find command with a filter on the "text" field using * 'regex' returns 'expected' when sorting by _id ascending. */ function assertFindResultsEq(regex, expected) { const res = coll.find({text: {$regex: regex}}).sort({_id: 1}).toArray(); const errfn = `Regex query "${regex}" returned ${tojson(res)} ` + `but expected ${tojson(expected)}`; assert.eq(res, expected, errfn); } // Sanity check on exact characters. assertFindResultsEq("y", [docAllAscii]); assertFindResultsEq("e", [docAllAscii, docMixed]); assertFindResultsEq("á", [docMixed]); assertFindResultsEq("정", [docNoAscii]); // Test that the (*UTF) and (*UTF8) options are accepted. assertFindResultsEq("(*UTF)e", [docAllAscii, docMixed]); assertFindResultsEq("(*UTF)á", [docMixed]); assertFindResultsEq("(*UTF)정", [docNoAscii]); assertFindResultsEq("(*UTF8)e", [docAllAscii, docMixed]); assertFindResultsEq("(*UTF8)á", [docMixed]); assertFindResultsEq("(*UTF8)정", [docNoAscii]); // Test that regexes support Unicode character properties. assertFindResultsEq(String.raw`\p{Latin}`, [docAllAscii, docMixed]); assertFindResultsEq(String.raw`^\p{Latin}+$`, [docAllAscii, docMixed]); assertFindResultsEq(String.raw`\p{Hangul}`, [docNoAscii]); assertFindResultsEq(String.raw`^\p{Hangul}+$`, [docNoAscii]); assertFindResultsEq(String.raw`^\p{L}+$`, [docAllAscii, docNoAscii, docMixed]); assertFindResultsEq(String.raw`^\p{Xan}+$`, [docAllAscii, docNoAscii, docMixed]); // Tests for the '\w' character type, which matches any "word" character. In the default mode, // characters outside of the ASCII code point range are excluded. // An unanchored regex should match the two documents that contain at least one ASCII character. assertFindResultsEq(String.raw`\w`, [docAllAscii, docMixed]); // This anchored regex will only match the document with exclusively ASCII characters, since the // Unicode character in the mixed document will prevent it from being considered all "word" // characters. assertFindResultsEq(String.raw`^\w+$`, [docAllAscii]); // When the (*UCP) option is specified, Unicode "word" characters are included in the '\w' // character type, so all three documents should match. assertFindResultsEq(String.raw`(*UCP)\w`, [docAllAscii, docNoAscii, docMixed]); assertFindResultsEq(String.raw`(*UCP)^\w+$`, [docAllAscii, docNoAscii, docMixed]); // By default, the [:alpha:] character class matches ASCII alphabetic characters. assertFindResultsEq("[[:alpha:]]", [docAllAscii, docMixed]); assertFindResultsEq("^[[:alpha:]]+$", [docAllAscii]); // When the (*UCP) option is specified, [:alpha:] becomes \p{L} and matches all Unicode // alphabetic characters. assertFindResultsEq("(*UCP)[[:alpha:]]", [docAllAscii, docNoAscii, docMixed]); assertFindResultsEq("(*UCP)^[[:alpha:]]+$", [docAllAscii, docNoAscii, docMixed]); // Drop the collection and repopulate it with numerical characters. coll.drop(); docAllAscii = { _id: 0, text: "02191996" }; docNoAscii = { _id: 1, text: "༢༣༤༥" }; docMixed = { _id: 2, text: "9୩୪୬୯6" }; [docAllAscii, docNoAscii, docMixed].forEach((doc) => assert.commandWorked(coll.insert(doc))); // Sanity check on exact characters. assertFindResultsEq("1", [docAllAscii]); assertFindResultsEq("9", [docAllAscii, docMixed]); assertFindResultsEq("୪", [docMixed]); assertFindResultsEq("༣", [docNoAscii]); // Test that the regexes are matched by the numeric Unicode character property. assertFindResultsEq(String.raw`^\p{N}+$`, [docAllAscii, docNoAscii, docMixed]); assertFindResultsEq(String.raw`^\p{Xan}+$`, [docAllAscii, docNoAscii, docMixed]); // Tests for the '\d' character type, which matches any "digit" character. In the default mode, // characters outside of the ASCII code point range are excluded. // An unanchored regex should match the two documents that contain at least one ASCII character. assertFindResultsEq(String.raw`\d`, [docAllAscii, docMixed]); // This anchored regex will only match the document with exclusively ASCII characters, since the // Unicode character in the mixed document will prevent it from being considered all "digit" // characters. assertFindResultsEq(String.raw`^\d+$`, [docAllAscii]); // When the (*UCP) option is specified, Unicode "digit" characters are included in the '\d' // character type, so all three documents should match. assertFindResultsEq(String.raw`(*UCP)\d`, [docAllAscii, docNoAscii, docMixed]); assertFindResultsEq(String.raw`(*UCP)^\d+$`, [docAllAscii, docNoAscii, docMixed]); // By default, the [:digit:] character class matches ASCII decimal digit characters. assertFindResultsEq("[[:digit:]]", [docAllAscii, docMixed]); assertFindResultsEq("^[[:digit:]]+$", [docAllAscii]); // When the (*UCP) option is specified, [:digit:] becomes \p{N} and matches all Unicode // decimal digit characters. assertFindResultsEq("(*UCP)[[:digit:]]", [docAllAscii, docNoAscii, docMixed]); assertFindResultsEq("(*UCP)^[[:digit:]]+$", [docAllAscii, docNoAscii, docMixed]); }());