diff options
author | Mathias Stearn <mathias@10gen.com> | 2016-02-24 16:35:54 -0500 |
---|---|---|
committer | Mathias Stearn <mathias@10gen.com> | 2016-03-11 08:50:18 -0500 |
commit | 6c3157f126bb44ab275325e85de7abee5ce9ad6d (patch) | |
tree | 16189ba18d0febfdd564006c1f03f008527f4f41 /src/mongo | |
parent | 4a35c7184e188354793f16d27e2330b3b5ce7f8f (diff) | |
download | mongo-6c3157f126bb44ab275325e85de7abee5ce9ad6d.tar.gz |
SERVER-19936 Optimize FTS v3 phrase matching
Major changes:
* Use Booyer-Moore algorithm for searching rather than std::search
* All strings are kept in UTF8 rather than going to UTF32.
* Case folding and diacritic removal are done in a single pass.
* Optimize case folding and diacritic removal for ASCII codepoints.
* Combine functionality of codepointIsDiacritic() into
codepointRemoveDiacritics()
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/fts/fts_unicode_phrase_matcher.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/codepoints.h | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp | 1546 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/codepoints_test.cpp | 32 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_casefold_map.py | 45 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/gen_diacritic_map.py | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.cpp | 162 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string.h | 25 | ||||
-rw-r--r-- | src/mongo/db/fts/unicode/string_test.cpp | 132 |
9 files changed, 1842 insertions, 112 deletions
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp index c467b474be1..5280af593f0 100644 --- a/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp +++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp @@ -57,8 +57,7 @@ bool UnicodeFTSPhraseMatcher::phraseMatches(const string& phrase, matchOptions |= unicode::String::kDiacriticSensitive; } - return unicode::String::substrMatch( - unicode::String(haystack), unicode::String(phrase), matchOptions, _caseFoldMode); + return unicode::String::substrMatch(haystack, phrase, matchOptions, _caseFoldMode); } } // namespace fts diff --git a/src/mongo/db/fts/unicode/codepoints.h b/src/mongo/db/fts/unicode/codepoints.h index 5b1e8e2b2b5..70520a00106 100644 --- a/src/mongo/db/fts/unicode/codepoints.h +++ b/src/mongo/db/fts/unicode/codepoints.h @@ -73,6 +73,10 @@ bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang); * which code blocks are used), decomposing them to the NFD normalization form, removing any * combining marks, and renormalizing them to the NFC form. The result is a mapping from original * codepoint to a codepoint with no diacritics. + * + * Returns 0 if codepoint is a pure diacritic mark (ie if codepointIsDiacritic() would return true). + * You will need to distinguish this case from the input codepoint being 0 either by explicit + * checking or avoiding a call to this function if codepoint is in the ASCII range (<0x80). */ char32_t codepointRemoveDiacritics(char32_t codepoint); diff --git a/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp b/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp index c6b39d328b8..33893b135cd 100644 --- a/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp +++ b/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp @@ -35,6 +35,20 @@ namespace unicode { char32_t codepointRemoveDiacritics(char32_t codepoint) { switch (codepoint) { + case 0x5e: + return 0x0; + case 0x60: + return 0x0; + case 0xa8: + return 0x0; + case 0xaf: + return 0x0; + case 0xb4: + return 0x0; + case 0xb7: + return 0x0; + case 0xb8: + return 0x0; case 0xc0: return 0x41; case 0xc1: @@ -539,8 +553,364 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x59; case 0x233: return 0x79; + case 0x2b0: + return 0x0; + case 0x2b1: + return 0x0; + case 0x2b2: + return 0x0; + case 0x2b3: + return 0x0; + case 0x2b4: + return 0x0; + case 0x2b5: + return 0x0; + case 0x2b6: + return 0x0; + case 0x2b7: + return 0x0; + case 0x2b8: + return 0x0; + case 0x2b9: + return 0x0; + case 0x2ba: + return 0x0; + case 0x2bb: + return 0x0; + case 0x2bc: + return 0x0; + case 0x2bd: + return 0x0; + case 0x2be: + return 0x0; + case 0x2bf: + return 0x0; + case 0x2c0: + return 0x0; + case 0x2c1: + return 0x0; + case 0x2c2: + return 0x0; + case 0x2c3: + return 0x0; + case 0x2c4: + return 0x0; + case 0x2c5: + return 0x0; + case 0x2c6: + return 0x0; + case 0x2c7: + return 0x0; + case 0x2c8: + return 0x0; + case 0x2c9: + return 0x0; + case 0x2ca: + return 0x0; + case 0x2cb: + return 0x0; + case 0x2cc: + return 0x0; + case 0x2cd: + return 0x0; + case 0x2ce: + return 0x0; + case 0x2cf: + return 0x0; + case 0x2d0: + return 0x0; + case 0x2d1: + return 0x0; + case 0x2d2: + return 0x0; + case 0x2d3: + return 0x0; + case 0x2d4: + return 0x0; + case 0x2d5: + return 0x0; + case 0x2d6: + return 0x0; + case 0x2d7: + return 0x0; + case 0x2d8: + return 0x0; + case 0x2d9: + return 0x0; + case 0x2da: + return 0x0; + case 0x2db: + return 0x0; + case 0x2dc: + return 0x0; + case 0x2dd: + return 0x0; + case 0x2de: + return 0x0; + case 0x2df: + return 0x0; + case 0x2e0: + return 0x0; + case 0x2e1: + return 0x0; + case 0x2e2: + return 0x0; + case 0x2e3: + return 0x0; + case 0x2e4: + return 0x0; + case 0x2e5: + return 0x0; + case 0x2e6: + return 0x0; + case 0x2e7: + return 0x0; + case 0x2e8: + return 0x0; + case 0x2e9: + return 0x0; + case 0x2ea: + return 0x0; + case 0x2eb: + return 0x0; + case 0x2ec: + return 0x0; + case 0x2ed: + return 0x0; + case 0x2ee: + return 0x0; + case 0x2ef: + return 0x0; + case 0x2f0: + return 0x0; + case 0x2f1: + return 0x0; + case 0x2f2: + return 0x0; + case 0x2f3: + return 0x0; + case 0x2f4: + return 0x0; + case 0x2f5: + return 0x0; + case 0x2f6: + return 0x0; + case 0x2f7: + return 0x0; + case 0x2f8: + return 0x0; + case 0x2f9: + return 0x0; + case 0x2fa: + return 0x0; + case 0x2fb: + return 0x0; + case 0x2fc: + return 0x0; + case 0x2fd: + return 0x0; + case 0x2fe: + return 0x0; + case 0x2ff: + return 0x0; + case 0x300: + return 0x0; + case 0x301: + return 0x0; + case 0x302: + return 0x0; + case 0x303: + return 0x0; + case 0x304: + return 0x0; + case 0x305: + return 0x0; + case 0x306: + return 0x0; + case 0x307: + return 0x0; + case 0x308: + return 0x0; + case 0x309: + return 0x0; + case 0x30a: + return 0x0; + case 0x30b: + return 0x0; + case 0x30c: + return 0x0; + case 0x30d: + return 0x0; + case 0x30e: + return 0x0; + case 0x30f: + return 0x0; + case 0x310: + return 0x0; + case 0x311: + return 0x0; + case 0x312: + return 0x0; + case 0x313: + return 0x0; + case 0x314: + return 0x0; + case 0x315: + return 0x0; + case 0x316: + return 0x0; + case 0x317: + return 0x0; + case 0x318: + return 0x0; + case 0x319: + return 0x0; + case 0x31a: + return 0x0; + case 0x31b: + return 0x0; + case 0x31c: + return 0x0; + case 0x31d: + return 0x0; + case 0x31e: + return 0x0; + case 0x31f: + return 0x0; + case 0x320: + return 0x0; + case 0x321: + return 0x0; + case 0x322: + return 0x0; + case 0x323: + return 0x0; + case 0x324: + return 0x0; + case 0x325: + return 0x0; + case 0x326: + return 0x0; + case 0x327: + return 0x0; + case 0x328: + return 0x0; + case 0x329: + return 0x0; + case 0x32a: + return 0x0; + case 0x32b: + return 0x0; + case 0x32c: + return 0x0; + case 0x32d: + return 0x0; + case 0x32e: + return 0x0; + case 0x32f: + return 0x0; + case 0x330: + return 0x0; + case 0x331: + return 0x0; + case 0x332: + return 0x0; + case 0x333: + return 0x0; + case 0x334: + return 0x0; + case 0x335: + return 0x0; + case 0x336: + return 0x0; + case 0x337: + return 0x0; + case 0x338: + return 0x0; + case 0x339: + return 0x0; + case 0x33a: + return 0x0; + case 0x33b: + return 0x0; + case 0x33c: + return 0x0; + case 0x33d: + return 0x0; + case 0x33e: + return 0x0; + case 0x33f: + return 0x0; + case 0x340: + return 0x0; + case 0x341: + return 0x0; + case 0x342: + return 0x0; + case 0x343: + return 0x0; + case 0x344: + return 0x0; + case 0x345: + return 0x0; + case 0x346: + return 0x0; + case 0x347: + return 0x0; + case 0x348: + return 0x0; + case 0x349: + return 0x0; + case 0x34a: + return 0x0; + case 0x34b: + return 0x0; + case 0x34c: + return 0x0; + case 0x34d: + return 0x0; + case 0x34e: + return 0x0; + case 0x350: + return 0x0; + case 0x351: + return 0x0; + case 0x352: + return 0x0; + case 0x353: + return 0x0; + case 0x354: + return 0x0; + case 0x355: + return 0x0; + case 0x356: + return 0x0; + case 0x357: + return 0x0; + case 0x35d: + return 0x0; + case 0x35e: + return 0x0; + case 0x35f: + return 0x0; + case 0x360: + return 0x0; + case 0x361: + return 0x0; + case 0x362: + return 0x0; + case 0x374: + return 0x0; + case 0x375: + return 0x0; + case 0x37a: + return 0x0; case 0x37e: return 0x3b; + case 0x384: + return 0x0; + case 0x385: + return 0x0; case 0x386: return 0x391; case 0x388: @@ -621,6 +991,16 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x474; case 0x477: return 0x475; + case 0x483: + return 0x0; + case 0x484: + return 0x0; + case 0x485: + return 0x0; + case 0x486: + return 0x0; + case 0x487: + return 0x0; case 0x4c1: return 0x416; case 0x4c2: @@ -689,12 +1069,314 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x42b; case 0x4f9: return 0x44b; + case 0x559: + return 0x0; + case 0x591: + return 0x0; + case 0x592: + return 0x0; + case 0x593: + return 0x0; + case 0x594: + return 0x0; + case 0x595: + return 0x0; + case 0x596: + return 0x0; + case 0x597: + return 0x0; + case 0x598: + return 0x0; + case 0x599: + return 0x0; + case 0x59a: + return 0x0; + case 0x59b: + return 0x0; + case 0x59c: + return 0x0; + case 0x59d: + return 0x0; + case 0x59e: + return 0x0; + case 0x59f: + return 0x0; + case 0x5a0: + return 0x0; + case 0x5a1: + return 0x0; + case 0x5a3: + return 0x0; + case 0x5a4: + return 0x0; + case 0x5a5: + return 0x0; + case 0x5a6: + return 0x0; + case 0x5a7: + return 0x0; + case 0x5a8: + return 0x0; + case 0x5a9: + return 0x0; + case 0x5aa: + return 0x0; + case 0x5ab: + return 0x0; + case 0x5ac: + return 0x0; + case 0x5ad: + return 0x0; + case 0x5ae: + return 0x0; + case 0x5af: + return 0x0; + case 0x5b0: + return 0x0; + case 0x5b1: + return 0x0; + case 0x5b2: + return 0x0; + case 0x5b3: + return 0x0; + case 0x5b4: + return 0x0; + case 0x5b5: + return 0x0; + case 0x5b6: + return 0x0; + case 0x5b7: + return 0x0; + case 0x5b8: + return 0x0; + case 0x5b9: + return 0x0; + case 0x5ba: + return 0x0; + case 0x5bb: + return 0x0; + case 0x5bc: + return 0x0; + case 0x5bd: + return 0x0; + case 0x5bf: + return 0x0; + case 0x5c1: + return 0x0; + case 0x5c2: + return 0x0; + case 0x5c4: + return 0x0; + case 0x64b: + return 0x0; + case 0x64c: + return 0x0; + case 0x64d: + return 0x0; + case 0x64e: + return 0x0; + case 0x64f: + return 0x0; + case 0x650: + return 0x0; + case 0x651: + return 0x0; + case 0x652: + return 0x0; + case 0x657: + return 0x0; + case 0x658: + return 0x0; + case 0x6df: + return 0x0; + case 0x6e0: + return 0x0; + case 0x6e5: + return 0x0; + case 0x6e6: + return 0x0; + case 0x6ea: + return 0x0; + case 0x6eb: + return 0x0; + case 0x6ec: + return 0x0; + case 0x730: + return 0x0; + case 0x731: + return 0x0; + case 0x732: + return 0x0; + case 0x733: + return 0x0; + case 0x734: + return 0x0; + case 0x735: + return 0x0; + case 0x736: + return 0x0; + case 0x737: + return 0x0; + case 0x738: + return 0x0; + case 0x739: + return 0x0; + case 0x73a: + return 0x0; + case 0x73b: + return 0x0; + case 0x73c: + return 0x0; + case 0x73d: + return 0x0; + case 0x73e: + return 0x0; + case 0x73f: + return 0x0; + case 0x740: + return 0x0; + case 0x741: + return 0x0; + case 0x742: + return 0x0; + case 0x743: + return 0x0; + case 0x744: + return 0x0; + case 0x745: + return 0x0; + case 0x746: + return 0x0; + case 0x747: + return 0x0; + case 0x748: + return 0x0; + case 0x749: + return 0x0; + case 0x74a: + return 0x0; + case 0x7a6: + return 0x0; + case 0x7a7: + return 0x0; + case 0x7a8: + return 0x0; + case 0x7a9: + return 0x0; + case 0x7aa: + return 0x0; + case 0x7ab: + return 0x0; + case 0x7ac: + return 0x0; + case 0x7ad: + return 0x0; + case 0x7ae: + return 0x0; + case 0x7af: + return 0x0; + case 0x7b0: + return 0x0; + case 0x7eb: + return 0x0; + case 0x7ec: + return 0x0; + case 0x7ed: + return 0x0; + case 0x7ee: + return 0x0; + case 0x7ef: + return 0x0; + case 0x7f0: + return 0x0; + case 0x7f1: + return 0x0; + case 0x7f2: + return 0x0; + case 0x7f3: + return 0x0; + case 0x7f4: + return 0x0; + case 0x7f5: + return 0x0; + case 0x818: + return 0x0; + case 0x819: + return 0x0; + case 0x8e3: + return 0x0; + case 0x8e4: + return 0x0; + case 0x8e5: + return 0x0; + case 0x8e6: + return 0x0; + case 0x8e7: + return 0x0; + case 0x8e8: + return 0x0; + case 0x8e9: + return 0x0; + case 0x8ea: + return 0x0; + case 0x8eb: + return 0x0; + case 0x8ec: + return 0x0; + case 0x8ed: + return 0x0; + case 0x8ee: + return 0x0; + case 0x8ef: + return 0x0; + case 0x8f0: + return 0x0; + case 0x8f1: + return 0x0; + case 0x8f2: + return 0x0; + case 0x8f3: + return 0x0; + case 0x8f4: + return 0x0; + case 0x8f5: + return 0x0; + case 0x8f6: + return 0x0; + case 0x8f7: + return 0x0; + case 0x8f8: + return 0x0; + case 0x8f9: + return 0x0; + case 0x8fa: + return 0x0; + case 0x8fb: + return 0x0; + case 0x8fc: + return 0x0; + case 0x8fd: + return 0x0; + case 0x8fe: + return 0x0; case 0x929: return 0x928; case 0x931: return 0x930; case 0x934: return 0x933; + case 0x93c: + return 0x0; + case 0x94d: + return 0x0; + case 0x951: + return 0x0; + case 0x952: + return 0x0; + case 0x953: + return 0x0; + case 0x954: + return 0x0; case 0x958: return 0x915; case 0x959: @@ -711,6 +1393,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x92b; case 0x95f: return 0x92f; + case 0x971: + return 0x0; + case 0x9bc: + return 0x0; + case 0x9cd: + return 0x0; case 0x9dc: return 0x9a1; case 0x9dd: @@ -721,6 +1409,10 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0xa32; case 0xa36: return 0xa38; + case 0xa3c: + return 0x0; + case 0xa4d: + return 0x0; case 0xa59: return 0xa16; case 0xa5a: @@ -729,14 +1421,444 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0xa1c; case 0xa5e: return 0xa2b; + case 0xabc: + return 0x0; + case 0xacd: + return 0x0; + case 0xb3c: + return 0x0; + case 0xb4d: + return 0x0; case 0xb5c: return 0xb21; case 0xb5d: return 0xb22; + case 0xbcd: + return 0x0; + case 0xc4d: + return 0x0; + case 0xcbc: + return 0x0; + case 0xccd: + return 0x0; + case 0xd4d: + return 0x0; + case 0xdca: + return 0x0; case 0xdda: return 0xdd9; case 0xddd: return 0xddc; + case 0xe47: + return 0x0; + case 0xe48: + return 0x0; + case 0xe49: + return 0x0; + case 0xe4a: + return 0x0; + case 0xe4b: + return 0x0; + case 0xe4c: + return 0x0; + case 0xe4e: + return 0x0; + case 0xec8: + return 0x0; + case 0xec9: + return 0x0; + case 0xeca: + return 0x0; + case 0xecb: + return 0x0; + case 0xecc: + return 0x0; + case 0xf18: + return 0x0; + case 0xf19: + return 0x0; + case 0xf35: + return 0x0; + case 0xf37: + return 0x0; + case 0xf39: + return 0x0; + case 0xf3e: + return 0x0; + case 0xf3f: + return 0x0; + case 0xf82: + return 0x0; + case 0xf83: + return 0x0; + case 0xf84: + return 0x0; + case 0xf86: + return 0x0; + case 0xf87: + return 0x0; + case 0xfc6: + return 0x0; + case 0x1037: + return 0x0; + case 0x1039: + return 0x0; + case 0x103a: + return 0x0; + case 0x1087: + return 0x0; + case 0x1088: + return 0x0; + case 0x1089: + return 0x0; + case 0x108a: + return 0x0; + case 0x108b: + return 0x0; + case 0x108c: + return 0x0; + case 0x108d: + return 0x0; + case 0x108f: + return 0x0; + case 0x109a: + return 0x0; + case 0x109b: + return 0x0; + case 0x17c9: + return 0x0; + case 0x17ca: + return 0x0; + case 0x17cb: + return 0x0; + case 0x17cc: + return 0x0; + case 0x17cd: + return 0x0; + case 0x17ce: + return 0x0; + case 0x17cf: + return 0x0; + case 0x17d0: + return 0x0; + case 0x17d1: + return 0x0; + case 0x17d2: + return 0x0; + case 0x17d3: + return 0x0; + case 0x17dd: + return 0x0; + case 0x1939: + return 0x0; + case 0x193a: + return 0x0; + case 0x193b: + return 0x0; + case 0x1a75: + return 0x0; + case 0x1a76: + return 0x0; + case 0x1a77: + return 0x0; + case 0x1a78: + return 0x0; + case 0x1a79: + return 0x0; + case 0x1a7a: + return 0x0; + case 0x1a7b: + return 0x0; + case 0x1a7c: + return 0x0; + case 0x1a7f: + return 0x0; + case 0x1ab0: + return 0x0; + case 0x1ab1: + return 0x0; + case 0x1ab2: + return 0x0; + case 0x1ab3: + return 0x0; + case 0x1ab4: + return 0x0; + case 0x1ab5: + return 0x0; + case 0x1ab6: + return 0x0; + case 0x1ab7: + return 0x0; + case 0x1ab8: + return 0x0; + case 0x1ab9: + return 0x0; + case 0x1aba: + return 0x0; + case 0x1abb: + return 0x0; + case 0x1abc: + return 0x0; + case 0x1abd: + return 0x0; + case 0x1b34: + return 0x0; + case 0x1b44: + return 0x0; + case 0x1b6b: + return 0x0; + case 0x1b6c: + return 0x0; + case 0x1b6d: + return 0x0; + case 0x1b6e: + return 0x0; + case 0x1b6f: + return 0x0; + case 0x1b70: + return 0x0; + case 0x1b71: + return 0x0; + case 0x1b72: + return 0x0; + case 0x1b73: + return 0x0; + case 0x1baa: + return 0x0; + case 0x1bab: + return 0x0; + case 0x1c36: + return 0x0; + case 0x1c37: + return 0x0; + case 0x1c78: + return 0x0; + case 0x1c79: + return 0x0; + case 0x1c7a: + return 0x0; + case 0x1c7b: + return 0x0; + case 0x1c7c: + return 0x0; + case 0x1c7d: + return 0x0; + case 0x1cd0: + return 0x0; + case 0x1cd1: + return 0x0; + case 0x1cd2: + return 0x0; + case 0x1cd3: + return 0x0; + case 0x1cd4: + return 0x0; + case 0x1cd5: + return 0x0; + case 0x1cd6: + return 0x0; + case 0x1cd7: + return 0x0; + case 0x1cd8: + return 0x0; + case 0x1cd9: + return 0x0; + case 0x1cda: + return 0x0; + case 0x1cdb: + return 0x0; + case 0x1cdc: + return 0x0; + case 0x1cdd: + return 0x0; + case 0x1cde: + return 0x0; + case 0x1cdf: + return 0x0; + case 0x1ce0: + return 0x0; + case 0x1ce1: + return 0x0; + case 0x1ce2: + return 0x0; + case 0x1ce3: + return 0x0; + case 0x1ce4: + return 0x0; + case 0x1ce5: + return 0x0; + case 0x1ce6: + return 0x0; + case 0x1ce7: + return 0x0; + case 0x1ce8: + return 0x0; + case 0x1ced: + return 0x0; + case 0x1cf4: + return 0x0; + case 0x1cf8: + return 0x0; + case 0x1cf9: + return 0x0; + case 0x1d2c: + return 0x0; + case 0x1d2d: + return 0x0; + case 0x1d2e: + return 0x0; + case 0x1d2f: + return 0x0; + case 0x1d30: + return 0x0; + case 0x1d31: + return 0x0; + case 0x1d32: + return 0x0; + case 0x1d33: + return 0x0; + case 0x1d34: + return 0x0; + case 0x1d35: + return 0x0; + case 0x1d36: + return 0x0; + case 0x1d37: + return 0x0; + case 0x1d38: + return 0x0; + case 0x1d39: + return 0x0; + case 0x1d3a: + return 0x0; + case 0x1d3b: + return 0x0; + case 0x1d3c: + return 0x0; + case 0x1d3d: + return 0x0; + case 0x1d3e: + return 0x0; + case 0x1d3f: + return 0x0; + case 0x1d40: + return 0x0; + case 0x1d41: + return 0x0; + case 0x1d42: + return 0x0; + case 0x1d43: + return 0x0; + case 0x1d44: + return 0x0; + case 0x1d45: + return 0x0; + case 0x1d46: + return 0x0; + case 0x1d47: + return 0x0; + case 0x1d48: + return 0x0; + case 0x1d49: + return 0x0; + case 0x1d4a: + return 0x0; + case 0x1d4b: + return 0x0; + case 0x1d4c: + return 0x0; + case 0x1d4d: + return 0x0; + case 0x1d4e: + return 0x0; + case 0x1d4f: + return 0x0; + case 0x1d50: + return 0x0; + case 0x1d51: + return 0x0; + case 0x1d52: + return 0x0; + case 0x1d53: + return 0x0; + case 0x1d54: + return 0x0; + case 0x1d55: + return 0x0; + case 0x1d56: + return 0x0; + case 0x1d57: + return 0x0; + case 0x1d58: + return 0x0; + case 0x1d59: + return 0x0; + case 0x1d5a: + return 0x0; + case 0x1d5b: + return 0x0; + case 0x1d5c: + return 0x0; + case 0x1d5d: + return 0x0; + case 0x1d5e: + return 0x0; + case 0x1d5f: + return 0x0; + case 0x1d60: + return 0x0; + case 0x1d61: + return 0x0; + case 0x1d62: + return 0x0; + case 0x1d63: + return 0x0; + case 0x1d64: + return 0x0; + case 0x1d65: + return 0x0; + case 0x1d66: + return 0x0; + case 0x1d67: + return 0x0; + case 0x1d68: + return 0x0; + case 0x1d69: + return 0x0; + case 0x1d6a: + return 0x0; + case 0x1dc4: + return 0x0; + case 0x1dc5: + return 0x0; + case 0x1dc6: + return 0x0; + case 0x1dc7: + return 0x0; + case 0x1dc8: + return 0x0; + case 0x1dc9: + return 0x0; + case 0x1dca: + return 0x0; + case 0x1dcb: + return 0x0; + case 0x1dcc: + return 0x0; + case 0x1dcd: + return 0x0; + case 0x1dce: + return 0x0; + case 0x1dcf: + return 0x0; + case 0x1df5: + return 0x0; + case 0x1dfd: + return 0x0; + case 0x1dfe: + return 0x0; + case 0x1dff: + return 0x0; case 0x1e00: return 0x41; case 0x1e01: @@ -1575,8 +2697,16 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x391; case 0x1fbc: return 0x391; + case 0x1fbd: + return 0x0; case 0x1fbe: return 0x3b9; + case 0x1fbf: + return 0x0; + case 0x1fc0: + return 0x0; + case 0x1fc1: + return 0x0; case 0x1fc2: return 0x3b7; case 0x1fc3: @@ -1597,6 +2727,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x397; case 0x1fcc: return 0x397; + case 0x1fcd: + return 0x0; + case 0x1fce: + return 0x0; + case 0x1fcf: + return 0x0; case 0x1fd0: return 0x3b9; case 0x1fd1: @@ -1617,6 +2753,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x399; case 0x1fdb: return 0x399; + case 0x1fdd: + return 0x0; + case 0x1fde: + return 0x0; + case 0x1fdf: + return 0x0; case 0x1fe0: return 0x3c5; case 0x1fe1: @@ -1643,6 +2785,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x3a5; case 0x1fec: return 0x3a1; + case 0x1fed: + return 0x0; + case 0x1fee: + return 0x0; + case 0x1fef: + return 0x0; case 0x1ff2: return 0x3c9; case 0x1ff3: @@ -1663,6 +2811,10 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x3a9; case 0x1ffc: return 0x3a9; + case 0x1ffd: + return 0x0; + case 0x1ffe: + return 0x0; case 0x2000: return 0x2002; case 0x2001: @@ -1767,6 +2919,26 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x3009; case 0x2adc: return 0x2add; + case 0x2cef: + return 0x0; + case 0x2cf0: + return 0x0; + case 0x2cf1: + return 0x0; + case 0x2e2f: + return 0x0; + case 0x302a: + return 0x0; + case 0x302b: + return 0x0; + case 0x302c: + return 0x0; + case 0x302d: + return 0x0; + case 0x302e: + return 0x0; + case 0x302f: + return 0x0; case 0x304c: return 0x304b; case 0x304e: @@ -1819,6 +2991,14 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x307b; case 0x3094: return 0x3046; + case 0x3099: + return 0x0; + case 0x309a: + return 0x0; + case 0x309b: + return 0x0; + case 0x309c: + return 0x0; case 0x309e: return 0x309d; case 0x30ac: @@ -1881,8 +3061,138 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x30f1; case 0x30fa: return 0x30f2; + case 0x30fc: + return 0x0; case 0x30fe: return 0x30fd; + case 0xa66f: + return 0x0; + case 0xa67c: + return 0x0; + case 0xa67d: + return 0x0; + case 0xa67f: + return 0x0; + case 0xa69c: + return 0x0; + case 0xa69d: + return 0x0; + case 0xa6f0: + return 0x0; + case 0xa6f1: + return 0x0; + case 0xa717: + return 0x0; + case 0xa718: + return 0x0; + case 0xa719: + return 0x0; + case 0xa71a: + return 0x0; + case 0xa71b: + return 0x0; + case 0xa71c: + return 0x0; + case 0xa71d: + return 0x0; + case 0xa71e: + return 0x0; + case 0xa71f: + return 0x0; + case 0xa720: + return 0x0; + case 0xa721: + return 0x0; + case 0xa788: + return 0x0; + case 0xa7f8: + return 0x0; + case 0xa7f9: + return 0x0; + case 0xa8c4: + return 0x0; + case 0xa8e0: + return 0x0; + case 0xa8e1: + return 0x0; + case 0xa8e2: + return 0x0; + case 0xa8e3: + return 0x0; + case 0xa8e4: + return 0x0; + case 0xa8e5: + return 0x0; + case 0xa8e6: + return 0x0; + case 0xa8e7: + return 0x0; + case 0xa8e8: + return 0x0; + case 0xa8e9: + return 0x0; + case 0xa8ea: + return 0x0; + case 0xa8eb: + return 0x0; + case 0xa8ec: + return 0x0; + case 0xa8ed: + return 0x0; + case 0xa8ee: + return 0x0; + case 0xa8ef: + return 0x0; + case 0xa8f0: + return 0x0; + case 0xa8f1: + return 0x0; + case 0xa92b: + return 0x0; + case 0xa92c: + return 0x0; + case 0xa92d: + return 0x0; + case 0xa92e: + return 0x0; + case 0xa953: + return 0x0; + case 0xa9b3: + return 0x0; + case 0xa9c0: + return 0x0; + case 0xa9e5: + return 0x0; + case 0xaa7b: + return 0x0; + case 0xaa7c: + return 0x0; + case 0xaa7d: + return 0x0; + case 0xaabf: + return 0x0; + case 0xaac0: + return 0x0; + case 0xaac1: + return 0x0; + case 0xaac2: + return 0x0; + case 0xaaf6: + return 0x0; + case 0xab5b: + return 0x0; + case 0xab5c: + return 0x0; + case 0xab5d: + return 0x0; + case 0xab5e: + return 0x0; + case 0xab5f: + return 0x0; + case 0xabec: + return 0x0; + case 0xabed: + return 0x0; case 0xf900: return 0x8c48; case 0xf901: @@ -2805,6 +4115,8 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x9f8e; case 0xfb1d: return 0x5d9; + case 0xfb1e: + return 0x0; case 0xfb1f: return 0x5f2; case 0xfb2a: @@ -2871,12 +4183,246 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) { return 0x5db; case 0xfb4e: return 0x5e4; + case 0xfe20: + return 0x0; + case 0xfe21: + return 0x0; + case 0xfe22: + return 0x0; + case 0xfe23: + return 0x0; + case 0xfe24: + return 0x0; + case 0xfe25: + return 0x0; + case 0xfe26: + return 0x0; + case 0xfe27: + return 0x0; + case 0xfe28: + return 0x0; + case 0xfe29: + return 0x0; + case 0xfe2a: + return 0x0; + case 0xfe2b: + return 0x0; + case 0xfe2c: + return 0x0; + case 0xfe2d: + return 0x0; + case 0xfe2e: + return 0x0; + case 0xfe2f: + return 0x0; + case 0xff3e: + return 0x0; + case 0xff40: + return 0x0; + case 0xff70: + return 0x0; + case 0xff9e: + return 0x0; + case 0xff9f: + return 0x0; + case 0xffe3: + return 0x0; + case 0x102e0: + return 0x0; + case 0x10ae5: + return 0x0; + case 0x10ae6: + return 0x0; case 0x1109a: return 0x11099; case 0x1109c: return 0x1109b; case 0x110ab: return 0x110a5; + case 0x110b9: + return 0x0; + case 0x110ba: + return 0x0; + case 0x11133: + return 0x0; + case 0x11134: + return 0x0; + case 0x11173: + return 0x0; + case 0x111c0: + return 0x0; + case 0x111ca: + return 0x0; + case 0x111cb: + return 0x0; + case 0x111cc: + return 0x0; + case 0x11235: + return 0x0; + case 0x11236: + return 0x0; + case 0x112e9: + return 0x0; + case 0x112ea: + return 0x0; + case 0x1133c: + return 0x0; + case 0x1134d: + return 0x0; + case 0x11366: + return 0x0; + case 0x11367: + return 0x0; + case 0x11368: + return 0x0; + case 0x11369: + return 0x0; + case 0x1136a: + return 0x0; + case 0x1136b: + return 0x0; + case 0x1136c: + return 0x0; + case 0x11370: + return 0x0; + case 0x11371: + return 0x0; + case 0x11372: + return 0x0; + case 0x11373: + return 0x0; + case 0x11374: + return 0x0; + case 0x114c2: + return 0x0; + case 0x114c3: + return 0x0; + case 0x115bf: + return 0x0; + case 0x115c0: + return 0x0; + case 0x1163f: + return 0x0; + case 0x116b6: + return 0x0; + case 0x116b7: + return 0x0; + case 0x1172b: + return 0x0; + case 0x16af0: + return 0x0; + case 0x16af1: + return 0x0; + case 0x16af2: + return 0x0; + case 0x16af3: + return 0x0; + case 0x16af4: + return 0x0; + case 0x16f8f: + return 0x0; + case 0x16f90: + return 0x0; + case 0x16f91: + return 0x0; + case 0x16f92: + return 0x0; + case 0x16f93: + return 0x0; + case 0x16f94: + return 0x0; + case 0x16f95: + return 0x0; + case 0x16f96: + return 0x0; + case 0x16f97: + return 0x0; + case 0x16f98: + return 0x0; + case 0x16f99: + return 0x0; + case 0x16f9a: + return 0x0; + case 0x16f9b: + return 0x0; + case 0x16f9c: + return 0x0; + case 0x16f9d: + return 0x0; + case 0x16f9e: + return 0x0; + case 0x16f9f: + return 0x0; + case 0x1d167: + return 0x0; + case 0x1d168: + return 0x0; + case 0x1d169: + return 0x0; + case 0x1d16d: + return 0x0; + case 0x1d16e: + return 0x0; + case 0x1d16f: + return 0x0; + case 0x1d170: + return 0x0; + case 0x1d171: + return 0x0; + case 0x1d172: + return 0x0; + case 0x1d17b: + return 0x0; + case 0x1d17c: + return 0x0; + case 0x1d17d: + return 0x0; + case 0x1d17e: + return 0x0; + case 0x1d17f: + return 0x0; + case 0x1d180: + return 0x0; + case 0x1d181: + return 0x0; + case 0x1d182: + return 0x0; + case 0x1d185: + return 0x0; + case 0x1d186: + return 0x0; + case 0x1d187: + return 0x0; + case 0x1d188: + return 0x0; + case 0x1d189: + return 0x0; + case 0x1d18a: + return 0x0; + case 0x1d18b: + return 0x0; + case 0x1d1aa: + return 0x0; + case 0x1d1ab: + return 0x0; + case 0x1d1ac: + return 0x0; + case 0x1d1ad: + return 0x0; + case 0x1e8d0: + return 0x0; + case 0x1e8d1: + return 0x0; + case 0x1e8d2: + return 0x0; + case 0x1e8d3: + return 0x0; + case 0x1e8d4: + return 0x0; + case 0x1e8d5: + return 0x0; + case 0x1e8d6: + return 0x0; case 0x2f800: return 0x4e3d; case 0x2f801: diff --git a/src/mongo/db/fts/unicode/codepoints_test.cpp b/src/mongo/db/fts/unicode/codepoints_test.cpp index 90510666cba..a68c41688f6 100644 --- a/src/mongo/db/fts/unicode/codepoints_test.cpp +++ b/src/mongo/db/fts/unicode/codepoints_test.cpp @@ -32,6 +32,8 @@ namespace mongo { namespace unicode { +const char32_t maxCP = 0x1FFFFF; // Highest valid codepoint. + /** * Above most of the arrays in this class are the UTF-32 character literals that correspond to the * codepoints in the array. @@ -39,14 +41,19 @@ namespace unicode { TEST(UnicodeCodepoints, Diacritics) { // There are no character literals for combining marks. - const char32_t marks[] = {0x0301, 0x0339, 0x1AB4, 0x1DC5, 0xA69D}; + const char32_t marks[] = {'^', '`', 0x0301, 0x0339, 0x1AB4, 0x1DC5, 0xA69D}; // const char32_t not_marks[] = {U'-', U'.', U'\'', U'*', U'm'}; const char32_t not_marks[] = {0x2D, 0x2E, 0x27, 0x2A, 0x6D}; - for (auto i = 0; i < 5; ++i) { - ASSERT(codepointIsDiacritic(marks[i])); - ASSERT_FALSE(codepointIsDiacritic(not_marks[i])); + for (auto cp : marks) { + ASSERT(codepointIsDiacritic(cp)); + ASSERT_EQ(codepointRemoveDiacritics(cp), char32_t(0)); + } + + for (auto cp : not_marks) { + ASSERT(!codepointIsDiacritic(cp)); + ASSERT_NE(codepointRemoveDiacritics(cp), char32_t(0)); } } @@ -77,6 +84,10 @@ TEST(UnicodeCodepoints, RemoveDiacritics) { for (auto i = 0; i < 5; ++i) { ASSERT_EQUALS(clean[i], codepointRemoveDiacritics(originals[i])); } + + for (char32_t cp = 0; cp <= maxCP; cp++) { + ASSERT_EQ(codepointRemoveDiacritics(cp) == 0, cp == 0 || codepointIsDiacritic(cp)); + } } TEST(UnicodeCodepoints, ToLower) { @@ -90,5 +101,18 @@ TEST(UnicodeCodepoints, ToLower) { } } +TEST(UnicodeCodepoints, ToLowerIsFixedPoint) { + for (char32_t cp = 0; cp <= maxCP; cp++) { + ASSERT_EQ(codepointToLower(cp), codepointToLower(codepointToLower(cp))); + } +} + +TEST(UnicodeCodepoints, RemoveDiacriticsIsFixedPoint) { + for (char32_t cp = 0; cp <= maxCP; cp++) { + ASSERT_EQ(codepointRemoveDiacritics(cp), + codepointRemoveDiacritics(codepointRemoveDiacritics(cp))); + } +} + } // namespace unicode } // namespace mongo diff --git a/src/mongo/db/fts/unicode/gen_casefold_map.py b/src/mongo/db/fts/unicode/gen_casefold_map.py index 7ab9b49f7a4..61249e5f0d7 100644 --- a/src/mongo/db/fts/unicode/gen_casefold_map.py +++ b/src/mongo/db/fts/unicode/gen_casefold_map.py @@ -7,11 +7,11 @@ from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \ include def generate(unicode_casefold_file, target): - """Generates a C++ source file that contains a Unicode case folding + """Generates a C++ source file that contains a Unicode case folding function. The case folding function contains a switch statement with cases for every - Unicode codepoint that has a case folding mapping. + Unicode codepoint that has a case folding mapping. """ out = open(target, "w") @@ -43,29 +43,46 @@ def generate(unicode_casefold_file, target): codepoint_mapping = int(values[2], 16) case_mappings[original_codepoint] = codepoint_mapping - out.write("""char32_t codepointToLower(char32_t codepoint, CaseFoldMode \ -mode) { - if (mode == CaseFoldMode::kTurkish) { - if (codepoint == 0x049) { // I -> ı - return 0x131; - } else if (codepoint == 0x130) { // İ -> i - return 0x069; - } + turkishMapping = { + 0x49: 0x131, # I -> ı + 0x130: 0x069, # İ -> i } - switch (codepoint) {\n""") + out.write( + """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) { + if (codepoint <= 0x7f) { + if (codepoint >= 'A' && codepoint <= 'Z') { + return (mode == CaseFoldMode::kTurkish && codepoint == 'I') + ? 0x131 + : (codepoint | 0x20); // Set the ascii lowercase bit on the character. + } + return codepoint; + } + + switch (codepoint) {\n""") mappings_list = [] for mapping in case_mappings: mappings_list.append((mapping, case_mappings[mapping])) + # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle + # them even if we'd skip the letter in non-turkish mode. + for mapping in turkishMapping: + if mapping not in case_mappings: + mappings_list.append((mapping, mapping)) + sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0]) for mapping in sorted_mappings: - out.write("\ - case " + str(hex(mapping[0])) + ": return " + \ - str(hex(mapping[1])) +";\n") + if mapping[0] <= 0x7f: + continue # ascii is special cased above. + + if mapping[0] in turkishMapping: + out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n" + % (mapping[0], turkishMapping[mapping[0]], mapping[1])) + else: + out.write("case 0x%x: return 0x%x;\n"%mapping) out.write("\ default: return codepoint;\n }\n}") diff --git a/src/mongo/db/fts/unicode/gen_diacritic_map.py b/src/mongo/db/fts/unicode/gen_diacritic_map.py index 84f57e82a3c..08cfa95cda2 100644 --- a/src/mongo/db/fts/unicode/gen_diacritic_map.py +++ b/src/mongo/db/fts/unicode/gen_diacritic_map.py @@ -55,6 +55,8 @@ def add_diacritic_mapping(codepoint): # Only use mappings where the final recomposed form is a single codepoint if (a != c and len(c) == 1): + assert c != '\0' # This is used to indicate the codepoint is a pure diacritic. + assert ord(c) not in diacritics diacritic_mappings[codepoint] = ord(c[0]) def add_diacritic_range(start, end): @@ -78,6 +80,9 @@ def generate(target): # Map diacritics from 0 to the maximum Unicode codepoint add_diacritic_range(0x0000, 0x10FFFF) + for diacritic in diacritics: + diacritic_mappings[diacritic] = 0 + out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) { switch (codepoint) {\n""") diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp index 9c18a776787..0ea37d8f33f 100644 --- a/src/mongo/db/fts/unicode/string.cpp +++ b/src/mongo/db/fts/unicode/string.cpp @@ -29,7 +29,9 @@ #include "mongo/db/fts/unicode/string.h" #include <algorithm> +#include <boost/algorithm/searching/boyer_moore.hpp> +#include "mongo/platform/bits.h" #include "mongo/shell/linenoise_utf8.h" #include "mongo/util/assert_util.h" @@ -42,10 +44,6 @@ using linenoise_utf8::copyString8to32; using std::u32string; String::String(const StringData utf8_src) { - // Reserve space for underlying buffers to prevent excessive reallocations. - _outputBuf.reserve(utf8_src.size() * 4); - _data.reserve(utf8_src.size() * 4); - // Convert UTF-8 input to UTF-32 data. setData(utf8_src); } @@ -79,12 +77,6 @@ void String::setData(const StringData utf8_src) { _needsOutputConversion = true; } -String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) { - // Reserve space for underlying buffers to prevent excessive reallocations. - _outputBuf.reserve(src.size() * 4); - _data.reserve(src.size() * 4); -} - std::string String::toString() { // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input // characters, plus a null character if there isn't one. @@ -100,14 +92,6 @@ std::string String::toString() { return _outputBuf; } -size_t String::size() const { - return _data.size(); -} - -const char32_t& String::operator[](int i) const { - return _data[i]; -} - String String::substr(size_t pos, size_t len) const { unicode::String buf; substrToBuf(pos, len, buf); @@ -147,63 +131,133 @@ void String::substrToBuf(size_t pos, size_t len, String& buffer) const { void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const { buffer._data.resize(_data.size()); - auto index = 0; + auto outIt = buffer._data.begin(); for (auto codepoint : _data) { - buffer._data[index++] = codepointToLower(codepoint, mode); + *outIt++ = codepointToLower(codepoint, mode); } buffer._needsOutputConversion = true; } void String::removeDiacriticsToBuf(String& buffer) const { buffer._data.resize(_data.size()); - auto index = 0; + auto outIt = buffer._data.begin(); for (auto codepoint : _data) { - if (!codepointIsDiacritic(codepoint)) { - buffer._data[index++] = codepointRemoveDiacritics(codepoint); + if (codepoint <= 0x7f) { + // ASCII only has two diacritics so they are hard-coded here. + if (codepoint != '^' && codepoint != '`') { + *outIt++ = codepoint; + } + } else if (auto clean = codepointRemoveDiacritics(codepoint)) { + *outIt++ = clean; + } else { + // codepoint was a pure diacritic mark, so skip it. } } - buffer._data.resize(index); + buffer._data.resize(outIt - buffer._data.begin()); buffer._needsOutputConversion = true; } -bool String::substrMatch(const String& str, - const String& find, +std::pair<std::unique_ptr<char[]>, char*> String::prepForSubstrMatch(StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode) { + // This function should only be called when casefolding or stripping diacritics. + dassert(!(options & kCaseSensitive) || !(options & kDiacriticSensitive)); + + // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics + // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2 + // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we + // don't map a single code point to more than one. + std::unique_ptr<char[]> buffer(new char[utf8.size() * 2]); + auto outputIt = buffer.get(); + + for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) { + const uint8_t firstByte = *inputIt++; + char32_t codepoint = 0; + if (firstByte <= 0x7f) { + // ASCII special case. Can use faster operations. + if ((!(options & kCaseSensitive)) && (firstByte >= 'A' && firstByte <= 'Z')) { + codepoint = (mode == CaseFoldMode::kTurkish && firstByte == 'I') + ? 0x131 // In Turkish, I -> ı (i with no dot). + : (firstByte | 0x20); // Set the ascii lowercase bit on the character. + } else { + // ASCII has two pure diacritics that should be skipped and no characters that + // change when removing diacritics. + if ((options & kDiacriticSensitive) || !(firstByte == '^' || firstByte == '`')) { + *outputIt++ = (firstByte); + } + continue; + } + } else { + // firstByte indicates that it is not an ASCII char. + int leadingOnes = countLeadingZeros64(~(uint64_t(firstByte) << (64 - 8))); + + // Only checking enough to ensure that this code doesn't crash in the face of malformed + // utf-8. We make no guarantees about what results will be returned in this case. + uassert(ErrorCodes::BadValue, + "text contains invalid UTF-8", + leadingOnes > 1 && leadingOnes <= 4 && inputIt + leadingOnes - 1 <= endIt); + + codepoint = firstByte & (0xff >> leadingOnes); // mask off the size indicator. + for (int subByteIx = 1; subByteIx < leadingOnes; subByteIx++) { + const uint8_t subByte = *inputIt++; + codepoint <<= 6; + codepoint |= subByte & 0x3f; // mask off continuation bits. + } + + if (!(options & kCaseSensitive)) { + codepoint = codepointToLower(codepoint, mode); + } + + if (!(options & kDiacriticSensitive)) { + codepoint = codepointRemoveDiacritics(codepoint); + if (!codepoint) + continue; // codepoint is a pure diacritic. + } + } + + // Back to utf-8. + if (codepoint <= 0x7f /* max 1-byte codepoint */) { + *outputIt++ = (codepoint); + } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) { + *outputIt++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s. + *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); + } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) { + *outputIt++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s. + *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80); + *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); + } else { + *outputIt++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s. + *outputIt++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80); + *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80); + *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80); + } + } + + return {std::move(buffer), outputIt}; +} + +bool String::substrMatch(const std::string& str, + const std::string& find, SubstrMatchOptions options, CaseFoldMode cfMode) { - // In Turkish, lowercasing needs to be applied first because the letter İ has a different case - // folding mapping than the letter I, but removing diacritics removes the dot from İ. if (cfMode == CaseFoldMode::kTurkish) { - String cleanStr = str.toLower(cfMode); - String cleanFind = find.toLower(cfMode); - return substrMatch(cleanStr, cleanFind, options | kCaseSensitive, CaseFoldMode::kNormal); + // Turkish comparisons are always case insensitive due to their handling of I/i. + options &= ~kCaseSensitive; } - if (options & kDiacriticSensitive) { - if (options & kCaseSensitive) { - // Case sensitive and diacritic sensitive. - return std::search(str._data.cbegin(), - str._data.cend(), - find._data.cbegin(), - find._data.cend(), - [&](char32_t c1, char32_t c2) { return (c1 == c2); }) != - str._data.cend(); - } - - // Case insensitive and diacritic sensitive. - return std::search(str._data.cbegin(), - str._data.cend(), - find._data.cbegin(), - find._data.cend(), - [&](char32_t c1, char32_t c2) { - return (codepointToLower(c1, cfMode) == - codepointToLower(c2, cfMode)); - }) != str._data.cend(); + if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) { + // No transformation needed. Just do the search on the input strings. + return boost::algorithm::boyer_moore_search( + str.cbegin(), str.cend(), find.cbegin(), find.cend()) != str.cend(); } - String cleanStr = str.removeDiacritics(); - String cleanFind = find.removeDiacritics(); + auto haystack = prepForSubstrMatch(str, options, cfMode); + auto needle = prepForSubstrMatch(find, options, cfMode); - return substrMatch(cleanStr, cleanFind, options | kDiacriticSensitive, cfMode); + // Case sensitive and diacritic sensitive. + return boost::algorithm::boyer_moore_search( + haystack.first.get(), haystack.second, needle.first.get(), needle.second) != + haystack.second; } } // namespace unicode diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index c3355ee4f25..1ba6e46c27c 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -29,6 +29,7 @@ #pragma once #include <cstdint> +#include <memory> #include <string> #include "mongo/base/string_data.h" @@ -110,12 +111,16 @@ public: /** * Returns the number Unicode codepoints in the String. */ - size_t size() const; + size_t size() const { + return _data.size(); + } /** * Returns the Unicode codepoint at index i of the String. */ - const char32_t& operator[](int i) const; + const char32_t& operator[](int i) const { + return _data[i]; + } /** * Options for the substrMatch method. @@ -143,18 +148,22 @@ public: * the search is case insensitive, non-Turkish case folding is used unless the * CaseFoldMode::Turkish is passed to mode. */ - static bool substrMatch(const String& str, - const String& find, + static bool substrMatch(const std::string& str, + const std::string& find, SubstrMatchOptions options, CaseFoldMode mode = CaseFoldMode::kNormal); -private: /** - * Private constructor used by substr, toLower, and removeDiacritics to build a String from - * UTF-32 data. + * Strips diacritics and case-folds the utf8 input string, as needed to support options. + * + * Returns an owned buffer containing the output utf8 string and an end iterator for the string + * (points at the first byte after the string). */ - String(std::u32string&& src); + static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8, + SubstrMatchOptions options, + CaseFoldMode mode); +private: /** * Helper method for converting a UTF-8 string to a UTF-32 string. */ diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp index 9354f7bf25c..8dec9a7e8df 100644 --- a/src/mongo/db/fts/unicode/string_test.cpp +++ b/src/mongo/db/fts/unicode/string_test.cpp @@ -47,82 +47,146 @@ namespace unicode { using linenoise_utf8::copyString32to8; +namespace { +// Added to the end of strings to ensure the real content is tested with the vectored +// implementation. +const std::string filler(32, 'x'); + +/** + * Converts return from prepForSubstrMatch to a StringData that is only useful in the current + * expression. + */ +StringData toStringData(std::pair<std::unique_ptr<char[]>, char*> result) { + return {result.first.get(), size_t(result.second - result.first.get())}; +} + +auto kDiacriticSensitive = String::kDiacriticSensitive; +auto kCaseSensitive = String::kCaseSensitive; + +auto kTurkish = CaseFoldMode::kTurkish; +auto kNormal = CaseFoldMode::kNormal; +} + + +// Macro to preserve line numbers and arguments in error messages. +#define TEST_PREP_FOR_SUBSTR_MATCH(expected, input, options, caseFoldMode) \ + ASSERT_EQ(expected, toStringData(String::prepForSubstrMatch(input, options, caseFoldMode))); \ + ASSERT_EQ(expected + filler, \ + toStringData(String::prepForSubstrMatch(input + filler, options, caseFoldMode))) + TEST(UnicodeString, RemoveDiacritics) { + // Test all ascii chars. + for (unsigned char ch = 0; ch <= 0x7F; ch++) { + const auto input = std::string(1, ch); + const auto output = codepointIsDiacritic(ch) ? std::string() : std::string(1, ch); + if (ch) { // String's constructor doesn't handle embedded NUL bytes. + ASSERT_EQUALS(output, String(input).removeDiacritics().toString()); + } + TEST_PREP_FOR_SUBSTR_MATCH(output, input, kCaseSensitive, kNormal); + } + // NFC Normalized Text. - String test1 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?")); + const char test1[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"); // NFD Normalized Text ("Café"). const char test2[] = {'C', 'a', 'f', 'e', static_cast<char>(0xcc), static_cast<char>(0x81), 0}; - ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), test1.removeDiacritics().toString()); + ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString()); ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString()); + + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal); + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("Cafe"), test2, kCaseSensitive, kNormal); } TEST(UnicodeString, CaseFolding) { - String test1 = String(UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?")); - String test2 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?")); + // Test all ascii chars. + for (unsigned char ch = 0; ch <= 0x7F; ch++) { + const auto upper = std::string(1, ch); + const auto lower = std::string(1, std::tolower(ch)); + if (ch) { // String's constructor doesn't handle embedded NUL bytes. + ASSERT_EQUALS(lower, String(upper).toLower().toString()); + } + TEST_PREP_FOR_SUBSTR_MATCH(lower, upper, kDiacriticSensitive, kNormal); + } + + const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?"); + const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"); - ASSERT_EQUALS(UTF8("сколько тебе лет?"), test1.toLower().toString()); - ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), test2.toLower().toString()); + ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString()); + ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString()); + + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal); + TEST_PREP_FOR_SUBSTR_MATCH( + UTF8("¿cuántos años tienes tú?"), test2, kDiacriticSensitive, kNormal); } TEST(UnicodeString, CaseFoldingTurkish) { - String test1 = String(UTF8("KAC YASINDASINIZ")); - String test2 = String(UTF8("KAC YASİNDASİNİZ")); + const char test1[] = UTF8("KAC YASINDASINIZ"); + const char test2[] = UTF8("KAC YASİNDASİNİZ"); + + ASSERT_EQUALS(UTF8("kac yasındasınız"), + String(test1).toLower(CaseFoldMode::kTurkish).toString()); + ASSERT_EQUALS(UTF8("kac yasindasiniz"), + String(test2).toLower(CaseFoldMode::kTurkish).toString()); - ASSERT_EQUALS(UTF8("kac yasındasınız"), test1.toLower(CaseFoldMode::kTurkish).toString()); - ASSERT_EQUALS(UTF8("kac yasindasiniz"), test2.toLower(CaseFoldMode::kTurkish).toString()); + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish); + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish); } TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) { // NFC Normalized Text. - String test1 = String(UTF8("Πόσο χρονών είσαι?")); - String test2 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?")); + const char test1[] = UTF8("Πόσο χρονών είσαι?"); + const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"); // NFD Normalized Text ("CAFÉ"). const char test3[] = {'C', 'A', 'F', 'E', static_cast<char>(0xcc), static_cast<char>(0x81), 0}; - ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"), test1.toLower().removeDiacritics().toString()); - ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"), test2.toLower().removeDiacritics().toString()); + ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"), + String(test1).toLower().removeDiacritics().toString()); + ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"), + String(test2).toLower().removeDiacritics().toString()); ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString()); + + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal); + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal); + TEST_PREP_FOR_SUBSTR_MATCH(UTF8("cafe"), test3, 0, kNormal); } TEST(UnicodeString, SubstringMatch) { - String str = String(UTF8("Одумайся! Престол свой сохрани; И ярость укроти.")); + std::string str = UTF8("Одумайся! Престол свой сохрани; И ярость укроти."); // Case insensitive & diacritic insensitive. - ASSERT(String::substrMatch(str, String(UTF8("ПРЁСТОЛ СВОИ")), String::kNone)); - ASSERT_FALSE(String::substrMatch(str, String(UTF8("Престол сохрани")), String::kNone)); + ASSERT(String::substrMatch(str, UTF8("ПРЁСТОЛ СВОИ"), String::kNone)); + ASSERT_FALSE(String::substrMatch(str, UTF8("Престол сохрани"), String::kNone)); // Case sensitive & diacritic insensitive. - ASSERT(String::substrMatch(str, String(UTF8("Одумаися!")), String::kCaseSensitive)); - ASSERT_FALSE(String::substrMatch(str, String(UTF8("одумайся!")), String::kCaseSensitive)); + ASSERT(String::substrMatch(str, UTF8("Одумаися!"), String::kCaseSensitive)); + ASSERT_FALSE(String::substrMatch(str, UTF8("одумайся!"), String::kCaseSensitive)); // Case insensitive & diacritic sensitive. - ASSERT(String::substrMatch(str, String(UTF8("одумайся!")), String::kDiacriticSensitive)); - ASSERT_FALSE(String::substrMatch(str, String(UTF8("Одумаися!")), String::kDiacriticSensitive)); + ASSERT(String::substrMatch(str, UTF8("одумайся!"), String::kDiacriticSensitive)); + ASSERT_FALSE(String::substrMatch(str, UTF8("Одумаися!"), String::kDiacriticSensitive)); // Case sensitive & diacritic sensitive. ASSERT(String::substrMatch( - str, String(UTF8("Одумайся!")), String::kDiacriticSensitive | String::kCaseSensitive)); + str, UTF8("Одумайся!"), String::kDiacriticSensitive | String::kCaseSensitive)); ASSERT_FALSE(String::substrMatch( - str, String(UTF8("Одумаися!")), String::kDiacriticSensitive | String::kCaseSensitive)); + str, UTF8("Одумаися!"), String::kDiacriticSensitive | String::kCaseSensitive)); } TEST(UnicodeString, SubstringMatchTurkish) { - String str = String(UTF8("KAÇ YAŞINDASINIZ?")); + std::string str = UTF8("KAÇ YAŞINDASINIZ?"); // Case insensitive & diacritic insensitive. - ASSERT(String::substrMatch( - str, String(UTF8("yasındasınız")), String::kNone, CaseFoldMode::kTurkish)); - ASSERT_FALSE(String::substrMatch( - str, String(UTF8("yasindasiniz")), String::kNone, CaseFoldMode::kTurkish)); + ASSERT(String::substrMatch(str, UTF8("yasındasınız"), String::kNone, CaseFoldMode::kTurkish)); + ASSERT_FALSE( + String::substrMatch(str, UTF8("yasindasiniz"), String::kNone, CaseFoldMode::kTurkish)); // Case insensitive & diacritic sensitive. ASSERT(String::substrMatch( - str, String(UTF8("yaşındasınız")), String::kDiacriticSensitive, CaseFoldMode::kTurkish)); + str, UTF8("yaşındasınız"), String::kDiacriticSensitive, CaseFoldMode::kTurkish)); ASSERT_FALSE(String::substrMatch( - str, String(UTF8("yaşindasiniz")), String::kDiacriticSensitive, CaseFoldMode::kTurkish)); + str, UTF8("yaşindasiniz"), String::kDiacriticSensitive, CaseFoldMode::kTurkish)); } TEST(UnicodeString, BadUTF8) { @@ -153,6 +217,14 @@ TEST(UnicodeString, BadUTF8) { ASSERT_THROWS(String test2(invalid2), AssertionException); ASSERT_THROWS(String test3(invalid3), AssertionException); ASSERT_THROWS(String test4(invalid4), AssertionException); + + // preForSubstrMatch doesn't make any guarantees about behavior when fed invalid utf8. + // These calls are to ensure that they don't trigger any faults in sanitizing builds. + String::prepForSubstrMatch(invalid1, 0, kNormal); + String::prepForSubstrMatch(invalid2, 0, kNormal); + String::prepForSubstrMatch(invalid3, 0, kNormal); + + ASSERT_THROWS(String::prepForSubstrMatch(invalid4, 0, kNormal), AssertionException); } TEST(UnicodeString, UTF32ToUTF8) { |