summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts/unicode
diff options
context:
space:
mode:
authorMathias Stearn <mathias@10gen.com>2016-02-24 16:35:54 -0500
committerMathias Stearn <mathias@10gen.com>2016-03-11 08:50:18 -0500
commit6c3157f126bb44ab275325e85de7abee5ce9ad6d (patch)
tree16189ba18d0febfdd564006c1f03f008527f4f41 /src/mongo/db/fts/unicode
parent4a35c7184e188354793f16d27e2330b3b5ce7f8f (diff)
downloadmongo-6c3157f126bb44ab275325e85de7abee5ce9ad6d.tar.gz
SERVER-19936 Optimize FTS v3 phrase matching
Major changes: * Use Booyer-Moore algorithm for searching rather than std::search * All strings are kept in UTF8 rather than going to UTF32. * Case folding and diacritic removal are done in a single pass. * Optimize case folding and diacritic removal for ASCII codepoints. * Combine functionality of codepointIsDiacritic() into codepointRemoveDiacritics()
Diffstat (limited to 'src/mongo/db/fts/unicode')
-rw-r--r--src/mongo/db/fts/unicode/codepoints.h4
-rw-r--r--src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp1546
-rw-r--r--src/mongo/db/fts/unicode/codepoints_test.cpp32
-rw-r--r--src/mongo/db/fts/unicode/gen_casefold_map.py45
-rw-r--r--src/mongo/db/fts/unicode/gen_diacritic_map.py5
-rw-r--r--src/mongo/db/fts/unicode/string.cpp162
-rw-r--r--src/mongo/db/fts/unicode/string.h25
-rw-r--r--src/mongo/db/fts/unicode/string_test.cpp132
8 files changed, 1841 insertions, 110 deletions
diff --git a/src/mongo/db/fts/unicode/codepoints.h b/src/mongo/db/fts/unicode/codepoints.h
index 5b1e8e2b2b5..70520a00106 100644
--- a/src/mongo/db/fts/unicode/codepoints.h
+++ b/src/mongo/db/fts/unicode/codepoints.h
@@ -73,6 +73,10 @@ bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang);
* which code blocks are used), decomposing them to the NFD normalization form, removing any
* combining marks, and renormalizing them to the NFC form. The result is a mapping from original
* codepoint to a codepoint with no diacritics.
+ *
+ * Returns 0 if codepoint is a pure diacritic mark (ie if codepointIsDiacritic() would return true).
+ * You will need to distinguish this case from the input codepoint being 0 either by explicit
+ * checking or avoiding a call to this function if codepoint is in the ASCII range (<0x80).
*/
char32_t codepointRemoveDiacritics(char32_t codepoint);
diff --git a/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp b/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp
index c6b39d328b8..33893b135cd 100644
--- a/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp
+++ b/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp
@@ -35,6 +35,20 @@ namespace unicode {
char32_t codepointRemoveDiacritics(char32_t codepoint) {
switch (codepoint) {
+ case 0x5e:
+ return 0x0;
+ case 0x60:
+ return 0x0;
+ case 0xa8:
+ return 0x0;
+ case 0xaf:
+ return 0x0;
+ case 0xb4:
+ return 0x0;
+ case 0xb7:
+ return 0x0;
+ case 0xb8:
+ return 0x0;
case 0xc0:
return 0x41;
case 0xc1:
@@ -539,8 +553,364 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x59;
case 0x233:
return 0x79;
+ case 0x2b0:
+ return 0x0;
+ case 0x2b1:
+ return 0x0;
+ case 0x2b2:
+ return 0x0;
+ case 0x2b3:
+ return 0x0;
+ case 0x2b4:
+ return 0x0;
+ case 0x2b5:
+ return 0x0;
+ case 0x2b6:
+ return 0x0;
+ case 0x2b7:
+ return 0x0;
+ case 0x2b8:
+ return 0x0;
+ case 0x2b9:
+ return 0x0;
+ case 0x2ba:
+ return 0x0;
+ case 0x2bb:
+ return 0x0;
+ case 0x2bc:
+ return 0x0;
+ case 0x2bd:
+ return 0x0;
+ case 0x2be:
+ return 0x0;
+ case 0x2bf:
+ return 0x0;
+ case 0x2c0:
+ return 0x0;
+ case 0x2c1:
+ return 0x0;
+ case 0x2c2:
+ return 0x0;
+ case 0x2c3:
+ return 0x0;
+ case 0x2c4:
+ return 0x0;
+ case 0x2c5:
+ return 0x0;
+ case 0x2c6:
+ return 0x0;
+ case 0x2c7:
+ return 0x0;
+ case 0x2c8:
+ return 0x0;
+ case 0x2c9:
+ return 0x0;
+ case 0x2ca:
+ return 0x0;
+ case 0x2cb:
+ return 0x0;
+ case 0x2cc:
+ return 0x0;
+ case 0x2cd:
+ return 0x0;
+ case 0x2ce:
+ return 0x0;
+ case 0x2cf:
+ return 0x0;
+ case 0x2d0:
+ return 0x0;
+ case 0x2d1:
+ return 0x0;
+ case 0x2d2:
+ return 0x0;
+ case 0x2d3:
+ return 0x0;
+ case 0x2d4:
+ return 0x0;
+ case 0x2d5:
+ return 0x0;
+ case 0x2d6:
+ return 0x0;
+ case 0x2d7:
+ return 0x0;
+ case 0x2d8:
+ return 0x0;
+ case 0x2d9:
+ return 0x0;
+ case 0x2da:
+ return 0x0;
+ case 0x2db:
+ return 0x0;
+ case 0x2dc:
+ return 0x0;
+ case 0x2dd:
+ return 0x0;
+ case 0x2de:
+ return 0x0;
+ case 0x2df:
+ return 0x0;
+ case 0x2e0:
+ return 0x0;
+ case 0x2e1:
+ return 0x0;
+ case 0x2e2:
+ return 0x0;
+ case 0x2e3:
+ return 0x0;
+ case 0x2e4:
+ return 0x0;
+ case 0x2e5:
+ return 0x0;
+ case 0x2e6:
+ return 0x0;
+ case 0x2e7:
+ return 0x0;
+ case 0x2e8:
+ return 0x0;
+ case 0x2e9:
+ return 0x0;
+ case 0x2ea:
+ return 0x0;
+ case 0x2eb:
+ return 0x0;
+ case 0x2ec:
+ return 0x0;
+ case 0x2ed:
+ return 0x0;
+ case 0x2ee:
+ return 0x0;
+ case 0x2ef:
+ return 0x0;
+ case 0x2f0:
+ return 0x0;
+ case 0x2f1:
+ return 0x0;
+ case 0x2f2:
+ return 0x0;
+ case 0x2f3:
+ return 0x0;
+ case 0x2f4:
+ return 0x0;
+ case 0x2f5:
+ return 0x0;
+ case 0x2f6:
+ return 0x0;
+ case 0x2f7:
+ return 0x0;
+ case 0x2f8:
+ return 0x0;
+ case 0x2f9:
+ return 0x0;
+ case 0x2fa:
+ return 0x0;
+ case 0x2fb:
+ return 0x0;
+ case 0x2fc:
+ return 0x0;
+ case 0x2fd:
+ return 0x0;
+ case 0x2fe:
+ return 0x0;
+ case 0x2ff:
+ return 0x0;
+ case 0x300:
+ return 0x0;
+ case 0x301:
+ return 0x0;
+ case 0x302:
+ return 0x0;
+ case 0x303:
+ return 0x0;
+ case 0x304:
+ return 0x0;
+ case 0x305:
+ return 0x0;
+ case 0x306:
+ return 0x0;
+ case 0x307:
+ return 0x0;
+ case 0x308:
+ return 0x0;
+ case 0x309:
+ return 0x0;
+ case 0x30a:
+ return 0x0;
+ case 0x30b:
+ return 0x0;
+ case 0x30c:
+ return 0x0;
+ case 0x30d:
+ return 0x0;
+ case 0x30e:
+ return 0x0;
+ case 0x30f:
+ return 0x0;
+ case 0x310:
+ return 0x0;
+ case 0x311:
+ return 0x0;
+ case 0x312:
+ return 0x0;
+ case 0x313:
+ return 0x0;
+ case 0x314:
+ return 0x0;
+ case 0x315:
+ return 0x0;
+ case 0x316:
+ return 0x0;
+ case 0x317:
+ return 0x0;
+ case 0x318:
+ return 0x0;
+ case 0x319:
+ return 0x0;
+ case 0x31a:
+ return 0x0;
+ case 0x31b:
+ return 0x0;
+ case 0x31c:
+ return 0x0;
+ case 0x31d:
+ return 0x0;
+ case 0x31e:
+ return 0x0;
+ case 0x31f:
+ return 0x0;
+ case 0x320:
+ return 0x0;
+ case 0x321:
+ return 0x0;
+ case 0x322:
+ return 0x0;
+ case 0x323:
+ return 0x0;
+ case 0x324:
+ return 0x0;
+ case 0x325:
+ return 0x0;
+ case 0x326:
+ return 0x0;
+ case 0x327:
+ return 0x0;
+ case 0x328:
+ return 0x0;
+ case 0x329:
+ return 0x0;
+ case 0x32a:
+ return 0x0;
+ case 0x32b:
+ return 0x0;
+ case 0x32c:
+ return 0x0;
+ case 0x32d:
+ return 0x0;
+ case 0x32e:
+ return 0x0;
+ case 0x32f:
+ return 0x0;
+ case 0x330:
+ return 0x0;
+ case 0x331:
+ return 0x0;
+ case 0x332:
+ return 0x0;
+ case 0x333:
+ return 0x0;
+ case 0x334:
+ return 0x0;
+ case 0x335:
+ return 0x0;
+ case 0x336:
+ return 0x0;
+ case 0x337:
+ return 0x0;
+ case 0x338:
+ return 0x0;
+ case 0x339:
+ return 0x0;
+ case 0x33a:
+ return 0x0;
+ case 0x33b:
+ return 0x0;
+ case 0x33c:
+ return 0x0;
+ case 0x33d:
+ return 0x0;
+ case 0x33e:
+ return 0x0;
+ case 0x33f:
+ return 0x0;
+ case 0x340:
+ return 0x0;
+ case 0x341:
+ return 0x0;
+ case 0x342:
+ return 0x0;
+ case 0x343:
+ return 0x0;
+ case 0x344:
+ return 0x0;
+ case 0x345:
+ return 0x0;
+ case 0x346:
+ return 0x0;
+ case 0x347:
+ return 0x0;
+ case 0x348:
+ return 0x0;
+ case 0x349:
+ return 0x0;
+ case 0x34a:
+ return 0x0;
+ case 0x34b:
+ return 0x0;
+ case 0x34c:
+ return 0x0;
+ case 0x34d:
+ return 0x0;
+ case 0x34e:
+ return 0x0;
+ case 0x350:
+ return 0x0;
+ case 0x351:
+ return 0x0;
+ case 0x352:
+ return 0x0;
+ case 0x353:
+ return 0x0;
+ case 0x354:
+ return 0x0;
+ case 0x355:
+ return 0x0;
+ case 0x356:
+ return 0x0;
+ case 0x357:
+ return 0x0;
+ case 0x35d:
+ return 0x0;
+ case 0x35e:
+ return 0x0;
+ case 0x35f:
+ return 0x0;
+ case 0x360:
+ return 0x0;
+ case 0x361:
+ return 0x0;
+ case 0x362:
+ return 0x0;
+ case 0x374:
+ return 0x0;
+ case 0x375:
+ return 0x0;
+ case 0x37a:
+ return 0x0;
case 0x37e:
return 0x3b;
+ case 0x384:
+ return 0x0;
+ case 0x385:
+ return 0x0;
case 0x386:
return 0x391;
case 0x388:
@@ -621,6 +991,16 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x474;
case 0x477:
return 0x475;
+ case 0x483:
+ return 0x0;
+ case 0x484:
+ return 0x0;
+ case 0x485:
+ return 0x0;
+ case 0x486:
+ return 0x0;
+ case 0x487:
+ return 0x0;
case 0x4c1:
return 0x416;
case 0x4c2:
@@ -689,12 +1069,314 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x42b;
case 0x4f9:
return 0x44b;
+ case 0x559:
+ return 0x0;
+ case 0x591:
+ return 0x0;
+ case 0x592:
+ return 0x0;
+ case 0x593:
+ return 0x0;
+ case 0x594:
+ return 0x0;
+ case 0x595:
+ return 0x0;
+ case 0x596:
+ return 0x0;
+ case 0x597:
+ return 0x0;
+ case 0x598:
+ return 0x0;
+ case 0x599:
+ return 0x0;
+ case 0x59a:
+ return 0x0;
+ case 0x59b:
+ return 0x0;
+ case 0x59c:
+ return 0x0;
+ case 0x59d:
+ return 0x0;
+ case 0x59e:
+ return 0x0;
+ case 0x59f:
+ return 0x0;
+ case 0x5a0:
+ return 0x0;
+ case 0x5a1:
+ return 0x0;
+ case 0x5a3:
+ return 0x0;
+ case 0x5a4:
+ return 0x0;
+ case 0x5a5:
+ return 0x0;
+ case 0x5a6:
+ return 0x0;
+ case 0x5a7:
+ return 0x0;
+ case 0x5a8:
+ return 0x0;
+ case 0x5a9:
+ return 0x0;
+ case 0x5aa:
+ return 0x0;
+ case 0x5ab:
+ return 0x0;
+ case 0x5ac:
+ return 0x0;
+ case 0x5ad:
+ return 0x0;
+ case 0x5ae:
+ return 0x0;
+ case 0x5af:
+ return 0x0;
+ case 0x5b0:
+ return 0x0;
+ case 0x5b1:
+ return 0x0;
+ case 0x5b2:
+ return 0x0;
+ case 0x5b3:
+ return 0x0;
+ case 0x5b4:
+ return 0x0;
+ case 0x5b5:
+ return 0x0;
+ case 0x5b6:
+ return 0x0;
+ case 0x5b7:
+ return 0x0;
+ case 0x5b8:
+ return 0x0;
+ case 0x5b9:
+ return 0x0;
+ case 0x5ba:
+ return 0x0;
+ case 0x5bb:
+ return 0x0;
+ case 0x5bc:
+ return 0x0;
+ case 0x5bd:
+ return 0x0;
+ case 0x5bf:
+ return 0x0;
+ case 0x5c1:
+ return 0x0;
+ case 0x5c2:
+ return 0x0;
+ case 0x5c4:
+ return 0x0;
+ case 0x64b:
+ return 0x0;
+ case 0x64c:
+ return 0x0;
+ case 0x64d:
+ return 0x0;
+ case 0x64e:
+ return 0x0;
+ case 0x64f:
+ return 0x0;
+ case 0x650:
+ return 0x0;
+ case 0x651:
+ return 0x0;
+ case 0x652:
+ return 0x0;
+ case 0x657:
+ return 0x0;
+ case 0x658:
+ return 0x0;
+ case 0x6df:
+ return 0x0;
+ case 0x6e0:
+ return 0x0;
+ case 0x6e5:
+ return 0x0;
+ case 0x6e6:
+ return 0x0;
+ case 0x6ea:
+ return 0x0;
+ case 0x6eb:
+ return 0x0;
+ case 0x6ec:
+ return 0x0;
+ case 0x730:
+ return 0x0;
+ case 0x731:
+ return 0x0;
+ case 0x732:
+ return 0x0;
+ case 0x733:
+ return 0x0;
+ case 0x734:
+ return 0x0;
+ case 0x735:
+ return 0x0;
+ case 0x736:
+ return 0x0;
+ case 0x737:
+ return 0x0;
+ case 0x738:
+ return 0x0;
+ case 0x739:
+ return 0x0;
+ case 0x73a:
+ return 0x0;
+ case 0x73b:
+ return 0x0;
+ case 0x73c:
+ return 0x0;
+ case 0x73d:
+ return 0x0;
+ case 0x73e:
+ return 0x0;
+ case 0x73f:
+ return 0x0;
+ case 0x740:
+ return 0x0;
+ case 0x741:
+ return 0x0;
+ case 0x742:
+ return 0x0;
+ case 0x743:
+ return 0x0;
+ case 0x744:
+ return 0x0;
+ case 0x745:
+ return 0x0;
+ case 0x746:
+ return 0x0;
+ case 0x747:
+ return 0x0;
+ case 0x748:
+ return 0x0;
+ case 0x749:
+ return 0x0;
+ case 0x74a:
+ return 0x0;
+ case 0x7a6:
+ return 0x0;
+ case 0x7a7:
+ return 0x0;
+ case 0x7a8:
+ return 0x0;
+ case 0x7a9:
+ return 0x0;
+ case 0x7aa:
+ return 0x0;
+ case 0x7ab:
+ return 0x0;
+ case 0x7ac:
+ return 0x0;
+ case 0x7ad:
+ return 0x0;
+ case 0x7ae:
+ return 0x0;
+ case 0x7af:
+ return 0x0;
+ case 0x7b0:
+ return 0x0;
+ case 0x7eb:
+ return 0x0;
+ case 0x7ec:
+ return 0x0;
+ case 0x7ed:
+ return 0x0;
+ case 0x7ee:
+ return 0x0;
+ case 0x7ef:
+ return 0x0;
+ case 0x7f0:
+ return 0x0;
+ case 0x7f1:
+ return 0x0;
+ case 0x7f2:
+ return 0x0;
+ case 0x7f3:
+ return 0x0;
+ case 0x7f4:
+ return 0x0;
+ case 0x7f5:
+ return 0x0;
+ case 0x818:
+ return 0x0;
+ case 0x819:
+ return 0x0;
+ case 0x8e3:
+ return 0x0;
+ case 0x8e4:
+ return 0x0;
+ case 0x8e5:
+ return 0x0;
+ case 0x8e6:
+ return 0x0;
+ case 0x8e7:
+ return 0x0;
+ case 0x8e8:
+ return 0x0;
+ case 0x8e9:
+ return 0x0;
+ case 0x8ea:
+ return 0x0;
+ case 0x8eb:
+ return 0x0;
+ case 0x8ec:
+ return 0x0;
+ case 0x8ed:
+ return 0x0;
+ case 0x8ee:
+ return 0x0;
+ case 0x8ef:
+ return 0x0;
+ case 0x8f0:
+ return 0x0;
+ case 0x8f1:
+ return 0x0;
+ case 0x8f2:
+ return 0x0;
+ case 0x8f3:
+ return 0x0;
+ case 0x8f4:
+ return 0x0;
+ case 0x8f5:
+ return 0x0;
+ case 0x8f6:
+ return 0x0;
+ case 0x8f7:
+ return 0x0;
+ case 0x8f8:
+ return 0x0;
+ case 0x8f9:
+ return 0x0;
+ case 0x8fa:
+ return 0x0;
+ case 0x8fb:
+ return 0x0;
+ case 0x8fc:
+ return 0x0;
+ case 0x8fd:
+ return 0x0;
+ case 0x8fe:
+ return 0x0;
case 0x929:
return 0x928;
case 0x931:
return 0x930;
case 0x934:
return 0x933;
+ case 0x93c:
+ return 0x0;
+ case 0x94d:
+ return 0x0;
+ case 0x951:
+ return 0x0;
+ case 0x952:
+ return 0x0;
+ case 0x953:
+ return 0x0;
+ case 0x954:
+ return 0x0;
case 0x958:
return 0x915;
case 0x959:
@@ -711,6 +1393,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x92b;
case 0x95f:
return 0x92f;
+ case 0x971:
+ return 0x0;
+ case 0x9bc:
+ return 0x0;
+ case 0x9cd:
+ return 0x0;
case 0x9dc:
return 0x9a1;
case 0x9dd:
@@ -721,6 +1409,10 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0xa32;
case 0xa36:
return 0xa38;
+ case 0xa3c:
+ return 0x0;
+ case 0xa4d:
+ return 0x0;
case 0xa59:
return 0xa16;
case 0xa5a:
@@ -729,14 +1421,444 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0xa1c;
case 0xa5e:
return 0xa2b;
+ case 0xabc:
+ return 0x0;
+ case 0xacd:
+ return 0x0;
+ case 0xb3c:
+ return 0x0;
+ case 0xb4d:
+ return 0x0;
case 0xb5c:
return 0xb21;
case 0xb5d:
return 0xb22;
+ case 0xbcd:
+ return 0x0;
+ case 0xc4d:
+ return 0x0;
+ case 0xcbc:
+ return 0x0;
+ case 0xccd:
+ return 0x0;
+ case 0xd4d:
+ return 0x0;
+ case 0xdca:
+ return 0x0;
case 0xdda:
return 0xdd9;
case 0xddd:
return 0xddc;
+ case 0xe47:
+ return 0x0;
+ case 0xe48:
+ return 0x0;
+ case 0xe49:
+ return 0x0;
+ case 0xe4a:
+ return 0x0;
+ case 0xe4b:
+ return 0x0;
+ case 0xe4c:
+ return 0x0;
+ case 0xe4e:
+ return 0x0;
+ case 0xec8:
+ return 0x0;
+ case 0xec9:
+ return 0x0;
+ case 0xeca:
+ return 0x0;
+ case 0xecb:
+ return 0x0;
+ case 0xecc:
+ return 0x0;
+ case 0xf18:
+ return 0x0;
+ case 0xf19:
+ return 0x0;
+ case 0xf35:
+ return 0x0;
+ case 0xf37:
+ return 0x0;
+ case 0xf39:
+ return 0x0;
+ case 0xf3e:
+ return 0x0;
+ case 0xf3f:
+ return 0x0;
+ case 0xf82:
+ return 0x0;
+ case 0xf83:
+ return 0x0;
+ case 0xf84:
+ return 0x0;
+ case 0xf86:
+ return 0x0;
+ case 0xf87:
+ return 0x0;
+ case 0xfc6:
+ return 0x0;
+ case 0x1037:
+ return 0x0;
+ case 0x1039:
+ return 0x0;
+ case 0x103a:
+ return 0x0;
+ case 0x1087:
+ return 0x0;
+ case 0x1088:
+ return 0x0;
+ case 0x1089:
+ return 0x0;
+ case 0x108a:
+ return 0x0;
+ case 0x108b:
+ return 0x0;
+ case 0x108c:
+ return 0x0;
+ case 0x108d:
+ return 0x0;
+ case 0x108f:
+ return 0x0;
+ case 0x109a:
+ return 0x0;
+ case 0x109b:
+ return 0x0;
+ case 0x17c9:
+ return 0x0;
+ case 0x17ca:
+ return 0x0;
+ case 0x17cb:
+ return 0x0;
+ case 0x17cc:
+ return 0x0;
+ case 0x17cd:
+ return 0x0;
+ case 0x17ce:
+ return 0x0;
+ case 0x17cf:
+ return 0x0;
+ case 0x17d0:
+ return 0x0;
+ case 0x17d1:
+ return 0x0;
+ case 0x17d2:
+ return 0x0;
+ case 0x17d3:
+ return 0x0;
+ case 0x17dd:
+ return 0x0;
+ case 0x1939:
+ return 0x0;
+ case 0x193a:
+ return 0x0;
+ case 0x193b:
+ return 0x0;
+ case 0x1a75:
+ return 0x0;
+ case 0x1a76:
+ return 0x0;
+ case 0x1a77:
+ return 0x0;
+ case 0x1a78:
+ return 0x0;
+ case 0x1a79:
+ return 0x0;
+ case 0x1a7a:
+ return 0x0;
+ case 0x1a7b:
+ return 0x0;
+ case 0x1a7c:
+ return 0x0;
+ case 0x1a7f:
+ return 0x0;
+ case 0x1ab0:
+ return 0x0;
+ case 0x1ab1:
+ return 0x0;
+ case 0x1ab2:
+ return 0x0;
+ case 0x1ab3:
+ return 0x0;
+ case 0x1ab4:
+ return 0x0;
+ case 0x1ab5:
+ return 0x0;
+ case 0x1ab6:
+ return 0x0;
+ case 0x1ab7:
+ return 0x0;
+ case 0x1ab8:
+ return 0x0;
+ case 0x1ab9:
+ return 0x0;
+ case 0x1aba:
+ return 0x0;
+ case 0x1abb:
+ return 0x0;
+ case 0x1abc:
+ return 0x0;
+ case 0x1abd:
+ return 0x0;
+ case 0x1b34:
+ return 0x0;
+ case 0x1b44:
+ return 0x0;
+ case 0x1b6b:
+ return 0x0;
+ case 0x1b6c:
+ return 0x0;
+ case 0x1b6d:
+ return 0x0;
+ case 0x1b6e:
+ return 0x0;
+ case 0x1b6f:
+ return 0x0;
+ case 0x1b70:
+ return 0x0;
+ case 0x1b71:
+ return 0x0;
+ case 0x1b72:
+ return 0x0;
+ case 0x1b73:
+ return 0x0;
+ case 0x1baa:
+ return 0x0;
+ case 0x1bab:
+ return 0x0;
+ case 0x1c36:
+ return 0x0;
+ case 0x1c37:
+ return 0x0;
+ case 0x1c78:
+ return 0x0;
+ case 0x1c79:
+ return 0x0;
+ case 0x1c7a:
+ return 0x0;
+ case 0x1c7b:
+ return 0x0;
+ case 0x1c7c:
+ return 0x0;
+ case 0x1c7d:
+ return 0x0;
+ case 0x1cd0:
+ return 0x0;
+ case 0x1cd1:
+ return 0x0;
+ case 0x1cd2:
+ return 0x0;
+ case 0x1cd3:
+ return 0x0;
+ case 0x1cd4:
+ return 0x0;
+ case 0x1cd5:
+ return 0x0;
+ case 0x1cd6:
+ return 0x0;
+ case 0x1cd7:
+ return 0x0;
+ case 0x1cd8:
+ return 0x0;
+ case 0x1cd9:
+ return 0x0;
+ case 0x1cda:
+ return 0x0;
+ case 0x1cdb:
+ return 0x0;
+ case 0x1cdc:
+ return 0x0;
+ case 0x1cdd:
+ return 0x0;
+ case 0x1cde:
+ return 0x0;
+ case 0x1cdf:
+ return 0x0;
+ case 0x1ce0:
+ return 0x0;
+ case 0x1ce1:
+ return 0x0;
+ case 0x1ce2:
+ return 0x0;
+ case 0x1ce3:
+ return 0x0;
+ case 0x1ce4:
+ return 0x0;
+ case 0x1ce5:
+ return 0x0;
+ case 0x1ce6:
+ return 0x0;
+ case 0x1ce7:
+ return 0x0;
+ case 0x1ce8:
+ return 0x0;
+ case 0x1ced:
+ return 0x0;
+ case 0x1cf4:
+ return 0x0;
+ case 0x1cf8:
+ return 0x0;
+ case 0x1cf9:
+ return 0x0;
+ case 0x1d2c:
+ return 0x0;
+ case 0x1d2d:
+ return 0x0;
+ case 0x1d2e:
+ return 0x0;
+ case 0x1d2f:
+ return 0x0;
+ case 0x1d30:
+ return 0x0;
+ case 0x1d31:
+ return 0x0;
+ case 0x1d32:
+ return 0x0;
+ case 0x1d33:
+ return 0x0;
+ case 0x1d34:
+ return 0x0;
+ case 0x1d35:
+ return 0x0;
+ case 0x1d36:
+ return 0x0;
+ case 0x1d37:
+ return 0x0;
+ case 0x1d38:
+ return 0x0;
+ case 0x1d39:
+ return 0x0;
+ case 0x1d3a:
+ return 0x0;
+ case 0x1d3b:
+ return 0x0;
+ case 0x1d3c:
+ return 0x0;
+ case 0x1d3d:
+ return 0x0;
+ case 0x1d3e:
+ return 0x0;
+ case 0x1d3f:
+ return 0x0;
+ case 0x1d40:
+ return 0x0;
+ case 0x1d41:
+ return 0x0;
+ case 0x1d42:
+ return 0x0;
+ case 0x1d43:
+ return 0x0;
+ case 0x1d44:
+ return 0x0;
+ case 0x1d45:
+ return 0x0;
+ case 0x1d46:
+ return 0x0;
+ case 0x1d47:
+ return 0x0;
+ case 0x1d48:
+ return 0x0;
+ case 0x1d49:
+ return 0x0;
+ case 0x1d4a:
+ return 0x0;
+ case 0x1d4b:
+ return 0x0;
+ case 0x1d4c:
+ return 0x0;
+ case 0x1d4d:
+ return 0x0;
+ case 0x1d4e:
+ return 0x0;
+ case 0x1d4f:
+ return 0x0;
+ case 0x1d50:
+ return 0x0;
+ case 0x1d51:
+ return 0x0;
+ case 0x1d52:
+ return 0x0;
+ case 0x1d53:
+ return 0x0;
+ case 0x1d54:
+ return 0x0;
+ case 0x1d55:
+ return 0x0;
+ case 0x1d56:
+ return 0x0;
+ case 0x1d57:
+ return 0x0;
+ case 0x1d58:
+ return 0x0;
+ case 0x1d59:
+ return 0x0;
+ case 0x1d5a:
+ return 0x0;
+ case 0x1d5b:
+ return 0x0;
+ case 0x1d5c:
+ return 0x0;
+ case 0x1d5d:
+ return 0x0;
+ case 0x1d5e:
+ return 0x0;
+ case 0x1d5f:
+ return 0x0;
+ case 0x1d60:
+ return 0x0;
+ case 0x1d61:
+ return 0x0;
+ case 0x1d62:
+ return 0x0;
+ case 0x1d63:
+ return 0x0;
+ case 0x1d64:
+ return 0x0;
+ case 0x1d65:
+ return 0x0;
+ case 0x1d66:
+ return 0x0;
+ case 0x1d67:
+ return 0x0;
+ case 0x1d68:
+ return 0x0;
+ case 0x1d69:
+ return 0x0;
+ case 0x1d6a:
+ return 0x0;
+ case 0x1dc4:
+ return 0x0;
+ case 0x1dc5:
+ return 0x0;
+ case 0x1dc6:
+ return 0x0;
+ case 0x1dc7:
+ return 0x0;
+ case 0x1dc8:
+ return 0x0;
+ case 0x1dc9:
+ return 0x0;
+ case 0x1dca:
+ return 0x0;
+ case 0x1dcb:
+ return 0x0;
+ case 0x1dcc:
+ return 0x0;
+ case 0x1dcd:
+ return 0x0;
+ case 0x1dce:
+ return 0x0;
+ case 0x1dcf:
+ return 0x0;
+ case 0x1df5:
+ return 0x0;
+ case 0x1dfd:
+ return 0x0;
+ case 0x1dfe:
+ return 0x0;
+ case 0x1dff:
+ return 0x0;
case 0x1e00:
return 0x41;
case 0x1e01:
@@ -1575,8 +2697,16 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x391;
case 0x1fbc:
return 0x391;
+ case 0x1fbd:
+ return 0x0;
case 0x1fbe:
return 0x3b9;
+ case 0x1fbf:
+ return 0x0;
+ case 0x1fc0:
+ return 0x0;
+ case 0x1fc1:
+ return 0x0;
case 0x1fc2:
return 0x3b7;
case 0x1fc3:
@@ -1597,6 +2727,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x397;
case 0x1fcc:
return 0x397;
+ case 0x1fcd:
+ return 0x0;
+ case 0x1fce:
+ return 0x0;
+ case 0x1fcf:
+ return 0x0;
case 0x1fd0:
return 0x3b9;
case 0x1fd1:
@@ -1617,6 +2753,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x399;
case 0x1fdb:
return 0x399;
+ case 0x1fdd:
+ return 0x0;
+ case 0x1fde:
+ return 0x0;
+ case 0x1fdf:
+ return 0x0;
case 0x1fe0:
return 0x3c5;
case 0x1fe1:
@@ -1643,6 +2785,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x3a5;
case 0x1fec:
return 0x3a1;
+ case 0x1fed:
+ return 0x0;
+ case 0x1fee:
+ return 0x0;
+ case 0x1fef:
+ return 0x0;
case 0x1ff2:
return 0x3c9;
case 0x1ff3:
@@ -1663,6 +2811,10 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x3a9;
case 0x1ffc:
return 0x3a9;
+ case 0x1ffd:
+ return 0x0;
+ case 0x1ffe:
+ return 0x0;
case 0x2000:
return 0x2002;
case 0x2001:
@@ -1767,6 +2919,26 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x3009;
case 0x2adc:
return 0x2add;
+ case 0x2cef:
+ return 0x0;
+ case 0x2cf0:
+ return 0x0;
+ case 0x2cf1:
+ return 0x0;
+ case 0x2e2f:
+ return 0x0;
+ case 0x302a:
+ return 0x0;
+ case 0x302b:
+ return 0x0;
+ case 0x302c:
+ return 0x0;
+ case 0x302d:
+ return 0x0;
+ case 0x302e:
+ return 0x0;
+ case 0x302f:
+ return 0x0;
case 0x304c:
return 0x304b;
case 0x304e:
@@ -1819,6 +2991,14 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x307b;
case 0x3094:
return 0x3046;
+ case 0x3099:
+ return 0x0;
+ case 0x309a:
+ return 0x0;
+ case 0x309b:
+ return 0x0;
+ case 0x309c:
+ return 0x0;
case 0x309e:
return 0x309d;
case 0x30ac:
@@ -1881,8 +3061,138 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x30f1;
case 0x30fa:
return 0x30f2;
+ case 0x30fc:
+ return 0x0;
case 0x30fe:
return 0x30fd;
+ case 0xa66f:
+ return 0x0;
+ case 0xa67c:
+ return 0x0;
+ case 0xa67d:
+ return 0x0;
+ case 0xa67f:
+ return 0x0;
+ case 0xa69c:
+ return 0x0;
+ case 0xa69d:
+ return 0x0;
+ case 0xa6f0:
+ return 0x0;
+ case 0xa6f1:
+ return 0x0;
+ case 0xa717:
+ return 0x0;
+ case 0xa718:
+ return 0x0;
+ case 0xa719:
+ return 0x0;
+ case 0xa71a:
+ return 0x0;
+ case 0xa71b:
+ return 0x0;
+ case 0xa71c:
+ return 0x0;
+ case 0xa71d:
+ return 0x0;
+ case 0xa71e:
+ return 0x0;
+ case 0xa71f:
+ return 0x0;
+ case 0xa720:
+ return 0x0;
+ case 0xa721:
+ return 0x0;
+ case 0xa788:
+ return 0x0;
+ case 0xa7f8:
+ return 0x0;
+ case 0xa7f9:
+ return 0x0;
+ case 0xa8c4:
+ return 0x0;
+ case 0xa8e0:
+ return 0x0;
+ case 0xa8e1:
+ return 0x0;
+ case 0xa8e2:
+ return 0x0;
+ case 0xa8e3:
+ return 0x0;
+ case 0xa8e4:
+ return 0x0;
+ case 0xa8e5:
+ return 0x0;
+ case 0xa8e6:
+ return 0x0;
+ case 0xa8e7:
+ return 0x0;
+ case 0xa8e8:
+ return 0x0;
+ case 0xa8e9:
+ return 0x0;
+ case 0xa8ea:
+ return 0x0;
+ case 0xa8eb:
+ return 0x0;
+ case 0xa8ec:
+ return 0x0;
+ case 0xa8ed:
+ return 0x0;
+ case 0xa8ee:
+ return 0x0;
+ case 0xa8ef:
+ return 0x0;
+ case 0xa8f0:
+ return 0x0;
+ case 0xa8f1:
+ return 0x0;
+ case 0xa92b:
+ return 0x0;
+ case 0xa92c:
+ return 0x0;
+ case 0xa92d:
+ return 0x0;
+ case 0xa92e:
+ return 0x0;
+ case 0xa953:
+ return 0x0;
+ case 0xa9b3:
+ return 0x0;
+ case 0xa9c0:
+ return 0x0;
+ case 0xa9e5:
+ return 0x0;
+ case 0xaa7b:
+ return 0x0;
+ case 0xaa7c:
+ return 0x0;
+ case 0xaa7d:
+ return 0x0;
+ case 0xaabf:
+ return 0x0;
+ case 0xaac0:
+ return 0x0;
+ case 0xaac1:
+ return 0x0;
+ case 0xaac2:
+ return 0x0;
+ case 0xaaf6:
+ return 0x0;
+ case 0xab5b:
+ return 0x0;
+ case 0xab5c:
+ return 0x0;
+ case 0xab5d:
+ return 0x0;
+ case 0xab5e:
+ return 0x0;
+ case 0xab5f:
+ return 0x0;
+ case 0xabec:
+ return 0x0;
+ case 0xabed:
+ return 0x0;
case 0xf900:
return 0x8c48;
case 0xf901:
@@ -2805,6 +4115,8 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x9f8e;
case 0xfb1d:
return 0x5d9;
+ case 0xfb1e:
+ return 0x0;
case 0xfb1f:
return 0x5f2;
case 0xfb2a:
@@ -2871,12 +4183,246 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
return 0x5db;
case 0xfb4e:
return 0x5e4;
+ case 0xfe20:
+ return 0x0;
+ case 0xfe21:
+ return 0x0;
+ case 0xfe22:
+ return 0x0;
+ case 0xfe23:
+ return 0x0;
+ case 0xfe24:
+ return 0x0;
+ case 0xfe25:
+ return 0x0;
+ case 0xfe26:
+ return 0x0;
+ case 0xfe27:
+ return 0x0;
+ case 0xfe28:
+ return 0x0;
+ case 0xfe29:
+ return 0x0;
+ case 0xfe2a:
+ return 0x0;
+ case 0xfe2b:
+ return 0x0;
+ case 0xfe2c:
+ return 0x0;
+ case 0xfe2d:
+ return 0x0;
+ case 0xfe2e:
+ return 0x0;
+ case 0xfe2f:
+ return 0x0;
+ case 0xff3e:
+ return 0x0;
+ case 0xff40:
+ return 0x0;
+ case 0xff70:
+ return 0x0;
+ case 0xff9e:
+ return 0x0;
+ case 0xff9f:
+ return 0x0;
+ case 0xffe3:
+ return 0x0;
+ case 0x102e0:
+ return 0x0;
+ case 0x10ae5:
+ return 0x0;
+ case 0x10ae6:
+ return 0x0;
case 0x1109a:
return 0x11099;
case 0x1109c:
return 0x1109b;
case 0x110ab:
return 0x110a5;
+ case 0x110b9:
+ return 0x0;
+ case 0x110ba:
+ return 0x0;
+ case 0x11133:
+ return 0x0;
+ case 0x11134:
+ return 0x0;
+ case 0x11173:
+ return 0x0;
+ case 0x111c0:
+ return 0x0;
+ case 0x111ca:
+ return 0x0;
+ case 0x111cb:
+ return 0x0;
+ case 0x111cc:
+ return 0x0;
+ case 0x11235:
+ return 0x0;
+ case 0x11236:
+ return 0x0;
+ case 0x112e9:
+ return 0x0;
+ case 0x112ea:
+ return 0x0;
+ case 0x1133c:
+ return 0x0;
+ case 0x1134d:
+ return 0x0;
+ case 0x11366:
+ return 0x0;
+ case 0x11367:
+ return 0x0;
+ case 0x11368:
+ return 0x0;
+ case 0x11369:
+ return 0x0;
+ case 0x1136a:
+ return 0x0;
+ case 0x1136b:
+ return 0x0;
+ case 0x1136c:
+ return 0x0;
+ case 0x11370:
+ return 0x0;
+ case 0x11371:
+ return 0x0;
+ case 0x11372:
+ return 0x0;
+ case 0x11373:
+ return 0x0;
+ case 0x11374:
+ return 0x0;
+ case 0x114c2:
+ return 0x0;
+ case 0x114c3:
+ return 0x0;
+ case 0x115bf:
+ return 0x0;
+ case 0x115c0:
+ return 0x0;
+ case 0x1163f:
+ return 0x0;
+ case 0x116b6:
+ return 0x0;
+ case 0x116b7:
+ return 0x0;
+ case 0x1172b:
+ return 0x0;
+ case 0x16af0:
+ return 0x0;
+ case 0x16af1:
+ return 0x0;
+ case 0x16af2:
+ return 0x0;
+ case 0x16af3:
+ return 0x0;
+ case 0x16af4:
+ return 0x0;
+ case 0x16f8f:
+ return 0x0;
+ case 0x16f90:
+ return 0x0;
+ case 0x16f91:
+ return 0x0;
+ case 0x16f92:
+ return 0x0;
+ case 0x16f93:
+ return 0x0;
+ case 0x16f94:
+ return 0x0;
+ case 0x16f95:
+ return 0x0;
+ case 0x16f96:
+ return 0x0;
+ case 0x16f97:
+ return 0x0;
+ case 0x16f98:
+ return 0x0;
+ case 0x16f99:
+ return 0x0;
+ case 0x16f9a:
+ return 0x0;
+ case 0x16f9b:
+ return 0x0;
+ case 0x16f9c:
+ return 0x0;
+ case 0x16f9d:
+ return 0x0;
+ case 0x16f9e:
+ return 0x0;
+ case 0x16f9f:
+ return 0x0;
+ case 0x1d167:
+ return 0x0;
+ case 0x1d168:
+ return 0x0;
+ case 0x1d169:
+ return 0x0;
+ case 0x1d16d:
+ return 0x0;
+ case 0x1d16e:
+ return 0x0;
+ case 0x1d16f:
+ return 0x0;
+ case 0x1d170:
+ return 0x0;
+ case 0x1d171:
+ return 0x0;
+ case 0x1d172:
+ return 0x0;
+ case 0x1d17b:
+ return 0x0;
+ case 0x1d17c:
+ return 0x0;
+ case 0x1d17d:
+ return 0x0;
+ case 0x1d17e:
+ return 0x0;
+ case 0x1d17f:
+ return 0x0;
+ case 0x1d180:
+ return 0x0;
+ case 0x1d181:
+ return 0x0;
+ case 0x1d182:
+ return 0x0;
+ case 0x1d185:
+ return 0x0;
+ case 0x1d186:
+ return 0x0;
+ case 0x1d187:
+ return 0x0;
+ case 0x1d188:
+ return 0x0;
+ case 0x1d189:
+ return 0x0;
+ case 0x1d18a:
+ return 0x0;
+ case 0x1d18b:
+ return 0x0;
+ case 0x1d1aa:
+ return 0x0;
+ case 0x1d1ab:
+ return 0x0;
+ case 0x1d1ac:
+ return 0x0;
+ case 0x1d1ad:
+ return 0x0;
+ case 0x1e8d0:
+ return 0x0;
+ case 0x1e8d1:
+ return 0x0;
+ case 0x1e8d2:
+ return 0x0;
+ case 0x1e8d3:
+ return 0x0;
+ case 0x1e8d4:
+ return 0x0;
+ case 0x1e8d5:
+ return 0x0;
+ case 0x1e8d6:
+ return 0x0;
case 0x2f800:
return 0x4e3d;
case 0x2f801:
diff --git a/src/mongo/db/fts/unicode/codepoints_test.cpp b/src/mongo/db/fts/unicode/codepoints_test.cpp
index 90510666cba..a68c41688f6 100644
--- a/src/mongo/db/fts/unicode/codepoints_test.cpp
+++ b/src/mongo/db/fts/unicode/codepoints_test.cpp
@@ -32,6 +32,8 @@
namespace mongo {
namespace unicode {
+const char32_t maxCP = 0x1FFFFF; // Highest valid codepoint.
+
/**
* Above most of the arrays in this class are the UTF-32 character literals that correspond to the
* codepoints in the array.
@@ -39,14 +41,19 @@ namespace unicode {
TEST(UnicodeCodepoints, Diacritics) {
// There are no character literals for combining marks.
- const char32_t marks[] = {0x0301, 0x0339, 0x1AB4, 0x1DC5, 0xA69D};
+ const char32_t marks[] = {'^', '`', 0x0301, 0x0339, 0x1AB4, 0x1DC5, 0xA69D};
// const char32_t not_marks[] = {U'-', U'.', U'\'', U'*', U'm'};
const char32_t not_marks[] = {0x2D, 0x2E, 0x27, 0x2A, 0x6D};
- for (auto i = 0; i < 5; ++i) {
- ASSERT(codepointIsDiacritic(marks[i]));
- ASSERT_FALSE(codepointIsDiacritic(not_marks[i]));
+ for (auto cp : marks) {
+ ASSERT(codepointIsDiacritic(cp));
+ ASSERT_EQ(codepointRemoveDiacritics(cp), char32_t(0));
+ }
+
+ for (auto cp : not_marks) {
+ ASSERT(!codepointIsDiacritic(cp));
+ ASSERT_NE(codepointRemoveDiacritics(cp), char32_t(0));
}
}
@@ -77,6 +84,10 @@ TEST(UnicodeCodepoints, RemoveDiacritics) {
for (auto i = 0; i < 5; ++i) {
ASSERT_EQUALS(clean[i], codepointRemoveDiacritics(originals[i]));
}
+
+ for (char32_t cp = 0; cp <= maxCP; cp++) {
+ ASSERT_EQ(codepointRemoveDiacritics(cp) == 0, cp == 0 || codepointIsDiacritic(cp));
+ }
}
TEST(UnicodeCodepoints, ToLower) {
@@ -90,5 +101,18 @@ TEST(UnicodeCodepoints, ToLower) {
}
}
+TEST(UnicodeCodepoints, ToLowerIsFixedPoint) {
+ for (char32_t cp = 0; cp <= maxCP; cp++) {
+ ASSERT_EQ(codepointToLower(cp), codepointToLower(codepointToLower(cp)));
+ }
+}
+
+TEST(UnicodeCodepoints, RemoveDiacriticsIsFixedPoint) {
+ for (char32_t cp = 0; cp <= maxCP; cp++) {
+ ASSERT_EQ(codepointRemoveDiacritics(cp),
+ codepointRemoveDiacritics(codepointRemoveDiacritics(cp)));
+ }
+}
+
} // namespace unicode
} // namespace mongo
diff --git a/src/mongo/db/fts/unicode/gen_casefold_map.py b/src/mongo/db/fts/unicode/gen_casefold_map.py
index 7ab9b49f7a4..61249e5f0d7 100644
--- a/src/mongo/db/fts/unicode/gen_casefold_map.py
+++ b/src/mongo/db/fts/unicode/gen_casefold_map.py
@@ -7,11 +7,11 @@ from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
include
def generate(unicode_casefold_file, target):
- """Generates a C++ source file that contains a Unicode case folding
+ """Generates a C++ source file that contains a Unicode case folding
function.
The case folding function contains a switch statement with cases for every
- Unicode codepoint that has a case folding mapping.
+ Unicode codepoint that has a case folding mapping.
"""
out = open(target, "w")
@@ -43,29 +43,46 @@ def generate(unicode_casefold_file, target):
codepoint_mapping = int(values[2], 16)
case_mappings[original_codepoint] = codepoint_mapping
- out.write("""char32_t codepointToLower(char32_t codepoint, CaseFoldMode \
-mode) {
- if (mode == CaseFoldMode::kTurkish) {
- if (codepoint == 0x049) { // I -> ı
- return 0x131;
- } else if (codepoint == 0x130) { // İ -> i
- return 0x069;
- }
+ turkishMapping = {
+ 0x49: 0x131, # I -> ı
+ 0x130: 0x069, # İ -> i
}
- switch (codepoint) {\n""")
+ out.write(
+ """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) {
+ if (codepoint <= 0x7f) {
+ if (codepoint >= 'A' && codepoint <= 'Z') {
+ return (mode == CaseFoldMode::kTurkish && codepoint == 'I')
+ ? 0x131
+ : (codepoint | 0x20); // Set the ascii lowercase bit on the character.
+ }
+ return codepoint;
+ }
+
+ switch (codepoint) {\n""")
mappings_list = []
for mapping in case_mappings:
mappings_list.append((mapping, case_mappings[mapping]))
+ # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle
+ # them even if we'd skip the letter in non-turkish mode.
+ for mapping in turkishMapping:
+ if mapping not in case_mappings:
+ mappings_list.append((mapping, mapping))
+
sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])
for mapping in sorted_mappings:
- out.write("\
- case " + str(hex(mapping[0])) + ": return " + \
- str(hex(mapping[1])) +";\n")
+ if mapping[0] <= 0x7f:
+ continue # ascii is special cased above.
+
+ if mapping[0] in turkishMapping:
+ out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
+ % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
+ else:
+ out.write("case 0x%x: return 0x%x;\n"%mapping)
out.write("\
default: return codepoint;\n }\n}")
diff --git a/src/mongo/db/fts/unicode/gen_diacritic_map.py b/src/mongo/db/fts/unicode/gen_diacritic_map.py
index 84f57e82a3c..08cfa95cda2 100644
--- a/src/mongo/db/fts/unicode/gen_diacritic_map.py
+++ b/src/mongo/db/fts/unicode/gen_diacritic_map.py
@@ -55,6 +55,8 @@ def add_diacritic_mapping(codepoint):
# Only use mappings where the final recomposed form is a single codepoint
if (a != c and len(c) == 1):
+ assert c != '\0' # This is used to indicate the codepoint is a pure diacritic.
+ assert ord(c) not in diacritics
diacritic_mappings[codepoint] = ord(c[0])
def add_diacritic_range(start, end):
@@ -78,6 +80,9 @@ def generate(target):
# Map diacritics from 0 to the maximum Unicode codepoint
add_diacritic_range(0x0000, 0x10FFFF)
+ for diacritic in diacritics:
+ diacritic_mappings[diacritic] = 0
+
out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) {
switch (codepoint) {\n""")
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 9c18a776787..0ea37d8f33f 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -29,7 +29,9 @@
#include "mongo/db/fts/unicode/string.h"
#include <algorithm>
+#include <boost/algorithm/searching/boyer_moore.hpp>
+#include "mongo/platform/bits.h"
#include "mongo/shell/linenoise_utf8.h"
#include "mongo/util/assert_util.h"
@@ -42,10 +44,6 @@ using linenoise_utf8::copyString8to32;
using std::u32string;
String::String(const StringData utf8_src) {
- // Reserve space for underlying buffers to prevent excessive reallocations.
- _outputBuf.reserve(utf8_src.size() * 4);
- _data.reserve(utf8_src.size() * 4);
-
// Convert UTF-8 input to UTF-32 data.
setData(utf8_src);
}
@@ -79,12 +77,6 @@ void String::setData(const StringData utf8_src) {
_needsOutputConversion = true;
}
-String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) {
- // Reserve space for underlying buffers to prevent excessive reallocations.
- _outputBuf.reserve(src.size() * 4);
- _data.reserve(src.size() * 4);
-}
-
std::string String::toString() {
// _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
// characters, plus a null character if there isn't one.
@@ -100,14 +92,6 @@ std::string String::toString() {
return _outputBuf;
}
-size_t String::size() const {
- return _data.size();
-}
-
-const char32_t& String::operator[](int i) const {
- return _data[i];
-}
-
String String::substr(size_t pos, size_t len) const {
unicode::String buf;
substrToBuf(pos, len, buf);
@@ -147,63 +131,133 @@ void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
buffer._data.resize(_data.size());
- auto index = 0;
+ auto outIt = buffer._data.begin();
for (auto codepoint : _data) {
- buffer._data[index++] = codepointToLower(codepoint, mode);
+ *outIt++ = codepointToLower(codepoint, mode);
}
buffer._needsOutputConversion = true;
}
void String::removeDiacriticsToBuf(String& buffer) const {
buffer._data.resize(_data.size());
- auto index = 0;
+ auto outIt = buffer._data.begin();
for (auto codepoint : _data) {
- if (!codepointIsDiacritic(codepoint)) {
- buffer._data[index++] = codepointRemoveDiacritics(codepoint);
+ if (codepoint <= 0x7f) {
+ // ASCII only has two diacritics so they are hard-coded here.
+ if (codepoint != '^' && codepoint != '`') {
+ *outIt++ = codepoint;
+ }
+ } else if (auto clean = codepointRemoveDiacritics(codepoint)) {
+ *outIt++ = clean;
+ } else {
+ // codepoint was a pure diacritic mark, so skip it.
}
}
- buffer._data.resize(index);
+ buffer._data.resize(outIt - buffer._data.begin());
buffer._needsOutputConversion = true;
}
-bool String::substrMatch(const String& str,
- const String& find,
+std::pair<std::unique_ptr<char[]>, char*> String::prepForSubstrMatch(StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode) {
+ // This function should only be called when casefolding or stripping diacritics.
+ dassert(!(options & kCaseSensitive) || !(options & kDiacriticSensitive));
+
+ // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics
+ // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
+ // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we
+ // don't map a single code point to more than one.
+ std::unique_ptr<char[]> buffer(new char[utf8.size() * 2]);
+ auto outputIt = buffer.get();
+
+ for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) {
+ const uint8_t firstByte = *inputIt++;
+ char32_t codepoint = 0;
+ if (firstByte <= 0x7f) {
+ // ASCII special case. Can use faster operations.
+ if ((!(options & kCaseSensitive)) && (firstByte >= 'A' && firstByte <= 'Z')) {
+ codepoint = (mode == CaseFoldMode::kTurkish && firstByte == 'I')
+ ? 0x131 // In Turkish, I -> ı (i with no dot).
+ : (firstByte | 0x20); // Set the ascii lowercase bit on the character.
+ } else {
+ // ASCII has two pure diacritics that should be skipped and no characters that
+ // change when removing diacritics.
+ if ((options & kDiacriticSensitive) || !(firstByte == '^' || firstByte == '`')) {
+ *outputIt++ = (firstByte);
+ }
+ continue;
+ }
+ } else {
+ // firstByte indicates that it is not an ASCII char.
+ int leadingOnes = countLeadingZeros64(~(uint64_t(firstByte) << (64 - 8)));
+
+ // Only checking enough to ensure that this code doesn't crash in the face of malformed
+ // utf-8. We make no guarantees about what results will be returned in this case.
+ uassert(ErrorCodes::BadValue,
+ "text contains invalid UTF-8",
+ leadingOnes > 1 && leadingOnes <= 4 && inputIt + leadingOnes - 1 <= endIt);
+
+ codepoint = firstByte & (0xff >> leadingOnes); // mask off the size indicator.
+ for (int subByteIx = 1; subByteIx < leadingOnes; subByteIx++) {
+ const uint8_t subByte = *inputIt++;
+ codepoint <<= 6;
+ codepoint |= subByte & 0x3f; // mask off continuation bits.
+ }
+
+ if (!(options & kCaseSensitive)) {
+ codepoint = codepointToLower(codepoint, mode);
+ }
+
+ if (!(options & kDiacriticSensitive)) {
+ codepoint = codepointRemoveDiacritics(codepoint);
+ if (!codepoint)
+ continue; // codepoint is a pure diacritic.
+ }
+ }
+
+ // Back to utf-8.
+ if (codepoint <= 0x7f /* max 1-byte codepoint */) {
+ *outputIt++ = (codepoint);
+ } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
+ *outputIt++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s.
+ *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+ } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
+ *outputIt++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s.
+ *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+ *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+ } else {
+ *outputIt++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s.
+ *outputIt++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
+ *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+ *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+ }
+ }
+
+ return {std::move(buffer), outputIt};
+}
+
+bool String::substrMatch(const std::string& str,
+ const std::string& find,
SubstrMatchOptions options,
CaseFoldMode cfMode) {
- // In Turkish, lowercasing needs to be applied first because the letter İ has a different case
- // folding mapping than the letter I, but removing diacritics removes the dot from İ.
if (cfMode == CaseFoldMode::kTurkish) {
- String cleanStr = str.toLower(cfMode);
- String cleanFind = find.toLower(cfMode);
- return substrMatch(cleanStr, cleanFind, options | kCaseSensitive, CaseFoldMode::kNormal);
+ // Turkish comparisons are always case insensitive due to their handling of I/i.
+ options &= ~kCaseSensitive;
}
- if (options & kDiacriticSensitive) {
- if (options & kCaseSensitive) {
- // Case sensitive and diacritic sensitive.
- return std::search(str._data.cbegin(),
- str._data.cend(),
- find._data.cbegin(),
- find._data.cend(),
- [&](char32_t c1, char32_t c2) { return (c1 == c2); }) !=
- str._data.cend();
- }
-
- // Case insensitive and diacritic sensitive.
- return std::search(str._data.cbegin(),
- str._data.cend(),
- find._data.cbegin(),
- find._data.cend(),
- [&](char32_t c1, char32_t c2) {
- return (codepointToLower(c1, cfMode) ==
- codepointToLower(c2, cfMode));
- }) != str._data.cend();
+ if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
+ // No transformation needed. Just do the search on the input strings.
+ return boost::algorithm::boyer_moore_search(
+ str.cbegin(), str.cend(), find.cbegin(), find.cend()) != str.cend();
}
- String cleanStr = str.removeDiacritics();
- String cleanFind = find.removeDiacritics();
+ auto haystack = prepForSubstrMatch(str, options, cfMode);
+ auto needle = prepForSubstrMatch(find, options, cfMode);
- return substrMatch(cleanStr, cleanFind, options | kDiacriticSensitive, cfMode);
+ // Case sensitive and diacritic sensitive.
+ return boost::algorithm::boyer_moore_search(
+ haystack.first.get(), haystack.second, needle.first.get(), needle.second) !=
+ haystack.second;
}
} // namespace unicode
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index c3355ee4f25..1ba6e46c27c 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -29,6 +29,7 @@
#pragma once
#include <cstdint>
+#include <memory>
#include <string>
#include "mongo/base/string_data.h"
@@ -110,12 +111,16 @@ public:
/**
* Returns the number Unicode codepoints in the String.
*/
- size_t size() const;
+ size_t size() const {
+ return _data.size();
+ }
/**
* Returns the Unicode codepoint at index i of the String.
*/
- const char32_t& operator[](int i) const;
+ const char32_t& operator[](int i) const {
+ return _data[i];
+ }
/**
* Options for the substrMatch method.
@@ -143,18 +148,22 @@ public:
* the search is case insensitive, non-Turkish case folding is used unless the
* CaseFoldMode::Turkish is passed to mode.
*/
- static bool substrMatch(const String& str,
- const String& find,
+ static bool substrMatch(const std::string& str,
+ const std::string& find,
SubstrMatchOptions options,
CaseFoldMode mode = CaseFoldMode::kNormal);
-private:
/**
- * Private constructor used by substr, toLower, and removeDiacritics to build a String from
- * UTF-32 data.
+ * Strips diacritics and case-folds the utf8 input string, as needed to support options.
+ *
+ * Returns an owned buffer containing the output utf8 string and an end iterator for the string
+ * (points at the first byte after the string).
*/
- String(std::u32string&& src);
+ static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8,
+ SubstrMatchOptions options,
+ CaseFoldMode mode);
+private:
/**
* Helper method for converting a UTF-8 string to a UTF-32 string.
*/
diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp
index 9354f7bf25c..8dec9a7e8df 100644
--- a/src/mongo/db/fts/unicode/string_test.cpp
+++ b/src/mongo/db/fts/unicode/string_test.cpp
@@ -47,82 +47,146 @@ namespace unicode {
using linenoise_utf8::copyString32to8;
+namespace {
+// Added to the end of strings to ensure the real content is tested with the vectored
+// implementation.
+const std::string filler(32, 'x');
+
+/**
+ * Converts return from prepForSubstrMatch to a StringData that is only useful in the current
+ * expression.
+ */
+StringData toStringData(std::pair<std::unique_ptr<char[]>, char*> result) {
+ return {result.first.get(), size_t(result.second - result.first.get())};
+}
+
+auto kDiacriticSensitive = String::kDiacriticSensitive;
+auto kCaseSensitive = String::kCaseSensitive;
+
+auto kTurkish = CaseFoldMode::kTurkish;
+auto kNormal = CaseFoldMode::kNormal;
+}
+
+
+// Macro to preserve line numbers and arguments in error messages.
+#define TEST_PREP_FOR_SUBSTR_MATCH(expected, input, options, caseFoldMode) \
+ ASSERT_EQ(expected, toStringData(String::prepForSubstrMatch(input, options, caseFoldMode))); \
+ ASSERT_EQ(expected + filler, \
+ toStringData(String::prepForSubstrMatch(input + filler, options, caseFoldMode)))
+
TEST(UnicodeString, RemoveDiacritics) {
+ // Test all ascii chars.
+ for (unsigned char ch = 0; ch <= 0x7F; ch++) {
+ const auto input = std::string(1, ch);
+ const auto output = codepointIsDiacritic(ch) ? std::string() : std::string(1, ch);
+ if (ch) { // String's constructor doesn't handle embedded NUL bytes.
+ ASSERT_EQUALS(output, String(input).removeDiacritics().toString());
+ }
+ TEST_PREP_FOR_SUBSTR_MATCH(output, input, kCaseSensitive, kNormal);
+ }
+
// NFC Normalized Text.
- String test1 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"));
+ const char test1[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
// NFD Normalized Text ("Café").
const char test2[] = {'C', 'a', 'f', 'e', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
- ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), test1.removeDiacritics().toString());
+ ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString());
ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString());
+
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
}
TEST(UnicodeString, CaseFolding) {
- String test1 = String(UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?"));
- String test2 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"));
+ // Test all ascii chars.
+ for (unsigned char ch = 0; ch <= 0x7F; ch++) {
+ const auto upper = std::string(1, ch);
+ const auto lower = std::string(1, std::tolower(ch));
+ if (ch) { // String's constructor doesn't handle embedded NUL bytes.
+ ASSERT_EQUALS(lower, String(upper).toLower().toString());
+ }
+ TEST_PREP_FOR_SUBSTR_MATCH(lower, upper, kDiacriticSensitive, kNormal);
+ }
+
+ const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?");
+ const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
- ASSERT_EQUALS(UTF8("сколько тебе лет?"), test1.toLower().toString());
- ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), test2.toLower().toString());
+ ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString());
+ ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString());
+
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
+ TEST_PREP_FOR_SUBSTR_MATCH(
+ UTF8("¿cuántos años tienes tú?"), test2, kDiacriticSensitive, kNormal);
}
TEST(UnicodeString, CaseFoldingTurkish) {
- String test1 = String(UTF8("KAC YASINDASINIZ"));
- String test2 = String(UTF8("KAC YASİNDASİNİZ"));
+ const char test1[] = UTF8("KAC YASINDASINIZ");
+ const char test2[] = UTF8("KAC YASİNDASİNİZ");
+
+ ASSERT_EQUALS(UTF8("kac yasındasınız"),
+ String(test1).toLower(CaseFoldMode::kTurkish).toString());
+ ASSERT_EQUALS(UTF8("kac yasindasiniz"),
+ String(test2).toLower(CaseFoldMode::kTurkish).toString());
- ASSERT_EQUALS(UTF8("kac yasındasınız"), test1.toLower(CaseFoldMode::kTurkish).toString());
- ASSERT_EQUALS(UTF8("kac yasindasiniz"), test2.toLower(CaseFoldMode::kTurkish).toString());
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish);
}
TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
// NFC Normalized Text.
- String test1 = String(UTF8("Πόσο χρονών είσαι?"));
- String test2 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"));
+ const char test1[] = UTF8("Πόσο χρονών είσαι?");
+ const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
// NFD Normalized Text ("CAFÉ").
const char test3[] = {'C', 'A', 'F', 'E', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
- ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"), test1.toLower().removeDiacritics().toString());
- ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"), test2.toLower().removeDiacritics().toString());
+ ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"),
+ String(test1).toLower().removeDiacritics().toString());
+ ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"),
+ String(test2).toLower().removeDiacritics().toString());
ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString());
+
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
+ TEST_PREP_FOR_SUBSTR_MATCH(UTF8("cafe"), test3, 0, kNormal);
}
TEST(UnicodeString, SubstringMatch) {
- String str = String(UTF8("Одумайся! Престол свой сохрани; И ярость укроти."));
+ std::string str = UTF8("Одумайся! Престол свой сохрани; И ярость укроти.");
// Case insensitive & diacritic insensitive.
- ASSERT(String::substrMatch(str, String(UTF8("ПРЁСТОЛ СВОИ")), String::kNone));
- ASSERT_FALSE(String::substrMatch(str, String(UTF8("Престол сохрани")), String::kNone));
+ ASSERT(String::substrMatch(str, UTF8("ПРЁСТОЛ СВОИ"), String::kNone));
+ ASSERT_FALSE(String::substrMatch(str, UTF8("Престол сохрани"), String::kNone));
// Case sensitive & diacritic insensitive.
- ASSERT(String::substrMatch(str, String(UTF8("Одумаися!")), String::kCaseSensitive));
- ASSERT_FALSE(String::substrMatch(str, String(UTF8("одумайся!")), String::kCaseSensitive));
+ ASSERT(String::substrMatch(str, UTF8("Одумаися!"), String::kCaseSensitive));
+ ASSERT_FALSE(String::substrMatch(str, UTF8("одумайся!"), String::kCaseSensitive));
// Case insensitive & diacritic sensitive.
- ASSERT(String::substrMatch(str, String(UTF8("одумайся!")), String::kDiacriticSensitive));
- ASSERT_FALSE(String::substrMatch(str, String(UTF8("Одумаися!")), String::kDiacriticSensitive));
+ ASSERT(String::substrMatch(str, UTF8("одумайся!"), String::kDiacriticSensitive));
+ ASSERT_FALSE(String::substrMatch(str, UTF8("Одумаися!"), String::kDiacriticSensitive));
// Case sensitive & diacritic sensitive.
ASSERT(String::substrMatch(
- str, String(UTF8("Одумайся!")), String::kDiacriticSensitive | String::kCaseSensitive));
+ str, UTF8("Одумайся!"), String::kDiacriticSensitive | String::kCaseSensitive));
ASSERT_FALSE(String::substrMatch(
- str, String(UTF8("Одумаися!")), String::kDiacriticSensitive | String::kCaseSensitive));
+ str, UTF8("Одумаися!"), String::kDiacriticSensitive | String::kCaseSensitive));
}
TEST(UnicodeString, SubstringMatchTurkish) {
- String str = String(UTF8("KAÇ YAŞINDASINIZ?"));
+ std::string str = UTF8("KAÇ YAŞINDASINIZ?");
// Case insensitive & diacritic insensitive.
- ASSERT(String::substrMatch(
- str, String(UTF8("yasındasınız")), String::kNone, CaseFoldMode::kTurkish));
- ASSERT_FALSE(String::substrMatch(
- str, String(UTF8("yasindasiniz")), String::kNone, CaseFoldMode::kTurkish));
+ ASSERT(String::substrMatch(str, UTF8("yasındasınız"), String::kNone, CaseFoldMode::kTurkish));
+ ASSERT_FALSE(
+ String::substrMatch(str, UTF8("yasindasiniz"), String::kNone, CaseFoldMode::kTurkish));
// Case insensitive & diacritic sensitive.
ASSERT(String::substrMatch(
- str, String(UTF8("yaşındasınız")), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
+ str, UTF8("yaşındasınız"), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
ASSERT_FALSE(String::substrMatch(
- str, String(UTF8("yaşindasiniz")), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
+ str, UTF8("yaşindasiniz"), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
}
TEST(UnicodeString, BadUTF8) {
@@ -153,6 +217,14 @@ TEST(UnicodeString, BadUTF8) {
ASSERT_THROWS(String test2(invalid2), AssertionException);
ASSERT_THROWS(String test3(invalid3), AssertionException);
ASSERT_THROWS(String test4(invalid4), AssertionException);
+
+ // preForSubstrMatch doesn't make any guarantees about behavior when fed invalid utf8.
+ // These calls are to ensure that they don't trigger any faults in sanitizing builds.
+ String::prepForSubstrMatch(invalid1, 0, kNormal);
+ String::prepForSubstrMatch(invalid2, 0, kNormal);
+ String::prepForSubstrMatch(invalid3, 0, kNormal);
+
+ ASSERT_THROWS(String::prepForSubstrMatch(invalid4, 0, kNormal), AssertionException);
}
TEST(UnicodeString, UTF32ToUTF8) {