SERVER-19936 Optimize FTS v3 phrase matching

Major changes: * Use Booyer-Moore algorithm for searching rather than std::search * All strings are kept in UTF8 rather than going to UTF32. * Case folding and diacritic removal are done in a single pass. * Optimize case folding and diacritic removal for ASCII codepoints. * Combine functionality of codepointIsDiacritic() into codepointRemoveDiacritics()
author: Mathias Stearn <mathias@10gen.com> 2016-02-24 16:35:54 -0500
committer: Mathias Stearn <mathias@10gen.com> 2016-03-11 08:50:18 -0500
commit: 6c3157f126bb44ab275325e85de7abee5ce9ad6d (patch)
tree: 16189ba18d0febfdd564006c1f03f008527f4f41 /src/mongo/db/fts/unicode
parent: 4a35c7184e188354793f16d27e2330b3b5ce7f8f (diff)
download: mongo-6c3157f126bb44ab275325e85de7abee5ce9ad6d.tar.gz
8 files changed, 1841 insertions, 110 deletions
diff --git a/src/mongo/db/fts/unicode/codepoints.h b/src/mongo/db/fts/unicode/codepoints.h
index 5b1e8e2b2b5..70520a00106 100644
--- a/src/mongo/db/fts/unicode/codepoints.h
+++ b/src/mongo/db/fts/unicode/codepoints.h
@@ -73,6 +73,10 @@ bool codepointIsDelimiter(char32_t codepoint, DelimiterListLanguage lang);
  * which code blocks are used), decomposing them to the NFD normalization form, removing any
  * combining marks, and renormalizing them to the NFC form. The result is a mapping from original
  * codepoint to a codepoint with no diacritics.
+ *
+ * Returns 0 if codepoint is a pure diacritic mark (ie if codepointIsDiacritic() would return true).
+ * You will need to distinguish this case from the input codepoint being 0 either by explicit
+ * checking or avoiding a call to this function if codepoint is in the ASCII range (<0x80).
  */
 char32_t codepointRemoveDiacritics(char32_t codepoint);
 
diff --git a/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp b/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp
index c6b39d328b8..33893b135cd 100644
--- a/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp
+++ b/src/mongo/db/fts/unicode/codepoints_diacritic_map.cpp
@@ -35,6 +35,20 @@ namespace unicode {
 
 char32_t codepointRemoveDiacritics(char32_t codepoint) {
     switch (codepoint) {
+        case 0x5e:
+            return 0x0;
+        case 0x60:
+            return 0x0;
+        case 0xa8:
+            return 0x0;
+        case 0xaf:
+            return 0x0;
+        case 0xb4:
+            return 0x0;
+        case 0xb7:
+            return 0x0;
+        case 0xb8:
+            return 0x0;
         case 0xc0:
             return 0x41;
         case 0xc1:
@@ -539,8 +553,364 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x59;
         case 0x233:
             return 0x79;
+        case 0x2b0:
+            return 0x0;
+        case 0x2b1:
+            return 0x0;
+        case 0x2b2:
+            return 0x0;
+        case 0x2b3:
+            return 0x0;
+        case 0x2b4:
+            return 0x0;
+        case 0x2b5:
+            return 0x0;
+        case 0x2b6:
+            return 0x0;
+        case 0x2b7:
+            return 0x0;
+        case 0x2b8:
+            return 0x0;
+        case 0x2b9:
+            return 0x0;
+        case 0x2ba:
+            return 0x0;
+        case 0x2bb:
+            return 0x0;
+        case 0x2bc:
+            return 0x0;
+        case 0x2bd:
+            return 0x0;
+        case 0x2be:
+            return 0x0;
+        case 0x2bf:
+            return 0x0;
+        case 0x2c0:
+            return 0x0;
+        case 0x2c1:
+            return 0x0;
+        case 0x2c2:
+            return 0x0;
+        case 0x2c3:
+            return 0x0;
+        case 0x2c4:
+            return 0x0;
+        case 0x2c5:
+            return 0x0;
+        case 0x2c6:
+            return 0x0;
+        case 0x2c7:
+            return 0x0;
+        case 0x2c8:
+            return 0x0;
+        case 0x2c9:
+            return 0x0;
+        case 0x2ca:
+            return 0x0;
+        case 0x2cb:
+            return 0x0;
+        case 0x2cc:
+            return 0x0;
+        case 0x2cd:
+            return 0x0;
+        case 0x2ce:
+            return 0x0;
+        case 0x2cf:
+            return 0x0;
+        case 0x2d0:
+            return 0x0;
+        case 0x2d1:
+            return 0x0;
+        case 0x2d2:
+            return 0x0;
+        case 0x2d3:
+            return 0x0;
+        case 0x2d4:
+            return 0x0;
+        case 0x2d5:
+            return 0x0;
+        case 0x2d6:
+            return 0x0;
+        case 0x2d7:
+            return 0x0;
+        case 0x2d8:
+            return 0x0;
+        case 0x2d9:
+            return 0x0;
+        case 0x2da:
+            return 0x0;
+        case 0x2db:
+            return 0x0;
+        case 0x2dc:
+            return 0x0;
+        case 0x2dd:
+            return 0x0;
+        case 0x2de:
+            return 0x0;
+        case 0x2df:
+            return 0x0;
+        case 0x2e0:
+            return 0x0;
+        case 0x2e1:
+            return 0x0;
+        case 0x2e2:
+            return 0x0;
+        case 0x2e3:
+            return 0x0;
+        case 0x2e4:
+            return 0x0;
+        case 0x2e5:
+            return 0x0;
+        case 0x2e6:
+            return 0x0;
+        case 0x2e7:
+            return 0x0;
+        case 0x2e8:
+            return 0x0;
+        case 0x2e9:
+            return 0x0;
+        case 0x2ea:
+            return 0x0;
+        case 0x2eb:
+            return 0x0;
+        case 0x2ec:
+            return 0x0;
+        case 0x2ed:
+            return 0x0;
+        case 0x2ee:
+            return 0x0;
+        case 0x2ef:
+            return 0x0;
+        case 0x2f0:
+            return 0x0;
+        case 0x2f1:
+            return 0x0;
+        case 0x2f2:
+            return 0x0;
+        case 0x2f3:
+            return 0x0;
+        case 0x2f4:
+            return 0x0;
+        case 0x2f5:
+            return 0x0;
+        case 0x2f6:
+            return 0x0;
+        case 0x2f7:
+            return 0x0;
+        case 0x2f8:
+            return 0x0;
+        case 0x2f9:
+            return 0x0;
+        case 0x2fa:
+            return 0x0;
+        case 0x2fb:
+            return 0x0;
+        case 0x2fc:
+            return 0x0;
+        case 0x2fd:
+            return 0x0;
+        case 0x2fe:
+            return 0x0;
+        case 0x2ff:
+            return 0x0;
+        case 0x300:
+            return 0x0;
+        case 0x301:
+            return 0x0;
+        case 0x302:
+            return 0x0;
+        case 0x303:
+            return 0x0;
+        case 0x304:
+            return 0x0;
+        case 0x305:
+            return 0x0;
+        case 0x306:
+            return 0x0;
+        case 0x307:
+            return 0x0;
+        case 0x308:
+            return 0x0;
+        case 0x309:
+            return 0x0;
+        case 0x30a:
+            return 0x0;
+        case 0x30b:
+            return 0x0;
+        case 0x30c:
+            return 0x0;
+        case 0x30d:
+            return 0x0;
+        case 0x30e:
+            return 0x0;
+        case 0x30f:
+            return 0x0;
+        case 0x310:
+            return 0x0;
+        case 0x311:
+            return 0x0;
+        case 0x312:
+            return 0x0;
+        case 0x313:
+            return 0x0;
+        case 0x314:
+            return 0x0;
+        case 0x315:
+            return 0x0;
+        case 0x316:
+            return 0x0;
+        case 0x317:
+            return 0x0;
+        case 0x318:
+            return 0x0;
+        case 0x319:
+            return 0x0;
+        case 0x31a:
+            return 0x0;
+        case 0x31b:
+            return 0x0;
+        case 0x31c:
+            return 0x0;
+        case 0x31d:
+            return 0x0;
+        case 0x31e:
+            return 0x0;
+        case 0x31f:
+            return 0x0;
+        case 0x320:
+            return 0x0;
+        case 0x321:
+            return 0x0;
+        case 0x322:
+            return 0x0;
+        case 0x323:
+            return 0x0;
+        case 0x324:
+            return 0x0;
+        case 0x325:
+            return 0x0;
+        case 0x326:
+            return 0x0;
+        case 0x327:
+            return 0x0;
+        case 0x328:
+            return 0x0;
+        case 0x329:
+            return 0x0;
+        case 0x32a:
+            return 0x0;
+        case 0x32b:
+            return 0x0;
+        case 0x32c:
+            return 0x0;
+        case 0x32d:
+            return 0x0;
+        case 0x32e:
+            return 0x0;
+        case 0x32f:
+            return 0x0;
+        case 0x330:
+            return 0x0;
+        case 0x331:
+            return 0x0;
+        case 0x332:
+            return 0x0;
+        case 0x333:
+            return 0x0;
+        case 0x334:
+            return 0x0;
+        case 0x335:
+            return 0x0;
+        case 0x336:
+            return 0x0;
+        case 0x337:
+            return 0x0;
+        case 0x338:
+            return 0x0;
+        case 0x339:
+            return 0x0;
+        case 0x33a:
+            return 0x0;
+        case 0x33b:
+            return 0x0;
+        case 0x33c:
+            return 0x0;
+        case 0x33d:
+            return 0x0;
+        case 0x33e:
+            return 0x0;
+        case 0x33f:
+            return 0x0;
+        case 0x340:
+            return 0x0;
+        case 0x341:
+            return 0x0;
+        case 0x342:
+            return 0x0;
+        case 0x343:
+            return 0x0;
+        case 0x344:
+            return 0x0;
+        case 0x345:
+            return 0x0;
+        case 0x346:
+            return 0x0;
+        case 0x347:
+            return 0x0;
+        case 0x348:
+            return 0x0;
+        case 0x349:
+            return 0x0;
+        case 0x34a:
+            return 0x0;
+        case 0x34b:
+            return 0x0;
+        case 0x34c:
+            return 0x0;
+        case 0x34d:
+            return 0x0;
+        case 0x34e:
+            return 0x0;
+        case 0x350:
+            return 0x0;
+        case 0x351:
+            return 0x0;
+        case 0x352:
+            return 0x0;
+        case 0x353:
+            return 0x0;
+        case 0x354:
+            return 0x0;
+        case 0x355:
+            return 0x0;
+        case 0x356:
+            return 0x0;
+        case 0x357:
+            return 0x0;
+        case 0x35d:
+            return 0x0;
+        case 0x35e:
+            return 0x0;
+        case 0x35f:
+            return 0x0;
+        case 0x360:
+            return 0x0;
+        case 0x361:
+            return 0x0;
+        case 0x362:
+            return 0x0;
+        case 0x374:
+            return 0x0;
+        case 0x375:
+            return 0x0;
+        case 0x37a:
+            return 0x0;
         case 0x37e:
             return 0x3b;
+        case 0x384:
+            return 0x0;
+        case 0x385:
+            return 0x0;
         case 0x386:
             return 0x391;
         case 0x388:
@@ -621,6 +991,16 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x474;
         case 0x477:
             return 0x475;
+        case 0x483:
+            return 0x0;
+        case 0x484:
+            return 0x0;
+        case 0x485:
+            return 0x0;
+        case 0x486:
+            return 0x0;
+        case 0x487:
+            return 0x0;
         case 0x4c1:
             return 0x416;
         case 0x4c2:
@@ -689,12 +1069,314 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x42b;
         case 0x4f9:
             return 0x44b;
+        case 0x559:
+            return 0x0;
+        case 0x591:
+            return 0x0;
+        case 0x592:
+            return 0x0;
+        case 0x593:
+            return 0x0;
+        case 0x594:
+            return 0x0;
+        case 0x595:
+            return 0x0;
+        case 0x596:
+            return 0x0;
+        case 0x597:
+            return 0x0;
+        case 0x598:
+            return 0x0;
+        case 0x599:
+            return 0x0;
+        case 0x59a:
+            return 0x0;
+        case 0x59b:
+            return 0x0;
+        case 0x59c:
+            return 0x0;
+        case 0x59d:
+            return 0x0;
+        case 0x59e:
+            return 0x0;
+        case 0x59f:
+            return 0x0;
+        case 0x5a0:
+            return 0x0;
+        case 0x5a1:
+            return 0x0;
+        case 0x5a3:
+            return 0x0;
+        case 0x5a4:
+            return 0x0;
+        case 0x5a5:
+            return 0x0;
+        case 0x5a6:
+            return 0x0;
+        case 0x5a7:
+            return 0x0;
+        case 0x5a8:
+            return 0x0;
+        case 0x5a9:
+            return 0x0;
+        case 0x5aa:
+            return 0x0;
+        case 0x5ab:
+            return 0x0;
+        case 0x5ac:
+            return 0x0;
+        case 0x5ad:
+            return 0x0;
+        case 0x5ae:
+            return 0x0;
+        case 0x5af:
+            return 0x0;
+        case 0x5b0:
+            return 0x0;
+        case 0x5b1:
+            return 0x0;
+        case 0x5b2:
+            return 0x0;
+        case 0x5b3:
+            return 0x0;
+        case 0x5b4:
+            return 0x0;
+        case 0x5b5:
+            return 0x0;
+        case 0x5b6:
+            return 0x0;
+        case 0x5b7:
+            return 0x0;
+        case 0x5b8:
+            return 0x0;
+        case 0x5b9:
+            return 0x0;
+        case 0x5ba:
+            return 0x0;
+        case 0x5bb:
+            return 0x0;
+        case 0x5bc:
+            return 0x0;
+        case 0x5bd:
+            return 0x0;
+        case 0x5bf:
+            return 0x0;
+        case 0x5c1:
+            return 0x0;
+        case 0x5c2:
+            return 0x0;
+        case 0x5c4:
+            return 0x0;
+        case 0x64b:
+            return 0x0;
+        case 0x64c:
+            return 0x0;
+        case 0x64d:
+            return 0x0;
+        case 0x64e:
+            return 0x0;
+        case 0x64f:
+            return 0x0;
+        case 0x650:
+            return 0x0;
+        case 0x651:
+            return 0x0;
+        case 0x652:
+            return 0x0;
+        case 0x657:
+            return 0x0;
+        case 0x658:
+            return 0x0;
+        case 0x6df:
+            return 0x0;
+        case 0x6e0:
+            return 0x0;
+        case 0x6e5:
+            return 0x0;
+        case 0x6e6:
+            return 0x0;
+        case 0x6ea:
+            return 0x0;
+        case 0x6eb:
+            return 0x0;
+        case 0x6ec:
+            return 0x0;
+        case 0x730:
+            return 0x0;
+        case 0x731:
+            return 0x0;
+        case 0x732:
+            return 0x0;
+        case 0x733:
+            return 0x0;
+        case 0x734:
+            return 0x0;
+        case 0x735:
+            return 0x0;
+        case 0x736:
+            return 0x0;
+        case 0x737:
+            return 0x0;
+        case 0x738:
+            return 0x0;
+        case 0x739:
+            return 0x0;
+        case 0x73a:
+            return 0x0;
+        case 0x73b:
+            return 0x0;
+        case 0x73c:
+            return 0x0;
+        case 0x73d:
+            return 0x0;
+        case 0x73e:
+            return 0x0;
+        case 0x73f:
+            return 0x0;
+        case 0x740:
+            return 0x0;
+        case 0x741:
+            return 0x0;
+        case 0x742:
+            return 0x0;
+        case 0x743:
+            return 0x0;
+        case 0x744:
+            return 0x0;
+        case 0x745:
+            return 0x0;
+        case 0x746:
+            return 0x0;
+        case 0x747:
+            return 0x0;
+        case 0x748:
+            return 0x0;
+        case 0x749:
+            return 0x0;
+        case 0x74a:
+            return 0x0;
+        case 0x7a6:
+            return 0x0;
+        case 0x7a7:
+            return 0x0;
+        case 0x7a8:
+            return 0x0;
+        case 0x7a9:
+            return 0x0;
+        case 0x7aa:
+            return 0x0;
+        case 0x7ab:
+            return 0x0;
+        case 0x7ac:
+            return 0x0;
+        case 0x7ad:
+            return 0x0;
+        case 0x7ae:
+            return 0x0;
+        case 0x7af:
+            return 0x0;
+        case 0x7b0:
+            return 0x0;
+        case 0x7eb:
+            return 0x0;
+        case 0x7ec:
+            return 0x0;
+        case 0x7ed:
+            return 0x0;
+        case 0x7ee:
+            return 0x0;
+        case 0x7ef:
+            return 0x0;
+        case 0x7f0:
+            return 0x0;
+        case 0x7f1:
+            return 0x0;
+        case 0x7f2:
+            return 0x0;
+        case 0x7f3:
+            return 0x0;
+        case 0x7f4:
+            return 0x0;
+        case 0x7f5:
+            return 0x0;
+        case 0x818:
+            return 0x0;
+        case 0x819:
+            return 0x0;
+        case 0x8e3:
+            return 0x0;
+        case 0x8e4:
+            return 0x0;
+        case 0x8e5:
+            return 0x0;
+        case 0x8e6:
+            return 0x0;
+        case 0x8e7:
+            return 0x0;
+        case 0x8e8:
+            return 0x0;
+        case 0x8e9:
+            return 0x0;
+        case 0x8ea:
+            return 0x0;
+        case 0x8eb:
+            return 0x0;
+        case 0x8ec:
+            return 0x0;
+        case 0x8ed:
+            return 0x0;
+        case 0x8ee:
+            return 0x0;
+        case 0x8ef:
+            return 0x0;
+        case 0x8f0:
+            return 0x0;
+        case 0x8f1:
+            return 0x0;
+        case 0x8f2:
+            return 0x0;
+        case 0x8f3:
+            return 0x0;
+        case 0x8f4:
+            return 0x0;
+        case 0x8f5:
+            return 0x0;
+        case 0x8f6:
+            return 0x0;
+        case 0x8f7:
+            return 0x0;
+        case 0x8f8:
+            return 0x0;
+        case 0x8f9:
+            return 0x0;
+        case 0x8fa:
+            return 0x0;
+        case 0x8fb:
+            return 0x0;
+        case 0x8fc:
+            return 0x0;
+        case 0x8fd:
+            return 0x0;
+        case 0x8fe:
+            return 0x0;
         case 0x929:
             return 0x928;
         case 0x931:
             return 0x930;
         case 0x934:
             return 0x933;
+        case 0x93c:
+            return 0x0;
+        case 0x94d:
+            return 0x0;
+        case 0x951:
+            return 0x0;
+        case 0x952:
+            return 0x0;
+        case 0x953:
+            return 0x0;
+        case 0x954:
+            return 0x0;
         case 0x958:
             return 0x915;
         case 0x959:
@@ -711,6 +1393,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x92b;
         case 0x95f:
             return 0x92f;
+        case 0x971:
+            return 0x0;
+        case 0x9bc:
+            return 0x0;
+        case 0x9cd:
+            return 0x0;
         case 0x9dc:
             return 0x9a1;
         case 0x9dd:
@@ -721,6 +1409,10 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0xa32;
         case 0xa36:
             return 0xa38;
+        case 0xa3c:
+            return 0x0;
+        case 0xa4d:
+            return 0x0;
         case 0xa59:
             return 0xa16;
         case 0xa5a:
@@ -729,14 +1421,444 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0xa1c;
         case 0xa5e:
             return 0xa2b;
+        case 0xabc:
+            return 0x0;
+        case 0xacd:
+            return 0x0;
+        case 0xb3c:
+            return 0x0;
+        case 0xb4d:
+            return 0x0;
         case 0xb5c:
             return 0xb21;
         case 0xb5d:
             return 0xb22;
+        case 0xbcd:
+            return 0x0;
+        case 0xc4d:
+            return 0x0;
+        case 0xcbc:
+            return 0x0;
+        case 0xccd:
+            return 0x0;
+        case 0xd4d:
+            return 0x0;
+        case 0xdca:
+            return 0x0;
         case 0xdda:
             return 0xdd9;
         case 0xddd:
             return 0xddc;
+        case 0xe47:
+            return 0x0;
+        case 0xe48:
+            return 0x0;
+        case 0xe49:
+            return 0x0;
+        case 0xe4a:
+            return 0x0;
+        case 0xe4b:
+            return 0x0;
+        case 0xe4c:
+            return 0x0;
+        case 0xe4e:
+            return 0x0;
+        case 0xec8:
+            return 0x0;
+        case 0xec9:
+            return 0x0;
+        case 0xeca:
+            return 0x0;
+        case 0xecb:
+            return 0x0;
+        case 0xecc:
+            return 0x0;
+        case 0xf18:
+            return 0x0;
+        case 0xf19:
+            return 0x0;
+        case 0xf35:
+            return 0x0;
+        case 0xf37:
+            return 0x0;
+        case 0xf39:
+            return 0x0;
+        case 0xf3e:
+            return 0x0;
+        case 0xf3f:
+            return 0x0;
+        case 0xf82:
+            return 0x0;
+        case 0xf83:
+            return 0x0;
+        case 0xf84:
+            return 0x0;
+        case 0xf86:
+            return 0x0;
+        case 0xf87:
+            return 0x0;
+        case 0xfc6:
+            return 0x0;
+        case 0x1037:
+            return 0x0;
+        case 0x1039:
+            return 0x0;
+        case 0x103a:
+            return 0x0;
+        case 0x1087:
+            return 0x0;
+        case 0x1088:
+            return 0x0;
+        case 0x1089:
+            return 0x0;
+        case 0x108a:
+            return 0x0;
+        case 0x108b:
+            return 0x0;
+        case 0x108c:
+            return 0x0;
+        case 0x108d:
+            return 0x0;
+        case 0x108f:
+            return 0x0;
+        case 0x109a:
+            return 0x0;
+        case 0x109b:
+            return 0x0;
+        case 0x17c9:
+            return 0x0;
+        case 0x17ca:
+            return 0x0;
+        case 0x17cb:
+            return 0x0;
+        case 0x17cc:
+            return 0x0;
+        case 0x17cd:
+            return 0x0;
+        case 0x17ce:
+            return 0x0;
+        case 0x17cf:
+            return 0x0;
+        case 0x17d0:
+            return 0x0;
+        case 0x17d1:
+            return 0x0;
+        case 0x17d2:
+            return 0x0;
+        case 0x17d3:
+            return 0x0;
+        case 0x17dd:
+            return 0x0;
+        case 0x1939:
+            return 0x0;
+        case 0x193a:
+            return 0x0;
+        case 0x193b:
+            return 0x0;
+        case 0x1a75:
+            return 0x0;
+        case 0x1a76:
+            return 0x0;
+        case 0x1a77:
+            return 0x0;
+        case 0x1a78:
+            return 0x0;
+        case 0x1a79:
+            return 0x0;
+        case 0x1a7a:
+            return 0x0;
+        case 0x1a7b:
+            return 0x0;
+        case 0x1a7c:
+            return 0x0;
+        case 0x1a7f:
+            return 0x0;
+        case 0x1ab0:
+            return 0x0;
+        case 0x1ab1:
+            return 0x0;
+        case 0x1ab2:
+            return 0x0;
+        case 0x1ab3:
+            return 0x0;
+        case 0x1ab4:
+            return 0x0;
+        case 0x1ab5:
+            return 0x0;
+        case 0x1ab6:
+            return 0x0;
+        case 0x1ab7:
+            return 0x0;
+        case 0x1ab8:
+            return 0x0;
+        case 0x1ab9:
+            return 0x0;
+        case 0x1aba:
+            return 0x0;
+        case 0x1abb:
+            return 0x0;
+        case 0x1abc:
+            return 0x0;
+        case 0x1abd:
+            return 0x0;
+        case 0x1b34:
+            return 0x0;
+        case 0x1b44:
+            return 0x0;
+        case 0x1b6b:
+            return 0x0;
+        case 0x1b6c:
+            return 0x0;
+        case 0x1b6d:
+            return 0x0;
+        case 0x1b6e:
+            return 0x0;
+        case 0x1b6f:
+            return 0x0;
+        case 0x1b70:
+            return 0x0;
+        case 0x1b71:
+            return 0x0;
+        case 0x1b72:
+            return 0x0;
+        case 0x1b73:
+            return 0x0;
+        case 0x1baa:
+            return 0x0;
+        case 0x1bab:
+            return 0x0;
+        case 0x1c36:
+            return 0x0;
+        case 0x1c37:
+            return 0x0;
+        case 0x1c78:
+            return 0x0;
+        case 0x1c79:
+            return 0x0;
+        case 0x1c7a:
+            return 0x0;
+        case 0x1c7b:
+            return 0x0;
+        case 0x1c7c:
+            return 0x0;
+        case 0x1c7d:
+            return 0x0;
+        case 0x1cd0:
+            return 0x0;
+        case 0x1cd1:
+            return 0x0;
+        case 0x1cd2:
+            return 0x0;
+        case 0x1cd3:
+            return 0x0;
+        case 0x1cd4:
+            return 0x0;
+        case 0x1cd5:
+            return 0x0;
+        case 0x1cd6:
+            return 0x0;
+        case 0x1cd7:
+            return 0x0;
+        case 0x1cd8:
+            return 0x0;
+        case 0x1cd9:
+            return 0x0;
+        case 0x1cda:
+            return 0x0;
+        case 0x1cdb:
+            return 0x0;
+        case 0x1cdc:
+            return 0x0;
+        case 0x1cdd:
+            return 0x0;
+        case 0x1cde:
+            return 0x0;
+        case 0x1cdf:
+            return 0x0;
+        case 0x1ce0:
+            return 0x0;
+        case 0x1ce1:
+            return 0x0;
+        case 0x1ce2:
+            return 0x0;
+        case 0x1ce3:
+            return 0x0;
+        case 0x1ce4:
+            return 0x0;
+        case 0x1ce5:
+            return 0x0;
+        case 0x1ce6:
+            return 0x0;
+        case 0x1ce7:
+            return 0x0;
+        case 0x1ce8:
+            return 0x0;
+        case 0x1ced:
+            return 0x0;
+        case 0x1cf4:
+            return 0x0;
+        case 0x1cf8:
+            return 0x0;
+        case 0x1cf9:
+            return 0x0;
+        case 0x1d2c:
+            return 0x0;
+        case 0x1d2d:
+            return 0x0;
+        case 0x1d2e:
+            return 0x0;
+        case 0x1d2f:
+            return 0x0;
+        case 0x1d30:
+            return 0x0;
+        case 0x1d31:
+            return 0x0;
+        case 0x1d32:
+            return 0x0;
+        case 0x1d33:
+            return 0x0;
+        case 0x1d34:
+            return 0x0;
+        case 0x1d35:
+            return 0x0;
+        case 0x1d36:
+            return 0x0;
+        case 0x1d37:
+            return 0x0;
+        case 0x1d38:
+            return 0x0;
+        case 0x1d39:
+            return 0x0;
+        case 0x1d3a:
+            return 0x0;
+        case 0x1d3b:
+            return 0x0;
+        case 0x1d3c:
+            return 0x0;
+        case 0x1d3d:
+            return 0x0;
+        case 0x1d3e:
+            return 0x0;
+        case 0x1d3f:
+            return 0x0;
+        case 0x1d40:
+            return 0x0;
+        case 0x1d41:
+            return 0x0;
+        case 0x1d42:
+            return 0x0;
+        case 0x1d43:
+            return 0x0;
+        case 0x1d44:
+            return 0x0;
+        case 0x1d45:
+            return 0x0;
+        case 0x1d46:
+            return 0x0;
+        case 0x1d47:
+            return 0x0;
+        case 0x1d48:
+            return 0x0;
+        case 0x1d49:
+            return 0x0;
+        case 0x1d4a:
+            return 0x0;
+        case 0x1d4b:
+            return 0x0;
+        case 0x1d4c:
+            return 0x0;
+        case 0x1d4d:
+            return 0x0;
+        case 0x1d4e:
+            return 0x0;
+        case 0x1d4f:
+            return 0x0;
+        case 0x1d50:
+            return 0x0;
+        case 0x1d51:
+            return 0x0;
+        case 0x1d52:
+            return 0x0;
+        case 0x1d53:
+            return 0x0;
+        case 0x1d54:
+            return 0x0;
+        case 0x1d55:
+            return 0x0;
+        case 0x1d56:
+            return 0x0;
+        case 0x1d57:
+            return 0x0;
+        case 0x1d58:
+            return 0x0;
+        case 0x1d59:
+            return 0x0;
+        case 0x1d5a:
+            return 0x0;
+        case 0x1d5b:
+            return 0x0;
+        case 0x1d5c:
+            return 0x0;
+        case 0x1d5d:
+            return 0x0;
+        case 0x1d5e:
+            return 0x0;
+        case 0x1d5f:
+            return 0x0;
+        case 0x1d60:
+            return 0x0;
+        case 0x1d61:
+            return 0x0;
+        case 0x1d62:
+            return 0x0;
+        case 0x1d63:
+            return 0x0;
+        case 0x1d64:
+            return 0x0;
+        case 0x1d65:
+            return 0x0;
+        case 0x1d66:
+            return 0x0;
+        case 0x1d67:
+            return 0x0;
+        case 0x1d68:
+            return 0x0;
+        case 0x1d69:
+            return 0x0;
+        case 0x1d6a:
+            return 0x0;
+        case 0x1dc4:
+            return 0x0;
+        case 0x1dc5:
+            return 0x0;
+        case 0x1dc6:
+            return 0x0;
+        case 0x1dc7:
+            return 0x0;
+        case 0x1dc8:
+            return 0x0;
+        case 0x1dc9:
+            return 0x0;
+        case 0x1dca:
+            return 0x0;
+        case 0x1dcb:
+            return 0x0;
+        case 0x1dcc:
+            return 0x0;
+        case 0x1dcd:
+            return 0x0;
+        case 0x1dce:
+            return 0x0;
+        case 0x1dcf:
+            return 0x0;
+        case 0x1df5:
+            return 0x0;
+        case 0x1dfd:
+            return 0x0;
+        case 0x1dfe:
+            return 0x0;
+        case 0x1dff:
+            return 0x0;
         case 0x1e00:
             return 0x41;
         case 0x1e01:
@@ -1575,8 +2697,16 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x391;
         case 0x1fbc:
             return 0x391;
+        case 0x1fbd:
+            return 0x0;
         case 0x1fbe:
             return 0x3b9;
+        case 0x1fbf:
+            return 0x0;
+        case 0x1fc0:
+            return 0x0;
+        case 0x1fc1:
+            return 0x0;
         case 0x1fc2:
             return 0x3b7;
         case 0x1fc3:
@@ -1597,6 +2727,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x397;
         case 0x1fcc:
             return 0x397;
+        case 0x1fcd:
+            return 0x0;
+        case 0x1fce:
+            return 0x0;
+        case 0x1fcf:
+            return 0x0;
         case 0x1fd0:
             return 0x3b9;
         case 0x1fd1:
@@ -1617,6 +2753,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x399;
         case 0x1fdb:
             return 0x399;
+        case 0x1fdd:
+            return 0x0;
+        case 0x1fde:
+            return 0x0;
+        case 0x1fdf:
+            return 0x0;
         case 0x1fe0:
             return 0x3c5;
         case 0x1fe1:
@@ -1643,6 +2785,12 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x3a5;
         case 0x1fec:
             return 0x3a1;
+        case 0x1fed:
+            return 0x0;
+        case 0x1fee:
+            return 0x0;
+        case 0x1fef:
+            return 0x0;
         case 0x1ff2:
             return 0x3c9;
         case 0x1ff3:
@@ -1663,6 +2811,10 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x3a9;
         case 0x1ffc:
             return 0x3a9;
+        case 0x1ffd:
+            return 0x0;
+        case 0x1ffe:
+            return 0x0;
         case 0x2000:
             return 0x2002;
         case 0x2001:
@@ -1767,6 +2919,26 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x3009;
         case 0x2adc:
             return 0x2add;
+        case 0x2cef:
+            return 0x0;
+        case 0x2cf0:
+            return 0x0;
+        case 0x2cf1:
+            return 0x0;
+        case 0x2e2f:
+            return 0x0;
+        case 0x302a:
+            return 0x0;
+        case 0x302b:
+            return 0x0;
+        case 0x302c:
+            return 0x0;
+        case 0x302d:
+            return 0x0;
+        case 0x302e:
+            return 0x0;
+        case 0x302f:
+            return 0x0;
         case 0x304c:
             return 0x304b;
         case 0x304e:
@@ -1819,6 +2991,14 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x307b;
         case 0x3094:
             return 0x3046;
+        case 0x3099:
+            return 0x0;
+        case 0x309a:
+            return 0x0;
+        case 0x309b:
+            return 0x0;
+        case 0x309c:
+            return 0x0;
         case 0x309e:
             return 0x309d;
         case 0x30ac:
@@ -1881,8 +3061,138 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x30f1;
         case 0x30fa:
             return 0x30f2;
+        case 0x30fc:
+            return 0x0;
         case 0x30fe:
             return 0x30fd;
+        case 0xa66f:
+            return 0x0;
+        case 0xa67c:
+            return 0x0;
+        case 0xa67d:
+            return 0x0;
+        case 0xa67f:
+            return 0x0;
+        case 0xa69c:
+            return 0x0;
+        case 0xa69d:
+            return 0x0;
+        case 0xa6f0:
+            return 0x0;
+        case 0xa6f1:
+            return 0x0;
+        case 0xa717:
+            return 0x0;
+        case 0xa718:
+            return 0x0;
+        case 0xa719:
+            return 0x0;
+        case 0xa71a:
+            return 0x0;
+        case 0xa71b:
+            return 0x0;
+        case 0xa71c:
+            return 0x0;
+        case 0xa71d:
+            return 0x0;
+        case 0xa71e:
+            return 0x0;
+        case 0xa71f:
+            return 0x0;
+        case 0xa720:
+            return 0x0;
+        case 0xa721:
+            return 0x0;
+        case 0xa788:
+            return 0x0;
+        case 0xa7f8:
+            return 0x0;
+        case 0xa7f9:
+            return 0x0;
+        case 0xa8c4:
+            return 0x0;
+        case 0xa8e0:
+            return 0x0;
+        case 0xa8e1:
+            return 0x0;
+        case 0xa8e2:
+            return 0x0;
+        case 0xa8e3:
+            return 0x0;
+        case 0xa8e4:
+            return 0x0;
+        case 0xa8e5:
+            return 0x0;
+        case 0xa8e6:
+            return 0x0;
+        case 0xa8e7:
+            return 0x0;
+        case 0xa8e8:
+            return 0x0;
+        case 0xa8e9:
+            return 0x0;
+        case 0xa8ea:
+            return 0x0;
+        case 0xa8eb:
+            return 0x0;
+        case 0xa8ec:
+            return 0x0;
+        case 0xa8ed:
+            return 0x0;
+        case 0xa8ee:
+            return 0x0;
+        case 0xa8ef:
+            return 0x0;
+        case 0xa8f0:
+            return 0x0;
+        case 0xa8f1:
+            return 0x0;
+        case 0xa92b:
+            return 0x0;
+        case 0xa92c:
+            return 0x0;
+        case 0xa92d:
+            return 0x0;
+        case 0xa92e:
+            return 0x0;
+        case 0xa953:
+            return 0x0;
+        case 0xa9b3:
+            return 0x0;
+        case 0xa9c0:
+            return 0x0;
+        case 0xa9e5:
+            return 0x0;
+        case 0xaa7b:
+            return 0x0;
+        case 0xaa7c:
+            return 0x0;
+        case 0xaa7d:
+            return 0x0;
+        case 0xaabf:
+            return 0x0;
+        case 0xaac0:
+            return 0x0;
+        case 0xaac1:
+            return 0x0;
+        case 0xaac2:
+            return 0x0;
+        case 0xaaf6:
+            return 0x0;
+        case 0xab5b:
+            return 0x0;
+        case 0xab5c:
+            return 0x0;
+        case 0xab5d:
+            return 0x0;
+        case 0xab5e:
+            return 0x0;
+        case 0xab5f:
+            return 0x0;
+        case 0xabec:
+            return 0x0;
+        case 0xabed:
+            return 0x0;
         case 0xf900:
             return 0x8c48;
         case 0xf901:
@@ -2805,6 +4115,8 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x9f8e;
         case 0xfb1d:
             return 0x5d9;
+        case 0xfb1e:
+            return 0x0;
         case 0xfb1f:
             return 0x5f2;
         case 0xfb2a:
@@ -2871,12 +4183,246 @@ char32_t codepointRemoveDiacritics(char32_t codepoint) {
             return 0x5db;
         case 0xfb4e:
             return 0x5e4;
+        case 0xfe20:
+            return 0x0;
+        case 0xfe21:
+            return 0x0;
+        case 0xfe22:
+            return 0x0;
+        case 0xfe23:
+            return 0x0;
+        case 0xfe24:
+            return 0x0;
+        case 0xfe25:
+            return 0x0;
+        case 0xfe26:
+            return 0x0;
+        case 0xfe27:
+            return 0x0;
+        case 0xfe28:
+            return 0x0;
+        case 0xfe29:
+            return 0x0;
+        case 0xfe2a:
+            return 0x0;
+        case 0xfe2b:
+            return 0x0;
+        case 0xfe2c:
+            return 0x0;
+        case 0xfe2d:
+            return 0x0;
+        case 0xfe2e:
+            return 0x0;
+        case 0xfe2f:
+            return 0x0;
+        case 0xff3e:
+            return 0x0;
+        case 0xff40:
+            return 0x0;
+        case 0xff70:
+            return 0x0;
+        case 0xff9e:
+            return 0x0;
+        case 0xff9f:
+            return 0x0;
+        case 0xffe3:
+            return 0x0;
+        case 0x102e0:
+            return 0x0;
+        case 0x10ae5:
+            return 0x0;
+        case 0x10ae6:
+            return 0x0;
         case 0x1109a:
             return 0x11099;
         case 0x1109c:
             return 0x1109b;
         case 0x110ab:
             return 0x110a5;
+        case 0x110b9:
+            return 0x0;
+        case 0x110ba:
+            return 0x0;
+        case 0x11133:
+            return 0x0;
+        case 0x11134:
+            return 0x0;
+        case 0x11173:
+            return 0x0;
+        case 0x111c0:
+            return 0x0;
+        case 0x111ca:
+            return 0x0;
+        case 0x111cb:
+            return 0x0;
+        case 0x111cc:
+            return 0x0;
+        case 0x11235:
+            return 0x0;
+        case 0x11236:
+            return 0x0;
+        case 0x112e9:
+            return 0x0;
+        case 0x112ea:
+            return 0x0;
+        case 0x1133c:
+            return 0x0;
+        case 0x1134d:
+            return 0x0;
+        case 0x11366:
+            return 0x0;
+        case 0x11367:
+            return 0x0;
+        case 0x11368:
+            return 0x0;
+        case 0x11369:
+            return 0x0;
+        case 0x1136a:
+            return 0x0;
+        case 0x1136b:
+            return 0x0;
+        case 0x1136c:
+            return 0x0;
+        case 0x11370:
+            return 0x0;
+        case 0x11371:
+            return 0x0;
+        case 0x11372:
+            return 0x0;
+        case 0x11373:
+            return 0x0;
+        case 0x11374:
+            return 0x0;
+        case 0x114c2:
+            return 0x0;
+        case 0x114c3:
+            return 0x0;
+        case 0x115bf:
+            return 0x0;
+        case 0x115c0:
+            return 0x0;
+        case 0x1163f:
+            return 0x0;
+        case 0x116b6:
+            return 0x0;
+        case 0x116b7:
+            return 0x0;
+        case 0x1172b:
+            return 0x0;
+        case 0x16af0:
+            return 0x0;
+        case 0x16af1:
+            return 0x0;
+        case 0x16af2:
+            return 0x0;
+        case 0x16af3:
+            return 0x0;
+        case 0x16af4:
+            return 0x0;
+        case 0x16f8f:
+            return 0x0;
+        case 0x16f90:
+            return 0x0;
+        case 0x16f91:
+            return 0x0;
+        case 0x16f92:
+            return 0x0;
+        case 0x16f93:
+            return 0x0;
+        case 0x16f94:
+            return 0x0;
+        case 0x16f95:
+            return 0x0;
+        case 0x16f96:
+            return 0x0;
+        case 0x16f97:
+            return 0x0;
+        case 0x16f98:
+            return 0x0;
+        case 0x16f99:
+            return 0x0;
+        case 0x16f9a:
+            return 0x0;
+        case 0x16f9b:
+            return 0x0;
+        case 0x16f9c:
+            return 0x0;
+        case 0x16f9d:
+            return 0x0;
+        case 0x16f9e:
+            return 0x0;
+        case 0x16f9f:
+            return 0x0;
+        case 0x1d167:
+            return 0x0;
+        case 0x1d168:
+            return 0x0;
+        case 0x1d169:
+            return 0x0;
+        case 0x1d16d:
+            return 0x0;
+        case 0x1d16e:
+            return 0x0;
+        case 0x1d16f:
+            return 0x0;
+        case 0x1d170:
+            return 0x0;
+        case 0x1d171:
+            return 0x0;
+        case 0x1d172:
+            return 0x0;
+        case 0x1d17b:
+            return 0x0;
+        case 0x1d17c:
+            return 0x0;
+        case 0x1d17d:
+            return 0x0;
+        case 0x1d17e:
+            return 0x0;
+        case 0x1d17f:
+            return 0x0;
+        case 0x1d180:
+            return 0x0;
+        case 0x1d181:
+            return 0x0;
+        case 0x1d182:
+            return 0x0;
+        case 0x1d185:
+            return 0x0;
+        case 0x1d186:
+            return 0x0;
+        case 0x1d187:
+            return 0x0;
+        case 0x1d188:
+            return 0x0;
+        case 0x1d189:
+            return 0x0;
+        case 0x1d18a:
+            return 0x0;
+        case 0x1d18b:
+            return 0x0;
+        case 0x1d1aa:
+            return 0x0;
+        case 0x1d1ab:
+            return 0x0;
+        case 0x1d1ac:
+            return 0x0;
+        case 0x1d1ad:
+            return 0x0;
+        case 0x1e8d0:
+            return 0x0;
+        case 0x1e8d1:
+            return 0x0;
+        case 0x1e8d2:
+            return 0x0;
+        case 0x1e8d3:
+            return 0x0;
+        case 0x1e8d4:
+            return 0x0;
+        case 0x1e8d5:
+            return 0x0;
+        case 0x1e8d6:
+            return 0x0;
         case 0x2f800:
             return 0x4e3d;
         case 0x2f801:
diff --git a/src/mongo/db/fts/unicode/codepoints_test.cpp b/src/mongo/db/fts/unicode/codepoints_test.cpp
index 90510666cba..a68c41688f6 100644
--- a/src/mongo/db/fts/unicode/codepoints_test.cpp
+++ b/src/mongo/db/fts/unicode/codepoints_test.cpp
@@ -32,6 +32,8 @@
 namespace mongo {
 namespace unicode {
 
+const char32_t maxCP = 0x1FFFFF;  // Highest valid codepoint.
+
 /**
  * Above most of the arrays in this class are the UTF-32 character literals that correspond to the
  * codepoints in the array.
@@ -39,14 +41,19 @@ namespace unicode {
 
 TEST(UnicodeCodepoints, Diacritics) {
     // There are no character literals for combining marks.
-    const char32_t marks[] = {0x0301, 0x0339, 0x1AB4, 0x1DC5, 0xA69D};
+    const char32_t marks[] = {'^', '`', 0x0301, 0x0339, 0x1AB4, 0x1DC5, 0xA69D};
 
     // const char32_t not_marks[] = {U'-', U'.', U'\'', U'*', U'm'};
     const char32_t not_marks[] = {0x2D, 0x2E, 0x27, 0x2A, 0x6D};
 
-    for (auto i = 0; i < 5; ++i) {
-        ASSERT(codepointIsDiacritic(marks[i]));
-        ASSERT_FALSE(codepointIsDiacritic(not_marks[i]));
+    for (auto cp : marks) {
+        ASSERT(codepointIsDiacritic(cp));
+        ASSERT_EQ(codepointRemoveDiacritics(cp), char32_t(0));
+    }
+
+    for (auto cp : not_marks) {
+        ASSERT(!codepointIsDiacritic(cp));
+        ASSERT_NE(codepointRemoveDiacritics(cp), char32_t(0));
     }
 }
 
@@ -77,6 +84,10 @@ TEST(UnicodeCodepoints, RemoveDiacritics) {
     for (auto i = 0; i < 5; ++i) {
         ASSERT_EQUALS(clean[i], codepointRemoveDiacritics(originals[i]));
     }
+
+    for (char32_t cp = 0; cp <= maxCP; cp++) {
+        ASSERT_EQ(codepointRemoveDiacritics(cp) == 0, cp == 0 || codepointIsDiacritic(cp));
+    }
 }
 
 TEST(UnicodeCodepoints, ToLower) {
@@ -90,5 +101,18 @@ TEST(UnicodeCodepoints, ToLower) {
     }
 }
 
+TEST(UnicodeCodepoints, ToLowerIsFixedPoint) {
+    for (char32_t cp = 0; cp <= maxCP; cp++) {
+        ASSERT_EQ(codepointToLower(cp), codepointToLower(codepointToLower(cp)));
+    }
+}
+
+TEST(UnicodeCodepoints, RemoveDiacriticsIsFixedPoint) {
+    for (char32_t cp = 0; cp <= maxCP; cp++) {
+        ASSERT_EQ(codepointRemoveDiacritics(cp),
+                  codepointRemoveDiacritics(codepointRemoveDiacritics(cp)));
+    }
+}
+
 }  // namespace unicode
 }  // namespace mongo
diff --git a/src/mongo/db/fts/unicode/gen_casefold_map.py b/src/mongo/db/fts/unicode/gen_casefold_map.py
index 7ab9b49f7a4..61249e5f0d7 100644
--- a/src/mongo/db/fts/unicode/gen_casefold_map.py
+++ b/src/mongo/db/fts/unicode/gen_casefold_map.py
@@ -7,11 +7,11 @@ from gen_helper import getCopyrightNotice, openNamespaces, closeNamespaces, \
     include
 
 def generate(unicode_casefold_file, target):
-    """Generates a C++ source file that contains a Unicode case folding 
+    """Generates a C++ source file that contains a Unicode case folding
        function.
 
     The case folding function contains a switch statement with cases for every
-    Unicode codepoint that has a case folding mapping. 
+    Unicode codepoint that has a case folding mapping.
     """
     out = open(target, "w")
 
@@ -43,29 +43,46 @@ def generate(unicode_casefold_file, target):
             codepoint_mapping  = int(values[2], 16)
             case_mappings[original_codepoint] = codepoint_mapping
 
-    out.write("""char32_t codepointToLower(char32_t codepoint, CaseFoldMode \
-mode) { 
-    if (mode == CaseFoldMode::kTurkish) {
-        if (codepoint == 0x049) {  // I -> ı
-            return 0x131;
-        } else if (codepoint == 0x130) {  // İ -> i
-            return 0x069;
-        }
+    turkishMapping = {
+        0x49: 0x131,  # I -> ı
+        0x130: 0x069,   # İ -> i
     }
 
-    switch (codepoint) {\n""")
+    out.write(
+        """char32_t codepointToLower(char32_t codepoint, CaseFoldMode mode) {
+               if (codepoint <= 0x7f) {
+                    if (codepoint >= 'A' && codepoint <= 'Z') {
+                       return (mode == CaseFoldMode::kTurkish && codepoint == 'I')
+                              ? 0x131
+                              : (codepoint | 0x20); // Set the ascii lowercase bit on the character.
+                   }
+                   return codepoint;
+               }
+
+               switch (codepoint) {\n""")
 
     mappings_list = []
 
     for mapping in case_mappings:
         mappings_list.append((mapping, case_mappings[mapping]))
 
+    # Make sure we include each mapping in turkishMapping in the cases below. This ensures we handle
+    # them even if we'd skip the letter in non-turkish mode.
+    for mapping in turkishMapping:
+        if mapping not in case_mappings:
+            mappings_list.append((mapping, mapping))
+
     sorted_mappings = sorted(mappings_list, key=lambda mapping: mapping[0])
 
     for mapping in sorted_mappings:
-        out.write("\
-    case " + str(hex(mapping[0])) + ": return " + \
-        str(hex(mapping[1])) +";\n")
+        if mapping[0] <= 0x7f:
+            continue # ascii is special cased above.
+
+        if mapping[0] in turkishMapping:
+            out.write("case 0x%x: return mode == CaseFoldMode::kTurkish ? 0x%x : 0x%x;\n"
+                      % (mapping[0], turkishMapping[mapping[0]], mapping[1]))
+        else:
+            out.write("case 0x%x: return 0x%x;\n"%mapping)
 
     out.write("\
     default: return codepoint;\n    }\n}")
diff --git a/src/mongo/db/fts/unicode/gen_diacritic_map.py b/src/mongo/db/fts/unicode/gen_diacritic_map.py
index 84f57e82a3c..08cfa95cda2 100644
--- a/src/mongo/db/fts/unicode/gen_diacritic_map.py
+++ b/src/mongo/db/fts/unicode/gen_diacritic_map.py
@@ -55,6 +55,8 @@ def add_diacritic_mapping(codepoint):
 
     # Only use mappings where the final recomposed form is a single codepoint
     if (a != c and len(c) == 1):
+        assert c != '\0' # This is used to indicate the codepoint is a pure diacritic.
+        assert ord(c) not in diacritics
         diacritic_mappings[codepoint] = ord(c[0])
 
 def add_diacritic_range(start, end):
@@ -78,6 +80,9 @@ def generate(target):
     # Map diacritics from 0 to the maximum Unicode codepoint
     add_diacritic_range(0x0000, 0x10FFFF)
 
+    for diacritic in diacritics:
+        diacritic_mappings[diacritic] = 0
+
     out.write("""char32_t codepointRemoveDiacritics(char32_t codepoint) {
     switch (codepoint) {\n""")
 
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 9c18a776787..0ea37d8f33f 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -29,7 +29,9 @@
 #include "mongo/db/fts/unicode/string.h"
 
 #include <algorithm>
+#include <boost/algorithm/searching/boyer_moore.hpp>
 
+#include "mongo/platform/bits.h"
 #include "mongo/shell/linenoise_utf8.h"
 #include "mongo/util/assert_util.h"
 
@@ -42,10 +44,6 @@ using linenoise_utf8::copyString8to32;
 using std::u32string;
 
 String::String(const StringData utf8_src) {
-    // Reserve space for underlying buffers to prevent excessive reallocations.
-    _outputBuf.reserve(utf8_src.size() * 4);
-    _data.reserve(utf8_src.size() * 4);
-
     // Convert UTF-8 input to UTF-32 data.
     setData(utf8_src);
 }
@@ -79,12 +77,6 @@ void String::setData(const StringData utf8_src) {
     _needsOutputConversion = true;
 }
 
-String::String(u32string&& src) : _data(std::move(src)), _needsOutputConversion(true) {
-    // Reserve space for underlying buffers to prevent excessive reallocations.
-    _outputBuf.reserve(src.size() * 4);
-    _data.reserve(src.size() * 4);
-}
-
 std::string String::toString() {
     // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
     // characters, plus a null character if there isn't one.
@@ -100,14 +92,6 @@ std::string String::toString() {
     return _outputBuf;
 }
 
-size_t String::size() const {
-    return _data.size();
-}
-
-const char32_t& String::operator[](int i) const {
-    return _data[i];
-}
-
 String String::substr(size_t pos, size_t len) const {
     unicode::String buf;
     substrToBuf(pos, len, buf);
@@ -147,63 +131,133 @@ void String::substrToBuf(size_t pos, size_t len, String& buffer) const {
 
 void String::toLowerToBuf(CaseFoldMode mode, String& buffer) const {
     buffer._data.resize(_data.size());
-    auto index = 0;
+    auto outIt = buffer._data.begin();
     for (auto codepoint : _data) {
-        buffer._data[index++] = codepointToLower(codepoint, mode);
+        *outIt++ = codepointToLower(codepoint, mode);
     }
     buffer._needsOutputConversion = true;
 }
 
 void String::removeDiacriticsToBuf(String& buffer) const {
     buffer._data.resize(_data.size());
-    auto index = 0;
+    auto outIt = buffer._data.begin();
     for (auto codepoint : _data) {
-        if (!codepointIsDiacritic(codepoint)) {
-            buffer._data[index++] = codepointRemoveDiacritics(codepoint);
+        if (codepoint <= 0x7f) {
+            // ASCII only has two diacritics so they are hard-coded here.
+            if (codepoint != '^' && codepoint != '`') {
+                *outIt++ = codepoint;
+            }
+        } else if (auto clean = codepointRemoveDiacritics(codepoint)) {
+            *outIt++ = clean;
+        } else {
+            // codepoint was a pure diacritic mark, so skip it.
         }
     }
-    buffer._data.resize(index);
+    buffer._data.resize(outIt - buffer._data.begin());
     buffer._needsOutputConversion = true;
 }
 
-bool String::substrMatch(const String& str,
-                         const String& find,
+std::pair<std::unique_ptr<char[]>, char*> String::prepForSubstrMatch(StringData utf8,
+                                                                     SubstrMatchOptions options,
+                                                                     CaseFoldMode mode) {
+    // This function should only be called when casefolding or stripping diacritics.
+    dassert(!(options & kCaseSensitive) || !(options & kDiacriticSensitive));
+
+    // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics
+    // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
+    // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we
+    // don't map a single code point to more than one.
+    std::unique_ptr<char[]> buffer(new char[utf8.size() * 2]);
+    auto outputIt = buffer.get();
+
+    for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) {
+        const uint8_t firstByte = *inputIt++;
+        char32_t codepoint = 0;
+        if (firstByte <= 0x7f) {
+            // ASCII special case. Can use faster operations.
+            if ((!(options & kCaseSensitive)) && (firstByte >= 'A' && firstByte <= 'Z')) {
+                codepoint = (mode == CaseFoldMode::kTurkish && firstByte == 'I')
+                    ? 0x131                // In Turkish, I -> ı (i with no dot).
+                    : (firstByte | 0x20);  // Set the ascii lowercase bit on the character.
+            } else {
+                // ASCII has two pure diacritics that should be skipped and no characters that
+                // change when removing diacritics.
+                if ((options & kDiacriticSensitive) || !(firstByte == '^' || firstByte == '`')) {
+                    *outputIt++ = (firstByte);
+                }
+                continue;
+            }
+        } else {
+            // firstByte indicates that it is not an ASCII char.
+            int leadingOnes = countLeadingZeros64(~(uint64_t(firstByte) << (64 - 8)));
+
+            // Only checking enough to ensure that this code doesn't crash in the face of malformed
+            // utf-8. We make no guarantees about what results will be returned in this case.
+            uassert(ErrorCodes::BadValue,
+                    "text contains invalid UTF-8",
+                    leadingOnes > 1 && leadingOnes <= 4 && inputIt + leadingOnes - 1 <= endIt);
+
+            codepoint = firstByte & (0xff >> leadingOnes);  // mask off the size indicator.
+            for (int subByteIx = 1; subByteIx < leadingOnes; subByteIx++) {
+                const uint8_t subByte = *inputIt++;
+                codepoint <<= 6;
+                codepoint |= subByte & 0x3f;  // mask off continuation bits.
+            }
+
+            if (!(options & kCaseSensitive)) {
+                codepoint = codepointToLower(codepoint, mode);
+            }
+
+            if (!(options & kDiacriticSensitive)) {
+                codepoint = codepointRemoveDiacritics(codepoint);
+                if (!codepoint)
+                    continue;  // codepoint is a pure diacritic.
+            }
+        }
+
+        // Back to utf-8.
+        if (codepoint <= 0x7f /* max 1-byte codepoint */) {
+            *outputIt++ = (codepoint);
+        } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
+            *outputIt++ = ((codepoint >> (6 * 1)) | 0xc0);  // 2 leading 1s.
+            *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+        } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
+            *outputIt++ = ((codepoint >> (6 * 2)) | 0xe0);  // 3 leading 1s.
+            *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+            *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+        } else {
+            *outputIt++ = ((codepoint >> (6 * 3)) | 0xf0);  // 4 leading 1s.
+            *outputIt++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
+            *outputIt++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
+            *outputIt++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
+        }
+    }
+
+    return {std::move(buffer), outputIt};
+}
+
+bool String::substrMatch(const std::string& str,
+                         const std::string& find,
                          SubstrMatchOptions options,
                          CaseFoldMode cfMode) {
-    // In Turkish, lowercasing needs to be applied first because the letter İ has a different case
-    // folding mapping than the letter I, but removing diacritics removes the dot from İ.
     if (cfMode == CaseFoldMode::kTurkish) {
-        String cleanStr = str.toLower(cfMode);
-        String cleanFind = find.toLower(cfMode);
-        return substrMatch(cleanStr, cleanFind, options | kCaseSensitive, CaseFoldMode::kNormal);
+        // Turkish comparisons are always case insensitive due to their handling of I/i.
+        options &= ~kCaseSensitive;
     }
 
-    if (options & kDiacriticSensitive) {
-        if (options & kCaseSensitive) {
-            // Case sensitive and diacritic sensitive.
-            return std::search(str._data.cbegin(),
-                               str._data.cend(),
-                               find._data.cbegin(),
-                               find._data.cend(),
-                               [&](char32_t c1, char32_t c2) { return (c1 == c2); }) !=
-                str._data.cend();
-        }
-
-        // Case insensitive and diacritic sensitive.
-        return std::search(str._data.cbegin(),
-                           str._data.cend(),
-                           find._data.cbegin(),
-                           find._data.cend(),
-                           [&](char32_t c1, char32_t c2) {
-                               return (codepointToLower(c1, cfMode) ==
-                                       codepointToLower(c2, cfMode));
-                           }) != str._data.cend();
+    if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
+        // No transformation needed. Just do the search on the input strings.
+        return boost::algorithm::boyer_moore_search(
+                   str.cbegin(), str.cend(), find.cbegin(), find.cend()) != str.cend();
     }
 
-    String cleanStr = str.removeDiacritics();
-    String cleanFind = find.removeDiacritics();
+    auto haystack = prepForSubstrMatch(str, options, cfMode);
+    auto needle = prepForSubstrMatch(find, options, cfMode);
 
-    return substrMatch(cleanStr, cleanFind, options | kDiacriticSensitive, cfMode);
+    // Case sensitive and diacritic sensitive.
+    return boost::algorithm::boyer_moore_search(
+               haystack.first.get(), haystack.second, needle.first.get(), needle.second) !=
+        haystack.second;
 }
 
 }  // namespace unicode
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index c3355ee4f25..1ba6e46c27c 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -29,6 +29,7 @@
 #pragma once
 
 #include <cstdint>
+#include <memory>
 #include <string>
 
 #include "mongo/base/string_data.h"
@@ -110,12 +111,16 @@ public:
     /**
      * Returns the number Unicode codepoints in the String.
      */
-    size_t size() const;
+    size_t size() const {
+        return _data.size();
+    }
 
     /**
      * Returns the Unicode codepoint at index i of the String.
      */
-    const char32_t& operator[](int i) const;
+    const char32_t& operator[](int i) const {
+        return _data[i];
+    }
 
     /**
      * Options for the substrMatch method.
@@ -143,18 +148,22 @@ public:
      * the search is case insensitive, non-Turkish case folding is used unless the
      * CaseFoldMode::Turkish is passed to mode.
      */
-    static bool substrMatch(const String& str,
-                            const String& find,
+    static bool substrMatch(const std::string& str,
+                            const std::string& find,
                             SubstrMatchOptions options,
                             CaseFoldMode mode = CaseFoldMode::kNormal);
 
-private:
     /**
-     * Private constructor used by substr, toLower, and removeDiacritics to build a String from
-     * UTF-32 data.
+     * Strips diacritics and case-folds the utf8 input string, as needed to support options.
+     *
+     * Returns an owned buffer containing the output utf8 string and an end iterator for the string
+     * (points at the first byte after the string).
      */
-    String(std::u32string&& src);
+    static std::pair<std::unique_ptr<char[]>, char*> prepForSubstrMatch(StringData utf8,
+                                                                        SubstrMatchOptions options,
+                                                                        CaseFoldMode mode);
 
+private:
     /**
      * Helper method for converting a UTF-8 string to a UTF-32 string.
      */
diff --git a/src/mongo/db/fts/unicode/string_test.cpp b/src/mongo/db/fts/unicode/string_test.cpp
index 9354f7bf25c..8dec9a7e8df 100644
--- a/src/mongo/db/fts/unicode/string_test.cpp
+++ b/src/mongo/db/fts/unicode/string_test.cpp
@@ -47,82 +47,146 @@ namespace unicode {
 
 using linenoise_utf8::copyString32to8;
 
+namespace {
+// Added to the end of strings to ensure the real content is tested with the vectored
+// implementation.
+const std::string filler(32, 'x');
+
+/**
+ * Converts return from prepForSubstrMatch to a StringData that is only useful in the current
+ * expression.
+ */
+StringData toStringData(std::pair<std::unique_ptr<char[]>, char*> result) {
+    return {result.first.get(), size_t(result.second - result.first.get())};
+}
+
+auto kDiacriticSensitive = String::kDiacriticSensitive;
+auto kCaseSensitive = String::kCaseSensitive;
+
+auto kTurkish = CaseFoldMode::kTurkish;
+auto kNormal = CaseFoldMode::kNormal;
+}
+
+
+// Macro to preserve line numbers and arguments in error messages.
+#define TEST_PREP_FOR_SUBSTR_MATCH(expected, input, options, caseFoldMode)                       \
+    ASSERT_EQ(expected, toStringData(String::prepForSubstrMatch(input, options, caseFoldMode))); \
+    ASSERT_EQ(expected + filler,                                                                 \
+              toStringData(String::prepForSubstrMatch(input + filler, options, caseFoldMode)))
+
 TEST(UnicodeString, RemoveDiacritics) {
+    // Test all ascii chars.
+    for (unsigned char ch = 0; ch <= 0x7F; ch++) {
+        const auto input = std::string(1, ch);
+        const auto output = codepointIsDiacritic(ch) ? std::string() : std::string(1, ch);
+        if (ch) {  // String's constructor doesn't handle embedded NUL bytes.
+            ASSERT_EQUALS(output, String(input).removeDiacritics().toString());
+        }
+        TEST_PREP_FOR_SUBSTR_MATCH(output, input, kCaseSensitive, kNormal);
+    }
+
     // NFC Normalized Text.
-    String test1 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"));
+    const char test1[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
 
     // NFD Normalized Text ("Café").
     const char test2[] = {'C', 'a', 'f', 'e', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
 
-    ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), test1.removeDiacritics().toString());
+    ASSERT_EQUALS(UTF8("¿CUANTOS ANOS TIENES TU?"), String(test1).removeDiacritics().toString());
     ASSERT_EQUALS(UTF8("Cafe"), String(test2).removeDiacritics().toString());
+
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿CUANTOS ANOS TIENES TU?"), test1, kCaseSensitive, kNormal);
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("Cafe"), test2, kCaseSensitive, kNormal);
 }
 
 TEST(UnicodeString, CaseFolding) {
-    String test1 = String(UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?"));
-    String test2 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"));
+    // Test all ascii chars.
+    for (unsigned char ch = 0; ch <= 0x7F; ch++) {
+        const auto upper = std::string(1, ch);
+        const auto lower = std::string(1, std::tolower(ch));
+        if (ch) {  // String's constructor doesn't handle embedded NUL bytes.
+            ASSERT_EQUALS(lower, String(upper).toLower().toString());
+        }
+        TEST_PREP_FOR_SUBSTR_MATCH(lower, upper, kDiacriticSensitive, kNormal);
+    }
+
+    const char test1[] = UTF8("СКОЛЬКО ТЕБЕ ЛЕТ?");
+    const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
 
-    ASSERT_EQUALS(UTF8("сколько тебе лет?"), test1.toLower().toString());
-    ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), test2.toLower().toString());
+    ASSERT_EQUALS(UTF8("сколько тебе лет?"), String(test1).toLower().toString());
+    ASSERT_EQUALS(UTF8("¿cuántos años tienes tú?"), String(test2).toLower().toString());
+
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("сколько тебе лет?"), test1, kDiacriticSensitive, kNormal);
+    TEST_PREP_FOR_SUBSTR_MATCH(
+        UTF8("¿cuántos años tienes tú?"), test2, kDiacriticSensitive, kNormal);
 }
 
 TEST(UnicodeString, CaseFoldingTurkish) {
-    String test1 = String(UTF8("KAC YASINDASINIZ"));
-    String test2 = String(UTF8("KAC YASİNDASİNİZ"));
+    const char test1[] = UTF8("KAC YASINDASINIZ");
+    const char test2[] = UTF8("KAC YASİNDASİNİZ");
+
+    ASSERT_EQUALS(UTF8("kac yasındasınız"),
+                  String(test1).toLower(CaseFoldMode::kTurkish).toString());
+    ASSERT_EQUALS(UTF8("kac yasindasiniz"),
+                  String(test2).toLower(CaseFoldMode::kTurkish).toString());
 
-    ASSERT_EQUALS(UTF8("kac yasındasınız"), test1.toLower(CaseFoldMode::kTurkish).toString());
-    ASSERT_EQUALS(UTF8("kac yasindasiniz"), test2.toLower(CaseFoldMode::kTurkish).toString());
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasındasınız"), test1, kDiacriticSensitive, kTurkish);
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("kac yasindasiniz"), test2, kDiacriticSensitive, kTurkish);
 }
 
 TEST(UnicodeString, CaseFoldingAndRemoveDiacritics) {
     // NFC Normalized Text.
-    String test1 = String(UTF8("Πόσο χρονών είσαι?"));
-    String test2 = String(UTF8("¿CUÁNTOS AÑOS TIENES TÚ?"));
+    const char test1[] = UTF8("Πόσο χρονών είσαι?");
+    const char test2[] = UTF8("¿CUÁNTOS AÑOS TIENES TÚ?");
 
     // NFD Normalized Text ("CAFÉ").
     const char test3[] = {'C', 'A', 'F', 'E', static_cast<char>(0xcc), static_cast<char>(0x81), 0};
 
-    ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"), test1.toLower().removeDiacritics().toString());
-    ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"), test2.toLower().removeDiacritics().toString());
+    ASSERT_EQUALS(UTF8("ποσο χρονων εισαι?"),
+                  String(test1).toLower().removeDiacritics().toString());
+    ASSERT_EQUALS(UTF8("¿cuantos anos tienes tu?"),
+                  String(test2).toLower().removeDiacritics().toString());
     ASSERT_EQUALS(UTF8("cafe"), String(test3).toLower().removeDiacritics().toString());
+
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("ποσο χρονων εισαι?"), test1, 0, kNormal);
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("¿cuantos anos tienes tu?"), test2, 0, kNormal);
+    TEST_PREP_FOR_SUBSTR_MATCH(UTF8("cafe"), test3, 0, kNormal);
 }
 
 TEST(UnicodeString, SubstringMatch) {
-    String str = String(UTF8("Одумайся! Престол свой сохрани; И ярость укроти."));
+    std::string str = UTF8("Одумайся! Престол свой сохрани; И ярость укроти.");
 
     // Case insensitive & diacritic insensitive.
-    ASSERT(String::substrMatch(str, String(UTF8("ПРЁСТОЛ СВОИ")), String::kNone));
-    ASSERT_FALSE(String::substrMatch(str, String(UTF8("Престол сохрани")), String::kNone));
+    ASSERT(String::substrMatch(str, UTF8("ПРЁСТОЛ СВОИ"), String::kNone));
+    ASSERT_FALSE(String::substrMatch(str, UTF8("Престол сохрани"), String::kNone));
 
     // Case sensitive & diacritic insensitive.
-    ASSERT(String::substrMatch(str, String(UTF8("Одумаися!")), String::kCaseSensitive));
-    ASSERT_FALSE(String::substrMatch(str, String(UTF8("одумайся!")), String::kCaseSensitive));
+    ASSERT(String::substrMatch(str, UTF8("Одумаися!"), String::kCaseSensitive));
+    ASSERT_FALSE(String::substrMatch(str, UTF8("одумайся!"), String::kCaseSensitive));
 
     // Case insensitive & diacritic sensitive.
-    ASSERT(String::substrMatch(str, String(UTF8("одумайся!")), String::kDiacriticSensitive));
-    ASSERT_FALSE(String::substrMatch(str, String(UTF8("Одумаися!")), String::kDiacriticSensitive));
+    ASSERT(String::substrMatch(str, UTF8("одумайся!"), String::kDiacriticSensitive));
+    ASSERT_FALSE(String::substrMatch(str, UTF8("Одумаися!"), String::kDiacriticSensitive));
 
     // Case sensitive & diacritic sensitive.
     ASSERT(String::substrMatch(
-        str, String(UTF8("Одумайся!")), String::kDiacriticSensitive | String::kCaseSensitive));
+        str, UTF8("Одумайся!"), String::kDiacriticSensitive | String::kCaseSensitive));
     ASSERT_FALSE(String::substrMatch(
-        str, String(UTF8("Одумаися!")), String::kDiacriticSensitive | String::kCaseSensitive));
+        str, UTF8("Одумаися!"), String::kDiacriticSensitive | String::kCaseSensitive));
 }
 
 TEST(UnicodeString, SubstringMatchTurkish) {
-    String str = String(UTF8("KAÇ YAŞINDASINIZ?"));
+    std::string str = UTF8("KAÇ YAŞINDASINIZ?");
 
     // Case insensitive & diacritic insensitive.
-    ASSERT(String::substrMatch(
-        str, String(UTF8("yasındasınız")), String::kNone, CaseFoldMode::kTurkish));
-    ASSERT_FALSE(String::substrMatch(
-        str, String(UTF8("yasindasiniz")), String::kNone, CaseFoldMode::kTurkish));
+    ASSERT(String::substrMatch(str, UTF8("yasındasınız"), String::kNone, CaseFoldMode::kTurkish));
+    ASSERT_FALSE(
+        String::substrMatch(str, UTF8("yasindasiniz"), String::kNone, CaseFoldMode::kTurkish));
 
     // Case insensitive & diacritic sensitive.
     ASSERT(String::substrMatch(
-        str, String(UTF8("yaşındasınız")), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
+        str, UTF8("yaşındasınız"), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
     ASSERT_FALSE(String::substrMatch(
-        str, String(UTF8("yaşindasiniz")), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
+        str, UTF8("yaşindasiniz"), String::kDiacriticSensitive, CaseFoldMode::kTurkish));
 }
 
 TEST(UnicodeString, BadUTF8) {
@@ -153,6 +217,14 @@ TEST(UnicodeString, BadUTF8) {
     ASSERT_THROWS(String test2(invalid2), AssertionException);
     ASSERT_THROWS(String test3(invalid3), AssertionException);
     ASSERT_THROWS(String test4(invalid4), AssertionException);
+
+    // preForSubstrMatch doesn't make any guarantees about behavior when fed invalid utf8.
+    // These calls are to ensure that they don't trigger any faults in sanitizing builds.
+    String::prepForSubstrMatch(invalid1, 0, kNormal);
+    String::prepForSubstrMatch(invalid2, 0, kNormal);
+    String::prepForSubstrMatch(invalid3, 0, kNormal);
+
+    ASSERT_THROWS(String::prepForSubstrMatch(invalid4, 0, kNormal), AssertionException);
 }
 
 TEST(UnicodeString, UTF32ToUTF8) {
author	Mathias Stearn <mathias@10gen.com>	2016-02-24 16:35:54 -0500
committer	Mathias Stearn <mathias@10gen.com>	2016-03-11 08:50:18 -0500
commit	6c3157f126bb44ab275325e85de7abee5ce9ad6d (patch)
tree	16189ba18d0febfdd564006c1f03f008527f4f41 /src/mongo/db/fts/unicode
parent	4a35c7184e188354793f16d27e2330b3b5ce7f8f (diff)
download	mongo-6c3157f126bb44ab275325e85de7abee5ce9ad6d.tar.gz