SERVER-19639 Add Unicode phrase matcher and tokenizer for FTS

author: Adam Chelminski <adam.chelminski@mongodb.com> 2015-07-24 18:15:20 -0400
committer: Adam Chelminski <adam.chelminski@mongodb.com> 2015-08-07 17:12:21 -0400
commit: 5c053bde2267fadd42fdf71ceea047cbc1480d6d (patch)
tree: 4ed81b79cb60ef34769401c639caf5514f7735dc /src/mongo/db/fts
parent: 326aa0029a29f06772665750400473db69945234 (diff)
download: mongo-5c053bde2267fadd42fdf71ceea047cbc1480d6d.tar.gz
20 files changed, 816 insertions, 48 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 3a769baca98..3a98b50f8cc 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -42,6 +42,8 @@ baseEnv.Library('base', [
         'fts_language.cpp',
         'fts_basic_phrase_matcher.cpp',
         'fts_basic_tokenizer.cpp',
+        'fts_unicode_phrase_matcher.cpp',
+        'fts_unicode_tokenizer.cpp',
         'fts_util.cpp',
         'fts_element_iterator.cpp',
         'stemmer.cpp',
@@ -72,19 +74,22 @@ env.Library('ftsmongos', [
         ], LIBDEPS=["server_common"])
 
 
-env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp",
+env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp",
+env.CppUnitTest( "fts_basic_tokenizer_test", "fts_basic_tokenizer_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp",
+env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_basic_tokenizer_test", "fts_basic_tokenizer_test.cpp",
+env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp",
+env.CppUnitTest( "fts_language_test", "fts_language_test.cpp",
+                 LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
                  LIBDEPS=["base"] )
 
 env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
@@ -93,14 +98,17 @@ env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
 env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_language_test", "fts_language_test.cpp",
+env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
+env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp",
+env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp",
                  LIBDEPS=["base"] )
 
-env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp",
+env.CppUnitTest( "fts_unicode_phrase_matcher_test", "fts_unicode_phrase_matcher_test.cpp",
+                 LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_unicode_tokenizer_test", "fts_unicode_tokenizer_test.cpp",
                  LIBDEPS=["base"] )
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
index e32174d3b15..da6acc0bbab 100644
--- a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
@@ -37,7 +37,7 @@ using std::string;
 
 bool BasicFTSPhraseMatcher::phraseMatches(const string& phrase,
                                           const string& haystack,
-                                          PhraseMatcherOptions options) const {
+                                          Options options) const {
     if (options & kCaseSensitive) {
         return haystack.find(phrase) != string::npos;
     }
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.h b/src/mongo/db/fts/fts_basic_phrase_matcher.h
index c595b9c3bf9..5fd07d28cee 100644
--- a/src/mongo/db/fts/fts_basic_phrase_matcher.h
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher.h
@@ -36,7 +36,8 @@ namespace fts {
 
 /**
  * A phrase matcher that looks for exact substring matches with optional ASCII-aware case
- * insensitivity.
+ * insensitivity. This phrase matcher does not implement the kDiacriticSensitive match option. All
+ * operations are inherently diacritic sensitive.
  */
 class BasicFTSPhraseMatcher final : public FTSPhraseMatcher {
     MONGO_DISALLOW_COPYING(BasicFTSPhraseMatcher);
@@ -46,7 +47,7 @@ public:
 
     bool phraseMatches(const std::string& phrase,
                        const std::string& haystack,
-                       PhraseMatcherOptions options) const final;
+                       Options options) const override;
 };
 
 }  // namespace fts
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
index 9f193e836ad..14ca5b9d95a 100644
--- a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
@@ -44,7 +44,7 @@ TEST(FtsBasicPhraseMatcher, CaseInsensitive) {
     std::string nofind2 = "dolor velit";
 
     BasicFTSPhraseMatcher phraseMatcher;
-    FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kNone;
+    FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone;
 
     ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
     ASSERT(phraseMatcher.phraseMatches(find2, str2, options));
@@ -64,7 +64,7 @@ TEST(FtsBasicPhraseMatcher, CaseSensitive) {
     std::string nofind2 = "Irure dolor";
 
     BasicFTSPhraseMatcher phraseMatcher;
-    FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kCaseSensitive;
+    FTSPhraseMatcher::Options options = FTSPhraseMatcher::kCaseSensitive;
 
     ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
     ASSERT(phraseMatcher.phraseMatches(find2, str2, options));
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
index 9fc41923d40..a053d21140a 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -74,11 +74,11 @@ bool BasicFTSTokenizer::moveNext() {
 
         // Stop words are case-sensitive so we need them to be lower cased to check
         // against the stop word list
-        if ((_options & FTSTokenizer::FilterStopWords) && _stopWords->isStopWord(word)) {
+        if ((_options & FTSTokenizer::kFilterStopWords) && _stopWords->isStopWord(word)) {
             continue;
         }
 
-        if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) {
+        if (_options & FTSTokenizer::kGenerateCaseSensitiveTokens) {
             word = token.data.toString();
         }
 
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
index 221de72bb8c..1206f494e57 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.h
+++ b/src/mongo/db/fts/fts_basic_tokenizer.h
@@ -52,18 +52,21 @@ class StopWords;
  *
  * For each word returns a stem version of a word optimized for full text indexing.
  * Optionally supports returning case sensitive search terms.
+ *
+ * BasicFTSTokenizer does not implement the kGenerateDiacriticSensitiveTokens option. All tokens
+ * generated by the BasicFTSTokenizer are ineherently diacritic sensitive.
  */
-class BasicFTSTokenizer : public FTSTokenizer {
+class BasicFTSTokenizer final : public FTSTokenizer {
     MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
 
 public:
     BasicFTSTokenizer(const FTSLanguage* language);
 
-    void reset(StringData document, Options options) final;
+    void reset(StringData document, Options options) override;
 
-    bool moveNext() final;
+    bool moveNext() override;
 
-    StringData get() const final;
+    StringData get() const override;
 
 private:
     const FTSLanguage* const _language;
diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
index 5feab67face..0359da805d3 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
@@ -39,7 +39,7 @@ std::vector<std::string> tokenizeString(const char* str, const char* language) {
 
     std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer());
 
-    tokenizer->reset(str, FTSTokenizer::None);
+    tokenizer->reset(str, FTSTokenizer::kNone);
 
     std::vector<std::string> terms;
 
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 52c67e337e9..a4b2a6e4638 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -82,8 +82,8 @@ bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const stri
     std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
 
     tokenizer->reset(raw.c_str(),
-                     _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens
-                                               : FTSTokenizer::None);
+                     _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens
+                                               : FTSTokenizer::kNone);
 
     while (tokenizer->moveNext()) {
         string word = tokenizer->get().toString();
@@ -115,8 +115,8 @@ bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const stri
     std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
 
     tokenizer->reset(raw.c_str(),
-                     _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens
-                                               : FTSTokenizer::None);
+                     _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens
+                                               : FTSTokenizer::kNone);
 
     while (tokenizer->moveNext()) {
         string word = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_phrase_matcher.h b/src/mongo/db/fts/fts_phrase_matcher.h
index ae7b8c8a9f9..7386852ebff 100644
--- a/src/mongo/db/fts/fts_phrase_matcher.h
+++ b/src/mongo/db/fts/fts_phrase_matcher.h
@@ -41,7 +41,7 @@ class FTSPhraseMatcher {
 public:
     virtual ~FTSPhraseMatcher() = default;
 
-    using PhraseMatcherOptions = uint8_t;
+    using Options = uint8_t;
 
     /**
      * Use no options.
@@ -54,11 +54,16 @@ public:
     static const int kCaseSensitive = 1 << 0;
 
     /**
+     * Remove diacritics (thus ignoring them) as part of phrase matching.
+     */
+    static const int kDiacriticSensitive = 1 << 1;
+
+    /**
      * Does the string 'phrase' occur in the string 'haystack'?
      */
     virtual bool phraseMatches(const std::string& phrase,
                                const std::string& haystack,
-                               PhraseMatcherOptions options) const = 0;
+                               Options options) const = 0;
 };
 
 }  // namespace fts
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 4fa332266a0..9fbf0e04978 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -133,7 +133,7 @@ Status FTSQuery::parse(const string& query,
 }
 
 void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool negated) {
-    tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords);
+    tokenizer->reset(sentence.c_str(), FTSTokenizer::kFilterStopWords);
 
     auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
 
@@ -160,8 +160,7 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n
     }
 
     tokenizer->reset(sentence.c_str(),
-                     static_cast<FTSTokenizer::Options>(FTSTokenizer::FilterStopWords |
-                                                        FTSTokenizer::GenerateCaseSensitiveTokens));
+                     FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateCaseSensitiveTokens);
 
     // If we want case-sensitivity, get the case-sensitive token
     while (tokenizer->moveNext()) {
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index eb7e018b522..1ec72152351 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -182,7 +182,7 @@ void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer,
 
     unsigned numTokens = 0;
 
-    tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords);
+    tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords);
 
     while (tokenizer->moveNext()) {
         string term = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
index 4c99506eae2..f6c71cd6c20 100644
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -29,6 +29,8 @@
 
 #pragma once
 
+#include <cstdint>
+
 #include "mongo/base/disallow_copying.h"
 #include "mongo/base/string_data.h"
 
@@ -49,24 +51,29 @@ public:
     virtual ~FTSTokenizer() = default;
 
     /**
-     * Options for generating tokens
+     * Options for generating tokens.
+     */
+    using Options = uint8_t;
+
+    /**
+     * Default means lower cased, diacritics removed, and stop words are not filtered.
+     */
+    static const Options kNone = 0;
+
+    /**
+     * Do not lower case terms.
+     */
+    static const Options kGenerateCaseSensitiveTokens = 1 << 0;
+
+    /**
+     * Filter out stop words from return tokens.
+     */
+    static const Options kFilterStopWords = 1 << 1;
+
+    /**
+     * Do not remove diacritics from terms.
      */
-    enum Options {
-        /**
-         * Default means lower cased, and stop words are not filtered.
-         */
-        None = 0,
-
-        /**
-         * Do not lower case terms.
-         */
-        GenerateCaseSensitiveTokens = 1 << 0,
-
-        /**
-         * Filter out stop words from return tokens.
-         */
-        FilterStopWords = 1 << 1,
-    };
+    static const Options kGenerateDiacriticSensitiveTokens = 1 << 2;
 
     /**
      * Process a new document, and discards any previous results.
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp
new file mode 100644
index 00000000000..c467b474be1
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp
@@ -0,0 +1,65 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
+
+#include "mongo/db/fts/fts_language.h"
+#include "mongo/db/fts/unicode/string.h"
+
+namespace mongo {
+namespace fts {
+
+using std::string;
+
+UnicodeFTSPhraseMatcher::UnicodeFTSPhraseMatcher(const string& language) {
+    if (language == "turkish") {
+        _caseFoldMode = unicode::CaseFoldMode::kTurkish;
+    } else {
+        _caseFoldMode = unicode::CaseFoldMode::kNormal;
+    }
+}
+
+bool UnicodeFTSPhraseMatcher::phraseMatches(const string& phrase,
+                                            const string& haystack,
+                                            Options options) const {
+    unicode::String::SubstrMatchOptions matchOptions = unicode::String::kNone;
+
+    if (options & kCaseSensitive) {
+        matchOptions |= unicode::String::kCaseSensitive;
+    }
+
+    if (options & kDiacriticSensitive) {
+        matchOptions |= unicode::String::kDiacriticSensitive;
+    }
+
+    return unicode::String::substrMatch(
+        unicode::String(haystack), unicode::String(phrase), matchOptions, _caseFoldMode);
+}
+
+}  // namespace fts
+}  // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.h b/src/mongo/db/fts/fts_unicode_phrase_matcher.h
new file mode 100644
index 00000000000..b584a7c7185
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.h
@@ -0,0 +1,64 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/db/fts/fts_phrase_matcher.h"
+#include "mongo/db/fts/unicode/codepoints.h"
+
+namespace mongo {
+namespace fts {
+
+class FTSLanguage;
+
+/**
+ * UnicodeFTSPhraseMatcher
+ *
+ * A phrase matcher that looks for exact substring matches that ignore diacritics, and with UTF-8
+ * aware case folding if the phrase match is not specified as case sensitive. Optionally, the phrase
+ * matching can be diacritic sensitive if a parameter is passed to the constructor. Additionally, if
+ * the language string passed to the phrase matcher's constructor is Turkish (uses the special I
+ * case fold mapping), the phrase matcher will take that into account.
+ */
+class UnicodeFTSPhraseMatcher final : public FTSPhraseMatcher {
+    MONGO_DISALLOW_COPYING(UnicodeFTSPhraseMatcher);
+
+public:
+    UnicodeFTSPhraseMatcher(const std::string& language);
+
+    bool phraseMatches(const std::string& phrase,
+                       const std::string& haystack,
+                       Options options) const override;
+
+private:
+    unicode::CaseFoldMode _caseFoldMode;
+};
+
+}  // namespace fts
+}  // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp
new file mode 100644
index 00000000000..9fa63a61f14
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp
@@ -0,0 +1,137 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+namespace fts {
+
+// Case insensitive & diacritic insensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticInsensitive) {
+    std::string str =
+        "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+    std::string find1 = "pinguino wenceslao";
+    std::string find2 = "frio, anoraba";
+
+    std::string nofind1 = "bajo lluvia";
+    std::string nofind2 = "El Wenceslao";
+
+    UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+    FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone;
+
+    ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+    ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case sensitive & diacritic insensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseSensitiveAndDiacriticInsensitive) {
+    std::string str =
+        "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+    std::string find1 = "pinguino Wenceslao";
+    std::string find2 = "El pinguino";
+
+    std::string nofind1 = "pinguino wenceslao";
+    std::string nofind2 = "el pinguino";
+
+    UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+    FTSPhraseMatcher::Options options = FTSPhraseMatcher::kCaseSensitive;
+
+    ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+    ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case insensitive & diacritic sensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseInsensitiveAndDiacriticSensitive) {
+    std::string str =
+        "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+    std::string find1 = "HIZO KILÓMETROS";
+    std::string find2 = "el pingüino";
+
+    std::string nofind1 = "hizo kilometros";
+    std::string nofind2 = "pinguino";
+
+    UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+    FTSPhraseMatcher::Options options = FTSPhraseMatcher::kDiacriticSensitive;
+
+    ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+    ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case sensitive & diacritic sensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticSensitive) {
+    std::string str =
+        "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+    std::string find1 = "pingüino Wenceslao";
+    std::string find2 = "kilómetros bajo";
+
+    std::string nofind1 = "pinguino Wenceslao";
+    std::string nofind2 = "kilómetros BaJo";
+
+    UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+    FTSPhraseMatcher::Options options =
+        FTSPhraseMatcher::kCaseSensitive | FTSPhraseMatcher::kDiacriticSensitive;
+
+    ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+    ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case insensitive & diacritic insensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticInsensitiveTurkish) {
+    std::string str = "Pijamalı hasta yağız şoföre çabucak güvendi.";
+    std::string find1 = "PİJAMALI hasta";
+    std::string find2 = "YAGIZ sofore";
+
+    std::string nofind1 = "çabucak GÜVENDI";
+    std::string nofind2 = "yagiz sofore";
+
+    UnicodeFTSPhraseMatcher phraseMatcher("turkish");
+    FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone;
+
+    ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+    ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+    ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+
+}  // namespace fts
+}  // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
new file mode 100644
index 00000000000..8cdce180dea
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
@@ -0,0 +1,125 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/fts/fts_unicode_tokenizer.h"
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/stdx/memory.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+namespace fts {
+
+using std::string;
+
+UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
+    : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
+    if (_language->str() == "english") {
+        _delimListLanguage = unicode::DelimiterListLanguage::kEnglish;
+    } else {
+        _delimListLanguage = unicode::DelimiterListLanguage::kNotEnglish;
+    }
+
+    if (_language->str() == "turkish") {
+        _caseFoldMode = unicode::CaseFoldMode::kTurkish;
+    } else {
+        _caseFoldMode = unicode::CaseFoldMode::kNormal;
+    }
+}
+
+void UnicodeFTSTokenizer::reset(StringData document, Options options) {
+    _options = options;
+    _pos = 0;
+    _document = unicode::String(document);
+
+    // Skip any leading delimiters (and handle the case where the document is entirely delimiters).
+    _skipDelimiters();
+}
+
+bool UnicodeFTSTokenizer::moveNext() {
+    while (true) {
+        if (_pos >= _document.size()) {
+            _stem = "";
+            return false;
+        }
+
+        // Traverse through non-delimiters and build the next token.
+        size_t start = _pos++;
+        while (_pos < _document.size() &&
+               (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
+            ++_pos;
+        }
+        unicode::String token = _document.substr(start, _pos - start);
+
+        // Skip the delimiters before the next token.
+        _skipDelimiters();
+
+        // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
+        // but with diacritics not removed to check against the stop word list.
+        unicode::String word = token.toLower(_caseFoldMode);
+
+        if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) {
+            continue;
+        }
+
+        if (_options & kGenerateCaseSensitiveTokens) {
+            word = token;
+        }
+
+        // The stemmer is diacritic sensitive, so stem the word before removing diacritics.
+        _stem = _stemmer.stem(word.toString());
+
+        if (!(_options & kGenerateDiacriticSensitiveTokens)) {
+            token.resetData(_stem);
+            _stem = token.removeDiacritics().toString();
+        }
+
+        return true;
+    }
+}
+
+StringData UnicodeFTSTokenizer::get() const {
+    return _stem;
+}
+
+void UnicodeFTSTokenizer::_skipDelimiters() {
+    while (_pos < _document.size() &&
+           unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage)) {
+        ++_pos;
+    }
+}
+
+}  // namespace fts
+}  // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h
new file mode 100644
index 00000000000..0312ffc300b
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.h
@@ -0,0 +1,92 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/base/string_data.h"
+#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/db/fts/unicode/string.h"
+
+namespace mongo {
+namespace fts {
+
+class FTSLanguage;
+class StopWords;
+
+/**
+ * UnicodeFTSTokenizer
+ * A iterator of "documents" where a document contains words delimited by a predefined set of
+ * Unicode delimiters (see gen_delimiter_list.py)
+ * Uses
+ * - A list of Unicode delimiters for tokenizing words (see gen_delimiter_list.py).
+ * - tolower from mongo::unicode, which supports UTF-8 simple and Turkish case folding
+ * - Stemmer (ie, Snowball Stemmer) to stem words.
+ * - Embeded stop word lists for each language in StopWord class
+ *
+ * For each word returns a stem version of a word optimized for full text indexing.
+ * Optionally supports returning case sensitive search terms.
+ */
+class UnicodeFTSTokenizer final : public FTSTokenizer {
+    MONGO_DISALLOW_COPYING(UnicodeFTSTokenizer);
+
+public:
+    UnicodeFTSTokenizer(const FTSLanguage* language);
+
+    void reset(StringData document, Options options) override;
+
+    bool moveNext() override;
+
+    StringData get() const override;
+
+private:
+    /**
+     * Helper that moves the tokenizer past all delimiters that shouldn't be considered part of
+     * tokens.
+     */
+    void _skipDelimiters();
+
+    unicode::DelimiterListLanguage _delimListLanguage;
+    unicode::CaseFoldMode _caseFoldMode;
+
+    const FTSLanguage* const _language;
+    const Stemmer _stemmer;
+    const StopWords* const _stopWords;
+
+    unicode::String _document;
+    size_t _pos;
+
+    Options _options;
+
+    std::string _stem;
+};
+
+}  // namespace fts
+}  // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
new file mode 100644
index 00000000000..e73c9599682
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
@@ -0,0 +1,244 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/fts_unicode_tokenizer.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+namespace fts {
+
+std::vector<std::string> tokenizeString(const char* str,
+                                        const char* language,
+                                        FTSTokenizer::Options options) {
+    StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
+    ASSERT_OK(swl);
+
+    UnicodeFTSTokenizer tokenizer(swl.getValue());
+
+    tokenizer.reset(str, options);
+
+    std::vector<std::string> terms;
+
+    while (tokenizer.moveNext()) {
+        terms.push_back(tokenizer.get().toString());
+    }
+
+    return terms;
+}
+
+// Ensure punctuation is filtered out of the indexed document and the 's is not separated
+TEST(FtsUnicodeTokenizer, English) {
+    std::vector<std::string> terms =
+        tokenizeString("Do you see Mark's dog running?", "english", FTSTokenizer::kNone);
+
+    ASSERT_EQUALS(6U, terms.size());
+    ASSERT_EQUALS("do", terms[0]);
+    ASSERT_EQUALS("you", terms[1]);
+    ASSERT_EQUALS("see", terms[2]);
+    ASSERT_EQUALS("mark", terms[3]);
+    ASSERT_EQUALS("dog", terms[4]);
+    ASSERT_EQUALS("run", terms[5]);
+}
+
+// Ensure that the tokenization still works correctly when there are leading and/or trailing
+// delimiters.
+TEST(FtsUnicodeTokenizer, EnglishLeadingAndTrailingDelimiters) {
+    std::vector<std::string> terms =
+        tokenizeString("  , Do you see Mark's dog running?   ", "english", FTSTokenizer::kNone);
+
+    ASSERT_EQUALS(6U, terms.size());
+    ASSERT_EQUALS("do", terms[0]);
+    ASSERT_EQUALS("you", terms[1]);
+    ASSERT_EQUALS("see", terms[2]);
+    ASSERT_EQUALS("mark", terms[3]);
+    ASSERT_EQUALS("dog", terms[4]);
+    ASSERT_EQUALS("run", terms[5]);
+}
+
+// Ensure that strings containing only delimiters are properly handled.
+TEST(FtsUnicodeTokenizer, OnlyDelimiters) {
+    std::vector<std::string> terms = tokenizeString("   ", "english", FTSTokenizer::kNone);
+
+    ASSERT_EQUALS(0U, terms.size());
+}
+
+// Ensure punctuation is filtered out of the indexed document and the 'est is separated.
+TEST(FtsUnicodeTokenizer, FrenchAndNonAsciiPunctuation) {
+    std::vector<std::string> terms = tokenizeString(
+        "Voyez-vous «le chien» de Mark courante? C'est bien!", "french", FTSTokenizer::kNone);
+
+    ASSERT_EQUALS(10U, terms.size());
+    ASSERT_EQUALS("voi", terms[0]);
+    ASSERT_EQUALS("vous", terms[1]);
+    ASSERT_EQUALS("le", terms[2]);
+    ASSERT_EQUALS("chien", terms[3]);
+    ASSERT_EQUALS("de", terms[4]);
+    ASSERT_EQUALS("mark", terms[5]);
+    ASSERT_EQUALS("cour", terms[6]);
+    ASSERT_EQUALS("c", terms[7]);
+    ASSERT_EQUALS("est", terms[8]);
+    ASSERT_EQUALS("bien", terms[9]);
+}
+
+// Ensure punctuation is filtered out of the indexed document and the 'est is separated.
+TEST(FtsUnicodeTokenizer, FrenchDiacriticStemming) {
+    std::vector<std::string> terms =
+        tokenizeString("parlames, parlates, parlerent, parlâmes, parlâtes, parlèrent",
+                       "french",
+                       FTSTokenizer::kNone);
+
+    ASSERT_EQUALS(6U, terms.size());
+    ASSERT_EQUALS("parlam", terms[0]);
+    ASSERT_EQUALS("parlat", terms[1]);
+    ASSERT_EQUALS("parlerent", terms[2]);
+    ASSERT_EQUALS("parl", terms[3]);
+    ASSERT_EQUALS("parl", terms[4]);
+    ASSERT_EQUALS("parl", terms[5]);
+}
+
+// Ensure punctuation is filtered out of the indexed document and that diacritics are not in the
+// resulting tokens.
+TEST(FtsUnicodeTokenizer, Turkish) {
+    std::vector<std::string> terms = tokenizeString(
+        "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kNone);
+
+    ASSERT_EQUALS(7U, terms.size());
+    ASSERT_EQUALS("kac", terms[0]);
+    ASSERT_EQUALS("yas", terms[1]);
+    ASSERT_EQUALS("sen", terms[2]);
+    ASSERT_EQUALS("ve", terms[3]);
+    ASSERT_EQUALS("sen", terms[4]);
+    ASSERT_EQUALS("nere", terms[5]);
+    ASSERT_EQUALS("var", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are not in the
+// resulting tokens, and that the generated tokens are not lowercased.
+TEST(FtsUnicodeTokenizer, TurkishCaseSensitive) {
+    std::vector<std::string> terms = tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+                                                    "turkish",
+                                                    FTSTokenizer::kGenerateCaseSensitiveTokens);
+
+    ASSERT_EQUALS(7U, terms.size());
+    ASSERT_EQUALS("KAC", terms[0]);
+    ASSERT_EQUALS("YASINDASIN", terms[1]);
+    ASSERT_EQUALS("SEN", terms[2]);
+    ASSERT_EQUALS("VE", terms[3]);
+    ASSERT_EQUALS("SEN", terms[4]);
+    ASSERT_EQUALS("NEREDEN", terms[5]);
+    ASSERT_EQUALS("VARDIR", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are in the
+// resulting tokens, and that the generated tokens are lowercased.
+TEST(FtsUnicodeTokenizer, TurkishDiacriticSensitive) {
+    std::vector<std::string> terms =
+        tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+                       "turkish",
+                       FTSTokenizer::kGenerateDiacriticSensitiveTokens);
+
+    ASSERT_EQUALS(7U, terms.size());
+    ASSERT_EQUALS("kaç", terms[0]);
+    ASSERT_EQUALS("yaş", terms[1]);
+    ASSERT_EQUALS("sen", terms[2]);
+    ASSERT_EQUALS("ve", terms[3]);
+    ASSERT_EQUALS("sen", terms[4]);
+    ASSERT_EQUALS("nere", terms[5]);
+    ASSERT_EQUALS("var", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are in the
+// resulting tokens, and that the generated tokens are not lowercased.
+TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitive) {
+    std::vector<std::string> terms =
+        tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+                       "turkish",
+                       FTSTokenizer::kGenerateDiacriticSensitiveTokens |
+                           FTSTokenizer::kGenerateCaseSensitiveTokens);
+
+    ASSERT_EQUALS(7U, terms.size());
+    ASSERT_EQUALS("KAÇ", terms[0]);
+    ASSERT_EQUALS("YAŞINDASIN", terms[1]);
+    ASSERT_EQUALS("SEN", terms[2]);
+    ASSERT_EQUALS("VE", terms[3]);
+    ASSERT_EQUALS("SEN", terms[4]);
+    ASSERT_EQUALS("NEREDEN", terms[5]);
+    ASSERT_EQUALS("VARDIR", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are in the
+// resulting tokens, and that the generated tokens are not lowercased.
+TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitiveAndStopWords) {
+    std::vector<std::string> terms = tokenizeString(
+        "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+        "turkish",
+        FTSTokenizer::kGenerateDiacriticSensitiveTokens |
+            FTSTokenizer::kGenerateCaseSensitiveTokens | FTSTokenizer::kFilterStopWords);
+
+    ASSERT_EQUALS(4U, terms.size());
+    ASSERT_EQUALS("KAÇ", terms[0]);
+    ASSERT_EQUALS("YAŞINDASIN", terms[1]);
+    ASSERT_EQUALS("NEREDEN", terms[2]);
+    ASSERT_EQUALS("VARDIR", terms[3]);
+}
+
+
+// Ensure that stop words are only removed if they contain the correct diacritics.
+TEST(FtsUnicodeTokenizer, FrenchStopWords) {
+    std::vector<std::string> terms =
+        tokenizeString("Je ne vais pas etre énervé. Je vais être excité.",
+                       "french",
+                       FTSTokenizer::kFilterStopWords);
+
+    ASSERT_EQUALS(5U, terms.size());
+    ASSERT_EQUALS("vais", terms[0]);
+    ASSERT_EQUALS("etre", terms[1]);
+    ASSERT_EQUALS("enerv", terms[2]);
+    ASSERT_EQUALS("vais", terms[3]);
+    ASSERT_EQUALS("excit", terms[4]);
+}
+
+// Ensure that stop words are only removed if they contain the correct diacritics.
+TEST(FtsUnicodeTokenizer, FrenchStopWordsAndDiacriticSensitive) {
+    std::vector<std::string> terms = tokenizeString(
+        "Je ne vais pas etre énervé. Je vais être excité.",
+        "french",
+        FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateDiacriticSensitiveTokens);
+
+    ASSERT_EQUALS(5U, terms.size());
+    ASSERT_EQUALS("vais", terms[0]);
+    ASSERT_EQUALS("etre", terms[1]);
+    ASSERT_EQUALS("énerv", terms[2]);
+    ASSERT_EQUALS("vais", terms[3]);
+    ASSERT_EQUALS("excit", terms[4]);
+}
+
+}  // namespace fts
+}  // namespace mongo
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 24c6ff8027e..9f29749edbc 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -42,6 +42,14 @@ using linenoise_utf8::copyString8to32;
 using std::u32string;
 
 String::String(const StringData utf8_src) {
+    setData(utf8_src);
+}
+
+void String::resetData(const StringData utf8_src) {
+    setData(utf8_src);
+}
+
+void String::setData(const StringData utf8_src) {
     // _data is the target, resize it so that it's guaranteed to fit all of the input characters,
     // plus a null character if there isn't one.
     _data.resize(utf8_src.size() + 1);
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index 1fa77af2f3f..ddfa6f93870 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -62,6 +62,11 @@ public:
     explicit String(StringData utf8_src);
 
     /**
+     * Reset the String with the new UTF-8 source data, reusing the underlying buffer when possible.
+     */
+    void resetData(const StringData utf8_src);
+
+    /**
      * Return a lowercased version of the String instance using the Unicode data in u_data.h.
      */
     String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;
@@ -130,6 +135,11 @@ private:
     String(std::u32string&& src);
 
     /**
+     * Helper method for converting a UTF-8 string to a UTF-32 string.
+     */
+    void setData(const StringData utf8_src);
+
+    /**
      * The underlying UTF-32 data.
      */
     std::u32string _data;
author	Adam Chelminski <adam.chelminski@mongodb.com>	2015-07-24 18:15:20 -0400
committer	Adam Chelminski <adam.chelminski@mongodb.com>	2015-08-07 17:12:21 -0400
commit	5c053bde2267fadd42fdf71ceea047cbc1480d6d (patch)
tree	4ed81b79cb60ef34769401c639caf5514f7735dc /src/mongo/db/fts
parent	326aa0029a29f06772665750400473db69945234 (diff)
download	mongo-5c053bde2267fadd42fdf71ceea047cbc1480d6d.tar.gz