summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
authorAdam Chelminski <adam.chelminski@mongodb.com>2015-07-24 18:15:20 -0400
committerAdam Chelminski <adam.chelminski@mongodb.com>2015-08-07 17:12:21 -0400
commit5c053bde2267fadd42fdf71ceea047cbc1480d6d (patch)
tree4ed81b79cb60ef34769401c639caf5514f7735dc /src/mongo/db/fts
parent326aa0029a29f06772665750400473db69945234 (diff)
downloadmongo-5c053bde2267fadd42fdf71ceea047cbc1480d6d.tar.gz
SERVER-19639 Add Unicode phrase matcher and tokenizer for FTS
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/SConscript26
-rw-r--r--src/mongo/db/fts/fts_basic_phrase_matcher.cpp2
-rw-r--r--src/mongo/db/fts/fts_basic_phrase_matcher.h5
-rw-r--r--src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp4
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.cpp4
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.h11
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer_test.cpp2
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp8
-rw-r--r--src/mongo/db/fts/fts_phrase_matcher.h9
-rw-r--r--src/mongo/db/fts/fts_query.cpp5
-rw-r--r--src/mongo/db/fts/fts_spec.cpp2
-rw-r--r--src/mongo/db/fts/fts_tokenizer.h41
-rw-r--r--src/mongo/db/fts/fts_unicode_phrase_matcher.cpp65
-rw-r--r--src/mongo/db/fts/fts_unicode_phrase_matcher.h64
-rw-r--r--src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp137
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer.cpp125
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer.h92
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer_test.cpp244
-rw-r--r--src/mongo/db/fts/unicode/string.cpp8
-rw-r--r--src/mongo/db/fts/unicode/string.h10
20 files changed, 816 insertions, 48 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 3a769baca98..3a98b50f8cc 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -42,6 +42,8 @@ baseEnv.Library('base', [
'fts_language.cpp',
'fts_basic_phrase_matcher.cpp',
'fts_basic_tokenizer.cpp',
+ 'fts_unicode_phrase_matcher.cpp',
+ 'fts_unicode_tokenizer.cpp',
'fts_util.cpp',
'fts_element_iterator.cpp',
'stemmer.cpp',
@@ -72,19 +74,22 @@ env.Library('ftsmongos', [
], LIBDEPS=["server_common"])
-env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp",
+env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp",
+env.CppUnitTest( "fts_basic_tokenizer_test", "fts_basic_tokenizer_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp",
+env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_basic_tokenizer_test", "fts_basic_tokenizer_test.cpp",
+env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp",
+env.CppUnitTest( "fts_language_test", "fts_language_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
LIBDEPS=["base"] )
env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
@@ -93,14 +98,17 @@ env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_language_test", "fts_language_test.cpp",
+env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
+env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp",
+env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp",
LIBDEPS=["base"] )
-env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp",
+env.CppUnitTest( "fts_unicode_phrase_matcher_test", "fts_unicode_phrase_matcher_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_unicode_tokenizer_test", "fts_unicode_tokenizer_test.cpp",
LIBDEPS=["base"] )
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
index e32174d3b15..da6acc0bbab 100644
--- a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp
@@ -37,7 +37,7 @@ using std::string;
bool BasicFTSPhraseMatcher::phraseMatches(const string& phrase,
const string& haystack,
- PhraseMatcherOptions options) const {
+ Options options) const {
if (options & kCaseSensitive) {
return haystack.find(phrase) != string::npos;
}
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.h b/src/mongo/db/fts/fts_basic_phrase_matcher.h
index c595b9c3bf9..5fd07d28cee 100644
--- a/src/mongo/db/fts/fts_basic_phrase_matcher.h
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher.h
@@ -36,7 +36,8 @@ namespace fts {
/**
* A phrase matcher that looks for exact substring matches with optional ASCII-aware case
- * insensitivity.
+ * insensitivity. This phrase matcher does not implement the kDiacriticSensitive match option. All
+ * operations are inherently diacritic sensitive.
*/
class BasicFTSPhraseMatcher final : public FTSPhraseMatcher {
MONGO_DISALLOW_COPYING(BasicFTSPhraseMatcher);
@@ -46,7 +47,7 @@ public:
bool phraseMatches(const std::string& phrase,
const std::string& haystack,
- PhraseMatcherOptions options) const final;
+ Options options) const override;
};
} // namespace fts
diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
index 9f193e836ad..14ca5b9d95a 100644
--- a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
+++ b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
@@ -44,7 +44,7 @@ TEST(FtsBasicPhraseMatcher, CaseInsensitive) {
std::string nofind2 = "dolor velit";
BasicFTSPhraseMatcher phraseMatcher;
- FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kNone;
+ FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone;
ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
ASSERT(phraseMatcher.phraseMatches(find2, str2, options));
@@ -64,7 +64,7 @@ TEST(FtsBasicPhraseMatcher, CaseSensitive) {
std::string nofind2 = "Irure dolor";
BasicFTSPhraseMatcher phraseMatcher;
- FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kCaseSensitive;
+ FTSPhraseMatcher::Options options = FTSPhraseMatcher::kCaseSensitive;
ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
ASSERT(phraseMatcher.phraseMatches(find2, str2, options));
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
index 9fc41923d40..a053d21140a 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -74,11 +74,11 @@ bool BasicFTSTokenizer::moveNext() {
// Stop words are case-sensitive so we need them to be lower cased to check
// against the stop word list
- if ((_options & FTSTokenizer::FilterStopWords) && _stopWords->isStopWord(word)) {
+ if ((_options & FTSTokenizer::kFilterStopWords) && _stopWords->isStopWord(word)) {
continue;
}
- if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) {
+ if (_options & FTSTokenizer::kGenerateCaseSensitiveTokens) {
word = token.data.toString();
}
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
index 221de72bb8c..1206f494e57 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer.h
+++ b/src/mongo/db/fts/fts_basic_tokenizer.h
@@ -52,18 +52,21 @@ class StopWords;
*
* For each word returns a stem version of a word optimized for full text indexing.
* Optionally supports returning case sensitive search terms.
+ *
+ * BasicFTSTokenizer does not implement the kGenerateDiacriticSensitiveTokens option. All tokens
+ * generated by the BasicFTSTokenizer are ineherently diacritic sensitive.
*/
-class BasicFTSTokenizer : public FTSTokenizer {
+class BasicFTSTokenizer final : public FTSTokenizer {
MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
public:
BasicFTSTokenizer(const FTSLanguage* language);
- void reset(StringData document, Options options) final;
+ void reset(StringData document, Options options) override;
- bool moveNext() final;
+ bool moveNext() override;
- StringData get() const final;
+ StringData get() const override;
private:
const FTSLanguage* const _language;
diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
index 5feab67face..0359da805d3 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
@@ -39,7 +39,7 @@ std::vector<std::string> tokenizeString(const char* str, const char* language) {
std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer());
- tokenizer->reset(str, FTSTokenizer::None);
+ tokenizer->reset(str, FTSTokenizer::kNone);
std::vector<std::string> terms;
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 52c67e337e9..a4b2a6e4638 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -82,8 +82,8 @@ bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const stri
std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
tokenizer->reset(raw.c_str(),
- _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens
- : FTSTokenizer::None);
+ _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens
+ : FTSTokenizer::kNone);
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
@@ -115,8 +115,8 @@ bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const stri
std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
tokenizer->reset(raw.c_str(),
- _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens
- : FTSTokenizer::None);
+ _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens
+ : FTSTokenizer::kNone);
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_phrase_matcher.h b/src/mongo/db/fts/fts_phrase_matcher.h
index ae7b8c8a9f9..7386852ebff 100644
--- a/src/mongo/db/fts/fts_phrase_matcher.h
+++ b/src/mongo/db/fts/fts_phrase_matcher.h
@@ -41,7 +41,7 @@ class FTSPhraseMatcher {
public:
virtual ~FTSPhraseMatcher() = default;
- using PhraseMatcherOptions = uint8_t;
+ using Options = uint8_t;
/**
* Use no options.
@@ -54,11 +54,16 @@ public:
static const int kCaseSensitive = 1 << 0;
/**
+ * Remove diacritics (thus ignoring them) as part of phrase matching.
+ */
+ static const int kDiacriticSensitive = 1 << 1;
+
+ /**
* Does the string 'phrase' occur in the string 'haystack'?
*/
virtual bool phraseMatches(const std::string& phrase,
const std::string& haystack,
- PhraseMatcherOptions options) const = 0;
+ Options options) const = 0;
};
} // namespace fts
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 4fa332266a0..9fbf0e04978 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -133,7 +133,7 @@ Status FTSQuery::parse(const string& query,
}
void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool negated) {
- tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords);
+ tokenizer->reset(sentence.c_str(), FTSTokenizer::kFilterStopWords);
auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
@@ -160,8 +160,7 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n
}
tokenizer->reset(sentence.c_str(),
- static_cast<FTSTokenizer::Options>(FTSTokenizer::FilterStopWords |
- FTSTokenizer::GenerateCaseSensitiveTokens));
+ FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateCaseSensitiveTokens);
// If we want case-sensitivity, get the case-sensitive token
while (tokenizer->moveNext()) {
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index eb7e018b522..1ec72152351 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -182,7 +182,7 @@ void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer,
unsigned numTokens = 0;
- tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords);
+ tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords);
while (tokenizer->moveNext()) {
string term = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
index 4c99506eae2..f6c71cd6c20 100644
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -29,6 +29,8 @@
#pragma once
+#include <cstdint>
+
#include "mongo/base/disallow_copying.h"
#include "mongo/base/string_data.h"
@@ -49,24 +51,29 @@ public:
virtual ~FTSTokenizer() = default;
/**
- * Options for generating tokens
+ * Options for generating tokens.
+ */
+ using Options = uint8_t;
+
+ /**
+ * Default means lower cased, diacritics removed, and stop words are not filtered.
+ */
+ static const Options kNone = 0;
+
+ /**
+ * Do not lower case terms.
+ */
+ static const Options kGenerateCaseSensitiveTokens = 1 << 0;
+
+ /**
+ * Filter out stop words from return tokens.
+ */
+ static const Options kFilterStopWords = 1 << 1;
+
+ /**
+ * Do not remove diacritics from terms.
*/
- enum Options {
- /**
- * Default means lower cased, and stop words are not filtered.
- */
- None = 0,
-
- /**
- * Do not lower case terms.
- */
- GenerateCaseSensitiveTokens = 1 << 0,
-
- /**
- * Filter out stop words from return tokens.
- */
- FilterStopWords = 1 << 1,
- };
+ static const Options kGenerateDiacriticSensitiveTokens = 1 << 2;
/**
* Process a new document, and discards any previous results.
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp
new file mode 100644
index 00000000000..c467b474be1
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp
@@ -0,0 +1,65 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
+
+#include "mongo/db/fts/fts_language.h"
+#include "mongo/db/fts/unicode/string.h"
+
+namespace mongo {
+namespace fts {
+
+using std::string;
+
+UnicodeFTSPhraseMatcher::UnicodeFTSPhraseMatcher(const string& language) {
+ if (language == "turkish") {
+ _caseFoldMode = unicode::CaseFoldMode::kTurkish;
+ } else {
+ _caseFoldMode = unicode::CaseFoldMode::kNormal;
+ }
+}
+
+bool UnicodeFTSPhraseMatcher::phraseMatches(const string& phrase,
+ const string& haystack,
+ Options options) const {
+ unicode::String::SubstrMatchOptions matchOptions = unicode::String::kNone;
+
+ if (options & kCaseSensitive) {
+ matchOptions |= unicode::String::kCaseSensitive;
+ }
+
+ if (options & kDiacriticSensitive) {
+ matchOptions |= unicode::String::kDiacriticSensitive;
+ }
+
+ return unicode::String::substrMatch(
+ unicode::String(haystack), unicode::String(phrase), matchOptions, _caseFoldMode);
+}
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.h b/src/mongo/db/fts/fts_unicode_phrase_matcher.h
new file mode 100644
index 00000000000..b584a7c7185
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/db/fts/fts_phrase_matcher.h"
+#include "mongo/db/fts/unicode/codepoints.h"
+
+namespace mongo {
+namespace fts {
+
+class FTSLanguage;
+
+/**
+ * UnicodeFTSPhraseMatcher
+ *
+ * A phrase matcher that looks for exact substring matches that ignore diacritics, and with UTF-8
+ * aware case folding if the phrase match is not specified as case sensitive. Optionally, the phrase
+ * matching can be diacritic sensitive if a parameter is passed to the constructor. Additionally, if
+ * the language string passed to the phrase matcher's constructor is Turkish (uses the special I
+ * case fold mapping), the phrase matcher will take that into account.
+ */
+class UnicodeFTSPhraseMatcher final : public FTSPhraseMatcher {
+ MONGO_DISALLOW_COPYING(UnicodeFTSPhraseMatcher);
+
+public:
+ UnicodeFTSPhraseMatcher(const std::string& language);
+
+ bool phraseMatches(const std::string& phrase,
+ const std::string& haystack,
+ Options options) const override;
+
+private:
+ unicode::CaseFoldMode _caseFoldMode;
+};
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp
new file mode 100644
index 00000000000..9fa63a61f14
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp
@@ -0,0 +1,137 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+namespace fts {
+
+// Case insensitive & diacritic insensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticInsensitive) {
+ std::string str =
+ "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+ std::string find1 = "pinguino wenceslao";
+ std::string find2 = "frio, anoraba";
+
+ std::string nofind1 = "bajo lluvia";
+ std::string nofind2 = "El Wenceslao";
+
+ UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+ FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case sensitive & diacritic insensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseSensitiveAndDiacriticInsensitive) {
+ std::string str =
+ "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+ std::string find1 = "pinguino Wenceslao";
+ std::string find2 = "El pinguino";
+
+ std::string nofind1 = "pinguino wenceslao";
+ std::string nofind2 = "el pinguino";
+
+ UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+ FTSPhraseMatcher::Options options = FTSPhraseMatcher::kCaseSensitive;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case insensitive & diacritic sensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseInsensitiveAndDiacriticSensitive) {
+ std::string str =
+ "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+ std::string find1 = "HIZO KILÓMETROS";
+ std::string find2 = "el pingüino";
+
+ std::string nofind1 = "hizo kilometros";
+ std::string nofind2 = "pinguino";
+
+ UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+ FTSPhraseMatcher::Options options = FTSPhraseMatcher::kDiacriticSensitive;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case sensitive & diacritic sensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticSensitive) {
+ std::string str =
+ "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba";
+ std::string find1 = "pingüino Wenceslao";
+ std::string find2 = "kilómetros bajo";
+
+ std::string nofind1 = "pinguino Wenceslao";
+ std::string nofind2 = "kilómetros BaJo";
+
+ UnicodeFTSPhraseMatcher phraseMatcher("spanish");
+ FTSPhraseMatcher::Options options =
+ FTSPhraseMatcher::kCaseSensitive | FTSPhraseMatcher::kDiacriticSensitive;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+// Case insensitive & diacritic insensitive match.
+TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticInsensitiveTurkish) {
+ std::string str = "Pijamalı hasta yağız şoföre çabucak güvendi.";
+ std::string find1 = "PİJAMALI hasta";
+ std::string find2 = "YAGIZ sofore";
+
+ std::string nofind1 = "çabucak GÜVENDI";
+ std::string nofind2 = "yagiz sofore";
+
+ UnicodeFTSPhraseMatcher phraseMatcher("turkish");
+ FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone;
+
+ ASSERT(phraseMatcher.phraseMatches(find1, str, options));
+ ASSERT(phraseMatcher.phraseMatches(find2, str, options));
+
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options));
+ ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options));
+}
+
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
new file mode 100644
index 00000000000..8cdce180dea
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp
@@ -0,0 +1,125 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/fts/fts_unicode_tokenizer.h"
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/stdx/memory.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+namespace fts {
+
+using std::string;
+
+UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language)
+ : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
+ if (_language->str() == "english") {
+ _delimListLanguage = unicode::DelimiterListLanguage::kEnglish;
+ } else {
+ _delimListLanguage = unicode::DelimiterListLanguage::kNotEnglish;
+ }
+
+ if (_language->str() == "turkish") {
+ _caseFoldMode = unicode::CaseFoldMode::kTurkish;
+ } else {
+ _caseFoldMode = unicode::CaseFoldMode::kNormal;
+ }
+}
+
+void UnicodeFTSTokenizer::reset(StringData document, Options options) {
+ _options = options;
+ _pos = 0;
+ _document = unicode::String(document);
+
+ // Skip any leading delimiters (and handle the case where the document is entirely delimiters).
+ _skipDelimiters();
+}
+
+bool UnicodeFTSTokenizer::moveNext() {
+ while (true) {
+ if (_pos >= _document.size()) {
+ _stem = "";
+ return false;
+ }
+
+ // Traverse through non-delimiters and build the next token.
+ size_t start = _pos++;
+ while (_pos < _document.size() &&
+ (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) {
+ ++_pos;
+ }
+ unicode::String token = _document.substr(start, _pos - start);
+
+ // Skip the delimiters before the next token.
+ _skipDelimiters();
+
+ // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased
+ // but with diacritics not removed to check against the stop word list.
+ unicode::String word = token.toLower(_caseFoldMode);
+
+ if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) {
+ continue;
+ }
+
+ if (_options & kGenerateCaseSensitiveTokens) {
+ word = token;
+ }
+
+ // The stemmer is diacritic sensitive, so stem the word before removing diacritics.
+ _stem = _stemmer.stem(word.toString());
+
+ if (!(_options & kGenerateDiacriticSensitiveTokens)) {
+ token.resetData(_stem);
+ _stem = token.removeDiacritics().toString();
+ }
+
+ return true;
+ }
+}
+
+StringData UnicodeFTSTokenizer::get() const {
+ return _stem;
+}
+
+void UnicodeFTSTokenizer::_skipDelimiters() {
+ while (_pos < _document.size() &&
+ unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage)) {
+ ++_pos;
+ }
+}
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h
new file mode 100644
index 00000000000..0312ffc300b
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_tokenizer.h
@@ -0,0 +1,92 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/base/string_data.h"
+#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/db/fts/unicode/string.h"
+
+namespace mongo {
+namespace fts {
+
+class FTSLanguage;
+class StopWords;
+
+/**
+ * UnicodeFTSTokenizer
+ * A iterator of "documents" where a document contains words delimited by a predefined set of
+ * Unicode delimiters (see gen_delimiter_list.py)
+ * Uses
+ * - A list of Unicode delimiters for tokenizing words (see gen_delimiter_list.py).
+ * - tolower from mongo::unicode, which supports UTF-8 simple and Turkish case folding
+ * - Stemmer (ie, Snowball Stemmer) to stem words.
+ * - Embeded stop word lists for each language in StopWord class
+ *
+ * For each word returns a stem version of a word optimized for full text indexing.
+ * Optionally supports returning case sensitive search terms.
+ */
+class UnicodeFTSTokenizer final : public FTSTokenizer {
+ MONGO_DISALLOW_COPYING(UnicodeFTSTokenizer);
+
+public:
+ UnicodeFTSTokenizer(const FTSLanguage* language);
+
+ void reset(StringData document, Options options) override;
+
+ bool moveNext() override;
+
+ StringData get() const override;
+
+private:
+ /**
+ * Helper that moves the tokenizer past all delimiters that shouldn't be considered part of
+ * tokens.
+ */
+ void _skipDelimiters();
+
+ unicode::DelimiterListLanguage _delimListLanguage;
+ unicode::CaseFoldMode _caseFoldMode;
+
+ const FTSLanguage* const _language;
+ const Stemmer _stemmer;
+ const StopWords* const _stopWords;
+
+ unicode::String _document;
+ size_t _pos;
+
+ Options _options;
+
+ std::string _stem;
+};
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
new file mode 100644
index 00000000000..e73c9599682
--- /dev/null
+++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
@@ -0,0 +1,244 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/fts_unicode_tokenizer.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+namespace fts {
+
+std::vector<std::string> tokenizeString(const char* str,
+ const char* language,
+ FTSTokenizer::Options options) {
+ StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
+ ASSERT_OK(swl);
+
+ UnicodeFTSTokenizer tokenizer(swl.getValue());
+
+ tokenizer.reset(str, options);
+
+ std::vector<std::string> terms;
+
+ while (tokenizer.moveNext()) {
+ terms.push_back(tokenizer.get().toString());
+ }
+
+ return terms;
+}
+
+// Ensure punctuation is filtered out of the indexed document and the 's is not separated
+TEST(FtsUnicodeTokenizer, English) {
+ std::vector<std::string> terms =
+ tokenizeString("Do you see Mark's dog running?", "english", FTSTokenizer::kNone);
+
+ ASSERT_EQUALS(6U, terms.size());
+ ASSERT_EQUALS("do", terms[0]);
+ ASSERT_EQUALS("you", terms[1]);
+ ASSERT_EQUALS("see", terms[2]);
+ ASSERT_EQUALS("mark", terms[3]);
+ ASSERT_EQUALS("dog", terms[4]);
+ ASSERT_EQUALS("run", terms[5]);
+}
+
+// Ensure that the tokenization still works correctly when there are leading and/or trailing
+// delimiters.
+TEST(FtsUnicodeTokenizer, EnglishLeadingAndTrailingDelimiters) {
+ std::vector<std::string> terms =
+ tokenizeString(" , Do you see Mark's dog running? ", "english", FTSTokenizer::kNone);
+
+ ASSERT_EQUALS(6U, terms.size());
+ ASSERT_EQUALS("do", terms[0]);
+ ASSERT_EQUALS("you", terms[1]);
+ ASSERT_EQUALS("see", terms[2]);
+ ASSERT_EQUALS("mark", terms[3]);
+ ASSERT_EQUALS("dog", terms[4]);
+ ASSERT_EQUALS("run", terms[5]);
+}
+
+// Ensure that strings containing only delimiters are properly handled.
+TEST(FtsUnicodeTokenizer, OnlyDelimiters) {
+ std::vector<std::string> terms = tokenizeString(" ", "english", FTSTokenizer::kNone);
+
+ ASSERT_EQUALS(0U, terms.size());
+}
+
+// Ensure punctuation is filtered out of the indexed document and the 'est is separated.
+TEST(FtsUnicodeTokenizer, FrenchAndNonAsciiPunctuation) {
+ std::vector<std::string> terms = tokenizeString(
+ "Voyez-vous «le chien» de Mark courante? C'est bien!", "french", FTSTokenizer::kNone);
+
+ ASSERT_EQUALS(10U, terms.size());
+ ASSERT_EQUALS("voi", terms[0]);
+ ASSERT_EQUALS("vous", terms[1]);
+ ASSERT_EQUALS("le", terms[2]);
+ ASSERT_EQUALS("chien", terms[3]);
+ ASSERT_EQUALS("de", terms[4]);
+ ASSERT_EQUALS("mark", terms[5]);
+ ASSERT_EQUALS("cour", terms[6]);
+ ASSERT_EQUALS("c", terms[7]);
+ ASSERT_EQUALS("est", terms[8]);
+ ASSERT_EQUALS("bien", terms[9]);
+}
+
+// Ensure punctuation is filtered out of the indexed document and the 'est is separated.
+TEST(FtsUnicodeTokenizer, FrenchDiacriticStemming) {
+ std::vector<std::string> terms =
+ tokenizeString("parlames, parlates, parlerent, parlâmes, parlâtes, parlèrent",
+ "french",
+ FTSTokenizer::kNone);
+
+ ASSERT_EQUALS(6U, terms.size());
+ ASSERT_EQUALS("parlam", terms[0]);
+ ASSERT_EQUALS("parlat", terms[1]);
+ ASSERT_EQUALS("parlerent", terms[2]);
+ ASSERT_EQUALS("parl", terms[3]);
+ ASSERT_EQUALS("parl", terms[4]);
+ ASSERT_EQUALS("parl", terms[5]);
+}
+
+// Ensure punctuation is filtered out of the indexed document and that diacritics are not in the
+// resulting tokens.
+TEST(FtsUnicodeTokenizer, Turkish) {
+ std::vector<std::string> terms = tokenizeString(
+ "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kNone);
+
+ ASSERT_EQUALS(7U, terms.size());
+ ASSERT_EQUALS("kac", terms[0]);
+ ASSERT_EQUALS("yas", terms[1]);
+ ASSERT_EQUALS("sen", terms[2]);
+ ASSERT_EQUALS("ve", terms[3]);
+ ASSERT_EQUALS("sen", terms[4]);
+ ASSERT_EQUALS("nere", terms[5]);
+ ASSERT_EQUALS("var", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are not in the
+// resulting tokens, and that the generated tokens are not lowercased.
+TEST(FtsUnicodeTokenizer, TurkishCaseSensitive) {
+ std::vector<std::string> terms = tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+ "turkish",
+ FTSTokenizer::kGenerateCaseSensitiveTokens);
+
+ ASSERT_EQUALS(7U, terms.size());
+ ASSERT_EQUALS("KAC", terms[0]);
+ ASSERT_EQUALS("YASINDASIN", terms[1]);
+ ASSERT_EQUALS("SEN", terms[2]);
+ ASSERT_EQUALS("VE", terms[3]);
+ ASSERT_EQUALS("SEN", terms[4]);
+ ASSERT_EQUALS("NEREDEN", terms[5]);
+ ASSERT_EQUALS("VARDIR", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are in the
+// resulting tokens, and that the generated tokens are lowercased.
+TEST(FtsUnicodeTokenizer, TurkishDiacriticSensitive) {
+ std::vector<std::string> terms =
+ tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+ "turkish",
+ FTSTokenizer::kGenerateDiacriticSensitiveTokens);
+
+ ASSERT_EQUALS(7U, terms.size());
+ ASSERT_EQUALS("kaç", terms[0]);
+ ASSERT_EQUALS("yaş", terms[1]);
+ ASSERT_EQUALS("sen", terms[2]);
+ ASSERT_EQUALS("ve", terms[3]);
+ ASSERT_EQUALS("sen", terms[4]);
+ ASSERT_EQUALS("nere", terms[5]);
+ ASSERT_EQUALS("var", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are in the
+// resulting tokens, and that the generated tokens are not lowercased.
+TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitive) {
+ std::vector<std::string> terms =
+ tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+ "turkish",
+ FTSTokenizer::kGenerateDiacriticSensitiveTokens |
+ FTSTokenizer::kGenerateCaseSensitiveTokens);
+
+ ASSERT_EQUALS(7U, terms.size());
+ ASSERT_EQUALS("KAÇ", terms[0]);
+ ASSERT_EQUALS("YAŞINDASIN", terms[1]);
+ ASSERT_EQUALS("SEN", terms[2]);
+ ASSERT_EQUALS("VE", terms[3]);
+ ASSERT_EQUALS("SEN", terms[4]);
+ ASSERT_EQUALS("NEREDEN", terms[5]);
+ ASSERT_EQUALS("VARDIR", terms[6]);
+}
+
+// Ensure punctuation is filtered out of the indexed document, that diacritics are in the
+// resulting tokens, and that the generated tokens are not lowercased.
+TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitiveAndStopWords) {
+ std::vector<std::string> terms = tokenizeString(
+ "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?",
+ "turkish",
+ FTSTokenizer::kGenerateDiacriticSensitiveTokens |
+ FTSTokenizer::kGenerateCaseSensitiveTokens | FTSTokenizer::kFilterStopWords);
+
+ ASSERT_EQUALS(4U, terms.size());
+ ASSERT_EQUALS("KAÇ", terms[0]);
+ ASSERT_EQUALS("YAŞINDASIN", terms[1]);
+ ASSERT_EQUALS("NEREDEN", terms[2]);
+ ASSERT_EQUALS("VARDIR", terms[3]);
+}
+
+
+// Ensure that stop words are only removed if they contain the correct diacritics.
+TEST(FtsUnicodeTokenizer, FrenchStopWords) {
+ std::vector<std::string> terms =
+ tokenizeString("Je ne vais pas etre énervé. Je vais être excité.",
+ "french",
+ FTSTokenizer::kFilterStopWords);
+
+ ASSERT_EQUALS(5U, terms.size());
+ ASSERT_EQUALS("vais", terms[0]);
+ ASSERT_EQUALS("etre", terms[1]);
+ ASSERT_EQUALS("enerv", terms[2]);
+ ASSERT_EQUALS("vais", terms[3]);
+ ASSERT_EQUALS("excit", terms[4]);
+}
+
+// Ensure that stop words are only removed if they contain the correct diacritics.
+TEST(FtsUnicodeTokenizer, FrenchStopWordsAndDiacriticSensitive) {
+ std::vector<std::string> terms = tokenizeString(
+ "Je ne vais pas etre énervé. Je vais être excité.",
+ "french",
+ FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateDiacriticSensitiveTokens);
+
+ ASSERT_EQUALS(5U, terms.size());
+ ASSERT_EQUALS("vais", terms[0]);
+ ASSERT_EQUALS("etre", terms[1]);
+ ASSERT_EQUALS("énerv", terms[2]);
+ ASSERT_EQUALS("vais", terms[3]);
+ ASSERT_EQUALS("excit", terms[4]);
+}
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp
index 24c6ff8027e..9f29749edbc 100644
--- a/src/mongo/db/fts/unicode/string.cpp
+++ b/src/mongo/db/fts/unicode/string.cpp
@@ -42,6 +42,14 @@ using linenoise_utf8::copyString8to32;
using std::u32string;
String::String(const StringData utf8_src) {
+ setData(utf8_src);
+}
+
+void String::resetData(const StringData utf8_src) {
+ setData(utf8_src);
+}
+
+void String::setData(const StringData utf8_src) {
// _data is the target, resize it so that it's guaranteed to fit all of the input characters,
// plus a null character if there isn't one.
_data.resize(utf8_src.size() + 1);
diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h
index 1fa77af2f3f..ddfa6f93870 100644
--- a/src/mongo/db/fts/unicode/string.h
+++ b/src/mongo/db/fts/unicode/string.h
@@ -62,6 +62,11 @@ public:
explicit String(StringData utf8_src);
/**
+ * Reset the String with the new UTF-8 source data, reusing the underlying buffer when possible.
+ */
+ void resetData(const StringData utf8_src);
+
+ /**
* Return a lowercased version of the String instance using the Unicode data in u_data.h.
*/
String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const;
@@ -130,6 +135,11 @@ private:
String(std::u32string&& src);
/**
+ * Helper method for converting a UTF-8 string to a UTF-32 string.
+ */
+ void setData(const StringData utf8_src);
+
+ /**
* The underlying UTF-32 data.
*/
std::u32string _data;