diff options
Diffstat (limited to 'src')
20 files changed, 816 insertions, 48 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript index 3a769baca98..3a98b50f8cc 100644 --- a/src/mongo/db/fts/SConscript +++ b/src/mongo/db/fts/SConscript @@ -42,6 +42,8 @@ baseEnv.Library('base', [ 'fts_language.cpp', 'fts_basic_phrase_matcher.cpp', 'fts_basic_tokenizer.cpp', + 'fts_unicode_phrase_matcher.cpp', + 'fts_unicode_tokenizer.cpp', 'fts_util.cpp', 'fts_element_iterator.cpp', 'stemmer.cpp', @@ -72,19 +74,22 @@ env.Library('ftsmongos', [ ], LIBDEPS=["server_common"]) -env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp", +env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp", +env.CppUnitTest( "fts_basic_tokenizer_test", "fts_basic_tokenizer_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp", +env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_basic_tokenizer_test", "fts_basic_tokenizer_test.cpp", +env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp", +env.CppUnitTest( "fts_language_test", "fts_language_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp", LIBDEPS=["base"] ) env.CppUnitTest( "fts_query_test", "fts_query_test.cpp", @@ -93,14 +98,17 @@ env.CppUnitTest( "fts_query_test", "fts_query_test.cpp", env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_language_test", "fts_language_test.cpp", +env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp", +env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp", +env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp", LIBDEPS=["base"] ) -env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp", +env.CppUnitTest( "fts_unicode_phrase_matcher_test", "fts_unicode_phrase_matcher_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_unicode_tokenizer_test", "fts_unicode_tokenizer_test.cpp", LIBDEPS=["base"] ) diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp index e32174d3b15..da6acc0bbab 100644 --- a/src/mongo/db/fts/fts_basic_phrase_matcher.cpp +++ b/src/mongo/db/fts/fts_basic_phrase_matcher.cpp @@ -37,7 +37,7 @@ using std::string; bool BasicFTSPhraseMatcher::phraseMatches(const string& phrase, const string& haystack, - PhraseMatcherOptions options) const { + Options options) const { if (options & kCaseSensitive) { return haystack.find(phrase) != string::npos; } diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher.h b/src/mongo/db/fts/fts_basic_phrase_matcher.h index c595b9c3bf9..5fd07d28cee 100644 --- a/src/mongo/db/fts/fts_basic_phrase_matcher.h +++ b/src/mongo/db/fts/fts_basic_phrase_matcher.h @@ -36,7 +36,8 @@ namespace fts { /** * A phrase matcher that looks for exact substring matches with optional ASCII-aware case - * insensitivity. + * insensitivity. This phrase matcher does not implement the kDiacriticSensitive match option. All + * operations are inherently diacritic sensitive. */ class BasicFTSPhraseMatcher final : public FTSPhraseMatcher { MONGO_DISALLOW_COPYING(BasicFTSPhraseMatcher); @@ -46,7 +47,7 @@ public: bool phraseMatches(const std::string& phrase, const std::string& haystack, - PhraseMatcherOptions options) const final; + Options options) const override; }; } // namespace fts diff --git a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp index 9f193e836ad..14ca5b9d95a 100644 --- a/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp +++ b/src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp @@ -44,7 +44,7 @@ TEST(FtsBasicPhraseMatcher, CaseInsensitive) { std::string nofind2 = "dolor velit"; BasicFTSPhraseMatcher phraseMatcher; - FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kNone; + FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone; ASSERT(phraseMatcher.phraseMatches(find1, str1, options)); ASSERT(phraseMatcher.phraseMatches(find2, str2, options)); @@ -64,7 +64,7 @@ TEST(FtsBasicPhraseMatcher, CaseSensitive) { std::string nofind2 = "Irure dolor"; BasicFTSPhraseMatcher phraseMatcher; - FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kCaseSensitive; + FTSPhraseMatcher::Options options = FTSPhraseMatcher::kCaseSensitive; ASSERT(phraseMatcher.phraseMatches(find1, str1, options)); ASSERT(phraseMatcher.phraseMatches(find2, str2, options)); diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp index 9fc41923d40..a053d21140a 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp @@ -74,11 +74,11 @@ bool BasicFTSTokenizer::moveNext() { // Stop words are case-sensitive so we need them to be lower cased to check // against the stop word list - if ((_options & FTSTokenizer::FilterStopWords) && _stopWords->isStopWord(word)) { + if ((_options & FTSTokenizer::kFilterStopWords) && _stopWords->isStopWord(word)) { continue; } - if (_options & FTSTokenizer::GenerateCaseSensitiveTokens) { + if (_options & FTSTokenizer::kGenerateCaseSensitiveTokens) { word = token.data.toString(); } diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h index 221de72bb8c..1206f494e57 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer.h +++ b/src/mongo/db/fts/fts_basic_tokenizer.h @@ -52,18 +52,21 @@ class StopWords; * * For each word returns a stem version of a word optimized for full text indexing. * Optionally supports returning case sensitive search terms. + * + * BasicFTSTokenizer does not implement the kGenerateDiacriticSensitiveTokens option. All tokens + * generated by the BasicFTSTokenizer are ineherently diacritic sensitive. */ -class BasicFTSTokenizer : public FTSTokenizer { +class BasicFTSTokenizer final : public FTSTokenizer { MONGO_DISALLOW_COPYING(BasicFTSTokenizer); public: BasicFTSTokenizer(const FTSLanguage* language); - void reset(StringData document, Options options) final; + void reset(StringData document, Options options) override; - bool moveNext() final; + bool moveNext() override; - StringData get() const final; + StringData get() const override; private: const FTSLanguage* const _language; diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp index 5feab67face..0359da805d3 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp @@ -39,7 +39,7 @@ std::vector<std::string> tokenizeString(const char* str, const char* language) { std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer()); - tokenizer->reset(str, FTSTokenizer::None); + tokenizer->reset(str, FTSTokenizer::kNone); std::vector<std::string> terms; diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index 52c67e337e9..a4b2a6e4638 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -82,8 +82,8 @@ bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const stri std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); tokenizer->reset(raw.c_str(), - _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens - : FTSTokenizer::None); + _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens + : FTSTokenizer::kNone); while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); @@ -115,8 +115,8 @@ bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const stri std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); tokenizer->reset(raw.c_str(), - _query.getCaseSensitive() ? FTSTokenizer::GenerateCaseSensitiveTokens - : FTSTokenizer::None); + _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens + : FTSTokenizer::kNone); while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); diff --git a/src/mongo/db/fts/fts_phrase_matcher.h b/src/mongo/db/fts/fts_phrase_matcher.h index ae7b8c8a9f9..7386852ebff 100644 --- a/src/mongo/db/fts/fts_phrase_matcher.h +++ b/src/mongo/db/fts/fts_phrase_matcher.h @@ -41,7 +41,7 @@ class FTSPhraseMatcher { public: virtual ~FTSPhraseMatcher() = default; - using PhraseMatcherOptions = uint8_t; + using Options = uint8_t; /** * Use no options. @@ -54,11 +54,16 @@ public: static const int kCaseSensitive = 1 << 0; /** + * Remove diacritics (thus ignoring them) as part of phrase matching. + */ + static const int kDiacriticSensitive = 1 << 1; + + /** * Does the string 'phrase' occur in the string 'haystack'? */ virtual bool phraseMatches(const std::string& phrase, const std::string& haystack, - PhraseMatcherOptions options) const = 0; + Options options) const = 0; }; } // namespace fts diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index 4fa332266a0..9fbf0e04978 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -133,7 +133,7 @@ Status FTSQuery::parse(const string& query, } void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool negated) { - tokenizer->reset(sentence.c_str(), FTSTokenizer::FilterStopWords); + tokenizer->reset(sentence.c_str(), FTSTokenizer::kFilterStopWords); auto& activeTerms = negated ? _negatedTerms : _positiveTerms; @@ -160,8 +160,7 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n } tokenizer->reset(sentence.c_str(), - static_cast<FTSTokenizer::Options>(FTSTokenizer::FilterStopWords | - FTSTokenizer::GenerateCaseSensitiveTokens)); + FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateCaseSensitiveTokens); // If we want case-sensitivity, get the case-sensitive token while (tokenizer->moveNext()) { diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index eb7e018b522..1ec72152351 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -182,7 +182,7 @@ void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer, unsigned numTokens = 0; - tokenizer->reset(raw.rawData(), FTSTokenizer::FilterStopWords); + tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords); while (tokenizer->moveNext()) { string term = tokenizer->get().toString(); diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h index 4c99506eae2..f6c71cd6c20 100644 --- a/src/mongo/db/fts/fts_tokenizer.h +++ b/src/mongo/db/fts/fts_tokenizer.h @@ -29,6 +29,8 @@ #pragma once +#include <cstdint> + #include "mongo/base/disallow_copying.h" #include "mongo/base/string_data.h" @@ -49,24 +51,29 @@ public: virtual ~FTSTokenizer() = default; /** - * Options for generating tokens + * Options for generating tokens. + */ + using Options = uint8_t; + + /** + * Default means lower cased, diacritics removed, and stop words are not filtered. + */ + static const Options kNone = 0; + + /** + * Do not lower case terms. + */ + static const Options kGenerateCaseSensitiveTokens = 1 << 0; + + /** + * Filter out stop words from return tokens. + */ + static const Options kFilterStopWords = 1 << 1; + + /** + * Do not remove diacritics from terms. */ - enum Options { - /** - * Default means lower cased, and stop words are not filtered. - */ - None = 0, - - /** - * Do not lower case terms. - */ - GenerateCaseSensitiveTokens = 1 << 0, - - /** - * Filter out stop words from return tokens. - */ - FilterStopWords = 1 << 1, - }; + static const Options kGenerateDiacriticSensitiveTokens = 1 << 2; /** * Process a new document, and discards any previous results. diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp new file mode 100644 index 00000000000..c467b474be1 --- /dev/null +++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.cpp @@ -0,0 +1,65 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_unicode_phrase_matcher.h" + +#include "mongo/db/fts/fts_language.h" +#include "mongo/db/fts/unicode/string.h" + +namespace mongo { +namespace fts { + +using std::string; + +UnicodeFTSPhraseMatcher::UnicodeFTSPhraseMatcher(const string& language) { + if (language == "turkish") { + _caseFoldMode = unicode::CaseFoldMode::kTurkish; + } else { + _caseFoldMode = unicode::CaseFoldMode::kNormal; + } +} + +bool UnicodeFTSPhraseMatcher::phraseMatches(const string& phrase, + const string& haystack, + Options options) const { + unicode::String::SubstrMatchOptions matchOptions = unicode::String::kNone; + + if (options & kCaseSensitive) { + matchOptions |= unicode::String::kCaseSensitive; + } + + if (options & kDiacriticSensitive) { + matchOptions |= unicode::String::kDiacriticSensitive; + } + + return unicode::String::substrMatch( + unicode::String(haystack), unicode::String(phrase), matchOptions, _caseFoldMode); +} + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher.h b/src/mongo/db/fts/fts_unicode_phrase_matcher.h new file mode 100644 index 00000000000..b584a7c7185 --- /dev/null +++ b/src/mongo/db/fts/fts_unicode_phrase_matcher.h @@ -0,0 +1,64 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/disallow_copying.h" +#include "mongo/db/fts/fts_phrase_matcher.h" +#include "mongo/db/fts/unicode/codepoints.h" + +namespace mongo { +namespace fts { + +class FTSLanguage; + +/** + * UnicodeFTSPhraseMatcher + * + * A phrase matcher that looks for exact substring matches that ignore diacritics, and with UTF-8 + * aware case folding if the phrase match is not specified as case sensitive. Optionally, the phrase + * matching can be diacritic sensitive if a parameter is passed to the constructor. Additionally, if + * the language string passed to the phrase matcher's constructor is Turkish (uses the special I + * case fold mapping), the phrase matcher will take that into account. + */ +class UnicodeFTSPhraseMatcher final : public FTSPhraseMatcher { + MONGO_DISALLOW_COPYING(UnicodeFTSPhraseMatcher); + +public: + UnicodeFTSPhraseMatcher(const std::string& language); + + bool phraseMatches(const std::string& phrase, + const std::string& haystack, + Options options) const override; + +private: + unicode::CaseFoldMode _caseFoldMode; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp b/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp new file mode 100644 index 00000000000..9fa63a61f14 --- /dev/null +++ b/src/mongo/db/fts/fts_unicode_phrase_matcher_test.cpp @@ -0,0 +1,137 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_unicode_phrase_matcher.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { +namespace fts { + +// Case insensitive & diacritic insensitive match. +TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticInsensitive) { + std::string str = + "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba"; + std::string find1 = "pinguino wenceslao"; + std::string find2 = "frio, anoraba"; + + std::string nofind1 = "bajo lluvia"; + std::string nofind2 = "El Wenceslao"; + + UnicodeFTSPhraseMatcher phraseMatcher("spanish"); + FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone; + + ASSERT(phraseMatcher.phraseMatches(find1, str, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options)); +} + +// Case sensitive & diacritic insensitive match. +TEST(FtsUnicodePhraseMatcher, CaseSensitiveAndDiacriticInsensitive) { + std::string str = + "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba"; + std::string find1 = "pinguino Wenceslao"; + std::string find2 = "El pinguino"; + + std::string nofind1 = "pinguino wenceslao"; + std::string nofind2 = "el pinguino"; + + UnicodeFTSPhraseMatcher phraseMatcher("spanish"); + FTSPhraseMatcher::Options options = FTSPhraseMatcher::kCaseSensitive; + + ASSERT(phraseMatcher.phraseMatches(find1, str, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options)); +} + +// Case insensitive & diacritic sensitive match. +TEST(FtsUnicodePhraseMatcher, CaseInsensitiveAndDiacriticSensitive) { + std::string str = + "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba"; + std::string find1 = "HIZO KILÓMETROS"; + std::string find2 = "el pingüino"; + + std::string nofind1 = "hizo kilometros"; + std::string nofind2 = "pinguino"; + + UnicodeFTSPhraseMatcher phraseMatcher("spanish"); + FTSPhraseMatcher::Options options = FTSPhraseMatcher::kDiacriticSensitive; + + ASSERT(phraseMatcher.phraseMatches(find1, str, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options)); +} + +// Case sensitive & diacritic sensitive match. +TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticSensitive) { + std::string str = + "El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba"; + std::string find1 = "pingüino Wenceslao"; + std::string find2 = "kilómetros bajo"; + + std::string nofind1 = "pinguino Wenceslao"; + std::string nofind2 = "kilómetros BaJo"; + + UnicodeFTSPhraseMatcher phraseMatcher("spanish"); + FTSPhraseMatcher::Options options = + FTSPhraseMatcher::kCaseSensitive | FTSPhraseMatcher::kDiacriticSensitive; + + ASSERT(phraseMatcher.phraseMatches(find1, str, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options)); +} + +// Case insensitive & diacritic insensitive match. +TEST(FtsUnicodePhraseMatcher, CaseAndDiacriticInsensitiveTurkish) { + std::string str = "Pijamalı hasta yağız şoföre çabucak güvendi."; + std::string find1 = "PİJAMALI hasta"; + std::string find2 = "YAGIZ sofore"; + + std::string nofind1 = "çabucak GÜVENDI"; + std::string nofind2 = "yagiz sofore"; + + UnicodeFTSPhraseMatcher phraseMatcher("turkish"); + FTSPhraseMatcher::Options options = FTSPhraseMatcher::kNone; + + ASSERT(phraseMatcher.phraseMatches(find1, str, options)); + ASSERT(phraseMatcher.phraseMatches(find2, str, options)); + + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str, options)); + ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str, options)); +} + + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.cpp b/src/mongo/db/fts/fts_unicode_tokenizer.cpp new file mode 100644 index 00000000000..8cdce180dea --- /dev/null +++ b/src/mongo/db/fts/fts_unicode_tokenizer.cpp @@ -0,0 +1,125 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/fts/fts_unicode_tokenizer.h" + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/stdx/memory.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" + +namespace mongo { +namespace fts { + +using std::string; + +UnicodeFTSTokenizer::UnicodeFTSTokenizer(const FTSLanguage* language) + : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { + if (_language->str() == "english") { + _delimListLanguage = unicode::DelimiterListLanguage::kEnglish; + } else { + _delimListLanguage = unicode::DelimiterListLanguage::kNotEnglish; + } + + if (_language->str() == "turkish") { + _caseFoldMode = unicode::CaseFoldMode::kTurkish; + } else { + _caseFoldMode = unicode::CaseFoldMode::kNormal; + } +} + +void UnicodeFTSTokenizer::reset(StringData document, Options options) { + _options = options; + _pos = 0; + _document = unicode::String(document); + + // Skip any leading delimiters (and handle the case where the document is entirely delimiters). + _skipDelimiters(); +} + +bool UnicodeFTSTokenizer::moveNext() { + while (true) { + if (_pos >= _document.size()) { + _stem = ""; + return false; + } + + // Traverse through non-delimiters and build the next token. + size_t start = _pos++; + while (_pos < _document.size() && + (!unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage))) { + ++_pos; + } + unicode::String token = _document.substr(start, _pos - start); + + // Skip the delimiters before the next token. + _skipDelimiters(); + + // Stop words are case-sensitive and diacritic sensitive, so we need them to be lower cased + // but with diacritics not removed to check against the stop word list. + unicode::String word = token.toLower(_caseFoldMode); + + if ((_options & kFilterStopWords) && _stopWords->isStopWord(word.toString())) { + continue; + } + + if (_options & kGenerateCaseSensitiveTokens) { + word = token; + } + + // The stemmer is diacritic sensitive, so stem the word before removing diacritics. + _stem = _stemmer.stem(word.toString()); + + if (!(_options & kGenerateDiacriticSensitiveTokens)) { + token.resetData(_stem); + _stem = token.removeDiacritics().toString(); + } + + return true; + } +} + +StringData UnicodeFTSTokenizer::get() const { + return _stem; +} + +void UnicodeFTSTokenizer::_skipDelimiters() { + while (_pos < _document.size() && + unicode::codepointIsDelimiter(_document[_pos], _delimListLanguage)) { + ++_pos; + } +} + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_unicode_tokenizer.h b/src/mongo/db/fts/fts_unicode_tokenizer.h new file mode 100644 index 00000000000..0312ffc300b --- /dev/null +++ b/src/mongo/db/fts/fts_unicode_tokenizer.h @@ -0,0 +1,92 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/disallow_copying.h" +#include "mongo/base/string_data.h" +#include "mongo/db/fts/fts_tokenizer.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/db/fts/unicode/string.h" + +namespace mongo { +namespace fts { + +class FTSLanguage; +class StopWords; + +/** + * UnicodeFTSTokenizer + * A iterator of "documents" where a document contains words delimited by a predefined set of + * Unicode delimiters (see gen_delimiter_list.py) + * Uses + * - A list of Unicode delimiters for tokenizing words (see gen_delimiter_list.py). + * - tolower from mongo::unicode, which supports UTF-8 simple and Turkish case folding + * - Stemmer (ie, Snowball Stemmer) to stem words. + * - Embeded stop word lists for each language in StopWord class + * + * For each word returns a stem version of a word optimized for full text indexing. + * Optionally supports returning case sensitive search terms. + */ +class UnicodeFTSTokenizer final : public FTSTokenizer { + MONGO_DISALLOW_COPYING(UnicodeFTSTokenizer); + +public: + UnicodeFTSTokenizer(const FTSLanguage* language); + + void reset(StringData document, Options options) override; + + bool moveNext() override; + + StringData get() const override; + +private: + /** + * Helper that moves the tokenizer past all delimiters that shouldn't be considered part of + * tokens. + */ + void _skipDelimiters(); + + unicode::DelimiterListLanguage _delimListLanguage; + unicode::CaseFoldMode _caseFoldMode; + + const FTSLanguage* const _language; + const Stemmer _stemmer; + const StopWords* const _stopWords; + + unicode::String _document; + size_t _pos; + + Options _options; + + std::string _stem; +}; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp new file mode 100644 index 00000000000..e73c9599682 --- /dev/null +++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp @@ -0,0 +1,244 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_tokenizer.h" +#include "mongo/db/fts/fts_unicode_tokenizer.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { +namespace fts { + +std::vector<std::string> tokenizeString(const char* str, + const char* language, + FTSTokenizer::Options options) { + StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2); + ASSERT_OK(swl); + + UnicodeFTSTokenizer tokenizer(swl.getValue()); + + tokenizer.reset(str, options); + + std::vector<std::string> terms; + + while (tokenizer.moveNext()) { + terms.push_back(tokenizer.get().toString()); + } + + return terms; +} + +// Ensure punctuation is filtered out of the indexed document and the 's is not separated +TEST(FtsUnicodeTokenizer, English) { + std::vector<std::string> terms = + tokenizeString("Do you see Mark's dog running?", "english", FTSTokenizer::kNone); + + ASSERT_EQUALS(6U, terms.size()); + ASSERT_EQUALS("do", terms[0]); + ASSERT_EQUALS("you", terms[1]); + ASSERT_EQUALS("see", terms[2]); + ASSERT_EQUALS("mark", terms[3]); + ASSERT_EQUALS("dog", terms[4]); + ASSERT_EQUALS("run", terms[5]); +} + +// Ensure that the tokenization still works correctly when there are leading and/or trailing +// delimiters. +TEST(FtsUnicodeTokenizer, EnglishLeadingAndTrailingDelimiters) { + std::vector<std::string> terms = + tokenizeString(" , Do you see Mark's dog running? ", "english", FTSTokenizer::kNone); + + ASSERT_EQUALS(6U, terms.size()); + ASSERT_EQUALS("do", terms[0]); + ASSERT_EQUALS("you", terms[1]); + ASSERT_EQUALS("see", terms[2]); + ASSERT_EQUALS("mark", terms[3]); + ASSERT_EQUALS("dog", terms[4]); + ASSERT_EQUALS("run", terms[5]); +} + +// Ensure that strings containing only delimiters are properly handled. +TEST(FtsUnicodeTokenizer, OnlyDelimiters) { + std::vector<std::string> terms = tokenizeString(" ", "english", FTSTokenizer::kNone); + + ASSERT_EQUALS(0U, terms.size()); +} + +// Ensure punctuation is filtered out of the indexed document and the 'est is separated. +TEST(FtsUnicodeTokenizer, FrenchAndNonAsciiPunctuation) { + std::vector<std::string> terms = tokenizeString( + "Voyez-vous «le chien» de Mark courante? C'est bien!", "french", FTSTokenizer::kNone); + + ASSERT_EQUALS(10U, terms.size()); + ASSERT_EQUALS("voi", terms[0]); + ASSERT_EQUALS("vous", terms[1]); + ASSERT_EQUALS("le", terms[2]); + ASSERT_EQUALS("chien", terms[3]); + ASSERT_EQUALS("de", terms[4]); + ASSERT_EQUALS("mark", terms[5]); + ASSERT_EQUALS("cour", terms[6]); + ASSERT_EQUALS("c", terms[7]); + ASSERT_EQUALS("est", terms[8]); + ASSERT_EQUALS("bien", terms[9]); +} + +// Ensure punctuation is filtered out of the indexed document and the 'est is separated. +TEST(FtsUnicodeTokenizer, FrenchDiacriticStemming) { + std::vector<std::string> terms = + tokenizeString("parlames, parlates, parlerent, parlâmes, parlâtes, parlèrent", + "french", + FTSTokenizer::kNone); + + ASSERT_EQUALS(6U, terms.size()); + ASSERT_EQUALS("parlam", terms[0]); + ASSERT_EQUALS("parlat", terms[1]); + ASSERT_EQUALS("parlerent", terms[2]); + ASSERT_EQUALS("parl", terms[3]); + ASSERT_EQUALS("parl", terms[4]); + ASSERT_EQUALS("parl", terms[5]); +} + +// Ensure punctuation is filtered out of the indexed document and that diacritics are not in the +// resulting tokens. +TEST(FtsUnicodeTokenizer, Turkish) { + std::vector<std::string> terms = tokenizeString( + "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", "turkish", FTSTokenizer::kNone); + + ASSERT_EQUALS(7U, terms.size()); + ASSERT_EQUALS("kac", terms[0]); + ASSERT_EQUALS("yas", terms[1]); + ASSERT_EQUALS("sen", terms[2]); + ASSERT_EQUALS("ve", terms[3]); + ASSERT_EQUALS("sen", terms[4]); + ASSERT_EQUALS("nere", terms[5]); + ASSERT_EQUALS("var", terms[6]); +} + +// Ensure punctuation is filtered out of the indexed document, that diacritics are not in the +// resulting tokens, and that the generated tokens are not lowercased. +TEST(FtsUnicodeTokenizer, TurkishCaseSensitive) { + std::vector<std::string> terms = tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", + "turkish", + FTSTokenizer::kGenerateCaseSensitiveTokens); + + ASSERT_EQUALS(7U, terms.size()); + ASSERT_EQUALS("KAC", terms[0]); + ASSERT_EQUALS("YASINDASIN", terms[1]); + ASSERT_EQUALS("SEN", terms[2]); + ASSERT_EQUALS("VE", terms[3]); + ASSERT_EQUALS("SEN", terms[4]); + ASSERT_EQUALS("NEREDEN", terms[5]); + ASSERT_EQUALS("VARDIR", terms[6]); +} + +// Ensure punctuation is filtered out of the indexed document, that diacritics are in the +// resulting tokens, and that the generated tokens are lowercased. +TEST(FtsUnicodeTokenizer, TurkishDiacriticSensitive) { + std::vector<std::string> terms = + tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", + "turkish", + FTSTokenizer::kGenerateDiacriticSensitiveTokens); + + ASSERT_EQUALS(7U, terms.size()); + ASSERT_EQUALS("kaç", terms[0]); + ASSERT_EQUALS("yaş", terms[1]); + ASSERT_EQUALS("sen", terms[2]); + ASSERT_EQUALS("ve", terms[3]); + ASSERT_EQUALS("sen", terms[4]); + ASSERT_EQUALS("nere", terms[5]); + ASSERT_EQUALS("var", terms[6]); +} + +// Ensure punctuation is filtered out of the indexed document, that diacritics are in the +// resulting tokens, and that the generated tokens are not lowercased. +TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitive) { + std::vector<std::string> terms = + tokenizeString("KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", + "turkish", + FTSTokenizer::kGenerateDiacriticSensitiveTokens | + FTSTokenizer::kGenerateCaseSensitiveTokens); + + ASSERT_EQUALS(7U, terms.size()); + ASSERT_EQUALS("KAÇ", terms[0]); + ASSERT_EQUALS("YAŞINDASIN", terms[1]); + ASSERT_EQUALS("SEN", terms[2]); + ASSERT_EQUALS("VE", terms[3]); + ASSERT_EQUALS("SEN", terms[4]); + ASSERT_EQUALS("NEREDEN", terms[5]); + ASSERT_EQUALS("VARDIR", terms[6]); +} + +// Ensure punctuation is filtered out of the indexed document, that diacritics are in the +// resulting tokens, and that the generated tokens are not lowercased. +TEST(FtsUnicodeTokenizer, TurkishDiacriticAndCaseSensitiveAndStopWords) { + std::vector<std::string> terms = tokenizeString( + "KAÇ YAŞINDASIN SEN, VE SEN NEREDEN VARDIR?", + "turkish", + FTSTokenizer::kGenerateDiacriticSensitiveTokens | + FTSTokenizer::kGenerateCaseSensitiveTokens | FTSTokenizer::kFilterStopWords); + + ASSERT_EQUALS(4U, terms.size()); + ASSERT_EQUALS("KAÇ", terms[0]); + ASSERT_EQUALS("YAŞINDASIN", terms[1]); + ASSERT_EQUALS("NEREDEN", terms[2]); + ASSERT_EQUALS("VARDIR", terms[3]); +} + + +// Ensure that stop words are only removed if they contain the correct diacritics. +TEST(FtsUnicodeTokenizer, FrenchStopWords) { + std::vector<std::string> terms = + tokenizeString("Je ne vais pas etre énervé. Je vais être excité.", + "french", + FTSTokenizer::kFilterStopWords); + + ASSERT_EQUALS(5U, terms.size()); + ASSERT_EQUALS("vais", terms[0]); + ASSERT_EQUALS("etre", terms[1]); + ASSERT_EQUALS("enerv", terms[2]); + ASSERT_EQUALS("vais", terms[3]); + ASSERT_EQUALS("excit", terms[4]); +} + +// Ensure that stop words are only removed if they contain the correct diacritics. +TEST(FtsUnicodeTokenizer, FrenchStopWordsAndDiacriticSensitive) { + std::vector<std::string> terms = tokenizeString( + "Je ne vais pas etre énervé. Je vais être excité.", + "french", + FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateDiacriticSensitiveTokens); + + ASSERT_EQUALS(5U, terms.size()); + ASSERT_EQUALS("vais", terms[0]); + ASSERT_EQUALS("etre", terms[1]); + ASSERT_EQUALS("énerv", terms[2]); + ASSERT_EQUALS("vais", terms[3]); + ASSERT_EQUALS("excit", terms[4]); +} + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/unicode/string.cpp b/src/mongo/db/fts/unicode/string.cpp index 24c6ff8027e..9f29749edbc 100644 --- a/src/mongo/db/fts/unicode/string.cpp +++ b/src/mongo/db/fts/unicode/string.cpp @@ -42,6 +42,14 @@ using linenoise_utf8::copyString8to32; using std::u32string; String::String(const StringData utf8_src) { + setData(utf8_src); +} + +void String::resetData(const StringData utf8_src) { + setData(utf8_src); +} + +void String::setData(const StringData utf8_src) { // _data is the target, resize it so that it's guaranteed to fit all of the input characters, // plus a null character if there isn't one. _data.resize(utf8_src.size() + 1); diff --git a/src/mongo/db/fts/unicode/string.h b/src/mongo/db/fts/unicode/string.h index 1fa77af2f3f..ddfa6f93870 100644 --- a/src/mongo/db/fts/unicode/string.h +++ b/src/mongo/db/fts/unicode/string.h @@ -62,6 +62,11 @@ public: explicit String(StringData utf8_src); /** + * Reset the String with the new UTF-8 source data, reusing the underlying buffer when possible. + */ + void resetData(const StringData utf8_src); + + /** * Return a lowercased version of the String instance using the Unicode data in u_data.h. */ String toLower(CaseFoldMode mode = CaseFoldMode::kNormal) const; @@ -130,6 +135,11 @@ private: String(std::u32string&& src); /** + * Helper method for converting a UTF-8 string to a UTF-32 string. + */ + void setData(const StringData utf8_src); + + /** * The underlying UTF-32 data. */ std::u32string _data; |