diff options
author | Adam Chelminski <adam.chelminski@mongodb.com> | 2015-07-29 15:05:21 -0400 |
---|---|---|
committer | Adam Chelminski <adam.chelminski@mongodb.com> | 2015-08-11 16:56:55 -0400 |
commit | 92eac3b57d8beaf063fced8839cd870f97826bb7 (patch) | |
tree | 0db84953876345d4725576538c14783cb81391e9 /src/mongo/db/fts | |
parent | 657343ccff986bd2f8c46fc7455db4238e8801d1 (diff) | |
download | mongo-92eac3b57d8beaf063fced8839cd870f97826bb7.tar.gz |
SERVER-19557 Add text index v3
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r-- | src/mongo/db/fts/fts_index_format.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.cpp | 232 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.h | 38 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language_test.cpp | 44 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher.h | 9 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher_test.cpp | 22 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.cpp | 25 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.h | 9 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query_test.cpp | 54 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec_test.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_unicode_tokenizer_test.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_util.h | 3 |
14 files changed, 348 insertions, 182 deletions
diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp index f7110d80858..dcf96e25126 100644 --- a/src/mongo/db/fts/fts_index_format.cpp +++ b/src/mongo/db/fts/fts_index_format.cpp @@ -64,14 +64,14 @@ const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength; /** * Returns size of buffer required to store term in index key. * In version 1, terms are stored verbatim in key. - * In version 2, terms longer than 32 characters are hashed and combined + * In version 2 and above, terms longer than 32 characters are hashed and combined * with a prefix. */ int guessTermSize(const std::string& term, TextIndexVersion textIndexVersion) { if (TEXT_INDEX_VERSION_1 == textIndexVersion) { return term.size(); } else { - invariant(TEXT_INDEX_VERSION_2 == textIndexVersion); + invariant(TEXT_INDEX_VERSION_2 <= textIndexVersion); if (term.size() <= termKeyPrefixLength) { return term.size(); } @@ -184,9 +184,9 @@ void FTSIndexFormat::_appendIndexKey(BSONObjBuilder& b, b.append("", weight); } // See comments at the top of file for termKeyPrefixLength. - // Apply hash for text index version 2 to long terms (longer than 32 characters). + // Apply hash for text index version 2 and above to long terms (longer than 32 characters). else { - invariant(TEXT_INDEX_VERSION_2 == textIndexVersion); + invariant(TEXT_INDEX_VERSION_2 <= textIndexVersion); if (term.size() <= termKeyPrefixLength) { b.append("", term); } else { diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index 1180cfa17b1..b01e9de6508 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -35,6 +35,8 @@ #include "mongo/base/init.h" #include "mongo/db/fts/fts_basic_phrase_matcher.h" #include "mongo/db/fts/fts_basic_tokenizer.h" +#include "mongo/db/fts/fts_unicode_phrase_matcher.h" +#include "mongo/db/fts/fts_unicode_tokenizer.h" #include "mongo/stdx/memory.h" #include "mongo/util/assert_util.h" #include "mongo/util/mongoutils/str.h" @@ -70,48 +72,101 @@ struct LanguageStringCompare { } }; -// Lookup table from user language string (case-insensitive) to FTSLanguage. Populated -// by initializers in group FTSAllLanguagesRegistered and initializer -// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only. -typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMapV2; -LanguageMapV2 languageMapV2; +// Lookup table from user language string (case-insensitive) to FTSLanguage. +// Populated by initializers in initializer FTSRegisterV2LanguagesAndLater and initializer +// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes and above. +typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMap; + +LanguageMap languageMapV3; +LanguageMap languageMapV2; // Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. // Case-sensitive by lookup key. -typedef std::map<StringData, const FTSLanguage*> LanguageMapV1; -LanguageMapV1 languageMapV1; +typedef std::map<StringData, const FTSLanguage*> LanguageMapLegacy; +LanguageMapLegacy languageMapV1; } -std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { - return stdx::make_unique<BasicFTSTokenizer>(this); -} +MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS); -const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const { - return _basicPhraseMatcher; -} +// FTS Language map. These languages are available with TEXT_INDEX_VERSION_2 and above. +// +// Parameters: +// - C++ unique identifier suffix +// - lower case string name +// - language alias +// +#define MONGO_FTS_LANGUAGE_LIST(MONGO_FTS_LANGUAGE_DECL) \ + MONGO_FTS_LANGUAGE_DECL(Danish, "danish", "da") \ + MONGO_FTS_LANGUAGE_DECL(Dutch, "dutch", "nl") \ + MONGO_FTS_LANGUAGE_DECL(English, "english", "en") \ + MONGO_FTS_LANGUAGE_DECL(Finnish, "finnish", "fi") \ + MONGO_FTS_LANGUAGE_DECL(French, "french", "fr") \ + MONGO_FTS_LANGUAGE_DECL(German, "german", "de") \ + MONGO_FTS_LANGUAGE_DECL(Hungarian, "hungarian", "hu") \ + MONGO_FTS_LANGUAGE_DECL(Italian, "italian", "it") \ + MONGO_FTS_LANGUAGE_DECL(Norwegian, "norwegian", "nb") \ + MONGO_FTS_LANGUAGE_DECL(Portuguese, "portuguese", "pt") \ + MONGO_FTS_LANGUAGE_DECL(Romanian, "romanian", "ro") \ + MONGO_FTS_LANGUAGE_DECL(Russian, "russian", "ru") \ + MONGO_FTS_LANGUAGE_DECL(Spanish, "spanish", "es") \ + MONGO_FTS_LANGUAGE_DECL(Swedish, "swedish", "sv") \ + MONGO_FTS_LANGUAGE_DECL(Turkish, "turkish", "tr") -MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS); +// Declare compilation unit local language object. +// Must be declared statically as global language map only keeps a pointer to the language +// instance. // -// Register supported languages' canonical names for TEXT_INDEX_VERSION_2. +#define LANGUAGE_DECLV2(id, name, alias) BasicFTSLanguage language##id##V2; + +#define LANGUAGE_DECLV3(id, name, alias) UnicodeFTSLanguage language##id##V3(name); + +BasicFTSLanguage languageNoneV2; +MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV2); + +UnicodeFTSLanguage languageNoneV3("none"); +MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV3); + +// Registers each language and language aliases in the language map. // +#define LANGUAGE_INITV2(id, name, alias) \ + FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_2, &language##id##V2); + +#define LANGUAGE_INITV3(id, name, alias) \ + FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_3, &language##id##V3); -MONGO_FTS_LANGUAGE_DECLARE(languageNoneV2, "none", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageDanishV2, "danish", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageDutchV2, "dutch", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV2, "english", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV2, "french", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageGermanV2, "german", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageItalianV2, "italian", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageRussianV2, "russian", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2); -MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2); +/** + * Registers each language in the language map. + */ +MONGO_INITIALIZER_GENERAL(FTSRegisterV2LanguagesAndLater, + MONGO_NO_PREREQUISITES, + ("FTSAllLanguagesRegistered")) +(::mongo::InitializerContext* context) { + FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_2, &languageNoneV2); + MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV2); + + FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_3, &languageNoneV3); + MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV3); + return Status::OK(); +} + +#define LANGUAGE_ALIASV2(id, name, alias) \ + FTSLanguage::registerLanguageAlias(&language##id##V2, alias, TEXT_INDEX_VERSION_2); + +#define LANGUAGE_ALIASV3(id, name, alias) \ + FTSLanguage::registerLanguageAlias(&language##id##V3, alias, TEXT_INDEX_VERSION_3); + +/** + * Registers each language alias in the language map. + */ +MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered")) +(InitializerContext* context) { + // Register language aliases for TEXT_INDEX_VERSION_2. + MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV2); + // Register language aliases for TEXT_INDEX_VERSION_3. + MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV3); + return Status::OK(); +} // // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full @@ -172,59 +227,39 @@ MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1); MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1); MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1); -MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered")) -(InitializerContext* context) { - // Register language aliases for TEXT_INDEX_VERSION_2. - FTSLanguage::registerLanguageAlias(&languageDanishV2, "da", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageDutchV2, "nl", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageEnglishV2, "en", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageFinnishV2, "fi", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageFrenchV2, "fr", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageGermanV2, "de", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageHungarianV2, "hu", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageItalianV2, "it", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageRomanianV2, "ro", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageRussianV2, "ru", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageSpanishV2, "es", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageSwedishV2, "sv", TEXT_INDEX_VERSION_2); - FTSLanguage::registerLanguageAlias(&languageTurkishV2, "tr", TEXT_INDEX_VERSION_2); - return Status::OK(); -} - // static void FTSLanguage::registerLanguage(StringData languageName, TextIndexVersion textIndexVersion, FTSLanguage* language) { verify(!languageName.empty()); language->_canonicalName = languageName.toString(); - switch (textIndexVersion) { - case TEXT_INDEX_VERSION_2: - languageMapV2[languageName.toString()] = language; - return; - case TEXT_INDEX_VERSION_1: - verify(languageMapV1.find(languageName) == languageMapV1.end()); - languageMapV1[languageName] = language; - return; + + if (textIndexVersion >= TEXT_INDEX_VERSION_2) { + LanguageMap* languageMap = + (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2; + (*languageMap)[languageName.toString()] = language; + } else { + // Legacy text index. + invariant(textIndexVersion == TEXT_INDEX_VERSION_1); + verify(languageMapV1.find(languageName) == languageMapV1.end()); + languageMapV1[languageName] = language; } - verify(false); } // static void FTSLanguage::registerLanguageAlias(const FTSLanguage* language, StringData alias, TextIndexVersion textIndexVersion) { - switch (textIndexVersion) { - case TEXT_INDEX_VERSION_2: - languageMapV2[alias.toString()] = language; - return; - case TEXT_INDEX_VERSION_1: - verify(languageMapV1.find(alias) == languageMapV1.end()); - languageMapV1[alias] = language; - return; + if (textIndexVersion >= TEXT_INDEX_VERSION_2) { + LanguageMap* languageMap = + (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2; + (*languageMap)[alias.toString()] = language; + } else { + // Legacy text index. + invariant(textIndexVersion == TEXT_INDEX_VERSION_1); + verify(languageMapV1.find(alias) == languageMapV1.end()); + languageMapV1[alias] = language; } - verify(false); } FTSLanguage::FTSLanguage() : _canonicalName() {} @@ -236,31 +271,48 @@ const std::string& FTSLanguage::str() const { // static StatusWithFTSLanguage FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) { - switch (textIndexVersion) { - case TEXT_INDEX_VERSION_2: { - LanguageMapV2::const_iterator it = languageMapV2.find(langName.toString()); - if (it == languageMapV2.end()) { - // TEXT_INDEX_VERSION_2 rejects unrecognized language strings. - Status status = Status(ErrorCodes::BadValue, - mongoutils::str::stream() << "unsupported language: \"" - << langName << "\""); - return StatusWithFTSLanguage(status); - } + if (textIndexVersion >= TEXT_INDEX_VERSION_2) { + LanguageMap* languageMap = + (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2; + + LanguageMap::const_iterator it = languageMap->find(langName.toString()); - return StatusWithFTSLanguage(it->second); + if (it == languageMap->end()) { + // TEXT_INDEX_VERSION_2 and above reject unrecognized language strings. + Status status = Status(ErrorCodes::BadValue, + mongoutils::str::stream() + << "unsupported language: \"" << langName + << "\" for text index version " << textIndexVersion); + return StatusWithFTSLanguage(status); } - case TEXT_INDEX_VERSION_1: { - LanguageMapV1::const_iterator it = languageMapV1.find(langName); - if (it == languageMapV1.end()) { - // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". - return StatusWithFTSLanguage(&languageNoneV1); - } - return StatusWithFTSLanguage(it->second); + + return StatusWithFTSLanguage(it->second); + } else { + // Legacy text index. + invariant(textIndexVersion == TEXT_INDEX_VERSION_1); + LanguageMapLegacy::const_iterator it = languageMapV1.find(langName); + if (it == languageMapV1.end()) { + // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". + return StatusWithFTSLanguage(&languageNoneV1); } + return StatusWithFTSLanguage(it->second); } +} + +std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { + return stdx::make_unique<BasicFTSTokenizer>(this); +} + +const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const { + return _basicPhraseMatcher; +} + +std::unique_ptr<FTSTokenizer> UnicodeFTSLanguage::createTokenizer() const { + return stdx::make_unique<UnicodeFTSTokenizer>(this); +} - verify(false); - return StatusWithFTSLanguage(Status::OK()); +const FTSPhraseMatcher& UnicodeFTSLanguage::getPhraseMatcher() const { + return _unicodePhraseMatcher; } } } diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index 6c986f5de6e..062a3255ba1 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -32,6 +32,7 @@ #include "mongo/db/fts/fts_basic_phrase_matcher.h" #include "mongo/db/fts/fts_phrase_matcher.h" +#include "mongo/db/fts/fts_unicode_phrase_matcher.h" #include "mongo/db/fts/fts_util.h" #include "mongo/base/status_with.h" @@ -43,6 +44,7 @@ namespace fts { class FTSTokenizer; +// Legacy language initialization. #define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \ BasicFTSLanguage language; \ MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \ @@ -57,7 +59,7 @@ class FTSTokenizer; * * Recommended usage: * - * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 ); + * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_3 ); * if ( !swl.getStatus().isOK() ) { * // Error. * } @@ -84,7 +86,7 @@ public: /** * Returns a new FTSTokenizer instance for this language. - * Lifetime is scoped to FTSLanguage (which are currently all process lifetime) + * Lifetime is scoped to FTSLanguage (which are currently all process lifetime). */ virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0; @@ -94,10 +96,9 @@ public: virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0; /** - * Register std::string 'languageName' as a new language with text index version + * Register std::string 'languageName' as a new language with the text index version * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. - * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language - * string. + * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language string. */ static void registerLanguage(StringData languageName, TextIndexVersion textIndexVersion, @@ -113,15 +114,15 @@ public: TextIndexVersion textIndexVersion); /** - * Return the FTSLanguage associated with the given language string. Returns an error - * Status if an invalid language std::string is passed. + * Return the FTSLanguage associated with the given language string and the given text index + * version. Returns an error Status if an invalid language std::string is passed. * - * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are + * For textIndexVersion >= TEXT_INDEX_VERSION_2, language strings are * case-insensitive, and need to be in one of the two following forms: * - English name, like "spanish". * - Two-letter code, like "es". * - * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of + * For textIndexVersion == TEXT_INDEX_VERSION_1, no validation or normalization of * language strings is performed. This is necessary to preserve indexing behavior for * documents with language strings like "en": for compatibility, text data in these * documents needs to be processed with the English stemmer and the empty stopword list @@ -137,7 +138,10 @@ private: typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; - +/** + * FTSLanguage implementation that returns a BasicFTSTokenizer and BasicFTSPhraseMatcher for ASCII + * aware case folding in FTS. + */ class BasicFTSLanguage : public FTSLanguage { public: std::unique_ptr<FTSTokenizer> createTokenizer() const final; @@ -147,6 +151,20 @@ private: BasicFTSPhraseMatcher _basicPhraseMatcher; }; +/** + * FTSLanguage implementation that returns a UnicodeFTSTokenizer and UnicodeFTSPhraseMatcher for + * Unicode aware case folding and diacritic removal in FTS. + */ +class UnicodeFTSLanguage : public FTSLanguage { +public: + UnicodeFTSLanguage(const std::string& languageName) : _unicodePhraseMatcher(languageName) {} + std::unique_ptr<FTSTokenizer> createTokenizer() const final; + const FTSPhraseMatcher& getPhraseMatcher() const final; + +private: + UnicodeFTSPhraseMatcher _unicodePhraseMatcher; +}; + extern BasicFTSLanguage languagePorterV1; extern BasicFTSLanguage languageEnglishV2; extern BasicFTSLanguage languageFrenchV2; diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp index c24f02ff7fd..87e37272850 100644 --- a/src/mongo/db/fts/fts_language_test.cpp +++ b/src/mongo/db/fts/fts_language_test.cpp @@ -37,6 +37,50 @@ namespace mongo { namespace fts { +// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3. + +TEST(FTSLanguageV3, ExactLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_3); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV3, ExactCode) { + StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_3); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV3, UpperCaseLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_3); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV3, UpperCaseCode) { + StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_3); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "spanish"); +} + +TEST(FTSLanguageV3, NoneLanguage) { + StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_3); + ASSERT(swl.getStatus().isOK()); + ASSERT_EQUALS(swl.getValue()->str(), "none"); +} + +// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3. + +TEST(FTSLanguageV3, Empty) { + StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_3); + ASSERT(!swl.getStatus().isOK()); +} + +TEST(FTSLanguageV3, Unknown) { + StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_3); + ASSERT(!swl.getStatus().isOK()); +} + // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. TEST(FTSLanguageV2, ExactLanguage) { diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index a4b2a6e4638..7689f15acf3 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -80,10 +80,7 @@ bool FTSMatcher::hasPositiveTerm(const BSONObj& obj) const { bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const string& raw) const { std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - - tokenizer->reset(raw.c_str(), - _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens - : FTSTokenizer::kNone); + tokenizer->reset(raw.c_str(), _getTokenizerOptions()); while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); @@ -113,10 +110,7 @@ bool FTSMatcher::hasNegativeTerm(const BSONObj& obj) const { bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const string& raw) const { std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - - tokenizer->reset(raw.c_str(), - _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens - : FTSTokenizer::kNone); + tokenizer->reset(raw.c_str(), _getTokenizerOptions()); while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); @@ -153,16 +147,34 @@ bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const { while (it.more()) { FTSIteratorValue val = it.next(); - if (val._language->getPhraseMatcher().phraseMatches(phrase, - val._text, - _query.getCaseSensitive() - ? FTSPhraseMatcher::kCaseSensitive - : FTSPhraseMatcher::kNone)) { + FTSPhraseMatcher::Options matcherOptions = FTSPhraseMatcher::kNone; + + if (_query.getCaseSensitive()) { + matcherOptions |= FTSPhraseMatcher::kCaseSensitive; + } + if (_query.getDiacriticSensitive()) { + matcherOptions |= FTSPhraseMatcher::kDiacriticSensitive; + } + + if (val._language->getPhraseMatcher().phraseMatches(phrase, val._text, matcherOptions)) { return true; } } return false; } + +FTSTokenizer::Options FTSMatcher::_getTokenizerOptions() const { + FTSTokenizer::Options tokenizerOptions = FTSTokenizer::kNone; + + if (_query.getCaseSensitive()) { + tokenizerOptions |= FTSTokenizer::kGenerateCaseSensitiveTokens; + } + if (_query.getDiacriticSensitive()) { + tokenizerOptions |= FTSTokenizer::kGenerateDiacriticSensitiveTokens; + } + + return tokenizerOptions; +} } } diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h index 00fe8291c4d..45bfa360b09 100644 --- a/src/mongo/db/fts/fts_matcher.h +++ b/src/mongo/db/fts/fts_matcher.h @@ -32,6 +32,7 @@ #include "mongo/db/fts/fts_query.h" #include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/tokenizer.h" namespace mongo { @@ -81,7 +82,7 @@ private: * check. */ bool canSkipPositiveTermCheck() const { - return !_query.getCaseSensitive(); + return !_query.getCaseSensitive() && !_query.getDiacriticSensitive(); } /** @@ -101,6 +102,12 @@ private: */ bool _phraseMatch(const std::string& phrase, const BSONObj& obj) const; + /** + * Helper method that returns the tokenizer options that this matcher should use, based on the + * the query options. + */ + FTSTokenizer::Options _getTokenizerOptions() const; + // TODO These should be unowned pointers instead of owned copies. const FTSQuery _query; const FTSSpec _spec; diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp index 13eb74609dc..246510a9e70 100644 --- a/src/mongo/db/fts/fts_matcher_test.cpp +++ b/src/mongo/db/fts/fts_matcher_test.cpp @@ -38,7 +38,7 @@ namespace fts { TEST(FTSMatcher, NegWild1) { FTSQuery q; - ASSERT_OK(q.parse("foo -bar", "english", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("foo -bar", "english", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" << "text"))))); @@ -52,7 +52,7 @@ TEST(FTSMatcher, NegWild1) { // Regression test for SERVER-11994. TEST(FTSMatcher, NegWild2) { FTSQuery q; - ASSERT_OK(q.parse("pizza -restaurant", "english", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("pizza -restaurant", "english", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" << "text"))))); @@ -65,7 +65,7 @@ TEST(FTSMatcher, NegWild2) { TEST(FTSMatcher, Phrase1) { FTSQuery q; - ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("foo \"table top\"", "english", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**" << "text"))))); @@ -87,7 +87,7 @@ TEST(FTSMatcher, Phrase1) { TEST(FTSMatcher, Phrase2) { FTSQuery q; - ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("foo \"table top\"", "english", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -98,7 +98,7 @@ TEST(FTSMatcher, Phrase2) { // language. TEST(FTSMatcher, ParsesUsingDocLanguage) { FTSQuery q; - ASSERT_OK(q.parse("-glad", "none", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("-glad", "none", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -112,7 +112,7 @@ TEST(FTSMatcher, ParsesUsingDocLanguage) { // Test the matcher does not filter out stop words from positive terms TEST(FTSMatcher, MatcherDoesNotFilterStopWordsNeg) { FTSQuery q; - ASSERT_OK(q.parse("-the", "none", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("-the", "none", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -124,7 +124,7 @@ TEST(FTSMatcher, MatcherDoesNotFilterStopWordsNeg) { // Test the matcher does not filter out stop words from negative terms TEST(FTSMatcher, MatcherDoesNotFilterStopWordsPos) { FTSQuery q; - ASSERT_OK(q.parse("the", "none", false, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse("the", "none", false, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -137,7 +137,7 @@ TEST(FTSMatcher, MatcherDoesNotFilterStopWordsPos) { // case-sensitive text query 'search'. static bool docHasPositiveTermWithCase(const std::string& doc, const std::string& search) { FTSQuery q; - ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -164,7 +164,7 @@ TEST(FTSMatcher, HasPositiveTermCaseSensitive) { // case-sensitive text query 'search'. static bool docHasNegativeTermWithCase(const std::string& doc, const std::string& search) { FTSQuery q; - ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -191,7 +191,7 @@ TEST(FTSMatcher, HasNegativeTermCaseSensitive) { // from case-sensitive text query 'search'. static bool docPositivePhrasesMatchWithCase(const std::string& doc, const std::string& search) { FTSQuery q; - ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); @@ -214,7 +214,7 @@ TEST(FTSMatcher, PositivePhrasesMatchWithCase) { // from case-sensitive text query 'search'. static bool docNegativePhrasesMatchWithCase(const std::string& doc, const std::string& search) { FTSQuery q; - ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2)); + ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3)); FTSMatcher m(q, FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x" << "text"))))); diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index 9fbf0e04978..f162481066b 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -50,10 +50,12 @@ using std::stringstream; using std::vector; const bool FTSQuery::caseSensitiveDefault = false; +const bool FTSQuery::diacriticSensitiveDefault = false; Status FTSQuery::parse(const string& query, StringData language, bool caseSensitive, + bool diacriticSensitive, TextIndexVersion textIndexVersion) { StatusWithFTSLanguage swl = FTSLanguage::make(language, textIndexVersion); if (!swl.getStatus().isOK()) { @@ -61,6 +63,7 @@ Status FTSQuery::parse(const string& query, } _language = swl.getValue(); _caseSensitive = caseSensitive; + _diacriticSensitive = diacriticSensitive; // Build a space delimited list of words to have the FtsTokenizer tokenize string positiveTermSentence; @@ -148,21 +151,29 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n } // Compute the string corresponding to 'token' that will be used for the matcher. - // For case-insensitive queries, this is the same string as 'boundsTerm' computed - // above. - if (!_caseSensitive) { + // For case and diacritic insensitive queries, this is the same string as 'boundsTerm' + // computed above. + if (!_caseSensitive && !_diacriticSensitive) { activeTerms.insert(word); } } - if (!_caseSensitive) { + if (!_caseSensitive && !_diacriticSensitive) { return; } - tokenizer->reset(sentence.c_str(), - FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateCaseSensitiveTokens); + FTSTokenizer::Options newOptions = FTSTokenizer::kFilterStopWords; - // If we want case-sensitivity, get the case-sensitive token + if (_caseSensitive) { + newOptions |= FTSTokenizer::kGenerateCaseSensitiveTokens; + } + if (_diacriticSensitive) { + newOptions |= FTSTokenizer::kGenerateDiacriticSensitiveTokens; + } + + tokenizer->reset(sentence.c_str(), newOptions); + + // If we want case-sensitivity or diacritic sensitivity, get the correct token. while (tokenizer->moveNext()) { string word = tokenizer->get().toString(); diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index cac73425ffb..ea1882e4baf 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -48,10 +48,12 @@ public: // index version, since a query which doesn't specify a language and is against a // version 1 text index with a version 1 default language string needs to be parsed as // version 1 (see fts_language.cpp for a list of language strings specific to version - // 1). + // 1). Note that the diacritic sensitive option has no effect on FTS queries below index version + // 3. Status parse(const std::string& query, StringData language, bool caseSensitive, + bool diacriticSensitive, TextIndexVersion textIndexVersion); const std::set<std::string>& getPositiveTerms() const { @@ -77,6 +79,9 @@ public: bool getCaseSensitive() const { return _caseSensitive; } + bool getDiacriticSensitive() const { + return _diacriticSensitive; + } std::string toString() const; @@ -85,12 +90,14 @@ public: BSONObj toBSON() const; static const bool caseSensitiveDefault; + static const bool diacriticSensitiveDefault; private: void _addTerms(FTSTokenizer* tokenizer, const std::string& tokens, bool negated); const FTSLanguage* _language; bool _caseSensitive; + bool _diacriticSensitive; // Positive terms. std::set<std::string> _positiveTerms; diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp index a4a841c7f16..bcf9e537142 100644 --- a/src/mongo/db/fts/fts_query_test.cpp +++ b/src/mongo/db/fts/fts_query_test.cpp @@ -37,7 +37,7 @@ namespace fts { TEST(FTSQuery, Basic1) { FTSQuery q; - ASSERT(q.parse("this is fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("this is fun", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(false, q.getCaseSensitive()); ASSERT_EQUALS(1U, q.getPositiveTerms().size()); @@ -50,7 +50,7 @@ TEST(FTSQuery, Basic1) { TEST(FTSQuery, ParsePunctuation) { FTSQuery q; - ASSERT(q.parse("hello.world", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("hello.world", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(false, q.getCaseSensitive()); ASSERT_EQUALS(2U, q.getPositiveTerms().size()); @@ -64,7 +64,7 @@ TEST(FTSQuery, ParsePunctuation) { TEST(FTSQuery, Neg1) { FTSQuery q; - ASSERT(q.parse("this is -really fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("this is -really fun", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(1U, q.getPositiveTerms().size()); ASSERT_EQUALS("fun", *q.getPositiveTerms().begin()); @@ -75,8 +75,8 @@ TEST(FTSQuery, Neg1) { TEST(FTSQuery, Phrase1) { FTSQuery q; - ASSERT( - q.parse("doing a \"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("doing a \"phrase test\" for fun", "english", false, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS(3U, q.getPositiveTerms().size()); ASSERT_EQUALS(0U, q.getNegatedTerms().size()); @@ -90,8 +90,8 @@ TEST(FTSQuery, Phrase1) { TEST(FTSQuery, Phrase2) { FTSQuery q; - ASSERT( - q.parse("doing a \"phrase-test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("doing a \"phrase-test\" for fun", "english", false, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS(1U, q.getPositivePhr().size()); ASSERT_EQUALS("phrase-test", q.getPositivePhr()[0]); } @@ -99,19 +99,20 @@ TEST(FTSQuery, Phrase2) { TEST(FTSQuery, NegPhrase1) { FTSQuery q; ASSERT( - q.parse("doing a -\"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK()); + q.parse("doing a -\"phrase test\" for fun", "english", false, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS("fun||||||phrase test", q.debugString()); } TEST(FTSQuery, CaseSensitiveOption) { FTSQuery q; - ASSERT(q.parse("this is fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("this is fun", "english", true, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(true, q.getCaseSensitive()); } TEST(FTSQuery, CaseSensitivePositiveTerms) { FTSQuery q; - ASSERT(q.parse("This is Positively fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("This is Positively fun", "english", true, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(2U, q.getTermsForBounds().size()); ASSERT_EQUALS(1, @@ -127,8 +128,8 @@ TEST(FTSQuery, CaseSensitivePositiveTerms) { TEST(FTSQuery, CaseSensitiveNegativeTerms) { FTSQuery q; - ASSERT( - q.parse("-This -is -Negatively -miserable", "english", true, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("-This -is -Negatively -miserable", "english", true, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS(0U, q.getPositiveTerms().size()); ASSERT_EQUALS(0U, q.getTermsForBounds().size()); @@ -141,8 +142,8 @@ TEST(FTSQuery, CaseSensitiveNegativeTerms) { TEST(FTSQuery, CaseSensitivePositivePhrases) { FTSQuery q; - ASSERT( - q.parse("doing a \"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("doing a \"Phrase Test\" for fun", "english", true, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS(1U, q.getPositivePhr().size()); ASSERT_EQUALS(0U, q.getNegatedPhr().size()); @@ -151,8 +152,8 @@ TEST(FTSQuery, CaseSensitivePositivePhrases) { TEST(FTSQuery, CaseSensitiveNegativePhrases) { FTSQuery q; - ASSERT( - q.parse("doing a -\"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q.parse("doing a -\"Phrase Test\" for fun", "english", true, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS(0U, q.getPositivePhr().size()); ASSERT_EQUALS(1U, q.getNegatedPhr().size()); @@ -162,15 +163,16 @@ TEST(FTSQuery, CaseSensitiveNegativePhrases) { TEST(FTSQuery, Mix1) { FTSQuery q; ASSERT( - q.parse("\"industry\" -Melbourne -Physics", "english", false, TEXT_INDEX_VERSION_2).isOK()); + q.parse("\"industry\" -Melbourne -Physics", "english", false, false, TEXT_INDEX_VERSION_3) + .isOK()); ASSERT_EQUALS("industri||melbourn|physic||industry||", q.debugString()); } TEST(FTSQuery, NegPhrase2) { FTSQuery q1, q2, q3; - ASSERT(q1.parse("foo \"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); - ASSERT(q2.parse("foo \"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); - ASSERT(q3.parse("foo \" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q1.parse("foo \"bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); + ASSERT(q2.parse("foo \"-bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); + ASSERT(q3.parse("foo \" -bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(2U, q1.getPositiveTerms().size()); ASSERT_EQUALS(2U, q2.getPositiveTerms().size()); @@ -191,9 +193,9 @@ TEST(FTSQuery, NegPhrase2) { TEST(FTSQuery, NegPhrase3) { FTSQuery q1, q2, q3; - ASSERT(q1.parse("foo -\"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); - ASSERT(q2.parse("foo -\"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); - ASSERT(q3.parse("foo -\" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK()); + ASSERT(q1.parse("foo -\"bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); + ASSERT(q2.parse("foo -\"-bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); + ASSERT(q3.parse("foo -\" -bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK()); ASSERT_EQUALS(1U, q1.getPositiveTerms().size()); ASSERT_EQUALS(1U, q2.getPositiveTerms().size()); @@ -216,7 +218,7 @@ TEST(FTSQuery, NegPhrase3) { // stemmer and stopword list. TEST(FTSQuery, TextIndexVersion1LanguageEnglish) { FTSQuery q; - ASSERT(q.parse("the running", "english", false, TEXT_INDEX_VERSION_1).isOK()); + ASSERT(q.parse("the running", "english", false, false, TEXT_INDEX_VERSION_1).isOK()); ASSERT_EQUALS(1U, q.getPositiveTerms().size()); ASSERT_EQUALS("run", *q.getPositiveTerms().begin()); ASSERT_EQUALS(0U, q.getNegatedTerms().size()); @@ -228,7 +230,7 @@ TEST(FTSQuery, TextIndexVersion1LanguageEnglish) { // no stopword list. TEST(FTSQuery, TextIndexVersion1LanguageEng) { FTSQuery q; - ASSERT(q.parse("the running", "eng", false, TEXT_INDEX_VERSION_1).isOK()); + ASSERT(q.parse("the running", "eng", false, false, TEXT_INDEX_VERSION_1).isOK()); ASSERT_EQUALS(2U, q.getPositiveTerms().size()); ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the")); ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "run")); @@ -241,7 +243,7 @@ TEST(FTSQuery, TextIndexVersion1LanguageEng) { // and no stopword list will be used. TEST(FTSQuery, TextIndexVersion1LanguageInvalid) { FTSQuery q; - ASSERT(q.parse("the running", "invalid", false, TEXT_INDEX_VERSION_1).isOK()); + ASSERT(q.parse("the running", "invalid", false, false, TEXT_INDEX_VERSION_1).isOK()); ASSERT_EQUALS(2U, q.getPositiveTerms().size()); ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the")); ASSERT_EQUALS(1, diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index 1ec72152351..5e5fbeaefa3 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -73,18 +73,26 @@ FTSSpec::FTSSpec(const BSONObj& indexInfo) { "found invalid spec for text index, expected number for textIndexVersion", textIndexVersionElt.isNumber()); - // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2. + // We currently support TEXT_INDEX_VERSION_1 (deprecated), TEXT_INDEX_VERSION_2, and + // TEXT_INDEX_VERSION_3. // Reject all other values. - massert(17364, - str::stream() << "attempt to use unsupported textIndexVersion " - << textIndexVersionElt.numberInt() << "; versions supported: " - << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1, - textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 || - textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1); - - _textIndexVersion = (textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2) - ? TEXT_INDEX_VERSION_2 - : TEXT_INDEX_VERSION_1; + switch (textIndexVersionElt.numberInt()) { + case TEXT_INDEX_VERSION_3: + _textIndexVersion = TEXT_INDEX_VERSION_3; + break; + case TEXT_INDEX_VERSION_2: + _textIndexVersion = TEXT_INDEX_VERSION_2; + break; + case TEXT_INDEX_VERSION_1: + _textIndexVersion = TEXT_INDEX_VERSION_1; + break; + default: + msgasserted(17364, + str::stream() << "attempt to use unsupported textIndexVersion " + << textIndexVersionElt.numberInt() + << "; versions supported: " << TEXT_INDEX_VERSION_3 << ", " + << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1); + } // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires // textIndexVersion, since language parsing is version-specific. @@ -384,7 +392,7 @@ BSONObj FTSSpec::fixSpec(const BSONObj& spec) { } uassert(17264, "default_language is not valid", - FTSLanguage::make(default_language, TEXT_INDEX_VERSION_2).getStatus().isOK()); + FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3).getStatus().isOK()); BSONElement language_override_elt = spec["language_override"]; string language_override(language_override_elt.str()); @@ -397,7 +405,7 @@ BSONObj FTSSpec::fixSpec(const BSONObj& spec) { } int version = -1; - int textIndexVersion = TEXT_INDEX_VERSION_2; + int textIndexVersion = TEXT_INDEX_VERSION_3; // default text index version BSONObjBuilder b; BSONObjIterator i(spec); @@ -421,7 +429,9 @@ BSONObj FTSSpec::fixSpec(const BSONObj& spec) { textIndexVersion = e.numberInt(); uassert(16730, str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == TEXT_INDEX_VERSION_2); + textIndexVersion == TEXT_INDEX_VERSION_2 || + textIndexVersion == TEXT_INDEX_VERSION_3); // supported indexes + } else { b.append(e); } diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp index c9f628a2b28..3bd7d93800b 100644 --- a/src/mongo/db/fts/fts_spec_test.cpp +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -174,8 +174,11 @@ TEST(FTSSpec, FixTextIndexVersion1) { assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}"); assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}"); assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 3.0}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(3)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(3)}}"); - assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}"); + assertFixFailure("{key: {a: 'text'}, textIndexVersion: 4}"); assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}"); assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}"); } diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp index e73c9599682..1f0517d8575 100644 --- a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp +++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp @@ -26,8 +26,7 @@ * it in the license file. */ -#include "mongo/db/fts/fts_spec.h" -#include "mongo/db/fts/fts_tokenizer.h" +#include "mongo/db/fts/fts_language.h" #include "mongo/db/fts/fts_unicode_tokenizer.h" #include "mongo/unittest/unittest.h" @@ -37,7 +36,7 @@ namespace fts { std::vector<std::string> tokenizeString(const char* str, const char* language, FTSTokenizer::Options options) { - StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2); + StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3); ASSERT_OK(swl); UnicodeFTSTokenizer tokenizer(swl.getValue()); diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h index a1377162443..b9fed70a8e8 100644 --- a/src/mongo/db/fts/fts_util.h +++ b/src/mongo/db/fts/fts_util.h @@ -43,7 +43,8 @@ extern const std::string INDEX_NAME; enum TextIndexVersion { TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated. - TEXT_INDEX_VERSION_2 = 2 // Current index format. + TEXT_INDEX_VERSION_2 = 2, // Index format with ASCII support and murmur hashing. + TEXT_INDEX_VERSION_3 = 3, // Current index format with basic Unicode support. }; } } |