diff options
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer_test.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_element_iterator_test.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.cpp | 351 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.h | 87 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language_test.cpp | 180 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query_impl.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 34 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec_legacy.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_unicode_tokenizer_test.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer_test.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words_test.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer_test.cpp | 21 |
12 files changed, 252 insertions, 478 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp index 1207849d0ef..29ce8e9f7ee 100644 --- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp +++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp @@ -36,10 +36,7 @@ namespace fts { std::vector<std::string> tokenizeString(const char* str, const char* language) { // To retrieve the FTSBasicTokenizer, use TEXT_INDEX_VERSION_2 - StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2); - ASSERT_OK(swl); - - std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer()); + auto tokenizer = FTSLanguage::make(language, TEXT_INDEX_VERSION_2).createTokenizer(); tokenizer->reset(str, FTSTokenizer::kNone); diff --git a/src/mongo/db/fts/fts_element_iterator_test.cpp b/src/mongo/db/fts/fts_element_iterator_test.cpp index 9f5258d57bb..cdb0641f5b5 100644 --- a/src/mongo/db/fts/fts_element_iterator_test.cpp +++ b/src/mongo/db/fts/fts_element_iterator_test.cpp @@ -322,28 +322,28 @@ TEST(FTSElementIterator, LanguageOverrideV2) { FTSIteratorValue val = it.next(); ASSERT_EQUALS("walked", string(val._text)); ASSERT_EQUALS("english", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); ASSERT_EQUALS(1, val._weight); ASSERT(it.more()); val = it.next(); ASSERT_EQUALS("camminato", string(val._text)); ASSERT_EQUALS("italian", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); ASSERT_EQUALS(1, val._weight); ASSERT(it.more()); val = it.next(); ASSERT_EQUALS("ging", string(val._text)); ASSERT_EQUALS("german", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); ASSERT_EQUALS(1, val._weight); ASSERT(it.more()); val = it.next(); ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text)); ASSERT_EQUALS("spanish", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2)); ASSERT_EQUALS(1, val._weight); } @@ -371,28 +371,28 @@ TEST(FTSElementIterator, LanguageOverrideV3) { FTSIteratorValue val = it.next(); ASSERT_EQUALS("walked", string(val._text)); ASSERT_EQUALS("english", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); ASSERT_EQUALS(1, val._weight); ASSERT(it.more()); val = it.next(); ASSERT_EQUALS("camminato", string(val._text)); ASSERT_EQUALS("italian", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); ASSERT_EQUALS(1, val._weight); ASSERT(it.more()); val = it.next(); ASSERT_EQUALS("ging", string(val._text)); ASSERT_EQUALS("german", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); ASSERT_EQUALS(1, val._weight); ASSERT(it.more()); val = it.next(); ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text)); ASSERT_EQUALS("spanish", val._language->str()); - ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); + ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3)); ASSERT_EQUALS(1, val._weight); } diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index ad88ccc18d4..9f5920fa4b1 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -29,63 +29,41 @@ #include "mongo/db/fts/fts_language.h" +#include <algorithm> +#include <fmt/format.h> +#include <map> #include <memory> #include <string> +#include <type_traits> +#include <utility> -#include "mongo/base/init.h" +#include "mongo/base/error_codes.h" +#include "mongo/base/string_data.h" #include "mongo/db/fts/fts_basic_phrase_matcher.h" #include "mongo/db/fts/fts_basic_tokenizer.h" #include "mongo/db/fts/fts_unicode_phrase_matcher.h" #include "mongo/db/fts/fts_unicode_tokenizer.h" #include "mongo/util/assert_util.h" -#include "mongo/util/str.h" -#include "mongo/util/string_map.h" -namespace mongo { - -namespace fts { +namespace mongo::fts { namespace { +using namespace fmt::literals; + /** * Case-insensitive StringData comparator. + * Returns true if a < b. */ struct LanguageStringCompare { - /** Returns true if lhs < rhs. */ - bool operator()(std::string lhs, std::string rhs) const { - size_t minSize = std::min(lhs.size(), rhs.size()); - - for (size_t x = 0; x < minSize; x++) { - char a = tolower(lhs[x]); - char b = tolower(rhs[x]); - if (a < b) { - return true; - } - if (a > b) { - return false; - } - } - - return lhs.size() < rhs.size(); + bool operator()(StringData a, StringData b) const { + return std::lexicographical_compare( + a.begin(), a.end(), b.begin(), b.end(), [](unsigned char a, unsigned char b) { + return std::tolower(a) < std::tolower(b); + }); } }; -// Lookup table from user language string (case-insensitive) to FTSLanguage. -// Populated by initializers in initializer FTSRegisterV2LanguagesAndLater and initializer -// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes and above. -typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMap; - -LanguageMap languageMapV3; -LanguageMap languageMapV2; - -// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. -// Case-sensitive by lookup key. -typedef std::map<StringData, const FTSLanguage*> LanguageMapLegacy; -LanguageMapLegacy languageMapV1; -} // namespace - -MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS); - // FTS Language map. These languages are available with TEXT_INDEX_VERSION_2 and above. // // Parameters: @@ -93,78 +71,27 @@ MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO // - lower case string name // - language alias // -#define MONGO_FTS_LANGUAGE_LIST(MONGO_FTS_LANGUAGE_DECL) \ - MONGO_FTS_LANGUAGE_DECL(Danish, "danish", "da") \ - MONGO_FTS_LANGUAGE_DECL(Dutch, "dutch", "nl") \ - MONGO_FTS_LANGUAGE_DECL(English, "english", "en") \ - MONGO_FTS_LANGUAGE_DECL(Finnish, "finnish", "fi") \ - MONGO_FTS_LANGUAGE_DECL(French, "french", "fr") \ - MONGO_FTS_LANGUAGE_DECL(German, "german", "de") \ - MONGO_FTS_LANGUAGE_DECL(Hungarian, "hungarian", "hu") \ - MONGO_FTS_LANGUAGE_DECL(Italian, "italian", "it") \ - MONGO_FTS_LANGUAGE_DECL(Norwegian, "norwegian", "nb") \ - MONGO_FTS_LANGUAGE_DECL(Portuguese, "portuguese", "pt") \ - MONGO_FTS_LANGUAGE_DECL(Romanian, "romanian", "ro") \ - MONGO_FTS_LANGUAGE_DECL(Russian, "russian", "ru") \ - MONGO_FTS_LANGUAGE_DECL(Spanish, "spanish", "es") \ - MONGO_FTS_LANGUAGE_DECL(Swedish, "swedish", "sv") \ - MONGO_FTS_LANGUAGE_DECL(Turkish, "turkish", "tr") - - -// Declare compilation unit local language object. -// Must be declared statically as global language map only keeps a pointer to the language -// instance. -// -#define LANGUAGE_DECLV2(id, name, alias) BasicFTSLanguage language##id##V2; - -#define LANGUAGE_DECLV3(id, name, alias) UnicodeFTSLanguage language##id##V3(name); - -BasicFTSLanguage languageNoneV2; -MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV2); - -UnicodeFTSLanguage languageNoneV3("none"); -MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV3); - -// Registers each language and language aliases in the language map. -// -#define LANGUAGE_INITV2(id, name, alias) \ - FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_2, &language##id##V2); - -#define LANGUAGE_INITV3(id, name, alias) \ - FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_3, &language##id##V3); - -/** - * Registers each language in the language map. - */ -MONGO_INITIALIZER_GENERAL(FTSRegisterV2LanguagesAndLater, - MONGO_NO_PREREQUISITES, - ("FTSAllLanguagesRegistered")) -(::mongo::InitializerContext* context) { - FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_2, &languageNoneV2); - MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV2); - - FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_3, &languageNoneV3); - MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV3); - return Status::OK(); -} - -#define LANGUAGE_ALIASV2(id, name, alias) \ - FTSLanguage::registerLanguageAlias(&language##id##V2, alias, TEXT_INDEX_VERSION_2); - -#define LANGUAGE_ALIASV3(id, name, alias) \ - FTSLanguage::registerLanguageAlias(&language##id##V3, alias, TEXT_INDEX_VERSION_3); - -/** - * Registers each language alias in the language map. - */ -MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered")) -(InitializerContext* context) { - // Register language aliases for TEXT_INDEX_VERSION_2. - MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV2); - // Register language aliases for TEXT_INDEX_VERSION_3. - MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV3); - return Status::OK(); -} +struct { + StringData name; // - lower case string name + StringData alias; // - language alias (if nonempty) +} static constexpr kLanguagesV2V3[] = { + {"none"_sd, {}}, + {"danish"_sd, "da"_sd}, + {"dutch"_sd, "nl"_sd}, + {"english"_sd, "en"_sd}, + {"finnish"_sd, "fi"_sd}, + {"french"_sd, "fr"_sd}, + {"german"_sd, "de"_sd}, + {"hungarian"_sd, "hu"_sd}, + {"italian"_sd, "it"_sd}, + {"norwegian"_sd, "nb"_sd}, + {"portuguese"_sd, "pt"_sd}, + {"romanian"_sd, "ro"_sd}, + {"russian"_sd, "ru"_sd}, + {"spanish"_sd, "es"_sd}, + {"swedish"_sd, "sv"_sd}, + {"turkish"_sd, "tr"_sd}, +}; // // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full @@ -172,145 +99,105 @@ MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguag // TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list). // -MONGO_FTS_LANGUAGE_DECLARE(languageNoneV1, "none", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDaV1, "da", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDanV1, "dan", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDanishV1, "danish", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDeV1, "de", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDeuV1, "deu", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDutV1, "dut", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageDutchV1, "dutch", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageEnV1, "en", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageEngV1, "eng", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV1, "english", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageEsV1, "es", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageEslV1, "esl", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFiV1, "fi", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFinV1, "fin", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFrV1, "fr", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFraV1, "fra", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFreV1, "fre", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV1, "french", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageGerV1, "ger", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageGermanV1, "german", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageHuV1, "hu", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageHunV1, "hun", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageItV1, "it", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageItaV1, "ita", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageItalianV1, "italian", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageNlV1, "nl", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageNldV1, "nld", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageNoV1, "no", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageNorV1, "nor", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languagePorV1, "por", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languagePorterV1, "porter", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languagePtV1, "pt", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRoV1, "ro", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRonV1, "ron", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRuV1, "ru", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRumV1, "rum", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRusV1, "rus", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageRussianV1, "russian", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageSpaV1, "spa", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageSvV1, "sv", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageSweV1, "swe", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1); -MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1); - -// static -void FTSLanguage::registerLanguage(StringData languageName, - TextIndexVersion textIndexVersion, - FTSLanguage* language) { - verify(!languageName.empty()); - language->_canonicalName = languageName.toString(); +struct { + StringData name; +} static constexpr kLanguagesV1[] = { + {"none"_sd}, {"da"_sd}, {"dan"_sd}, {"danish"_sd}, {"de"_sd}, + {"deu"_sd}, {"dut"_sd}, {"dutch"_sd}, {"en"_sd}, {"eng"_sd}, + {"english"_sd}, {"es"_sd}, {"esl"_sd}, {"fi"_sd}, {"fin"_sd}, + {"finnish"_sd}, {"fr"_sd}, {"fra"_sd}, {"fre"_sd}, {"french"_sd}, + {"ger"_sd}, {"german"_sd}, {"hu"_sd}, {"hun"_sd}, {"hungarian"_sd}, + {"it"_sd}, {"ita"_sd}, {"italian"_sd}, {"nl"_sd}, {"nld"_sd}, + {"no"_sd}, {"nor"_sd}, {"norwegian"_sd}, {"por"_sd}, {"porter"_sd}, + {"portuguese"_sd}, {"pt"_sd}, {"ro"_sd}, {"romanian"_sd}, {"ron"_sd}, + {"ru"_sd}, {"rum"_sd}, {"rus"_sd}, {"russian"_sd}, {"spa"_sd}, + {"spanish"_sd}, {"sv"_sd}, {"swe"_sd}, {"swedish"_sd}, {"tr"_sd}, + {"tur"_sd}, {"turkish"_sd}, +}; - if (textIndexVersion >= TEXT_INDEX_VERSION_2) { - LanguageMap* languageMap = - (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2; - (*languageMap)[languageName.toString()] = language; - } else { - // Legacy text index. - invariant(textIndexVersion == TEXT_INDEX_VERSION_1); - verify(languageMapV1.find(languageName) == languageMapV1.end()); - languageMapV1[languageName] = language; +template <TextIndexVersion ver> +class LanguageRegistry { +public: + // For V3 and above, use UnicodeFTSLanguage. + using LanguageType = + std::conditional_t<(ver >= TEXT_INDEX_VERSION_3), UnicodeFTSLanguage, BasicFTSLanguage>; + + // For V2 and above, language names are case-insensitive. + using KeyCompare = + std::conditional_t<(ver >= TEXT_INDEX_VERSION_2), LanguageStringCompare, std::less<>>; + + void add(StringData name, StringData alias = {}) { + auto p = std::make_shared<const LanguageType>(std::string{name}); + _map[name.toString()] = p; + if (!alias.empty()) { + _map[alias.toString()] = p; + } } -} -// static -void FTSLanguage::registerLanguageAlias(const FTSLanguage* language, - StringData alias, - TextIndexVersion textIndexVersion) { - if (textIndexVersion >= TEXT_INDEX_VERSION_2) { - LanguageMap* languageMap = - (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2; - (*languageMap)[alias.toString()] = language; - } else { - // Legacy text index. - invariant(textIndexVersion == TEXT_INDEX_VERSION_1); - verify(languageMapV1.find(alias) == languageMapV1.end()); - languageMapV1[alias] = language; + const LanguageType& make(StringData langName) const { + std::string nameStr{langName}; + auto it = _map.find(nameStr); + if (it == _map.end()) { + if constexpr (ver == TEXT_INDEX_VERSION_1) { + // v1 treats unrecognized language strings as "none". + return *_map.at("none"); + } else { + // v2 and above reject unrecognized language strings. + uasserted(ErrorCodes::BadValue, + R"(unsupported language: "{}" for text index version {})"_format(langName, + ver)); + } + } + return *it->second; } -} -FTSLanguage::FTSLanguage() : _canonicalName() {} - -const std::string& FTSLanguage::str() const { - verify(!_canonicalName.empty()); - return _canonicalName; -} - -// static -StatusWithFTSLanguage FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) { - if (textIndexVersion >= TEXT_INDEX_VERSION_2) { - LanguageMap* languageMap = - (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2; +private: + std::map<std::string, std::shared_ptr<const LanguageType>, KeyCompare> _map; +}; - LanguageMap::const_iterator it = languageMap->find(langName.toString()); +// template <TextIndexVersion ver> +// LanguageRegistry<ver> languageRegistry; - if (it == languageMap->end()) { - // TEXT_INDEX_VERSION_2 and above reject unrecognized language strings. - Status status = - Status(ErrorCodes::BadValue, - str::stream() << "unsupported language: \"" << langName - << "\" for text index version " << textIndexVersion); - return StatusWithFTSLanguage(status); +template <TextIndexVersion ver> +const LanguageRegistry<ver>& getLanguageRegistry() { + static const auto instance = [] { + auto registry = new LanguageRegistry<ver>; + if constexpr (ver == TEXT_INDEX_VERSION_1) { + for (auto&& spec : kLanguagesV1) { + registry->add(spec.name); + } + } else if constexpr (ver == TEXT_INDEX_VERSION_2 || ver == TEXT_INDEX_VERSION_3) { + for (auto&& spec : kLanguagesV2V3) { + registry->add(spec.name, spec.alias); + } } + return registry; + }(); + return *instance; +} - return StatusWithFTSLanguage(it->second); - } else { - // Legacy text index. - invariant(textIndexVersion == TEXT_INDEX_VERSION_1); - LanguageMapLegacy::const_iterator it = languageMapV1.find(langName); - if (it == languageMapV1.end()) { - // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". - return StatusWithFTSLanguage(&languageNoneV1); - } - return StatusWithFTSLanguage(it->second); +} // namespace + +const FTSLanguage& FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) { + switch (textIndexVersion) { + case TEXT_INDEX_VERSION_1: + return getLanguageRegistry<TEXT_INDEX_VERSION_1>().make(langName); + case TEXT_INDEX_VERSION_2: + return getLanguageRegistry<TEXT_INDEX_VERSION_2>().make(langName); + case TEXT_INDEX_VERSION_3: + return getLanguageRegistry<TEXT_INDEX_VERSION_3>().make(langName); + case TEXT_INDEX_VERSION_INVALID: + break; } + uasserted(ErrorCodes::BadValue, "invalid TextIndexVersion"); } std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { return std::make_unique<BasicFTSTokenizer>(this); } -const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const { - return _basicPhraseMatcher; -} - std::unique_ptr<FTSTokenizer> UnicodeFTSLanguage::createTokenizer() const { return std::make_unique<UnicodeFTSTokenizer>(this); } -const FTSPhraseMatcher& UnicodeFTSLanguage::getPhraseMatcher() const { - return _unicodePhraseMatcher; -} -} // namespace fts -} // namespace mongo +} // namespace mongo::fts diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index 8bdcd1aa5ce..74c2b2a8cb5 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -29,60 +29,54 @@ #pragma once +#include <memory> +#include <string> +#include <utility> + #include "mongo/base/status_with.h" #include "mongo/db/fts/fts_basic_phrase_matcher.h" #include "mongo/db/fts/fts_phrase_matcher.h" #include "mongo/db/fts/fts_unicode_phrase_matcher.h" #include "mongo/db/fts/fts_util.h" -#include <string> - namespace mongo { namespace fts { class FTSTokenizer; -// Legacy language initialization. -#define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \ - BasicFTSLanguage language; \ - MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \ - (::mongo::InitializerContext * context) { \ - FTSLanguage::registerLanguage(name, version, &language); \ - return Status::OK(); \ - } - /** * A FTSLanguage represents a language for a text-indexed document or a text search. * FTSLanguage objects are not copyable. * * Recommended usage: * - * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_3 ); - * if ( !swl.getStatus().isOK() ) { + * const auto& language = FTSLanguage::make( "en", TEXT_INDEX_VERSION_3 ); + * if ( !lang.isOK() ) { * // Error. * } * else { - * const FTSLanguage* language = swl.getValue(); + * const FTSLanguage& language = swl.getValue(); * // Use language. * } */ class FTSLanguage { - // Use make() instead of copying. - FTSLanguage(const FTSLanguage&) = delete; - FTSLanguage& operator=(const FTSLanguage&) = delete; - public: - /** Create an uninitialized language. */ - FTSLanguage(); + FTSLanguage(std::string canonical, std::unique_ptr<FTSPhraseMatcher> phraseMatcher) + : _canonicalName{std::move(canonical)}, _phraseMatcher{std::move(phraseMatcher)} {} virtual ~FTSLanguage() {} + // Use make() instead of copying. + FTSLanguage(const FTSLanguage&) = delete; + FTSLanguage& operator=(const FTSLanguage&) = delete; + /** - * Returns the language as a std::string in canonical form (lowercased English name). It is - * an error to call str() on an uninitialized language. + * Returns the language in canonical form (lowercased English name). */ - const std::string& str() const; + const std::string& str() const { + return _canonicalName; + } /** * Returns a new FTSTokenizer instance for this language. @@ -93,29 +87,13 @@ public: /** * Returns a reference to the phrase matcher instance that this language owns. */ - virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0; - - /** - * Register std::string 'languageName' as a new language with the text index version - * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. - * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language string. - */ - static void registerLanguage(StringData languageName, - TextIndexVersion textIndexVersion, - FTSLanguage* languageOut); - - /** - * Register 'alias' as an alias for 'language' with text index version - * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the - * newly-registered alias. - */ - static void registerLanguageAlias(const FTSLanguage* language, - StringData alias, - TextIndexVersion textIndexVersion); + const FTSPhraseMatcher& getPhraseMatcher() const { + return *_phraseMatcher; + } /** * Return the FTSLanguage associated with the given language string and the given text index - * version. Returns an error Status if an invalid language std::string is passed. + * version. Throws an AssertionError if an invalid langName is passed. * * For textIndexVersion >= TEXT_INDEX_VERSION_2, language strings are * case-insensitive, and need to be in one of the two following forms: @@ -128,27 +106,22 @@ public: * documents needs to be processed with the English stemmer and the empty stopword list * (since "en" is recognized by Snowball but not the stopword processing logic). */ - static StatusWith<const FTSLanguage*> make(StringData langName, - TextIndexVersion textIndexVersion); + static const FTSLanguage& make(StringData langName, TextIndexVersion textIndexVersion); private: - // std::string representation of language in canonical form. std::string _canonicalName; + std::unique_ptr<FTSPhraseMatcher> _phraseMatcher; }; -typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; - /** * FTSLanguage implementation that returns a BasicFTSTokenizer and BasicFTSPhraseMatcher for ASCII * aware case folding in FTS. */ class BasicFTSLanguage : public FTSLanguage { public: + explicit BasicFTSLanguage(const std::string& languageName) + : FTSLanguage(languageName, std::make_unique<BasicFTSPhraseMatcher>()) {} std::unique_ptr<FTSTokenizer> createTokenizer() const final; - const FTSPhraseMatcher& getPhraseMatcher() const final; - -private: - BasicFTSPhraseMatcher _basicPhraseMatcher; }; /** @@ -157,16 +130,10 @@ private: */ class UnicodeFTSLanguage : public FTSLanguage { public: - UnicodeFTSLanguage(const std::string& languageName) : _unicodePhraseMatcher(languageName) {} + explicit UnicodeFTSLanguage(const std::string& languageName) + : FTSLanguage(languageName, std::make_unique<UnicodeFTSPhraseMatcher>(languageName)) {} std::unique_ptr<FTSTokenizer> createTokenizer() const final; - const FTSPhraseMatcher& getPhraseMatcher() const final; - -private: - UnicodeFTSPhraseMatcher _unicodePhraseMatcher; }; -extern BasicFTSLanguage languagePorterV1; -extern BasicFTSLanguage languageEnglishV2; -extern BasicFTSLanguage languageFrenchV2; } // namespace fts } // namespace mongo diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp index 29166d88319..a9e8fdd3a24 100644 --- a/src/mongo/db/fts/fts_language_test.cpp +++ b/src/mongo/db/fts/fts_language_test.cpp @@ -31,149 +31,51 @@ #include "mongo/db/fts/fts_spec.h" #include "mongo/platform/basic.h" #include "mongo/unittest/unittest.h" +#include "mongo/util/assert_util.h" namespace mongo { namespace fts { -// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3. - -TEST(FTSLanguageV3, ExactLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_3); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV3, ExactCode) { - StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_3); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV3, UpperCaseLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_3); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV3, UpperCaseCode) { - StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_3); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV3, NoneLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_3); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "none"); -} - -// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3. - -TEST(FTSLanguageV3, Empty) { - StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_3); - ASSERT(!swl.getStatus().isOK()); -} - -TEST(FTSLanguageV3, Unknown) { - StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_3); - ASSERT(!swl.getStatus().isOK()); -} - -// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - -TEST(FTSLanguageV2, ExactLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_2); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV2, ExactCode) { - StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV2, UpperCaseLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV2, UpperCaseCode) { - StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_2); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV2, NoneLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_2); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "none"); -} - -// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - -TEST(FTSLanguageV2, Unknown) { - StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_2); - ASSERT(!swl.getStatus().isOK()); -} - -TEST(FTSLanguageV2, Empty) { - StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_2); - ASSERT(!swl.getStatus().isOK()); -} - -// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - -TEST(FTSLanguageV1, ExactLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "spanish"); -} - -TEST(FTSLanguageV1, DeprecatedLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "porter"); -} - -TEST(FTSLanguageV1, StemmerOnlyLanguage1) { - StatusWithFTSLanguage swl = FTSLanguage::make("en", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "en"); -} - -TEST(FTSLanguageV1, StemmerOnlyLanguage2) { - StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "eng"); -} - -TEST(FTSLanguageV1, NoneLanguage) { - StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "none"); -} - -// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - -TEST(FTSLanguageV1, CaseSensitive) { - StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "none"); -} - -TEST(FTSLanguageV1, Unknown) { - StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "none"); -} - -TEST(FTSLanguageV1, Empty) { - StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_1); - ASSERT(swl.getStatus().isOK()); - ASSERT_EQUALS(swl.getValue()->str(), "none"); -} +namespace { + +using LanguageMakeException = mongo::ExceptionFor<ErrorCodes::BadValue>; + +TEST(FTSLanguageV3, Make) { + static constexpr auto kVer = TEXT_INDEX_VERSION_3; + ASSERT_EQUALS(FTSLanguage::make("spanish", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("es", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("SPANISH", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("ES", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("none", kVer).str(), "none"); + ASSERT_THROWS(FTSLanguage::make("", kVer), LanguageMakeException); + ASSERT_THROWS(FTSLanguage::make("spanglish", kVer), LanguageMakeException); +} + +TEST(FTSLanguageV2, Make) { + static constexpr auto kVer = TEXT_INDEX_VERSION_2; + ASSERT_EQUALS(FTSLanguage::make("spanish", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("es", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("SPANISH", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("ES", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("none", kVer).str(), "none"); + ASSERT_THROWS(FTSLanguage::make("spanglish", kVer), LanguageMakeException); + ASSERT_THROWS(FTSLanguage::make("", kVer), LanguageMakeException); +} + +TEST(FTSLanguageV1, Make) { + static constexpr auto kVer = TEXT_INDEX_VERSION_1; + ASSERT_EQUALS(FTSLanguage::make("spanish", kVer).str(), "spanish"); + ASSERT_EQUALS(FTSLanguage::make("porter", kVer).str(), "porter") << "deprecated"; + ASSERT_EQUALS(FTSLanguage::make("en", kVer).str(), "en"); + ASSERT_EQUALS(FTSLanguage::make("eng", kVer).str(), "eng"); + ASSERT_EQUALS(FTSLanguage::make("none", kVer).str(), "none"); + // Negative V1 tests + ASSERT_EQUALS(FTSLanguage::make("SPANISH", kVer).str(), "none") << "case sensitive"; + ASSERT_EQUALS(FTSLanguage::make("asdf", kVer).str(), "none") << "unknown"; + ASSERT_EQUALS(FTSLanguage::make("", kVer).str(), "none"); +} + +} // namespace } // namespace fts } // namespace mongo diff --git a/src/mongo/db/fts/fts_query_impl.cpp b/src/mongo/db/fts/fts_query_impl.cpp index a60ee888e66..e996a12862c 100644 --- a/src/mongo/db/fts/fts_query_impl.cpp +++ b/src/mongo/db/fts/fts_query_impl.cpp @@ -48,9 +48,11 @@ using std::stringstream; using std::vector; Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) { - StatusWithFTSLanguage ftsLanguage = FTSLanguage::make(getLanguage(), textIndexVersion); - if (!ftsLanguage.getStatus().isOK()) { - return ftsLanguage.getStatus(); + const FTSLanguage* ftsLanguage; + try { + ftsLanguage = &FTSLanguage::make(getLanguage(), textIndexVersion); + } catch (const DBException& e) { + return e.toStatus(); } // Build a space delimited list of words to have the FtsTokenizer tokenize @@ -128,7 +130,7 @@ Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) { } } - std::unique_ptr<FTSTokenizer> tokenizer(ftsLanguage.getValue()->createTokenizer()); + std::unique_ptr<FTSTokenizer> tokenizer = ftsLanguage->createTokenizer(); _addTerms(tokenizer.get(), positiveTermSentence, false); _addTerms(tokenizer.get(), negativeTermSentence, true); diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index c358ba4b679..aeaacd08e21 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -97,17 +97,17 @@ FTSSpec::FTSSpec(const BSONObj& indexInfo) { // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires // textIndexVersion, since language parsing is version-specific. auto indexLanguage = indexInfo["default_language"].String(); - auto swl = FTSLanguage::make(indexLanguage, _textIndexVersion); - - // This can fail if the user originally created the text index under an instance of - // MongoDB that supports different languages then the current instance - // TODO: consder propagating the index ns to here to improve the error message - uassert(28682, - str::stream() << "Unrecognized language " << indexLanguage - << " found for text index. Verify mongod was started with the" - " correct options.", - swl.getStatus().isOK()); - _defaultLanguage = swl.getValue(); + try { + _defaultLanguage = &FTSLanguage::make(indexLanguage, _textIndexVersion); + } catch (const DBException& ex) { + // This can fail if the user originally created the text index under an instance of + // MongoDB that supports different languages then the current instance + // TODO: consder propagating the index ns to here to improve the error message + uasserted(28682, + str::stream() << "Unrecognized language " << indexLanguage + << " found for text index. Verify mongod was started with the" + " correct options."); + } _languageOverrideField = indexInfo["language_override"].valuestrsafe(); @@ -163,9 +163,11 @@ const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc, uassert(17261, "found language override field in document with non-string type", e.type() == mongo::String); - StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), getTextIndexVersion()); - uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK()); - return swl.getValue(); + try { + return &FTSLanguage::make(e.String(), getTextIndexVersion()); + } catch (DBException& ex) { + uasserted(17262, "language override unsupported: " + e.String()); + } } void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const { @@ -439,7 +441,9 @@ StatusWith<BSONObj> FTSSpec::fixSpec(const BSONObj& spec) { return {ErrorCodes::CannotCreateIndex, "default_language needs a string type"}; } - if (!FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3).getStatus().isOK()) { + try { + FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3); + } catch (DBException& ex) { return {ErrorCodes::CannotCreateIndex, "default_language is not valid"}; } diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index 1d58c1da750..06ed2e17088 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -55,9 +55,8 @@ const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const { if (e.type() == String) { const char* x = e.valuestrsafe(); if (strlen(x) > 0) { - StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1); - dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. - return *swl.getValue(); + // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. + return FTSLanguage::make(x, TEXT_INDEX_VERSION_1); } } return *_defaultLanguage; diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp index 7ed921b57d4..9499149ad19 100644 --- a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp +++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp @@ -39,10 +39,7 @@ namespace fts { std::vector<std::string> tokenizeString(const char* str, const char* language, FTSTokenizer::Options options) { - StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3); - ASSERT_OK(swl); - - UnicodeFTSTokenizer tokenizer(swl.getValue()); + UnicodeFTSTokenizer tokenizer(&FTSLanguage::make(language, TEXT_INDEX_VERSION_3)); tokenizer.reset(str, options); diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index be09fe34b8c..b95e0949f1f 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -36,14 +36,23 @@ namespace mongo { namespace fts { +namespace { +const FTSLanguage* languageEnglishV2() { + return &FTSLanguage::make("english", TEXT_INDEX_VERSION_2); +} +const FTSLanguage* languagePorterV1() { + return &FTSLanguage::make("porter", TEXT_INDEX_VERSION_1); +} +} // namespace + TEST(English, Stemmer1) { - Stemmer s(&languageEnglishV2); + Stemmer s(languageEnglishV2()); ASSERT_EQUALS("run", s.stem("running")); ASSERT_EQUALS("Run", s.stem("Running")); } TEST(English, Caps) { - Stemmer s(&languagePorterV1); + Stemmer s(languagePorterV1()); ASSERT_EQUALS("unit", s.stem("united")); ASSERT_EQUALS("Unite", s.stem("United")); } diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index f0fb8ec37b8..f35f350af35 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -37,7 +37,8 @@ namespace mongo { namespace fts { TEST(English, Basic1) { - const StopWords* englishStopWords = StopWords::getStopWords(&languageEnglishV2); + const FTSLanguage* lang = &FTSLanguage::make("english", TEXT_INDEX_VERSION_2); + const StopWords* englishStopWords = StopWords::getStopWords(lang); ASSERT(englishStopWords->isStopWord("the")); ASSERT(!englishStopWords->isStopWord("computer")); } diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index db61f3abc7d..db0a1c272af 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -36,13 +36,22 @@ namespace mongo { namespace fts { +namespace { +const FTSLanguage* languageEnglishV2() { + return &FTSLanguage::make("english", TEXT_INDEX_VERSION_2); +} +const FTSLanguage* languageFrenchV2() { + return &FTSLanguage::make("french", TEXT_INDEX_VERSION_2); +} +} // namespace + TEST(Tokenizer, Empty1) { - Tokenizer i(&languageEnglishV2, ""); + Tokenizer i(languageEnglishV2(), ""); ASSERT(!i.more()); } TEST(Tokenizer, Basic1) { - Tokenizer i(&languageEnglishV2, "blue red green"); + Tokenizer i(languageEnglishV2(), "blue red green"); ASSERT(i.more()); ASSERT_EQUALS(i.next().data.toString(), "blue"); @@ -57,7 +66,7 @@ TEST(Tokenizer, Basic1) { } TEST(Tokenizer, Basic2) { - Tokenizer i(&languageEnglishV2, "blue-red"); + Tokenizer i(languageEnglishV2(), "blue-red"); Token a = i.next(); Token b = i.next(); @@ -75,7 +84,7 @@ TEST(Tokenizer, Basic2) { } TEST(Tokenizer, Basic3) { - Tokenizer i(&languageEnglishV2, "blue -red"); + Tokenizer i(languageEnglishV2(), "blue -red"); Token a = i.next(); Token b = i.next(); @@ -97,7 +106,7 @@ TEST(Tokenizer, Basic3) { } TEST(Tokenizer, Quote1English) { - Tokenizer i(&languageEnglishV2, "eliot's car"); + Tokenizer i(languageEnglishV2(), "eliot's car"); Token a = i.next(); Token b = i.next(); @@ -107,7 +116,7 @@ TEST(Tokenizer, Quote1English) { } TEST(Tokenizer, Quote1French) { - Tokenizer i(&languageFrenchV2, "eliot's car"); + Tokenizer i(languageFrenchV2(), "eliot's car"); Token a = i.next(); Token b = i.next(); |