summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer_test.cpp5
-rw-r--r--src/mongo/db/fts/fts_element_iterator_test.cpp16
-rw-r--r--src/mongo/db/fts/fts_language.cpp351
-rw-r--r--src/mongo/db/fts/fts_language.h87
-rw-r--r--src/mongo/db/fts/fts_language_test.cpp180
-rw-r--r--src/mongo/db/fts/fts_query_impl.cpp10
-rw-r--r--src/mongo/db/fts/fts_spec.cpp34
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp5
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer_test.cpp5
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp13
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp3
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp21
12 files changed, 252 insertions, 478 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
index 1207849d0ef..29ce8e9f7ee 100644
--- a/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
+++ b/src/mongo/db/fts/fts_basic_tokenizer_test.cpp
@@ -36,10 +36,7 @@ namespace fts {
std::vector<std::string> tokenizeString(const char* str, const char* language) {
// To retrieve the FTSBasicTokenizer, use TEXT_INDEX_VERSION_2
- StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
- ASSERT_OK(swl);
-
- std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer());
+ auto tokenizer = FTSLanguage::make(language, TEXT_INDEX_VERSION_2).createTokenizer();
tokenizer->reset(str, FTSTokenizer::kNone);
diff --git a/src/mongo/db/fts/fts_element_iterator_test.cpp b/src/mongo/db/fts/fts_element_iterator_test.cpp
index 9f5258d57bb..cdb0641f5b5 100644
--- a/src/mongo/db/fts/fts_element_iterator_test.cpp
+++ b/src/mongo/db/fts/fts_element_iterator_test.cpp
@@ -322,28 +322,28 @@ TEST(FTSElementIterator, LanguageOverrideV2) {
FTSIteratorValue val = it.next();
ASSERT_EQUALS("walked", string(val._text));
ASSERT_EQUALS("english", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
ASSERT_EQUALS(1, val._weight);
ASSERT(it.more());
val = it.next();
ASSERT_EQUALS("camminato", string(val._text));
ASSERT_EQUALS("italian", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
ASSERT_EQUALS(1, val._weight);
ASSERT(it.more());
val = it.next();
ASSERT_EQUALS("ging", string(val._text));
ASSERT_EQUALS("german", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
ASSERT_EQUALS(1, val._weight);
ASSERT(it.more());
val = it.next();
ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text));
ASSERT_EQUALS("spanish", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_2));
ASSERT_EQUALS(1, val._weight);
}
@@ -371,28 +371,28 @@ TEST(FTSElementIterator, LanguageOverrideV3) {
FTSIteratorValue val = it.next();
ASSERT_EQUALS("walked", string(val._text));
ASSERT_EQUALS("english", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
ASSERT_EQUALS(1, val._weight);
ASSERT(it.more());
val = it.next();
ASSERT_EQUALS("camminato", string(val._text));
ASSERT_EQUALS("italian", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
ASSERT_EQUALS(1, val._weight);
ASSERT(it.more());
val = it.next();
ASSERT_EQUALS("ging", string(val._text));
ASSERT_EQUALS("german", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
ASSERT_EQUALS(1, val._weight);
ASSERT(it.more());
val = it.next();
ASSERT_EQUALS("Feliz Año Nuevo!", string(val._text));
ASSERT_EQUALS("spanish", val._language->str());
- ASSERT_EQUALS(val._language, FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
+ ASSERT_EQUALS(val._language, &FTSLanguage::make(val._language->str(), TEXT_INDEX_VERSION_3));
ASSERT_EQUALS(1, val._weight);
}
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index ad88ccc18d4..9f5920fa4b1 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -29,63 +29,41 @@
#include "mongo/db/fts/fts_language.h"
+#include <algorithm>
+#include <fmt/format.h>
+#include <map>
#include <memory>
#include <string>
+#include <type_traits>
+#include <utility>
-#include "mongo/base/init.h"
+#include "mongo/base/error_codes.h"
+#include "mongo/base/string_data.h"
#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_basic_tokenizer.h"
#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
#include "mongo/db/fts/fts_unicode_tokenizer.h"
#include "mongo/util/assert_util.h"
-#include "mongo/util/str.h"
-#include "mongo/util/string_map.h"
-namespace mongo {
-
-namespace fts {
+namespace mongo::fts {
namespace {
+using namespace fmt::literals;
+
/**
* Case-insensitive StringData comparator.
+ * Returns true if a < b.
*/
struct LanguageStringCompare {
- /** Returns true if lhs < rhs. */
- bool operator()(std::string lhs, std::string rhs) const {
- size_t minSize = std::min(lhs.size(), rhs.size());
-
- for (size_t x = 0; x < minSize; x++) {
- char a = tolower(lhs[x]);
- char b = tolower(rhs[x]);
- if (a < b) {
- return true;
- }
- if (a > b) {
- return false;
- }
- }
-
- return lhs.size() < rhs.size();
+ bool operator()(StringData a, StringData b) const {
+ return std::lexicographical_compare(
+ a.begin(), a.end(), b.begin(), b.end(), [](unsigned char a, unsigned char b) {
+ return std::tolower(a) < std::tolower(b);
+ });
}
};
-// Lookup table from user language string (case-insensitive) to FTSLanguage.
-// Populated by initializers in initializer FTSRegisterV2LanguagesAndLater and initializer
-// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes and above.
-typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMap;
-
-LanguageMap languageMapV3;
-LanguageMap languageMapV2;
-
-// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes.
-// Case-sensitive by lookup key.
-typedef std::map<StringData, const FTSLanguage*> LanguageMapLegacy;
-LanguageMapLegacy languageMapV1;
-} // namespace
-
-MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS);
-
// FTS Language map. These languages are available with TEXT_INDEX_VERSION_2 and above.
//
// Parameters:
@@ -93,78 +71,27 @@ MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO
// - lower case string name
// - language alias
//
-#define MONGO_FTS_LANGUAGE_LIST(MONGO_FTS_LANGUAGE_DECL) \
- MONGO_FTS_LANGUAGE_DECL(Danish, "danish", "da") \
- MONGO_FTS_LANGUAGE_DECL(Dutch, "dutch", "nl") \
- MONGO_FTS_LANGUAGE_DECL(English, "english", "en") \
- MONGO_FTS_LANGUAGE_DECL(Finnish, "finnish", "fi") \
- MONGO_FTS_LANGUAGE_DECL(French, "french", "fr") \
- MONGO_FTS_LANGUAGE_DECL(German, "german", "de") \
- MONGO_FTS_LANGUAGE_DECL(Hungarian, "hungarian", "hu") \
- MONGO_FTS_LANGUAGE_DECL(Italian, "italian", "it") \
- MONGO_FTS_LANGUAGE_DECL(Norwegian, "norwegian", "nb") \
- MONGO_FTS_LANGUAGE_DECL(Portuguese, "portuguese", "pt") \
- MONGO_FTS_LANGUAGE_DECL(Romanian, "romanian", "ro") \
- MONGO_FTS_LANGUAGE_DECL(Russian, "russian", "ru") \
- MONGO_FTS_LANGUAGE_DECL(Spanish, "spanish", "es") \
- MONGO_FTS_LANGUAGE_DECL(Swedish, "swedish", "sv") \
- MONGO_FTS_LANGUAGE_DECL(Turkish, "turkish", "tr")
-
-
-// Declare compilation unit local language object.
-// Must be declared statically as global language map only keeps a pointer to the language
-// instance.
-//
-#define LANGUAGE_DECLV2(id, name, alias) BasicFTSLanguage language##id##V2;
-
-#define LANGUAGE_DECLV3(id, name, alias) UnicodeFTSLanguage language##id##V3(name);
-
-BasicFTSLanguage languageNoneV2;
-MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV2);
-
-UnicodeFTSLanguage languageNoneV3("none");
-MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV3);
-
-// Registers each language and language aliases in the language map.
-//
-#define LANGUAGE_INITV2(id, name, alias) \
- FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_2, &language##id##V2);
-
-#define LANGUAGE_INITV3(id, name, alias) \
- FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_3, &language##id##V3);
-
-/**
- * Registers each language in the language map.
- */
-MONGO_INITIALIZER_GENERAL(FTSRegisterV2LanguagesAndLater,
- MONGO_NO_PREREQUISITES,
- ("FTSAllLanguagesRegistered"))
-(::mongo::InitializerContext* context) {
- FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_2, &languageNoneV2);
- MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV2);
-
- FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_3, &languageNoneV3);
- MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV3);
- return Status::OK();
-}
-
-#define LANGUAGE_ALIASV2(id, name, alias) \
- FTSLanguage::registerLanguageAlias(&language##id##V2, alias, TEXT_INDEX_VERSION_2);
-
-#define LANGUAGE_ALIASV3(id, name, alias) \
- FTSLanguage::registerLanguageAlias(&language##id##V3, alias, TEXT_INDEX_VERSION_3);
-
-/**
- * Registers each language alias in the language map.
- */
-MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered"))
-(InitializerContext* context) {
- // Register language aliases for TEXT_INDEX_VERSION_2.
- MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV2);
- // Register language aliases for TEXT_INDEX_VERSION_3.
- MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV3);
- return Status::OK();
-}
+struct {
+ StringData name; // - lower case string name
+ StringData alias; // - language alias (if nonempty)
+} static constexpr kLanguagesV2V3[] = {
+ {"none"_sd, {}},
+ {"danish"_sd, "da"_sd},
+ {"dutch"_sd, "nl"_sd},
+ {"english"_sd, "en"_sd},
+ {"finnish"_sd, "fi"_sd},
+ {"french"_sd, "fr"_sd},
+ {"german"_sd, "de"_sd},
+ {"hungarian"_sd, "hu"_sd},
+ {"italian"_sd, "it"_sd},
+ {"norwegian"_sd, "nb"_sd},
+ {"portuguese"_sd, "pt"_sd},
+ {"romanian"_sd, "ro"_sd},
+ {"russian"_sd, "ru"_sd},
+ {"spanish"_sd, "es"_sd},
+ {"swedish"_sd, "sv"_sd},
+ {"turkish"_sd, "tr"_sd},
+};
//
// Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full
@@ -172,145 +99,105 @@ MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguag
// TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list).
//
-MONGO_FTS_LANGUAGE_DECLARE(languageNoneV1, "none", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDaV1, "da", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDanV1, "dan", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDanishV1, "danish", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDeV1, "de", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDeuV1, "deu", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDutV1, "dut", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageDutchV1, "dutch", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageEnV1, "en", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageEngV1, "eng", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV1, "english", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageEsV1, "es", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageEslV1, "esl", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFiV1, "fi", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFinV1, "fin", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFrV1, "fr", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFraV1, "fra", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFreV1, "fre", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV1, "french", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageGerV1, "ger", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageGermanV1, "german", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageHuV1, "hu", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageHunV1, "hun", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageItV1, "it", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageItaV1, "ita", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageItalianV1, "italian", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageNlV1, "nl", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageNldV1, "nld", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageNoV1, "no", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageNorV1, "nor", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languagePorV1, "por", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languagePorterV1, "porter", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languagePtV1, "pt", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRoV1, "ro", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRonV1, "ron", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRuV1, "ru", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRumV1, "rum", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRusV1, "rus", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageRussianV1, "russian", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageSpaV1, "spa", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageSvV1, "sv", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageSweV1, "swe", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1);
-MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1);
-
-// static
-void FTSLanguage::registerLanguage(StringData languageName,
- TextIndexVersion textIndexVersion,
- FTSLanguage* language) {
- verify(!languageName.empty());
- language->_canonicalName = languageName.toString();
+struct {
+ StringData name;
+} static constexpr kLanguagesV1[] = {
+ {"none"_sd}, {"da"_sd}, {"dan"_sd}, {"danish"_sd}, {"de"_sd},
+ {"deu"_sd}, {"dut"_sd}, {"dutch"_sd}, {"en"_sd}, {"eng"_sd},
+ {"english"_sd}, {"es"_sd}, {"esl"_sd}, {"fi"_sd}, {"fin"_sd},
+ {"finnish"_sd}, {"fr"_sd}, {"fra"_sd}, {"fre"_sd}, {"french"_sd},
+ {"ger"_sd}, {"german"_sd}, {"hu"_sd}, {"hun"_sd}, {"hungarian"_sd},
+ {"it"_sd}, {"ita"_sd}, {"italian"_sd}, {"nl"_sd}, {"nld"_sd},
+ {"no"_sd}, {"nor"_sd}, {"norwegian"_sd}, {"por"_sd}, {"porter"_sd},
+ {"portuguese"_sd}, {"pt"_sd}, {"ro"_sd}, {"romanian"_sd}, {"ron"_sd},
+ {"ru"_sd}, {"rum"_sd}, {"rus"_sd}, {"russian"_sd}, {"spa"_sd},
+ {"spanish"_sd}, {"sv"_sd}, {"swe"_sd}, {"swedish"_sd}, {"tr"_sd},
+ {"tur"_sd}, {"turkish"_sd},
+};
- if (textIndexVersion >= TEXT_INDEX_VERSION_2) {
- LanguageMap* languageMap =
- (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2;
- (*languageMap)[languageName.toString()] = language;
- } else {
- // Legacy text index.
- invariant(textIndexVersion == TEXT_INDEX_VERSION_1);
- verify(languageMapV1.find(languageName) == languageMapV1.end());
- languageMapV1[languageName] = language;
+template <TextIndexVersion ver>
+class LanguageRegistry {
+public:
+ // For V3 and above, use UnicodeFTSLanguage.
+ using LanguageType =
+ std::conditional_t<(ver >= TEXT_INDEX_VERSION_3), UnicodeFTSLanguage, BasicFTSLanguage>;
+
+ // For V2 and above, language names are case-insensitive.
+ using KeyCompare =
+ std::conditional_t<(ver >= TEXT_INDEX_VERSION_2), LanguageStringCompare, std::less<>>;
+
+ void add(StringData name, StringData alias = {}) {
+ auto p = std::make_shared<const LanguageType>(std::string{name});
+ _map[name.toString()] = p;
+ if (!alias.empty()) {
+ _map[alias.toString()] = p;
+ }
}
-}
-// static
-void FTSLanguage::registerLanguageAlias(const FTSLanguage* language,
- StringData alias,
- TextIndexVersion textIndexVersion) {
- if (textIndexVersion >= TEXT_INDEX_VERSION_2) {
- LanguageMap* languageMap =
- (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2;
- (*languageMap)[alias.toString()] = language;
- } else {
- // Legacy text index.
- invariant(textIndexVersion == TEXT_INDEX_VERSION_1);
- verify(languageMapV1.find(alias) == languageMapV1.end());
- languageMapV1[alias] = language;
+ const LanguageType& make(StringData langName) const {
+ std::string nameStr{langName};
+ auto it = _map.find(nameStr);
+ if (it == _map.end()) {
+ if constexpr (ver == TEXT_INDEX_VERSION_1) {
+ // v1 treats unrecognized language strings as "none".
+ return *_map.at("none");
+ } else {
+ // v2 and above reject unrecognized language strings.
+ uasserted(ErrorCodes::BadValue,
+ R"(unsupported language: "{}" for text index version {})"_format(langName,
+ ver));
+ }
+ }
+ return *it->second;
}
-}
-FTSLanguage::FTSLanguage() : _canonicalName() {}
-
-const std::string& FTSLanguage::str() const {
- verify(!_canonicalName.empty());
- return _canonicalName;
-}
-
-// static
-StatusWithFTSLanguage FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) {
- if (textIndexVersion >= TEXT_INDEX_VERSION_2) {
- LanguageMap* languageMap =
- (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2;
+private:
+ std::map<std::string, std::shared_ptr<const LanguageType>, KeyCompare> _map;
+};
- LanguageMap::const_iterator it = languageMap->find(langName.toString());
+// template <TextIndexVersion ver>
+// LanguageRegistry<ver> languageRegistry;
- if (it == languageMap->end()) {
- // TEXT_INDEX_VERSION_2 and above reject unrecognized language strings.
- Status status =
- Status(ErrorCodes::BadValue,
- str::stream() << "unsupported language: \"" << langName
- << "\" for text index version " << textIndexVersion);
- return StatusWithFTSLanguage(status);
+template <TextIndexVersion ver>
+const LanguageRegistry<ver>& getLanguageRegistry() {
+ static const auto instance = [] {
+ auto registry = new LanguageRegistry<ver>;
+ if constexpr (ver == TEXT_INDEX_VERSION_1) {
+ for (auto&& spec : kLanguagesV1) {
+ registry->add(spec.name);
+ }
+ } else if constexpr (ver == TEXT_INDEX_VERSION_2 || ver == TEXT_INDEX_VERSION_3) {
+ for (auto&& spec : kLanguagesV2V3) {
+ registry->add(spec.name, spec.alias);
+ }
}
+ return registry;
+ }();
+ return *instance;
+}
- return StatusWithFTSLanguage(it->second);
- } else {
- // Legacy text index.
- invariant(textIndexVersion == TEXT_INDEX_VERSION_1);
- LanguageMapLegacy::const_iterator it = languageMapV1.find(langName);
- if (it == languageMapV1.end()) {
- // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
- return StatusWithFTSLanguage(&languageNoneV1);
- }
- return StatusWithFTSLanguage(it->second);
+} // namespace
+
+const FTSLanguage& FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) {
+ switch (textIndexVersion) {
+ case TEXT_INDEX_VERSION_1:
+ return getLanguageRegistry<TEXT_INDEX_VERSION_1>().make(langName);
+ case TEXT_INDEX_VERSION_2:
+ return getLanguageRegistry<TEXT_INDEX_VERSION_2>().make(langName);
+ case TEXT_INDEX_VERSION_3:
+ return getLanguageRegistry<TEXT_INDEX_VERSION_3>().make(langName);
+ case TEXT_INDEX_VERSION_INVALID:
+ break;
}
+ uasserted(ErrorCodes::BadValue, "invalid TextIndexVersion");
}
std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
return std::make_unique<BasicFTSTokenizer>(this);
}
-const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const {
- return _basicPhraseMatcher;
-}
-
std::unique_ptr<FTSTokenizer> UnicodeFTSLanguage::createTokenizer() const {
return std::make_unique<UnicodeFTSTokenizer>(this);
}
-const FTSPhraseMatcher& UnicodeFTSLanguage::getPhraseMatcher() const {
- return _unicodePhraseMatcher;
-}
-} // namespace fts
-} // namespace mongo
+} // namespace mongo::fts
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index 8bdcd1aa5ce..74c2b2a8cb5 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -29,60 +29,54 @@
#pragma once
+#include <memory>
+#include <string>
+#include <utility>
+
#include "mongo/base/status_with.h"
#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_phrase_matcher.h"
#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
#include "mongo/db/fts/fts_util.h"
-#include <string>
-
namespace mongo {
namespace fts {
class FTSTokenizer;
-// Legacy language initialization.
-#define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \
- BasicFTSLanguage language; \
- MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \
- (::mongo::InitializerContext * context) { \
- FTSLanguage::registerLanguage(name, version, &language); \
- return Status::OK(); \
- }
-
/**
* A FTSLanguage represents a language for a text-indexed document or a text search.
* FTSLanguage objects are not copyable.
*
* Recommended usage:
*
- * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_3 );
- * if ( !swl.getStatus().isOK() ) {
+ * const auto& language = FTSLanguage::make( "en", TEXT_INDEX_VERSION_3 );
+ * if ( !lang.isOK() ) {
* // Error.
* }
* else {
- * const FTSLanguage* language = swl.getValue();
+ * const FTSLanguage& language = swl.getValue();
* // Use language.
* }
*/
class FTSLanguage {
- // Use make() instead of copying.
- FTSLanguage(const FTSLanguage&) = delete;
- FTSLanguage& operator=(const FTSLanguage&) = delete;
-
public:
- /** Create an uninitialized language. */
- FTSLanguage();
+ FTSLanguage(std::string canonical, std::unique_ptr<FTSPhraseMatcher> phraseMatcher)
+ : _canonicalName{std::move(canonical)}, _phraseMatcher{std::move(phraseMatcher)} {}
virtual ~FTSLanguage() {}
+ // Use make() instead of copying.
+ FTSLanguage(const FTSLanguage&) = delete;
+ FTSLanguage& operator=(const FTSLanguage&) = delete;
+
/**
- * Returns the language as a std::string in canonical form (lowercased English name). It is
- * an error to call str() on an uninitialized language.
+ * Returns the language in canonical form (lowercased English name).
*/
- const std::string& str() const;
+ const std::string& str() const {
+ return _canonicalName;
+ }
/**
* Returns a new FTSTokenizer instance for this language.
@@ -93,29 +87,13 @@ public:
/**
* Returns a reference to the phrase matcher instance that this language owns.
*/
- virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0;
-
- /**
- * Register std::string 'languageName' as a new language with the text index version
- * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
- * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language string.
- */
- static void registerLanguage(StringData languageName,
- TextIndexVersion textIndexVersion,
- FTSLanguage* languageOut);
-
- /**
- * Register 'alias' as an alias for 'language' with text index version
- * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the
- * newly-registered alias.
- */
- static void registerLanguageAlias(const FTSLanguage* language,
- StringData alias,
- TextIndexVersion textIndexVersion);
+ const FTSPhraseMatcher& getPhraseMatcher() const {
+ return *_phraseMatcher;
+ }
/**
* Return the FTSLanguage associated with the given language string and the given text index
- * version. Returns an error Status if an invalid language std::string is passed.
+ * version. Throws an AssertionError if an invalid langName is passed.
*
* For textIndexVersion >= TEXT_INDEX_VERSION_2, language strings are
* case-insensitive, and need to be in one of the two following forms:
@@ -128,27 +106,22 @@ public:
* documents needs to be processed with the English stemmer and the empty stopword list
* (since "en" is recognized by Snowball but not the stopword processing logic).
*/
- static StatusWith<const FTSLanguage*> make(StringData langName,
- TextIndexVersion textIndexVersion);
+ static const FTSLanguage& make(StringData langName, TextIndexVersion textIndexVersion);
private:
- // std::string representation of language in canonical form.
std::string _canonicalName;
+ std::unique_ptr<FTSPhraseMatcher> _phraseMatcher;
};
-typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
-
/**
* FTSLanguage implementation that returns a BasicFTSTokenizer and BasicFTSPhraseMatcher for ASCII
* aware case folding in FTS.
*/
class BasicFTSLanguage : public FTSLanguage {
public:
+ explicit BasicFTSLanguage(const std::string& languageName)
+ : FTSLanguage(languageName, std::make_unique<BasicFTSPhraseMatcher>()) {}
std::unique_ptr<FTSTokenizer> createTokenizer() const final;
- const FTSPhraseMatcher& getPhraseMatcher() const final;
-
-private:
- BasicFTSPhraseMatcher _basicPhraseMatcher;
};
/**
@@ -157,16 +130,10 @@ private:
*/
class UnicodeFTSLanguage : public FTSLanguage {
public:
- UnicodeFTSLanguage(const std::string& languageName) : _unicodePhraseMatcher(languageName) {}
+ explicit UnicodeFTSLanguage(const std::string& languageName)
+ : FTSLanguage(languageName, std::make_unique<UnicodeFTSPhraseMatcher>(languageName)) {}
std::unique_ptr<FTSTokenizer> createTokenizer() const final;
- const FTSPhraseMatcher& getPhraseMatcher() const final;
-
-private:
- UnicodeFTSPhraseMatcher _unicodePhraseMatcher;
};
-extern BasicFTSLanguage languagePorterV1;
-extern BasicFTSLanguage languageEnglishV2;
-extern BasicFTSLanguage languageFrenchV2;
} // namespace fts
} // namespace mongo
diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp
index 29166d88319..a9e8fdd3a24 100644
--- a/src/mongo/db/fts/fts_language_test.cpp
+++ b/src/mongo/db/fts/fts_language_test.cpp
@@ -31,149 +31,51 @@
#include "mongo/db/fts/fts_spec.h"
#include "mongo/platform/basic.h"
#include "mongo/unittest/unittest.h"
+#include "mongo/util/assert_util.h"
namespace mongo {
namespace fts {
-// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3.
-
-TEST(FTSLanguageV3, ExactLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_3);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV3, ExactCode) {
- StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_3);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV3, UpperCaseLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_3);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV3, UpperCaseCode) {
- StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_3);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV3, NoneLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_3);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "none");
-}
-
-// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3.
-
-TEST(FTSLanguageV3, Empty) {
- StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_3);
- ASSERT(!swl.getStatus().isOK());
-}
-
-TEST(FTSLanguageV3, Unknown) {
- StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_3);
- ASSERT(!swl.getStatus().isOK());
-}
-
-// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
-
-TEST(FTSLanguageV2, ExactLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_2);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV2, ExactCode) {
- StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV2, UpperCaseLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV2, UpperCaseCode) {
- StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_2);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV2, NoneLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_2);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "none");
-}
-
-// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
-
-TEST(FTSLanguageV2, Unknown) {
- StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_2);
- ASSERT(!swl.getStatus().isOK());
-}
-
-TEST(FTSLanguageV2, Empty) {
- StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_2);
- ASSERT(!swl.getStatus().isOK());
-}
-
-// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
-
-TEST(FTSLanguageV1, ExactLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "spanish");
-}
-
-TEST(FTSLanguageV1, DeprecatedLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "porter");
-}
-
-TEST(FTSLanguageV1, StemmerOnlyLanguage1) {
- StatusWithFTSLanguage swl = FTSLanguage::make("en", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "en");
-}
-
-TEST(FTSLanguageV1, StemmerOnlyLanguage2) {
- StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "eng");
-}
-
-TEST(FTSLanguageV1, NoneLanguage) {
- StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "none");
-}
-
-// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
-
-TEST(FTSLanguageV1, CaseSensitive) {
- StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "none");
-}
-
-TEST(FTSLanguageV1, Unknown) {
- StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "none");
-}
-
-TEST(FTSLanguageV1, Empty) {
- StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_1);
- ASSERT(swl.getStatus().isOK());
- ASSERT_EQUALS(swl.getValue()->str(), "none");
-}
+namespace {
+
+using LanguageMakeException = mongo::ExceptionFor<ErrorCodes::BadValue>;
+
+TEST(FTSLanguageV3, Make) {
+ static constexpr auto kVer = TEXT_INDEX_VERSION_3;
+ ASSERT_EQUALS(FTSLanguage::make("spanish", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("es", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("SPANISH", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("ES", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("none", kVer).str(), "none");
+ ASSERT_THROWS(FTSLanguage::make("", kVer), LanguageMakeException);
+ ASSERT_THROWS(FTSLanguage::make("spanglish", kVer), LanguageMakeException);
+}
+
+TEST(FTSLanguageV2, Make) {
+ static constexpr auto kVer = TEXT_INDEX_VERSION_2;
+ ASSERT_EQUALS(FTSLanguage::make("spanish", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("es", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("SPANISH", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("ES", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("none", kVer).str(), "none");
+ ASSERT_THROWS(FTSLanguage::make("spanglish", kVer), LanguageMakeException);
+ ASSERT_THROWS(FTSLanguage::make("", kVer), LanguageMakeException);
+}
+
+TEST(FTSLanguageV1, Make) {
+ static constexpr auto kVer = TEXT_INDEX_VERSION_1;
+ ASSERT_EQUALS(FTSLanguage::make("spanish", kVer).str(), "spanish");
+ ASSERT_EQUALS(FTSLanguage::make("porter", kVer).str(), "porter") << "deprecated";
+ ASSERT_EQUALS(FTSLanguage::make("en", kVer).str(), "en");
+ ASSERT_EQUALS(FTSLanguage::make("eng", kVer).str(), "eng");
+ ASSERT_EQUALS(FTSLanguage::make("none", kVer).str(), "none");
+ // Negative V1 tests
+ ASSERT_EQUALS(FTSLanguage::make("SPANISH", kVer).str(), "none") << "case sensitive";
+ ASSERT_EQUALS(FTSLanguage::make("asdf", kVer).str(), "none") << "unknown";
+ ASSERT_EQUALS(FTSLanguage::make("", kVer).str(), "none");
+}
+
+} // namespace
} // namespace fts
} // namespace mongo
diff --git a/src/mongo/db/fts/fts_query_impl.cpp b/src/mongo/db/fts/fts_query_impl.cpp
index a60ee888e66..e996a12862c 100644
--- a/src/mongo/db/fts/fts_query_impl.cpp
+++ b/src/mongo/db/fts/fts_query_impl.cpp
@@ -48,9 +48,11 @@ using std::stringstream;
using std::vector;
Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) {
- StatusWithFTSLanguage ftsLanguage = FTSLanguage::make(getLanguage(), textIndexVersion);
- if (!ftsLanguage.getStatus().isOK()) {
- return ftsLanguage.getStatus();
+ const FTSLanguage* ftsLanguage;
+ try {
+ ftsLanguage = &FTSLanguage::make(getLanguage(), textIndexVersion);
+ } catch (const DBException& e) {
+ return e.toStatus();
}
// Build a space delimited list of words to have the FtsTokenizer tokenize
@@ -128,7 +130,7 @@ Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) {
}
}
- std::unique_ptr<FTSTokenizer> tokenizer(ftsLanguage.getValue()->createTokenizer());
+ std::unique_ptr<FTSTokenizer> tokenizer = ftsLanguage->createTokenizer();
_addTerms(tokenizer.get(), positiveTermSentence, false);
_addTerms(tokenizer.get(), negativeTermSentence, true);
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index c358ba4b679..aeaacd08e21 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -97,17 +97,17 @@ FTSSpec::FTSSpec(const BSONObj& indexInfo) {
// Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
// textIndexVersion, since language parsing is version-specific.
auto indexLanguage = indexInfo["default_language"].String();
- auto swl = FTSLanguage::make(indexLanguage, _textIndexVersion);
-
- // This can fail if the user originally created the text index under an instance of
- // MongoDB that supports different languages then the current instance
- // TODO: consder propagating the index ns to here to improve the error message
- uassert(28682,
- str::stream() << "Unrecognized language " << indexLanguage
- << " found for text index. Verify mongod was started with the"
- " correct options.",
- swl.getStatus().isOK());
- _defaultLanguage = swl.getValue();
+ try {
+ _defaultLanguage = &FTSLanguage::make(indexLanguage, _textIndexVersion);
+ } catch (const DBException& ex) {
+ // This can fail if the user originally created the text index under an instance of
+ // MongoDB that supports different languages then the current instance
+ // TODO: consder propagating the index ns to here to improve the error message
+ uasserted(28682,
+ str::stream() << "Unrecognized language " << indexLanguage
+ << " found for text index. Verify mongod was started with the"
+ " correct options.");
+ }
_languageOverrideField = indexInfo["language_override"].valuestrsafe();
@@ -163,9 +163,11 @@ const FTSLanguage* FTSSpec::_getLanguageToUseV2(const BSONObj& userDoc,
uassert(17261,
"found language override field in document with non-string type",
e.type() == mongo::String);
- StatusWithFTSLanguage swl = FTSLanguage::make(e.String(), getTextIndexVersion());
- uassert(17262, "language override unsupported: " + e.String(), swl.getStatus().isOK());
- return swl.getValue();
+ try {
+ return &FTSLanguage::make(e.String(), getTextIndexVersion());
+ } catch (DBException& ex) {
+ uasserted(17262, "language override unsupported: " + e.String());
+ }
}
void FTSSpec::scoreDocument(const BSONObj& obj, TermFrequencyMap* term_freqs) const {
@@ -439,7 +441,9 @@ StatusWith<BSONObj> FTSSpec::fixSpec(const BSONObj& spec) {
return {ErrorCodes::CannotCreateIndex, "default_language needs a string type"};
}
- if (!FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3).getStatus().isOK()) {
+ try {
+ FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3);
+ } catch (DBException& ex) {
return {ErrorCodes::CannotCreateIndex, "default_language is not valid"};
}
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index 1d58c1da750..06ed2e17088 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -55,9 +55,8 @@ const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const {
if (e.type() == String) {
const char* x = e.valuestrsafe();
if (strlen(x) > 0) {
- StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1);
- dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
- return *swl.getValue();
+ // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
+ return FTSLanguage::make(x, TEXT_INDEX_VERSION_1);
}
}
return *_defaultLanguage;
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
index 7ed921b57d4..9499149ad19 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
+++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
@@ -39,10 +39,7 @@ namespace fts {
std::vector<std::string> tokenizeString(const char* str,
const char* language,
FTSTokenizer::Options options) {
- StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3);
- ASSERT_OK(swl);
-
- UnicodeFTSTokenizer tokenizer(swl.getValue());
+ UnicodeFTSTokenizer tokenizer(&FTSLanguage::make(language, TEXT_INDEX_VERSION_3));
tokenizer.reset(str, options);
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index be09fe34b8c..b95e0949f1f 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -36,14 +36,23 @@
namespace mongo {
namespace fts {
+namespace {
+const FTSLanguage* languageEnglishV2() {
+ return &FTSLanguage::make("english", TEXT_INDEX_VERSION_2);
+}
+const FTSLanguage* languagePorterV1() {
+ return &FTSLanguage::make("porter", TEXT_INDEX_VERSION_1);
+}
+} // namespace
+
TEST(English, Stemmer1) {
- Stemmer s(&languageEnglishV2);
+ Stemmer s(languageEnglishV2());
ASSERT_EQUALS("run", s.stem("running"));
ASSERT_EQUALS("Run", s.stem("Running"));
}
TEST(English, Caps) {
- Stemmer s(&languagePorterV1);
+ Stemmer s(languagePorterV1());
ASSERT_EQUALS("unit", s.stem("united"));
ASSERT_EQUALS("Unite", s.stem("United"));
}
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index f0fb8ec37b8..f35f350af35 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -37,7 +37,8 @@ namespace mongo {
namespace fts {
TEST(English, Basic1) {
- const StopWords* englishStopWords = StopWords::getStopWords(&languageEnglishV2);
+ const FTSLanguage* lang = &FTSLanguage::make("english", TEXT_INDEX_VERSION_2);
+ const StopWords* englishStopWords = StopWords::getStopWords(lang);
ASSERT(englishStopWords->isStopWord("the"));
ASSERT(!englishStopWords->isStopWord("computer"));
}
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index db61f3abc7d..db0a1c272af 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -36,13 +36,22 @@
namespace mongo {
namespace fts {
+namespace {
+const FTSLanguage* languageEnglishV2() {
+ return &FTSLanguage::make("english", TEXT_INDEX_VERSION_2);
+}
+const FTSLanguage* languageFrenchV2() {
+ return &FTSLanguage::make("french", TEXT_INDEX_VERSION_2);
+}
+} // namespace
+
TEST(Tokenizer, Empty1) {
- Tokenizer i(&languageEnglishV2, "");
+ Tokenizer i(languageEnglishV2(), "");
ASSERT(!i.more());
}
TEST(Tokenizer, Basic1) {
- Tokenizer i(&languageEnglishV2, "blue red green");
+ Tokenizer i(languageEnglishV2(), "blue red green");
ASSERT(i.more());
ASSERT_EQUALS(i.next().data.toString(), "blue");
@@ -57,7 +66,7 @@ TEST(Tokenizer, Basic1) {
}
TEST(Tokenizer, Basic2) {
- Tokenizer i(&languageEnglishV2, "blue-red");
+ Tokenizer i(languageEnglishV2(), "blue-red");
Token a = i.next();
Token b = i.next();
@@ -75,7 +84,7 @@ TEST(Tokenizer, Basic2) {
}
TEST(Tokenizer, Basic3) {
- Tokenizer i(&languageEnglishV2, "blue -red");
+ Tokenizer i(languageEnglishV2(), "blue -red");
Token a = i.next();
Token b = i.next();
@@ -97,7 +106,7 @@ TEST(Tokenizer, Basic3) {
}
TEST(Tokenizer, Quote1English) {
- Tokenizer i(&languageEnglishV2, "eliot's car");
+ Tokenizer i(languageEnglishV2(), "eliot's car");
Token a = i.next();
Token b = i.next();
@@ -107,7 +116,7 @@ TEST(Tokenizer, Quote1English) {
}
TEST(Tokenizer, Quote1French) {
- Tokenizer i(&languageFrenchV2, "eliot's car");
+ Tokenizer i(languageFrenchV2(), "eliot's car");
Token a = i.next();
Token b = i.next();