summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts
diff options
context:
space:
mode:
authorAdam Chelminski <adam.chelminski@mongodb.com>2015-07-29 15:05:21 -0400
committerAdam Chelminski <adam.chelminski@mongodb.com>2015-08-11 16:56:55 -0400
commit92eac3b57d8beaf063fced8839cd870f97826bb7 (patch)
tree0db84953876345d4725576538c14783cb81391e9 /src/mongo/db/fts
parent657343ccff986bd2f8c46fc7455db4238e8801d1 (diff)
downloadmongo-92eac3b57d8beaf063fced8839cd870f97826bb7.tar.gz
SERVER-19557 Add text index v3
Diffstat (limited to 'src/mongo/db/fts')
-rw-r--r--src/mongo/db/fts/fts_index_format.cpp8
-rw-r--r--src/mongo/db/fts/fts_language.cpp232
-rw-r--r--src/mongo/db/fts/fts_language.h38
-rw-r--r--src/mongo/db/fts/fts_language_test.cpp44
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp38
-rw-r--r--src/mongo/db/fts/fts_matcher.h9
-rw-r--r--src/mongo/db/fts/fts_matcher_test.cpp22
-rw-r--r--src/mongo/db/fts/fts_query.cpp25
-rw-r--r--src/mongo/db/fts/fts_query.h9
-rw-r--r--src/mongo/db/fts/fts_query_test.cpp54
-rw-r--r--src/mongo/db/fts/fts_spec.cpp38
-rw-r--r--src/mongo/db/fts/fts_spec_test.cpp5
-rw-r--r--src/mongo/db/fts/fts_unicode_tokenizer_test.cpp5
-rw-r--r--src/mongo/db/fts/fts_util.h3
14 files changed, 348 insertions, 182 deletions
diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp
index f7110d80858..dcf96e25126 100644
--- a/src/mongo/db/fts/fts_index_format.cpp
+++ b/src/mongo/db/fts/fts_index_format.cpp
@@ -64,14 +64,14 @@ const size_t termKeyLength = termKeyPrefixLength + termKeySuffixLength;
/**
* Returns size of buffer required to store term in index key.
* In version 1, terms are stored verbatim in key.
- * In version 2, terms longer than 32 characters are hashed and combined
+ * In version 2 and above, terms longer than 32 characters are hashed and combined
* with a prefix.
*/
int guessTermSize(const std::string& term, TextIndexVersion textIndexVersion) {
if (TEXT_INDEX_VERSION_1 == textIndexVersion) {
return term.size();
} else {
- invariant(TEXT_INDEX_VERSION_2 == textIndexVersion);
+ invariant(TEXT_INDEX_VERSION_2 <= textIndexVersion);
if (term.size() <= termKeyPrefixLength) {
return term.size();
}
@@ -184,9 +184,9 @@ void FTSIndexFormat::_appendIndexKey(BSONObjBuilder& b,
b.append("", weight);
}
// See comments at the top of file for termKeyPrefixLength.
- // Apply hash for text index version 2 to long terms (longer than 32 characters).
+ // Apply hash for text index version 2 and above to long terms (longer than 32 characters).
else {
- invariant(TEXT_INDEX_VERSION_2 == textIndexVersion);
+ invariant(TEXT_INDEX_VERSION_2 <= textIndexVersion);
if (term.size() <= termKeyPrefixLength) {
b.append("", term);
} else {
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index 1180cfa17b1..b01e9de6508 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -35,6 +35,8 @@
#include "mongo/base/init.h"
#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_basic_tokenizer.h"
+#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
+#include "mongo/db/fts/fts_unicode_tokenizer.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/mongoutils/str.h"
@@ -70,48 +72,101 @@ struct LanguageStringCompare {
}
};
-// Lookup table from user language string (case-insensitive) to FTSLanguage. Populated
-// by initializers in group FTSAllLanguagesRegistered and initializer
-// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only.
-typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMapV2;
-LanguageMapV2 languageMapV2;
+// Lookup table from user language string (case-insensitive) to FTSLanguage.
+// Populated by initializers in initializer FTSRegisterV2LanguagesAndLater and initializer
+// FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes and above.
+typedef std::map<std::string, const FTSLanguage*, LanguageStringCompare> LanguageMap;
+
+LanguageMap languageMapV3;
+LanguageMap languageMapV2;
// Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes.
// Case-sensitive by lookup key.
-typedef std::map<StringData, const FTSLanguage*> LanguageMapV1;
-LanguageMapV1 languageMapV1;
+typedef std::map<StringData, const FTSLanguage*> LanguageMapLegacy;
+LanguageMapLegacy languageMapV1;
}
-std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
- return stdx::make_unique<BasicFTSTokenizer>(this);
-}
+MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS);
-const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const {
- return _basicPhraseMatcher;
-}
+// FTS Language map. These languages are available with TEXT_INDEX_VERSION_2 and above.
+//
+// Parameters:
+// - C++ unique identifier suffix
+// - lower case string name
+// - language alias
+//
+#define MONGO_FTS_LANGUAGE_LIST(MONGO_FTS_LANGUAGE_DECL) \
+ MONGO_FTS_LANGUAGE_DECL(Danish, "danish", "da") \
+ MONGO_FTS_LANGUAGE_DECL(Dutch, "dutch", "nl") \
+ MONGO_FTS_LANGUAGE_DECL(English, "english", "en") \
+ MONGO_FTS_LANGUAGE_DECL(Finnish, "finnish", "fi") \
+ MONGO_FTS_LANGUAGE_DECL(French, "french", "fr") \
+ MONGO_FTS_LANGUAGE_DECL(German, "german", "de") \
+ MONGO_FTS_LANGUAGE_DECL(Hungarian, "hungarian", "hu") \
+ MONGO_FTS_LANGUAGE_DECL(Italian, "italian", "it") \
+ MONGO_FTS_LANGUAGE_DECL(Norwegian, "norwegian", "nb") \
+ MONGO_FTS_LANGUAGE_DECL(Portuguese, "portuguese", "pt") \
+ MONGO_FTS_LANGUAGE_DECL(Romanian, "romanian", "ro") \
+ MONGO_FTS_LANGUAGE_DECL(Russian, "russian", "ru") \
+ MONGO_FTS_LANGUAGE_DECL(Spanish, "spanish", "es") \
+ MONGO_FTS_LANGUAGE_DECL(Swedish, "swedish", "sv") \
+ MONGO_FTS_LANGUAGE_DECL(Turkish, "turkish", "tr")
-MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS);
+// Declare compilation unit local language object.
+// Must be declared statically as global language map only keeps a pointer to the language
+// instance.
//
-// Register supported languages' canonical names for TEXT_INDEX_VERSION_2.
+#define LANGUAGE_DECLV2(id, name, alias) BasicFTSLanguage language##id##V2;
+
+#define LANGUAGE_DECLV3(id, name, alias) UnicodeFTSLanguage language##id##V3(name);
+
+BasicFTSLanguage languageNoneV2;
+MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV2);
+
+UnicodeFTSLanguage languageNoneV3("none");
+MONGO_FTS_LANGUAGE_LIST(LANGUAGE_DECLV3);
+
+// Registers each language and language aliases in the language map.
//
+#define LANGUAGE_INITV2(id, name, alias) \
+ FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_2, &language##id##V2);
+
+#define LANGUAGE_INITV3(id, name, alias) \
+ FTSLanguage::registerLanguage(name, TEXT_INDEX_VERSION_3, &language##id##V3);
-MONGO_FTS_LANGUAGE_DECLARE(languageNoneV2, "none", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageDanishV2, "danish", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageDutchV2, "dutch", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageEnglishV2, "english", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageFrenchV2, "french", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageGermanV2, "german", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageItalianV2, "italian", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageRussianV2, "russian", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2);
-MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2);
+/**
+ * Registers each language in the language map.
+ */
+MONGO_INITIALIZER_GENERAL(FTSRegisterV2LanguagesAndLater,
+ MONGO_NO_PREREQUISITES,
+ ("FTSAllLanguagesRegistered"))
+(::mongo::InitializerContext* context) {
+ FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_2, &languageNoneV2);
+ MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV2);
+
+ FTSLanguage::registerLanguage("none", TEXT_INDEX_VERSION_3, &languageNoneV3);
+ MONGO_FTS_LANGUAGE_LIST(LANGUAGE_INITV3);
+ return Status::OK();
+}
+
+#define LANGUAGE_ALIASV2(id, name, alias) \
+ FTSLanguage::registerLanguageAlias(&language##id##V2, alias, TEXT_INDEX_VERSION_2);
+
+#define LANGUAGE_ALIASV3(id, name, alias) \
+ FTSLanguage::registerLanguageAlias(&language##id##V3, alias, TEXT_INDEX_VERSION_3);
+
+/**
+ * Registers each language alias in the language map.
+ */
+MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered"))
+(InitializerContext* context) {
+ // Register language aliases for TEXT_INDEX_VERSION_2.
+ MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV2);
+ // Register language aliases for TEXT_INDEX_VERSION_3.
+ MONGO_FTS_LANGUAGE_LIST(LANGUAGE_ALIASV3);
+ return Status::OK();
+}
//
// Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full
@@ -172,59 +227,39 @@ MONGO_FTS_LANGUAGE_DECLARE(languageTrV1, "tr", TEXT_INDEX_VERSION_1);
MONGO_FTS_LANGUAGE_DECLARE(languageTurV1, "tur", TEXT_INDEX_VERSION_1);
MONGO_FTS_LANGUAGE_DECLARE(languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1);
-MONGO_INITIALIZER_WITH_PREREQUISITES(FTSRegisterLanguageAliases, ("FTSAllLanguagesRegistered"))
-(InitializerContext* context) {
- // Register language aliases for TEXT_INDEX_VERSION_2.
- FTSLanguage::registerLanguageAlias(&languageDanishV2, "da", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageDutchV2, "nl", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageEnglishV2, "en", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageFinnishV2, "fi", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageFrenchV2, "fr", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageGermanV2, "de", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageHungarianV2, "hu", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageItalianV2, "it", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageRomanianV2, "ro", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageRussianV2, "ru", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageSpanishV2, "es", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageSwedishV2, "sv", TEXT_INDEX_VERSION_2);
- FTSLanguage::registerLanguageAlias(&languageTurkishV2, "tr", TEXT_INDEX_VERSION_2);
- return Status::OK();
-}
-
// static
void FTSLanguage::registerLanguage(StringData languageName,
TextIndexVersion textIndexVersion,
FTSLanguage* language) {
verify(!languageName.empty());
language->_canonicalName = languageName.toString();
- switch (textIndexVersion) {
- case TEXT_INDEX_VERSION_2:
- languageMapV2[languageName.toString()] = language;
- return;
- case TEXT_INDEX_VERSION_1:
- verify(languageMapV1.find(languageName) == languageMapV1.end());
- languageMapV1[languageName] = language;
- return;
+
+ if (textIndexVersion >= TEXT_INDEX_VERSION_2) {
+ LanguageMap* languageMap =
+ (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2;
+ (*languageMap)[languageName.toString()] = language;
+ } else {
+ // Legacy text index.
+ invariant(textIndexVersion == TEXT_INDEX_VERSION_1);
+ verify(languageMapV1.find(languageName) == languageMapV1.end());
+ languageMapV1[languageName] = language;
}
- verify(false);
}
// static
void FTSLanguage::registerLanguageAlias(const FTSLanguage* language,
StringData alias,
TextIndexVersion textIndexVersion) {
- switch (textIndexVersion) {
- case TEXT_INDEX_VERSION_2:
- languageMapV2[alias.toString()] = language;
- return;
- case TEXT_INDEX_VERSION_1:
- verify(languageMapV1.find(alias) == languageMapV1.end());
- languageMapV1[alias] = language;
- return;
+ if (textIndexVersion >= TEXT_INDEX_VERSION_2) {
+ LanguageMap* languageMap =
+ (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2;
+ (*languageMap)[alias.toString()] = language;
+ } else {
+ // Legacy text index.
+ invariant(textIndexVersion == TEXT_INDEX_VERSION_1);
+ verify(languageMapV1.find(alias) == languageMapV1.end());
+ languageMapV1[alias] = language;
}
- verify(false);
}
FTSLanguage::FTSLanguage() : _canonicalName() {}
@@ -236,31 +271,48 @@ const std::string& FTSLanguage::str() const {
// static
StatusWithFTSLanguage FTSLanguage::make(StringData langName, TextIndexVersion textIndexVersion) {
- switch (textIndexVersion) {
- case TEXT_INDEX_VERSION_2: {
- LanguageMapV2::const_iterator it = languageMapV2.find(langName.toString());
- if (it == languageMapV2.end()) {
- // TEXT_INDEX_VERSION_2 rejects unrecognized language strings.
- Status status = Status(ErrorCodes::BadValue,
- mongoutils::str::stream() << "unsupported language: \""
- << langName << "\"");
- return StatusWithFTSLanguage(status);
- }
+ if (textIndexVersion >= TEXT_INDEX_VERSION_2) {
+ LanguageMap* languageMap =
+ (textIndexVersion == TEXT_INDEX_VERSION_3) ? &languageMapV3 : &languageMapV2;
+
+ LanguageMap::const_iterator it = languageMap->find(langName.toString());
- return StatusWithFTSLanguage(it->second);
+ if (it == languageMap->end()) {
+ // TEXT_INDEX_VERSION_2 and above reject unrecognized language strings.
+ Status status = Status(ErrorCodes::BadValue,
+ mongoutils::str::stream()
+ << "unsupported language: \"" << langName
+ << "\" for text index version " << textIndexVersion);
+ return StatusWithFTSLanguage(status);
}
- case TEXT_INDEX_VERSION_1: {
- LanguageMapV1::const_iterator it = languageMapV1.find(langName);
- if (it == languageMapV1.end()) {
- // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
- return StatusWithFTSLanguage(&languageNoneV1);
- }
- return StatusWithFTSLanguage(it->second);
+
+ return StatusWithFTSLanguage(it->second);
+ } else {
+ // Legacy text index.
+ invariant(textIndexVersion == TEXT_INDEX_VERSION_1);
+ LanguageMapLegacy::const_iterator it = languageMapV1.find(langName);
+ if (it == languageMapV1.end()) {
+ // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
+ return StatusWithFTSLanguage(&languageNoneV1);
}
+ return StatusWithFTSLanguage(it->second);
}
+}
+
+std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
+ return stdx::make_unique<BasicFTSTokenizer>(this);
+}
+
+const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const {
+ return _basicPhraseMatcher;
+}
+
+std::unique_ptr<FTSTokenizer> UnicodeFTSLanguage::createTokenizer() const {
+ return stdx::make_unique<UnicodeFTSTokenizer>(this);
+}
- verify(false);
- return StatusWithFTSLanguage(Status::OK());
+const FTSPhraseMatcher& UnicodeFTSLanguage::getPhraseMatcher() const {
+ return _unicodePhraseMatcher;
}
}
}
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index 6c986f5de6e..062a3255ba1 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -32,6 +32,7 @@
#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_phrase_matcher.h"
+#include "mongo/db/fts/fts_unicode_phrase_matcher.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/base/status_with.h"
@@ -43,6 +44,7 @@ namespace fts {
class FTSTokenizer;
+// Legacy language initialization.
#define MONGO_FTS_LANGUAGE_DECLARE(language, name, version) \
BasicFTSLanguage language; \
MONGO_INITIALIZER_GENERAL(language, MONGO_NO_PREREQUISITES, ("FTSAllLanguagesRegistered")) \
@@ -57,7 +59,7 @@ class FTSTokenizer;
*
* Recommended usage:
*
- * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 );
+ * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_3 );
* if ( !swl.getStatus().isOK() ) {
* // Error.
* }
@@ -84,7 +86,7 @@ public:
/**
* Returns a new FTSTokenizer instance for this language.
- * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
+ * Lifetime is scoped to FTSLanguage (which are currently all process lifetime).
*/
virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
@@ -94,10 +96,9 @@ public:
virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0;
/**
- * Register std::string 'languageName' as a new language with text index version
+ * Register std::string 'languageName' as a new language with the text index version
* 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
- * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
- * string.
+ * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language string.
*/
static void registerLanguage(StringData languageName,
TextIndexVersion textIndexVersion,
@@ -113,15 +114,15 @@ public:
TextIndexVersion textIndexVersion);
/**
- * Return the FTSLanguage associated with the given language string. Returns an error
- * Status if an invalid language std::string is passed.
+ * Return the FTSLanguage associated with the given language string and the given text index
+ * version. Returns an error Status if an invalid language std::string is passed.
*
- * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are
+ * For textIndexVersion >= TEXT_INDEX_VERSION_2, language strings are
* case-insensitive, and need to be in one of the two following forms:
* - English name, like "spanish".
* - Two-letter code, like "es".
*
- * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of
+ * For textIndexVersion == TEXT_INDEX_VERSION_1, no validation or normalization of
* language strings is performed. This is necessary to preserve indexing behavior for
* documents with language strings like "en": for compatibility, text data in these
* documents needs to be processed with the English stemmer and the empty stopword list
@@ -137,7 +138,10 @@ private:
typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
-
+/**
+ * FTSLanguage implementation that returns a BasicFTSTokenizer and BasicFTSPhraseMatcher for ASCII
+ * aware case folding in FTS.
+ */
class BasicFTSLanguage : public FTSLanguage {
public:
std::unique_ptr<FTSTokenizer> createTokenizer() const final;
@@ -147,6 +151,20 @@ private:
BasicFTSPhraseMatcher _basicPhraseMatcher;
};
+/**
+ * FTSLanguage implementation that returns a UnicodeFTSTokenizer and UnicodeFTSPhraseMatcher for
+ * Unicode aware case folding and diacritic removal in FTS.
+ */
+class UnicodeFTSLanguage : public FTSLanguage {
+public:
+ UnicodeFTSLanguage(const std::string& languageName) : _unicodePhraseMatcher(languageName) {}
+ std::unique_ptr<FTSTokenizer> createTokenizer() const final;
+ const FTSPhraseMatcher& getPhraseMatcher() const final;
+
+private:
+ UnicodeFTSPhraseMatcher _unicodePhraseMatcher;
+};
+
extern BasicFTSLanguage languagePorterV1;
extern BasicFTSLanguage languageEnglishV2;
extern BasicFTSLanguage languageFrenchV2;
diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp
index c24f02ff7fd..87e37272850 100644
--- a/src/mongo/db/fts/fts_language_test.cpp
+++ b/src/mongo/db/fts/fts_language_test.cpp
@@ -37,6 +37,50 @@ namespace mongo {
namespace fts {
+// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3.
+
+TEST(FTSLanguageV3, ExactLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("spanish", TEXT_INDEX_VERSION_3);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV3, ExactCode) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_3);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV3, UpperCaseLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_3);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV3, UpperCaseCode) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_3);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "spanish");
+}
+
+TEST(FTSLanguageV3, NoneLanguage) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_3);
+ ASSERT(swl.getStatus().isOK());
+ ASSERT_EQUALS(swl.getValue()->str(), "none");
+}
+
+// Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_3.
+
+TEST(FTSLanguageV3, Empty) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_3);
+ ASSERT(!swl.getStatus().isOK());
+}
+
+TEST(FTSLanguageV3, Unknown) {
+ StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_3);
+ ASSERT(!swl.getStatus().isOK());
+}
+
// Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
TEST(FTSLanguageV2, ExactLanguage) {
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index a4b2a6e4638..7689f15acf3 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -80,10 +80,7 @@ bool FTSMatcher::hasPositiveTerm(const BSONObj& obj) const {
bool FTSMatcher::_hasPositiveTerm_string(const FTSLanguage* language, const string& raw) const {
std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
-
- tokenizer->reset(raw.c_str(),
- _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens
- : FTSTokenizer::kNone);
+ tokenizer->reset(raw.c_str(), _getTokenizerOptions());
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
@@ -113,10 +110,7 @@ bool FTSMatcher::hasNegativeTerm(const BSONObj& obj) const {
bool FTSMatcher::_hasNegativeTerm_string(const FTSLanguage* language, const string& raw) const {
std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
-
- tokenizer->reset(raw.c_str(),
- _query.getCaseSensitive() ? FTSTokenizer::kGenerateCaseSensitiveTokens
- : FTSTokenizer::kNone);
+ tokenizer->reset(raw.c_str(), _getTokenizerOptions());
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
@@ -153,16 +147,34 @@ bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const {
while (it.more()) {
FTSIteratorValue val = it.next();
- if (val._language->getPhraseMatcher().phraseMatches(phrase,
- val._text,
- _query.getCaseSensitive()
- ? FTSPhraseMatcher::kCaseSensitive
- : FTSPhraseMatcher::kNone)) {
+ FTSPhraseMatcher::Options matcherOptions = FTSPhraseMatcher::kNone;
+
+ if (_query.getCaseSensitive()) {
+ matcherOptions |= FTSPhraseMatcher::kCaseSensitive;
+ }
+ if (_query.getDiacriticSensitive()) {
+ matcherOptions |= FTSPhraseMatcher::kDiacriticSensitive;
+ }
+
+ if (val._language->getPhraseMatcher().phraseMatches(phrase, val._text, matcherOptions)) {
return true;
}
}
return false;
}
+
+FTSTokenizer::Options FTSMatcher::_getTokenizerOptions() const {
+ FTSTokenizer::Options tokenizerOptions = FTSTokenizer::kNone;
+
+ if (_query.getCaseSensitive()) {
+ tokenizerOptions |= FTSTokenizer::kGenerateCaseSensitiveTokens;
+ }
+ if (_query.getDiacriticSensitive()) {
+ tokenizerOptions |= FTSTokenizer::kGenerateDiacriticSensitiveTokens;
+ }
+
+ return tokenizerOptions;
+}
}
}
diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h
index 00fe8291c4d..45bfa360b09 100644
--- a/src/mongo/db/fts/fts_matcher.h
+++ b/src/mongo/db/fts/fts_matcher.h
@@ -32,6 +32,7 @@
#include "mongo/db/fts/fts_query.h"
#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/tokenizer.h"
namespace mongo {
@@ -81,7 +82,7 @@ private:
* check.
*/
bool canSkipPositiveTermCheck() const {
- return !_query.getCaseSensitive();
+ return !_query.getCaseSensitive() && !_query.getDiacriticSensitive();
}
/**
@@ -101,6 +102,12 @@ private:
*/
bool _phraseMatch(const std::string& phrase, const BSONObj& obj) const;
+ /**
+ * Helper method that returns the tokenizer options that this matcher should use, based on the
+ * the query options.
+ */
+ FTSTokenizer::Options _getTokenizerOptions() const;
+
// TODO These should be unowned pointers instead of owned copies.
const FTSQuery _query;
const FTSSpec _spec;
diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp
index 13eb74609dc..246510a9e70 100644
--- a/src/mongo/db/fts/fts_matcher_test.cpp
+++ b/src/mongo/db/fts/fts_matcher_test.cpp
@@ -38,7 +38,7 @@ namespace fts {
TEST(FTSMatcher, NegWild1) {
FTSQuery q;
- ASSERT_OK(q.parse("foo -bar", "english", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("foo -bar", "english", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**"
<< "text")))));
@@ -52,7 +52,7 @@ TEST(FTSMatcher, NegWild1) {
// Regression test for SERVER-11994.
TEST(FTSMatcher, NegWild2) {
FTSQuery q;
- ASSERT_OK(q.parse("pizza -restaurant", "english", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("pizza -restaurant", "english", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**"
<< "text")))));
@@ -65,7 +65,7 @@ TEST(FTSMatcher, NegWild2) {
TEST(FTSMatcher, Phrase1) {
FTSQuery q;
- ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("foo \"table top\"", "english", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("$**"
<< "text")))));
@@ -87,7 +87,7 @@ TEST(FTSMatcher, Phrase1) {
TEST(FTSMatcher, Phrase2) {
FTSQuery q;
- ASSERT_OK(q.parse("foo \"table top\"", "english", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("foo \"table top\"", "english", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -98,7 +98,7 @@ TEST(FTSMatcher, Phrase2) {
// language.
TEST(FTSMatcher, ParsesUsingDocLanguage) {
FTSQuery q;
- ASSERT_OK(q.parse("-glad", "none", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("-glad", "none", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -112,7 +112,7 @@ TEST(FTSMatcher, ParsesUsingDocLanguage) {
// Test the matcher does not filter out stop words from positive terms
TEST(FTSMatcher, MatcherDoesNotFilterStopWordsNeg) {
FTSQuery q;
- ASSERT_OK(q.parse("-the", "none", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("-the", "none", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -124,7 +124,7 @@ TEST(FTSMatcher, MatcherDoesNotFilterStopWordsNeg) {
// Test the matcher does not filter out stop words from negative terms
TEST(FTSMatcher, MatcherDoesNotFilterStopWordsPos) {
FTSQuery q;
- ASSERT_OK(q.parse("the", "none", false, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse("the", "none", false, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -137,7 +137,7 @@ TEST(FTSMatcher, MatcherDoesNotFilterStopWordsPos) {
// case-sensitive text query 'search'.
static bool docHasPositiveTermWithCase(const std::string& doc, const std::string& search) {
FTSQuery q;
- ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -164,7 +164,7 @@ TEST(FTSMatcher, HasPositiveTermCaseSensitive) {
// case-sensitive text query 'search'.
static bool docHasNegativeTermWithCase(const std::string& doc, const std::string& search) {
FTSQuery q;
- ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -191,7 +191,7 @@ TEST(FTSMatcher, HasNegativeTermCaseSensitive) {
// from case-sensitive text query 'search'.
static bool docPositivePhrasesMatchWithCase(const std::string& doc, const std::string& search) {
FTSQuery q;
- ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
@@ -214,7 +214,7 @@ TEST(FTSMatcher, PositivePhrasesMatchWithCase) {
// from case-sensitive text query 'search'.
static bool docNegativePhrasesMatchWithCase(const std::string& doc, const std::string& search) {
FTSQuery q;
- ASSERT_OK(q.parse(search, "english", true, TEXT_INDEX_VERSION_2));
+ ASSERT_OK(q.parse(search, "english", true, false, TEXT_INDEX_VERSION_3));
FTSMatcher m(q,
FTSSpec(FTSSpec::fixSpec(BSON("key" << BSON("x"
<< "text")))));
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 9fbf0e04978..f162481066b 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -50,10 +50,12 @@ using std::stringstream;
using std::vector;
const bool FTSQuery::caseSensitiveDefault = false;
+const bool FTSQuery::diacriticSensitiveDefault = false;
Status FTSQuery::parse(const string& query,
StringData language,
bool caseSensitive,
+ bool diacriticSensitive,
TextIndexVersion textIndexVersion) {
StatusWithFTSLanguage swl = FTSLanguage::make(language, textIndexVersion);
if (!swl.getStatus().isOK()) {
@@ -61,6 +63,7 @@ Status FTSQuery::parse(const string& query,
}
_language = swl.getValue();
_caseSensitive = caseSensitive;
+ _diacriticSensitive = diacriticSensitive;
// Build a space delimited list of words to have the FtsTokenizer tokenize
string positiveTermSentence;
@@ -148,21 +151,29 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n
}
// Compute the string corresponding to 'token' that will be used for the matcher.
- // For case-insensitive queries, this is the same string as 'boundsTerm' computed
- // above.
- if (!_caseSensitive) {
+ // For case and diacritic insensitive queries, this is the same string as 'boundsTerm'
+ // computed above.
+ if (!_caseSensitive && !_diacriticSensitive) {
activeTerms.insert(word);
}
}
- if (!_caseSensitive) {
+ if (!_caseSensitive && !_diacriticSensitive) {
return;
}
- tokenizer->reset(sentence.c_str(),
- FTSTokenizer::kFilterStopWords | FTSTokenizer::kGenerateCaseSensitiveTokens);
+ FTSTokenizer::Options newOptions = FTSTokenizer::kFilterStopWords;
- // If we want case-sensitivity, get the case-sensitive token
+ if (_caseSensitive) {
+ newOptions |= FTSTokenizer::kGenerateCaseSensitiveTokens;
+ }
+ if (_diacriticSensitive) {
+ newOptions |= FTSTokenizer::kGenerateDiacriticSensitiveTokens;
+ }
+
+ tokenizer->reset(sentence.c_str(), newOptions);
+
+ // If we want case-sensitivity or diacritic sensitivity, get the correct token.
while (tokenizer->moveNext()) {
string word = tokenizer->get().toString();
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index cac73425ffb..ea1882e4baf 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -48,10 +48,12 @@ public:
// index version, since a query which doesn't specify a language and is against a
// version 1 text index with a version 1 default language string needs to be parsed as
// version 1 (see fts_language.cpp for a list of language strings specific to version
- // 1).
+ // 1). Note that the diacritic sensitive option has no effect on FTS queries below index version
+ // 3.
Status parse(const std::string& query,
StringData language,
bool caseSensitive,
+ bool diacriticSensitive,
TextIndexVersion textIndexVersion);
const std::set<std::string>& getPositiveTerms() const {
@@ -77,6 +79,9 @@ public:
bool getCaseSensitive() const {
return _caseSensitive;
}
+ bool getDiacriticSensitive() const {
+ return _diacriticSensitive;
+ }
std::string toString() const;
@@ -85,12 +90,14 @@ public:
BSONObj toBSON() const;
static const bool caseSensitiveDefault;
+ static const bool diacriticSensitiveDefault;
private:
void _addTerms(FTSTokenizer* tokenizer, const std::string& tokens, bool negated);
const FTSLanguage* _language;
bool _caseSensitive;
+ bool _diacriticSensitive;
// Positive terms.
std::set<std::string> _positiveTerms;
diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp
index a4a841c7f16..bcf9e537142 100644
--- a/src/mongo/db/fts/fts_query_test.cpp
+++ b/src/mongo/db/fts/fts_query_test.cpp
@@ -37,7 +37,7 @@ namespace fts {
TEST(FTSQuery, Basic1) {
FTSQuery q;
- ASSERT(q.parse("this is fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("this is fun", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(false, q.getCaseSensitive());
ASSERT_EQUALS(1U, q.getPositiveTerms().size());
@@ -50,7 +50,7 @@ TEST(FTSQuery, Basic1) {
TEST(FTSQuery, ParsePunctuation) {
FTSQuery q;
- ASSERT(q.parse("hello.world", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("hello.world", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(false, q.getCaseSensitive());
ASSERT_EQUALS(2U, q.getPositiveTerms().size());
@@ -64,7 +64,7 @@ TEST(FTSQuery, ParsePunctuation) {
TEST(FTSQuery, Neg1) {
FTSQuery q;
- ASSERT(q.parse("this is -really fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("this is -really fun", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(1U, q.getPositiveTerms().size());
ASSERT_EQUALS("fun", *q.getPositiveTerms().begin());
@@ -75,8 +75,8 @@ TEST(FTSQuery, Neg1) {
TEST(FTSQuery, Phrase1) {
FTSQuery q;
- ASSERT(
- q.parse("doing a \"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("doing a \"phrase test\" for fun", "english", false, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS(3U, q.getPositiveTerms().size());
ASSERT_EQUALS(0U, q.getNegatedTerms().size());
@@ -90,8 +90,8 @@ TEST(FTSQuery, Phrase1) {
TEST(FTSQuery, Phrase2) {
FTSQuery q;
- ASSERT(
- q.parse("doing a \"phrase-test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("doing a \"phrase-test\" for fun", "english", false, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS(1U, q.getPositivePhr().size());
ASSERT_EQUALS("phrase-test", q.getPositivePhr()[0]);
}
@@ -99,19 +99,20 @@ TEST(FTSQuery, Phrase2) {
TEST(FTSQuery, NegPhrase1) {
FTSQuery q;
ASSERT(
- q.parse("doing a -\"phrase test\" for fun", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ q.parse("doing a -\"phrase test\" for fun", "english", false, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS("fun||||||phrase test", q.debugString());
}
TEST(FTSQuery, CaseSensitiveOption) {
FTSQuery q;
- ASSERT(q.parse("this is fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("this is fun", "english", true, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(true, q.getCaseSensitive());
}
TEST(FTSQuery, CaseSensitivePositiveTerms) {
FTSQuery q;
- ASSERT(q.parse("This is Positively fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("This is Positively fun", "english", true, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(2U, q.getTermsForBounds().size());
ASSERT_EQUALS(1,
@@ -127,8 +128,8 @@ TEST(FTSQuery, CaseSensitivePositiveTerms) {
TEST(FTSQuery, CaseSensitiveNegativeTerms) {
FTSQuery q;
- ASSERT(
- q.parse("-This -is -Negatively -miserable", "english", true, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("-This -is -Negatively -miserable", "english", true, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS(0U, q.getPositiveTerms().size());
ASSERT_EQUALS(0U, q.getTermsForBounds().size());
@@ -141,8 +142,8 @@ TEST(FTSQuery, CaseSensitiveNegativeTerms) {
TEST(FTSQuery, CaseSensitivePositivePhrases) {
FTSQuery q;
- ASSERT(
- q.parse("doing a \"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("doing a \"Phrase Test\" for fun", "english", true, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS(1U, q.getPositivePhr().size());
ASSERT_EQUALS(0U, q.getNegatedPhr().size());
@@ -151,8 +152,8 @@ TEST(FTSQuery, CaseSensitivePositivePhrases) {
TEST(FTSQuery, CaseSensitiveNegativePhrases) {
FTSQuery q;
- ASSERT(
- q.parse("doing a -\"Phrase Test\" for fun", "english", true, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q.parse("doing a -\"Phrase Test\" for fun", "english", true, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS(0U, q.getPositivePhr().size());
ASSERT_EQUALS(1U, q.getNegatedPhr().size());
@@ -162,15 +163,16 @@ TEST(FTSQuery, CaseSensitiveNegativePhrases) {
TEST(FTSQuery, Mix1) {
FTSQuery q;
ASSERT(
- q.parse("\"industry\" -Melbourne -Physics", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ q.parse("\"industry\" -Melbourne -Physics", "english", false, false, TEXT_INDEX_VERSION_3)
+ .isOK());
ASSERT_EQUALS("industri||melbourn|physic||industry||", q.debugString());
}
TEST(FTSQuery, NegPhrase2) {
FTSQuery q1, q2, q3;
- ASSERT(q1.parse("foo \"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
- ASSERT(q2.parse("foo \"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
- ASSERT(q3.parse("foo \" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q1.parse("foo \"bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
+ ASSERT(q2.parse("foo \"-bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
+ ASSERT(q3.parse("foo \" -bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(2U, q1.getPositiveTerms().size());
ASSERT_EQUALS(2U, q2.getPositiveTerms().size());
@@ -191,9 +193,9 @@ TEST(FTSQuery, NegPhrase2) {
TEST(FTSQuery, NegPhrase3) {
FTSQuery q1, q2, q3;
- ASSERT(q1.parse("foo -\"bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
- ASSERT(q2.parse("foo -\"-bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
- ASSERT(q3.parse("foo -\" -bar\"", "english", false, TEXT_INDEX_VERSION_2).isOK());
+ ASSERT(q1.parse("foo -\"bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
+ ASSERT(q2.parse("foo -\"-bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
+ ASSERT(q3.parse("foo -\" -bar\"", "english", false, false, TEXT_INDEX_VERSION_3).isOK());
ASSERT_EQUALS(1U, q1.getPositiveTerms().size());
ASSERT_EQUALS(1U, q2.getPositiveTerms().size());
@@ -216,7 +218,7 @@ TEST(FTSQuery, NegPhrase3) {
// stemmer and stopword list.
TEST(FTSQuery, TextIndexVersion1LanguageEnglish) {
FTSQuery q;
- ASSERT(q.parse("the running", "english", false, TEXT_INDEX_VERSION_1).isOK());
+ ASSERT(q.parse("the running", "english", false, false, TEXT_INDEX_VERSION_1).isOK());
ASSERT_EQUALS(1U, q.getPositiveTerms().size());
ASSERT_EQUALS("run", *q.getPositiveTerms().begin());
ASSERT_EQUALS(0U, q.getNegatedTerms().size());
@@ -228,7 +230,7 @@ TEST(FTSQuery, TextIndexVersion1LanguageEnglish) {
// no stopword list.
TEST(FTSQuery, TextIndexVersion1LanguageEng) {
FTSQuery q;
- ASSERT(q.parse("the running", "eng", false, TEXT_INDEX_VERSION_1).isOK());
+ ASSERT(q.parse("the running", "eng", false, false, TEXT_INDEX_VERSION_1).isOK());
ASSERT_EQUALS(2U, q.getPositiveTerms().size());
ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the"));
ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "run"));
@@ -241,7 +243,7 @@ TEST(FTSQuery, TextIndexVersion1LanguageEng) {
// and no stopword list will be used.
TEST(FTSQuery, TextIndexVersion1LanguageInvalid) {
FTSQuery q;
- ASSERT(q.parse("the running", "invalid", false, TEXT_INDEX_VERSION_1).isOK());
+ ASSERT(q.parse("the running", "invalid", false, false, TEXT_INDEX_VERSION_1).isOK());
ASSERT_EQUALS(2U, q.getPositiveTerms().size());
ASSERT_EQUALS(1, std::count(q.getPositiveTerms().begin(), q.getPositiveTerms().end(), "the"));
ASSERT_EQUALS(1,
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index 1ec72152351..5e5fbeaefa3 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -73,18 +73,26 @@ FTSSpec::FTSSpec(const BSONObj& indexInfo) {
"found invalid spec for text index, expected number for textIndexVersion",
textIndexVersionElt.isNumber());
- // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2.
+ // We currently support TEXT_INDEX_VERSION_1 (deprecated), TEXT_INDEX_VERSION_2, and
+ // TEXT_INDEX_VERSION_3.
// Reject all other values.
- massert(17364,
- str::stream() << "attempt to use unsupported textIndexVersion "
- << textIndexVersionElt.numberInt() << "; versions supported: "
- << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1,
- textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ||
- textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1);
-
- _textIndexVersion = (textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2)
- ? TEXT_INDEX_VERSION_2
- : TEXT_INDEX_VERSION_1;
+ switch (textIndexVersionElt.numberInt()) {
+ case TEXT_INDEX_VERSION_3:
+ _textIndexVersion = TEXT_INDEX_VERSION_3;
+ break;
+ case TEXT_INDEX_VERSION_2:
+ _textIndexVersion = TEXT_INDEX_VERSION_2;
+ break;
+ case TEXT_INDEX_VERSION_1:
+ _textIndexVersion = TEXT_INDEX_VERSION_1;
+ break;
+ default:
+ msgasserted(17364,
+ str::stream() << "attempt to use unsupported textIndexVersion "
+ << textIndexVersionElt.numberInt()
+ << "; versions supported: " << TEXT_INDEX_VERSION_3 << ", "
+ << TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1);
+ }
// Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
// textIndexVersion, since language parsing is version-specific.
@@ -384,7 +392,7 @@ BSONObj FTSSpec::fixSpec(const BSONObj& spec) {
}
uassert(17264,
"default_language is not valid",
- FTSLanguage::make(default_language, TEXT_INDEX_VERSION_2).getStatus().isOK());
+ FTSLanguage::make(default_language, TEXT_INDEX_VERSION_3).getStatus().isOK());
BSONElement language_override_elt = spec["language_override"];
string language_override(language_override_elt.str());
@@ -397,7 +405,7 @@ BSONObj FTSSpec::fixSpec(const BSONObj& spec) {
}
int version = -1;
- int textIndexVersion = TEXT_INDEX_VERSION_2;
+ int textIndexVersion = TEXT_INDEX_VERSION_3; // default text index version
BSONObjBuilder b;
BSONObjIterator i(spec);
@@ -421,7 +429,9 @@ BSONObj FTSSpec::fixSpec(const BSONObj& spec) {
textIndexVersion = e.numberInt();
uassert(16730,
str::stream() << "bad textIndexVersion: " << textIndexVersion,
- textIndexVersion == TEXT_INDEX_VERSION_2);
+ textIndexVersion == TEXT_INDEX_VERSION_2 ||
+ textIndexVersion == TEXT_INDEX_VERSION_3); // supported indexes
+
} else {
b.append(e);
}
diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp
index c9f628a2b28..3bd7d93800b 100644
--- a/src/mongo/db/fts/fts_spec_test.cpp
+++ b/src/mongo/db/fts/fts_spec_test.cpp
@@ -174,8 +174,11 @@ TEST(FTSSpec, FixTextIndexVersion1) {
assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}");
assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}");
assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 3.0}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(3)}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(3)}}");
- assertFixFailure("{key: {a: 'text'}, textIndexVersion: 3}");
+ assertFixFailure("{key: {a: 'text'}, textIndexVersion: 4}");
assertFixFailure("{key: {a: 'text'}, textIndexVersion: '2'}");
assertFixFailure("{key: {a: 'text'}, textIndexVersion: {}}");
}
diff --git a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
index e73c9599682..1f0517d8575 100644
--- a/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
+++ b/src/mongo/db/fts/fts_unicode_tokenizer_test.cpp
@@ -26,8 +26,7 @@
* it in the license file.
*/
-#include "mongo/db/fts/fts_spec.h"
-#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/fts_language.h"
#include "mongo/db/fts/fts_unicode_tokenizer.h"
#include "mongo/unittest/unittest.h"
@@ -37,7 +36,7 @@ namespace fts {
std::vector<std::string> tokenizeString(const char* str,
const char* language,
FTSTokenizer::Options options) {
- StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
+ StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3);
ASSERT_OK(swl);
UnicodeFTSTokenizer tokenizer(swl.getValue());
diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h
index a1377162443..b9fed70a8e8 100644
--- a/src/mongo/db/fts/fts_util.h
+++ b/src/mongo/db/fts/fts_util.h
@@ -43,7 +43,8 @@ extern const std::string INDEX_NAME;
enum TextIndexVersion {
TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated.
- TEXT_INDEX_VERSION_2 = 2 // Current index format.
+ TEXT_INDEX_VERSION_2 = 2, // Index format with ASCII support and murmur hashing.
+ TEXT_INDEX_VERSION_3 = 3, // Current index format with basic Unicode support.
};
}
}