summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorJason Rassi <rassi@10gen.com>2014-01-28 14:44:11 -0500
committerJason Rassi <rassi@10gen.com>2014-01-28 15:49:46 -0500
commit8869eab327d4ab783a9dc5dae54f2261fe692cfb (patch)
treecafbced8be41b080f83b64271f518bd7b2169121 /src/mongo
parent89d0788919b2c162155c5f32ea485abdab120390 (diff)
downloadmongo-8869eab327d4ab783a9dc5dae54f2261fe692cfb.tar.gz
SERVER-10906 Add support for legacy text indexes
FTSSpec now handles text indexes with index option {textIndexVersion:1}.
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/exec/stagedebug_cmd.cpp3
-rw-r--r--src/mongo/db/fts/SConscript1
-rw-r--r--src/mongo/db/fts/fts_language.cpp286
-rw-r--r--src/mongo/db/fts/fts_language.h80
-rw-r--r--src/mongo/db/fts/fts_language_test.cpp117
-rw-r--r--src/mongo/db/fts/fts_query.cpp16
-rw-r--r--src/mongo/db/fts/fts_query.h6
-rw-r--r--src/mongo/db/fts/fts_spec.cpp86
-rw-r--r--src/mongo/db/fts/fts_spec.h68
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp320
-rw-r--r--src/mongo/db/fts/fts_spec_test.cpp111
-rw-r--r--src/mongo/db/fts/fts_util.h6
-rw-r--r--src/mongo/db/fts/generate_stop_words.py6
-rw-r--r--src/mongo/db/fts/stemmer.cpp3
-rw-r--r--src/mongo/db/fts/stemmer.h2
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp9
-rw-r--r--src/mongo/db/fts/stop_words.cpp15
-rw-r--r--src/mongo/db/fts/stop_words.h2
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp4
-rw-r--r--src/mongo/db/fts/tokenizer.cpp5
-rw-r--r--src/mongo/db/fts/tokenizer.h2
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp19
-rw-r--r--src/mongo/db/matcher/expression_parser_text.cpp5
-rw-r--r--src/mongo/db/query/stage_builder.cpp6
24 files changed, 887 insertions, 291 deletions
diff --git a/src/mongo/db/exec/stagedebug_cmd.cpp b/src/mongo/db/exec/stagedebug_cmd.cpp
index dd3f53d9bb9..cfaea092033 100644
--- a/src/mongo/db/exec/stagedebug_cmd.cpp
+++ b/src/mongo/db/exec/stagedebug_cmd.cpp
@@ -373,7 +373,8 @@ namespace mongo {
params.spec = fam->getSpec();
- if (!params.query.parse(search, fam->getSpec().defaultLanguage().str()).isOK()) {
+ if (!params.query.parse(search,
+ fam->getSpec().defaultLanguage().str().c_str()).isOK()) {
return NULL;
}
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 7ec267cc6ad..afd6e0386a9 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -29,6 +29,7 @@ env.Library('base', [
'fts_matcher.cpp',
'fts_query.cpp',
'fts_spec.cpp',
+ 'fts_spec_legacy.cpp',
'fts_language.cpp',
'fts_util.cpp',
'stemmer.cpp',
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index 61bd33cf93f..21474038f06 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -34,6 +34,7 @@
#include "mongo/base/init.h"
#include "mongo/util/assert_util.h"
+#include "mongo/util/mongoutils/str.h"
#include "mongo/util/string_map.h"
#include "mongo/util/stringutils.h"
@@ -43,110 +44,219 @@ namespace mongo {
namespace {
- // Supported languages in canonical form (English names, lowercased). Includes "none".
- const string LanguageNone( "none" );
- const string LanguageDanish( "danish" );
- const string LanguageDutch( "dutch" );
- const string LanguageEnglish( "english" );
- const string LanguageFinnish( "finnish" );
- const string LanguageFrench( "french" );
- const string LanguageGerman( "german" );
- const string LanguageHungarian( "hungarian" );
- const string LanguageItalian( "italian" );
- const string LanguageNorwegian( "norwegian" );
- const string LanguagePortuguese( "portuguese" );
- const string LanguageRomanian( "romanian" );
- const string LanguageRussian( "russian" );
- const string LanguageSpanish( "spanish" );
- const string LanguageSwedish( "swedish" );
- const string LanguageTurkish( "turkish" );
-
- // Map from lowercased user string to language string. Resolves any language aliases
- // (two-letter codes).
- typedef StringMap<std::string> LanguageMap;
- LanguageMap languageMap;
- }
+ /**
+ * Case-insensitive StringData comparator.
+ */
+ struct LanguageStringCompare {
+ /** Returns true if lhs < rhs. */
+ bool operator()( const StringData& lhs, const StringData& rhs ) const {
+ size_t minSize = std::min( lhs.size(), rhs.size() );
- MONGO_INITIALIZER( FTSLanguageMap )( InitializerContext* context ) {
- languageMap[LanguageNone] = LanguageNone;
-
- languageMap["da"] = LanguageDanish;
- languageMap[LanguageDanish] = LanguageDanish;
- languageMap["nl"] = LanguageDutch;
- languageMap[LanguageDutch] = LanguageDutch;
- languageMap["en"] = LanguageEnglish;
- languageMap[LanguageEnglish] = LanguageEnglish;
- languageMap["fi"] = LanguageFinnish;
- languageMap[LanguageFinnish] = LanguageFinnish;
- languageMap["fr"] = LanguageFrench;
- languageMap[LanguageFrench] = LanguageFrench;
- languageMap["de"] = LanguageGerman;
- languageMap[LanguageGerman] = LanguageGerman;
- languageMap["hu"] = LanguageHungarian;
- languageMap[LanguageHungarian] = LanguageHungarian;
- languageMap["it"] = LanguageItalian;
- languageMap[LanguageItalian] = LanguageItalian;
- languageMap["nb"] = LanguageNorwegian;
- languageMap[LanguageNorwegian] = LanguageNorwegian;
- languageMap["pt"] = LanguagePortuguese;
- languageMap[LanguagePortuguese] = LanguagePortuguese;
- languageMap["ro"] = LanguageRomanian;
- languageMap[LanguageRomanian] = LanguageRomanian;
- languageMap["ru"] = LanguageRussian;
- languageMap[LanguageRussian] = LanguageRussian;
- languageMap["es"] = LanguageSpanish;
- languageMap[LanguageSpanish] = LanguageSpanish;
- languageMap["sv"] = LanguageSwedish;
- languageMap[LanguageSwedish] = LanguageSwedish;
- languageMap["tr"] = LanguageTurkish;
- languageMap[LanguageTurkish] = LanguageTurkish;
- return Status::OK();
- }
+ for ( size_t x = 0; x < minSize; x++ ) {
+ char a = tolower( lhs[x] );
+ char b = tolower( rhs[x] );
+ if ( a < b ) {
+ return true;
+ }
+ if ( a > b ) {
+ return false;
+ }
+ }
- FTSLanguage::FTSLanguage()
- : _lang() {
- }
+ return lhs.size() < rhs.size();
+ }
+ };
- FTSLanguage::FTSLanguage( const FTSLanguage& other )
- : _lang( other._lang ) {
- }
+ // Lookup table from user language string (case-insensitive) to FTSLanguage. Populated
+ // by initializers in group FTSAllLanguagesRegistered and initializer
+ // FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only.
+ typedef std::map<StringData, const FTSLanguage*, LanguageStringCompare> LanguageMapV2;
+ LanguageMapV2 languageMapV2;
- FTSLanguage& FTSLanguage::operator=( const FTSLanguage& other ) {
- _lang = other._lang;
- return *this;
+ // Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes.
+ // Case-sensitive by lookup key.
+ typedef std::map<StringData, const FTSLanguage*> LanguageMapV1;
+ LanguageMapV1 languageMapV1;
}
- FTSLanguage::~FTSLanguage() {
- }
+ MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
+ MONGO_NO_DEPENDENTS );
- Status FTSLanguage::init( const std::string& lang ) {
- // Lowercase.
- std::string langLower = tolowerString( lang );
+ //
+ // Register supported languages' canonical names for TEXT_INDEX_VERSION_2.
+ //
- // Resolve language aliases.
- LanguageMap::const_iterator it = languageMap.find( langLower );
- if ( it == languageMap.end() ) {
- return Status( ErrorCodes::BadValue,
- "unsupported language: \"" + lang + "\"" );
- }
+ MONGO_FTS_LANGUAGE_DECLARE( languageNoneV2, "none", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDanishV2, "danish", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDutchV2, "dutch", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV2, "english", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV2, "french", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageGermanV2, "german", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageItalianV2, "italian", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRussianV2, "russian", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2 );
+
+ //
+ // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full
+ // names are recognized by the StopWords class (as such, the language string "dan" in
+ // TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list).
+ //
+
+ MONGO_FTS_LANGUAGE_DECLARE( languageNoneV1, "none", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDaV1, "da", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDanV1, "dan", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDanishV1, "danish", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDeV1, "de", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDeuV1, "deu", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDutV1, "dut", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageDutchV1, "dutch", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageEnV1, "en", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageEngV1, "eng", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV1, "english", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageEsV1, "es", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageEslV1, "esl", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFiV1, "fi", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFinV1, "fin", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFrV1, "fr", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFraV1, "fra", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFreV1, "fre", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV1, "french", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageGerV1, "ger", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageGermanV1, "german", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageHuV1, "hu", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageHunV1, "hun", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageItV1, "it", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageItaV1, "ita", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageItalianV1, "italian", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageNlV1, "nl", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageNldV1, "nld", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageNoV1, "no", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageNorV1, "nor", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languagePorV1, "por", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languagePorterV1, "porter", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languagePtV1, "pt", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRoV1, "ro", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRonV1, "ron", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRuV1, "ru", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRumV1, "rum", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRusV1, "rus", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageRussianV1, "russian", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSpaV1, "spa", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSvV1, "sv", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSweV1, "swe", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageTrV1, "tr", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageTurV1, "tur", TEXT_INDEX_VERSION_1 );
+ MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1 );
- _lang = StringData( it->second );
+ MONGO_INITIALIZER_WITH_PREREQUISITES( FTSRegisterLanguageAliases,
+ ( "FTSAllLanguagesRegistered" ) )
+ ( InitializerContext* context ) {
+ // Register language aliases for TEXT_INDEX_VERSION_2.
+ FTSLanguage::registerLanguageAlias( &languageDanishV2, "da", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageDutchV2, "nl", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageEnglishV2, "en", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageFinnishV2, "fi", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageFrenchV2, "fr", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageGermanV2, "de", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageHungarianV2, "hu", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageItalianV2, "it", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageRomanianV2, "ro", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageRussianV2, "ru", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageSpanishV2, "es", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageSwedishV2, "sv", TEXT_INDEX_VERSION_2 );
+ FTSLanguage::registerLanguageAlias( &languageTurkishV2, "tr", TEXT_INDEX_VERSION_2 );
return Status::OK();
}
- std::string FTSLanguage::str() const {
- verify( !_lang.empty() );
- return _lang.toString();
+ // static
+ void FTSLanguage::registerLanguage( const StringData& languageName,
+ TextIndexVersion textIndexVersion,
+ FTSLanguage* language ) {
+ verify( !languageName.empty() );
+ language->_canonicalName = languageName.toString();
+ switch ( textIndexVersion ) {
+ case TEXT_INDEX_VERSION_2:
+ verify( languageMapV2.find( languageName ) == languageMapV2.end() );
+ languageMapV2[ languageName ] = language;
+ return;
+ case TEXT_INDEX_VERSION_1:
+ verify( languageMapV1.find( languageName ) == languageMapV1.end() );
+ languageMapV1[ languageName ] = language;
+ return;
+ }
+ verify( false );
}
- StatusWithFTSLanguage FTSLanguage::makeFTSLanguage( const std::string& lang ) {
- FTSLanguage language;
- Status s = language.init( lang );
- if ( !s.isOK() ) {
- return StatusWithFTSLanguage( s );
+ // static
+ void FTSLanguage::registerLanguageAlias( const FTSLanguage* language,
+ const StringData& alias,
+ TextIndexVersion textIndexVersion ) {
+ switch ( textIndexVersion ) {
+ case TEXT_INDEX_VERSION_2:
+ verify( languageMapV2.find( alias ) == languageMapV2.end() );
+ languageMapV2[ alias ] = language;
+ return;
+ case TEXT_INDEX_VERSION_1:
+ verify( languageMapV1.find( alias ) == languageMapV1.end() );
+ languageMapV1[ alias ] = language;
+ return;
}
- return StatusWithFTSLanguage( language );
+ verify( false );
+ }
+
+ FTSLanguage::FTSLanguage() : _canonicalName() {
+ }
+
+ const std::string& FTSLanguage::str() const {
+ verify( !_canonicalName.empty() );
+ return _canonicalName;
}
+ // static
+ StatusWithFTSLanguage FTSLanguage::make( const StringData& langName,
+ TextIndexVersion textIndexVersion ) {
+ switch ( textIndexVersion ) {
+ case TEXT_INDEX_VERSION_2: {
+ LanguageMapV2::const_iterator it = languageMapV2.find( langName );
+ if ( it == languageMapV2.end() ) {
+ // TEXT_INDEX_VERSION_2 rejects unrecognized language strings.
+ Status status = Status( ErrorCodes::BadValue,
+ mongoutils::str::stream() <<
+ "unsupported language: \"" << langName <<
+ "\"" );
+ return StatusWithFTSLanguage( status );
+ }
+
+ return StatusWithFTSLanguage( it->second );
+ }
+ case TEXT_INDEX_VERSION_1: {
+ LanguageMapV1::const_iterator it = languageMapV1.find( langName );
+ if ( it == languageMapV1.end() ) {
+ // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none".
+ return StatusWithFTSLanguage( &languageNoneV1 );
+ }
+ return StatusWithFTSLanguage( it->second );
+ }
+ }
+
+ verify( false );
+ return StatusWithFTSLanguage( Status::OK() );
+ }
}
}
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index 9a1d7053ea8..3a7a471bc32 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -30,6 +30,7 @@
#pragma once
+#include "mongo/db/fts/fts_util.h"
#include "mongo/base/status_with.h"
#include <string>
@@ -38,55 +39,90 @@ namespace mongo {
namespace fts {
+ #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \
+ FTSLanguage language; \
+ MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \
+ ( "FTSAllLanguagesRegistered" ) ) \
+ ( ::mongo::InitializerContext* context ) { \
+ FTSLanguage::registerLanguage( name, version, &language ); \
+ return Status::OK(); \
+ }
+
/**
- * A FTSLanguage is a copyable glorified enum representing a language for a text-indexed
- * document or a text search. Example of suggested usage:
+ * A FTSLanguage represents a language for a text-indexed document or a text search.
+ * FTSLanguage objects are not copyable.
+ *
+ * Recommended usage:
*
- * StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "en" );
+ * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 );
* if ( !swl.getStatus().isOK() ) {
* // Error.
* }
* else {
- * const FTSLanguage language = swl.getValue();
+ * const FTSLanguage* language = swl.getValue();
* // Use language.
* }
*/
class FTSLanguage {
+ // Use make() instead of copying.
+ MONGO_DISALLOW_COPYING( FTSLanguage );
public:
/** Create an uninitialized language. */
FTSLanguage();
- ~FTSLanguage();
- FTSLanguage( const FTSLanguage& );
- FTSLanguage& operator=( const FTSLanguage & );
+ /**
+ * Returns the language as a string in canonical form (lowercased English name). It is
+ * an error to call str() on an uninitialized language.
+ */
+ const std::string& str() const;
/**
- * Initialize an FTSLanguage from a language string. Language strings are
- * case-insensitive, and can be in one of the two following forms:
- * - English name, like "spanish".
- * - Two-letter code, like "es".
- * Returns an error Status if an invalid language string is passed.
+ * Register string 'languageName' as a new language with text index version
+ * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
+ * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
+ * string.
*/
- Status init( const std::string& lang );
+ static void registerLanguage( const StringData& languageName,
+ TextIndexVersion textIndexVersion,
+ FTSLanguage *languageOut );
/**
- * Returns the language as a string in canonical form (lowercased English name). It is
- * an error to call str() on an uninitialized language.
+ * Register 'alias' as an alias for 'language' with text index version
+ * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the
+ * newly-registered alias.
*/
- std::string str() const;
+ static void registerLanguageAlias( const FTSLanguage* language,
+ const StringData& alias,
+ TextIndexVersion textIndexVersion );
/**
- * Convenience method for creating an FTSLanguage out of a language string. Caller
- * must check getStatus().isOK() on return value.
+ * Return the FTSLanguage associated with the given language string. Returns an error
+ * Status if an invalid language string is passed.
+ *
+ * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are
+ * case-insensitive, and need to be in one of the two following forms:
+ * - English name, like "spanish".
+ * - Two-letter code, like "es".
+ *
+ * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of
+ * language strings is performed. This is necessary to preserve indexing behavior for
+ * documents with language strings like "en": for compatibility, text data in these
+ * documents needs to be processed with the English stemmer and the empty stopword list
+ * (since "en" is recognized by Snowball but not the stopword processing logic).
*/
- static StatusWith<const FTSLanguage> makeFTSLanguage( const std::string& lang );
+ static StatusWith<const FTSLanguage*> make( const StringData& langName,
+ TextIndexVersion textIndexVersion );
private:
- // Pointer to string representation of language. Not owned here.
- StringData _lang;
+ // String representation of language in canonical form.
+ std::string _canonicalName;
};
- typedef StatusWith<const FTSLanguage> StatusWithFTSLanguage;
+ typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
+
+ extern FTSLanguage languagePorterV1;
+ extern FTSLanguage languageEnglishV2;
+ extern FTSLanguage languageFrenchV2;
}
}
diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp
index 5fdd9a4aa73..141bbce27d9 100644
--- a/src/mongo/db/fts/fts_language_test.cpp
+++ b/src/mongo/db/fts/fts_language_test.cpp
@@ -30,76 +30,107 @@
#include "mongo/pch.h"
#include "mongo/db/fts/fts_language.h"
+#include "mongo/db/fts/fts_spec.h"
#include "mongo/unittest/unittest.h"
namespace mongo {
namespace fts {
- // Positive tests for FTSLanguage::init() and FTSLanguage::str().
+ // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
- TEST( FTSLanguage, ExactLanguage ) {
- FTSLanguage lang;
- Status s = lang.init( "spanish" );
- ASSERT( s.isOK() );
- ASSERT_EQUALS( lang.str(), "spanish" );
+ TEST( FTSLanguageV2, ExactLanguage ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_2 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
}
- TEST( FTSLanguage, ExactCode ) {
- FTSLanguage lang;
- Status s = lang.init( "es" );
- ASSERT( s.isOK() );
- ASSERT_EQUALS( lang.str(), "spanish" );
+ TEST( FTSLanguageV2, ExactCode ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "es", TEXT_INDEX_VERSION_2 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
}
- TEST( FTSLanguage, UpperCaseLanguage ) {
- FTSLanguage lang;
- Status s = lang.init( "SPANISH" );
- ASSERT( s.isOK() );
- ASSERT_EQUALS( lang.str(), "spanish" );
+ TEST( FTSLanguageV2, UpperCaseLanguage ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_2 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
}
- TEST( FTSLanguage, UpperCaseCode ) {
- FTSLanguage lang;
- Status s = lang.init( "ES" );
- ASSERT( s.isOK() );
- ASSERT_EQUALS( lang.str(), "spanish" );
+ TEST( FTSLanguageV2, UpperCaseCode ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "ES", TEXT_INDEX_VERSION_2 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
}
- TEST( FTSLanguage, NoneLanguage ) {
- FTSLanguage lang;
- Status s = lang.init( "none" );
- ASSERT( s.isOK() );
- ASSERT_EQUALS( lang.str(), "none" );
+ TEST( FTSLanguageV2, NoneLanguage ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_2 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "none" );
}
- // Negative tests for FTSLanguage::init() and FTSLanguage::str().
+ // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2.
- TEST( FTSLanguage, Unknown ) {
- FTSLanguage lang;
- Status s = lang.init( "spanglish" );
- ASSERT( !s.isOK() );
+ TEST( FTSLanguageV2, Unknown ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "spanglish", TEXT_INDEX_VERSION_2 );
+ ASSERT( !swl.getStatus().isOK() );
}
- TEST( FTSLanguage, Empty ) {
- FTSLanguage lang;
- Status s = lang.init( "" );
- ASSERT( !s.isOK() );
+ TEST( FTSLanguageV2, Empty ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_2 );
+ ASSERT( !swl.getStatus().isOK() );
}
- // Positive tests for FTSLanguage::makeFTSLanguage().
+ // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
- TEST( FTSLanguage, MakeFTSLanguage1 ) {
- StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "english" );
+ TEST( FTSLanguageV1, ExactLanguage ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_1 );
ASSERT( swl.getStatus().isOK() );
- ASSERT_EQUALS( swl.getValue().str(), "english" );
+ ASSERT_EQUALS( swl.getValue()->str(), "spanish" );
}
- // Negative tests for FTSLanguage::makeFTSLanguage().
+ TEST( FTSLanguageV1, DeprecatedLanguage ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "porter", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "porter" );
+ }
- TEST( FTSLanguage, MakeFTSLanguage2 ) {
- StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "onglish" );
- ASSERT( !swl.getStatus().isOK() );
+ TEST( FTSLanguageV1, StemmerOnlyLanguage1 ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "en" );
+ }
+
+ TEST( FTSLanguageV1, StemmerOnlyLanguage2 ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "eng", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "eng" );
+ }
+
+ TEST( FTSLanguageV1, NoneLanguage ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "none" );
+ }
+
+ // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1.
+
+ TEST( FTSLanguageV1, CaseSensitive ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "none" );
+ }
+
+ TEST( FTSLanguageV1, Unknown ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "asdf", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "none" );
+ }
+
+ TEST( FTSLanguageV1, Empty ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_1 );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue()->str(), "none" );
}
}
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 2f5e215ce64..bab78397614 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -31,6 +31,7 @@
#include "mongo/pch.h"
#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/fts/tokenizer.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
@@ -41,22 +42,23 @@ namespace mongo {
using namespace mongoutils;
- Status FTSQuery::parse(const string& query, const string& language) {
+ Status FTSQuery::parse(const string& query, const StringData& language) {
_search = query;
- Status status = _language.init( language );
- if ( !status.isOK() ) {
- return status;
+ StatusWithFTSLanguage swl = FTSLanguage::make( language, TEXT_INDEX_VERSION_2 );
+ if ( !swl.getStatus().isOK() ) {
+ return swl.getStatus();
}
+ _language = swl.getValue();
- const StopWords* stopWords = StopWords::getStopWords( _language );
- Stemmer stemmer( _language );
+ const StopWords* stopWords = StopWords::getStopWords( *_language );
+ Stemmer stemmer( *_language );
bool inNegation = false;
bool inPhrase = false;
unsigned quoteOffset = 0;
- Tokenizer i( _language, query );
+ Tokenizer i( *_language, query );
while ( i.more() ) {
Token t = i.next();
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index 4eec8d404c8..e7ee925b5ca 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -50,7 +50,7 @@ namespace mongo {
class FTSQuery {
public:
- Status parse(const string& query, const string& language);
+ Status parse(const string& query, const StringData& language);
const vector<string>& getTerms() const { return _terms; }
const unordered_set<string>& getNegatedTerms() const { return _negatedTerms; }
@@ -69,7 +69,7 @@ namespace mongo {
}
string getSearch() const { return _search; }
- const FTSLanguage getLanguage() const { return _language; }
+ const FTSLanguage& getLanguage() const { return *_language; }
string toString() const;
@@ -77,7 +77,7 @@ namespace mongo {
protected:
string _search;
- FTSLanguage _language;
+ const FTSLanguage* _language;
vector<string> _terms;
unordered_set<string> _negatedTerms;
vector<string> _phrases;
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index be74ae79bf1..fc692b097d6 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -30,8 +30,9 @@
#include "mongo/pch.h"
-#include "mongo/db/field_ref.h"
#include "mongo/db/fts/fts_spec.h"
+
+#include "mongo/db/field_ref.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
@@ -61,14 +62,37 @@ namespace mongo {
}
FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
+ // indexInfo is a text index spec. Text index specs pass through fixSpec() before
+ // being saved to the system.indexes collection. fixSpec() enforces a schema, such that
+ // required fields must exist and be of the correct type (e.g. weights,
+ // textIndexVersion).
massert( 16739, "found invalid spec for text index",
indexInfo["weights"].isABSONObj() );
-
- Status status = _defaultLanguage.init( indexInfo["default_language"].String() );
- verify( status.isOK() );
+ BSONElement textIndexVersionElt = indexInfo["textIndexVersion"];
+ massert( 17367,
+ "found invalid spec for text index, expected number for textIndexVersion",
+ textIndexVersionElt.isNumber() );
+
+ // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2.
+ // Reject all other values.
+ massert( 17364,
+ str::stream() << "attempt to use unsupported textIndexVersion " <<
+ textIndexVersionElt.numberInt() << "; versions supported: " <<
+ TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1,
+ textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ||
+ textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1 );
+
+ _textIndexVersion = ( textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ) ?
+ TEXT_INDEX_VERSION_2 : TEXT_INDEX_VERSION_1;
+
+ // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires
+ // textIndexVersion, since language parsing is version-specific.
+ StatusWithFTSLanguage swl =
+ FTSLanguage::make( indexInfo["default_language"].String(), _textIndexVersion );
+ verify( swl.getStatus().isOK() ); // should not fail, since validated by fixSpec().
+ _defaultLanguage = swl.getValue();
_languageOverrideField = indexInfo["language_override"].valuestrsafe();
- verify( validateOverride( _languageOverrideField ) );
_wildcard = false;
@@ -116,8 +140,8 @@ namespace mongo {
}
}
- const FTSLanguage FTSSpec::getLanguageToUse( const BSONObj& userDoc,
- const FTSLanguage currentLanguage ) const {
+ const FTSLanguage& FTSSpec::_getLanguageToUseV2( const BSONObj& userDoc,
+ const FTSLanguage& currentLanguage ) const {
BSONElement e = userDoc[_languageOverrideField];
if ( e.eoo() ) {
return currentLanguage;
@@ -125,11 +149,11 @@ namespace mongo {
uassert( 17261,
"found language override field in document with non-string type",
e.type() == mongo::String );
- StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( e.String() );
+ StatusWithFTSLanguage swl = FTSLanguage::make( e.String(), TEXT_INDEX_VERSION_2 );
uassert( 17262,
"language override unsupported: " + e.String(),
swl.getStatus().isOK() );
- return swl.getValue();
+ return *swl.getValue();
}
@@ -147,11 +171,18 @@ namespace mongo {
}
void FTSSpec::scoreDocument( const BSONObj& obj,
- const FTSLanguage parentLanguage,
+ const FTSLanguage& parentLanguage,
const string& parentPath,
bool isArray,
TermFrequencyMap* term_freqs ) const {
- const FTSLanguage language = getLanguageToUse( obj, parentLanguage );
+
+ if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) {
+ dassert( parentPath == "" );
+ dassert( !isArray );
+ return _scoreDocumentV1( obj, term_freqs );
+ }
+
+ const FTSLanguage& language = _getLanguageToUseV2( obj, parentLanguage );
Stemmer stemmer( language );
Tools tools( language, &stemmer, StopWords::getStopWords( language ) );
@@ -209,7 +240,7 @@ namespace mongo {
case String:
// Only index strings on exact match or wildcard.
if ( exactMatch || wildcard() ) {
- _scoreString( tools, elem.valuestr(), term_freqs, weight );
+ _scoreStringV2( tools, elem.valuestr(), term_freqs, weight );
}
break;
case Object:
@@ -233,22 +264,10 @@ namespace mongo {
}
}
- namespace {
- struct ScoreHelperStruct {
- ScoreHelperStruct()
- : freq(0), count(0), exp(0){
- }
- double freq;
- double count;
- double exp;
- };
- typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap;
- }
-
- void FTSSpec::_scoreString( const Tools& tools,
- const StringData& raw,
- TermFrequencyMap* docScores,
- double weight ) const {
+ void FTSSpec::_scoreStringV2( const Tools& tools,
+ const StringData& raw,
+ TermFrequencyMap* docScores,
+ double weight ) const {
ScoreHelperMap terms;
@@ -335,6 +354,10 @@ namespace mongo {
}
BSONObj FTSSpec::fixSpec( const BSONObj& spec ) {
+ if ( spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1 ) {
+ return _fixSpecV1( spec );
+ }
+
map<string,int> m;
BSONObj keyPattern;
@@ -477,7 +500,8 @@ namespace mongo {
}
uassert( 17264,
"default_language is not valid",
- FTSLanguage::makeFTSLanguage( default_language ).getStatus().isOK() );
+ FTSLanguage::make( default_language,
+ TEXT_INDEX_VERSION_2 ).getStatus().isOK() );
BSONElement language_override_elt = spec["language_override"];
string language_override( language_override_elt.str() );
@@ -492,7 +516,7 @@ namespace mongo {
}
int version = -1;
- int textIndexVersion = 2;
+ int textIndexVersion = TEXT_INDEX_VERSION_2;
BSONObjBuilder b;
BSONObjIterator i( spec );
@@ -523,7 +547,7 @@ namespace mongo {
textIndexVersion = e.numberInt();
uassert( 16730,
str::stream() << "bad textIndexVersion: " << textIndexVersion,
- textIndexVersion == 2 );
+ textIndexVersion == TEXT_INDEX_VERSION_2 );
}
else {
b.append( e );
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index 258ecf7407a..570303f181b 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -46,23 +46,33 @@ namespace mongo {
namespace fts {
extern const double MAX_WEIGHT;
+ extern const double MAX_WORD_WEIGHT;
typedef std::map<string,double> Weights; // TODO cool map
typedef unordered_map<string,double> TermFrequencyMap;
+ struct ScoreHelperStruct {
+ ScoreHelperStruct()
+ : freq(0), count(0), exp(0){
+ }
+ double freq;
+ double count;
+ double exp;
+ };
+ typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap;
class FTSSpec {
struct Tools {
- Tools( const FTSLanguage _language,
+ Tools( const FTSLanguage& _language,
const Stemmer* _stemmer,
const StopWords* _stopwords )
: language( _language )
, stemmer( _stemmer )
, stopwords( _stopwords ) {}
- const FTSLanguage language;
+ const FTSLanguage& language;
const Stemmer* stemmer;
const StopWords* stopwords;
};
@@ -71,7 +81,7 @@ namespace mongo {
FTSSpec( const BSONObj& indexInfo );
bool wildcard() const { return _wildcard; }
- const FTSLanguage defaultLanguage() const { return _defaultLanguage; }
+ const FTSLanguage& defaultLanguage() const { return *_defaultLanguage; }
const string& languageOverrideField() const { return _languageOverrideField; }
size_t numExtraBefore() const { return _extraBefore.size(); }
@@ -89,7 +99,7 @@ namespace mongo {
* - "term_freqs": out-parameter to store results
*/
void scoreDocument( const BSONObj& obj,
- const FTSLanguage parentLanguage,
+ const FTSLanguage& parentLanguage,
const string& parentPath,
bool isArray,
TermFrequencyMap* term_freqs ) const;
@@ -102,20 +112,56 @@ namespace mongo {
const Weights& weights() const { return _weights; }
static BSONObj fixSpec( const BSONObj& spec );
+
private:
+ //
+ // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only.
+ //
+
/**
* Get the language override for the given BSON doc. If no language override is
* specified, returns currentLanguage.
*/
- const FTSLanguage getLanguageToUse( const BSONObj& userDoc,
- const FTSLanguage currentLanguage ) const;
+ const FTSLanguage& _getLanguageToUseV2( const BSONObj& userDoc,
+ const FTSLanguage& currentLanguage ) const;
+
+ /**
+ * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses
+ * 'raw' using 'tools', and weights term scores based on 'weight'.
+ */
+ void _scoreStringV2( const Tools& tools,
+ const StringData& raw,
+ TermFrequencyMap* term_freqs,
+ double weight ) const;
+
+ //
+ // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only.
+ //
+
+ void _scoreStringV1( const Tools& tools,
+ const StringData& raw,
+ TermFrequencyMap* docScores,
+ double weight ) const;
+
+ bool _weightV1( const StringData& field, double* out ) const;
+
+ void _scoreRecurseV1( const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs ) const;
+
+ void _scoreDocumentV1( const BSONObj& obj, TermFrequencyMap* term_freqs ) const;
+
+ const FTSLanguage& _getLanguageToUseV1( const BSONObj& userDoc ) const;
+
+ static BSONObj _fixSpecV1( const BSONObj& spec );
+
+ //
+ // Instance variables.
+ //
- void _scoreString( const Tools& tools,
- const StringData& raw,
- TermFrequencyMap* term_freqs,
- double weight ) const;
+ TextIndexVersion _textIndexVersion;
- FTSLanguage _defaultLanguage;
+ const FTSLanguage* _defaultLanguage;
string _languageOverrideField;
bool _wildcard;
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
new file mode 100644
index 00000000000..556cac1e091
--- /dev/null
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -0,0 +1,320 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_spec.h"
+
+#include "mongo/util/mongoutils/str.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ //
+ // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1
+ // text indexes.
+ //
+
+ using namespace mongoutils;
+
+ namespace {
+ void _addFTSStuff( BSONObjBuilder* b ) {
+ b->append( "_fts", INDEX_NAME );
+ b->append( "_ftsx", 1 );
+ }
+ }
+
+ const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const {
+ BSONElement e = userDoc[_languageOverrideField];
+ if ( e.type() == String ) {
+ const char * x = e.valuestrsafe();
+ if ( strlen( x ) > 0 ) {
+ StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 );
+ dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
+ return *swl.getValue();
+ }
+ }
+ return *_defaultLanguage;
+ }
+
+ void FTSSpec::_scoreStringV1( const Tools& tools,
+ const StringData& raw,
+ TermFrequencyMap* docScores,
+ double weight ) const {
+
+ ScoreHelperMap terms;
+
+ unsigned numTokens = 0;
+
+ Tokenizer i( tools.language, raw );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT )
+ continue;
+
+ string term = t.data.toString();
+ makeLower( &term );
+ if ( tools.stopwords->isStopWord( term ) )
+ continue;
+ term = tools.stemmer->stem( term );
+
+ ScoreHelperStruct& data = terms[term];
+
+ if ( data.exp )
+ data.exp *= 2;
+ else
+ data.exp = 1;
+ data.count += 1;
+ data.freq += ( 1 / data.exp );
+
+ numTokens++;
+ }
+
+ for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
+
+ const string& term = i->first;
+ const ScoreHelperStruct& data = i->second;
+
+ // in order to adjust weights as a function of term count as it
+ // relates to total field length. ie. is this the only word or
+ // a frequently occuring term? or does it only show up once in
+ // a long block of text?
+
+ double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
+
+ // if term is identical to the raw form of the
+ // field (untokenized) give it a small boost.
+ double adjustment = 1;
+ if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
+ adjustment += 0.1;
+
+ double& score = (*docScores)[term];
+ score += ( weight * data.freq * coeff * adjustment );
+ verify( score <= MAX_WEIGHT );
+ }
+ }
+
+ bool FTSSpec::_weightV1( const StringData& field, double* out ) const {
+ Weights::const_iterator i = _weights.find( field.toString() );
+ if ( i == _weights.end() )
+ return false;
+ *out = i->second;
+ return true;
+ }
+
+ /*
+ * Recurses over all fields of an obj (document in collection)
+ * and fills term,score map term_freqs
+ * @param tokenizer, tokenizer to tokenize a string into terms
+ * @param obj, object being parsed
+ * term_freqs, map <term,score> to be filled up
+ */
+ void FTSSpec::_scoreRecurseV1( const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs ) const {
+ BSONObjIterator j( obj );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( languageOverrideField() == x.fieldName() )
+ continue;
+
+ if (x.type() == String) {
+ double w = 1;
+ _weightV1( x.fieldName(), &w );
+ _scoreStringV1(tools, x.valuestr(), term_freqs, w);
+ }
+ else if ( x.isABSONObj() ) {
+ _scoreRecurseV1( tools, x.Obj(), term_freqs);
+ }
+
+ }
+ }
+
+ void FTSSpec::_scoreDocumentV1( const BSONObj& obj,
+ TermFrequencyMap* term_freqs ) const {
+
+ const FTSLanguage& language = _getLanguageToUseV1( obj );
+
+ Stemmer stemmer(language);
+ Tools tools(language, &stemmer, StopWords::getStopWords( language ));
+
+ if ( wildcard() ) {
+ // if * is specified for weight, we can recurse over all fields.
+ _scoreRecurseV1(tools, obj, term_freqs);
+ return;
+ }
+
+ // otherwise, we need to remember the different weights for each field
+ // and act accordingly (in other words, call _score)
+ for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
+ const char * leftOverName = i->first.c_str();
+ // name of field
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+ // weight associated to name of field
+ double weight = i->second;
+
+ if ( e.eoo() ) {
+ // do nothing
+ }
+ else if ( e.type() == Array ) {
+ BSONObjIterator j( e.Obj() );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+ if ( leftOverName[0] && x.isABSONObj() )
+ x = x.Obj().getFieldDotted( leftOverName );
+ if ( x.type() == String )
+ _scoreStringV1( tools, x.valuestr(), term_freqs, weight );
+ }
+ }
+ else if ( e.type() == String ) {
+ _scoreStringV1( tools, e.valuestr(), term_freqs, weight );
+ }
+
+ }
+ }
+
+ BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) {
+ map<string,int> m;
+
+ BSONObj keyPattern;
+ {
+ BSONObjBuilder b;
+ bool addedFtsStuff = false;
+
+ BSONObjIterator i( spec["key"].Obj() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( str::equals( e.fieldName(), "_fts" ) ||
+ str::equals( e.fieldName(), "_ftsx" ) ) {
+ addedFtsStuff = true;
+ b.append( e );
+ }
+ else if ( e.type() == String &&
+ ( str::equals( "fts", e.valuestr() ) ||
+ str::equals( "text", e.valuestr() ) ) ) {
+
+ if ( !addedFtsStuff ) {
+ _addFTSStuff( &b );
+ addedFtsStuff = true;
+ }
+
+ m[e.fieldName()] = 1;
+ }
+ else {
+ b.append( e );
+ }
+ }
+
+ if ( !addedFtsStuff )
+ _addFTSStuff( &b );
+
+ keyPattern = b.obj();
+ }
+
+ if ( spec["weights"].isABSONObj() ) {
+ BSONObjIterator i( spec["weights"].Obj() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ m[e.fieldName()] = e.numberInt();
+ }
+ }
+ else if ( spec["weights"].str() == WILDCARD ) {
+ m[WILDCARD] = 1;
+ }
+
+ BSONObj weights;
+ {
+ BSONObjBuilder b;
+ for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) {
+ uassert( 17365, "score for word too high",
+ i->second > 0 && i->second < MAX_WORD_WEIGHT );
+ b.append( i->first, i->second );
+ }
+ weights = b.obj();
+ }
+
+ string default_language(spec.getStringField("default_language"));
+ if ( default_language.empty() )
+ default_language = "english";
+
+ string language_override(spec.getStringField("language_override"));
+ if ( language_override.empty() )
+ language_override = "language";
+
+ int version = -1;
+ int textIndexVersion = 1;
+
+ BSONObjBuilder b;
+ BSONObjIterator i( spec );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( str::equals( e.fieldName(), "key" ) ) {
+ b.append( "key", keyPattern );
+ }
+ else if ( str::equals( e.fieldName(), "weights" ) ) {
+ b.append( "weights", weights );
+ weights = BSONObj();
+ }
+ else if ( str::equals( e.fieldName(), "default_language" ) ) {
+ b.append( "default_language", default_language);
+ default_language = "";
+ }
+ else if ( str::equals( e.fieldName(), "language_override" ) ) {
+ b.append( "language_override", language_override);
+ language_override = "";
+ }
+ else if ( str::equals( e.fieldName(), "v" ) ) {
+ version = e.numberInt();
+ }
+ else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) {
+ textIndexVersion = e.numberInt();
+ uassert( 17366,
+ str::stream() << "bad textIndexVersion: " << textIndexVersion,
+ textIndexVersion == 1 );
+ }
+ else {
+ b.append( e );
+ }
+ }
+
+ if ( !weights.isEmpty() )
+ b.append( "weights", weights );
+ if ( !default_language.empty() )
+ b.append( "default_language", default_language);
+ if ( !language_override.empty() )
+ b.append( "language_override", language_override);
+
+ if ( version >= 0 )
+ b.append( "v", version );
+
+ b.append( "textIndexVersion", textIndexVersion );
+
+ return b.obj();
+ }
+ }
+}
diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp
index 73c53758229..8323ecc0cea 100644
--- a/src/mongo/db/fts/fts_spec_test.cpp
+++ b/src/mongo/db/fts/fts_spec_test.cpp
@@ -160,6 +160,9 @@ namespace mongo {
}
TEST( FTSSpec, FixTextIndexVersion1 ) {
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}");
+ assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}");
assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}");
assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}");
assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}");
@@ -178,7 +181,7 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "title" << "cat sat run" ),
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ spec.defaultLanguage(),
"",
false,
&m );
@@ -197,7 +200,7 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ),
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ spec.defaultLanguage(),
"",
false,
&m );
@@ -220,7 +223,7 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "a" << BSON( "b" << "term" ) ),
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ spec.defaultLanguage(),
"",
false,
&m );
@@ -236,7 +239,7 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ),
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ spec.defaultLanguage(),
"",
false,
&m );
@@ -308,11 +311,7 @@ namespace mongo {
// The following document matches {"a.b": {$type: 2}}, so "term" should be indexed.
BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays
TermFrequencyMap m;
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &m );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &m );
ASSERT_EQUALS( 1U, m.size() );
}
@@ -323,11 +322,7 @@ namespace mongo {
// The wildcard spec implies a full recursive traversal, so "term" should be indexed.
BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
TermFrequencyMap m;
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &m );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &m );
ASSERT_EQUALS( 1U, m.size() );
}
@@ -339,11 +334,7 @@ namespace mongo {
// indexed.
BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
TermFrequencyMap m;
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &m );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &m );
ASSERT_EQUALS( 0U, m.size() );
}
@@ -362,11 +353,7 @@ namespace mongo {
" }"
" }" );
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &tfm );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
set<string> hits;
hits.insert("walk");
@@ -397,11 +384,7 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &tfm );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -432,11 +415,7 @@ namespace mongo {
" } ]"
"}" );
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &tfm );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -469,11 +448,7 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &tfm );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -506,11 +481,7 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &tfm );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -545,11 +516,7 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj,
- FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "",
- false,
- &tfm );
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -564,6 +531,52 @@ namespace mongo {
}
+ /** Test differences across textIndexVersion values in handling of nested arrays. */
+ TEST( FTSSpec, TextIndexLegacyNestedArrays ) {
+ BSONObj obj = fromjson( "{a: [{b: ['hello']}]}" );
+
+ // textIndexVersion=1 FTSSpec objects do not index nested arrays.
+ {
+ BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 1}" );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
+ ASSERT_EQUALS( tfm.size(), 0U );
+ }
+
+ // textIndexVersion=2 FTSSpec objects do index nested arrays.
+ {
+ BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 2}" );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
+ ASSERT_EQUALS( tfm.size(), 1U );
+ }
+ }
+
+ /** Test differences across textIndexVersion values in handling of language annotations. */
+ TEST( FTSSpec, TextIndexLegacyLanguageRecognition) {
+ BSONObj obj = fromjson( "{a: 'the', language: 'EN'}" );
+
+ // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none"
+ // for purposes of stopword processing.
+ {
+ BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 1}" );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
+ ASSERT_EQUALS( tfm.size(), 1U ); // "the" not recognized as stopword
+ }
+
+ // textIndexVersion=2 FTSSpec objects recognize two-letter codes.
+ {
+ BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 2}" );
+ FTSSpec spec( FTSSpec::fixSpec( indexSpec ) );
+ TermFrequencyMap tfm;
+ spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm );
+ ASSERT_EQUALS( tfm.size(), 0U ); // "the" recognized as stopword
+ }
+ }
}
}
diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h
index d16f605c16c..fe9ad0de341 100644
--- a/src/mongo/db/fts/fts_util.h
+++ b/src/mongo/db/fts/fts_util.h
@@ -44,6 +44,12 @@ namespace mongo {
extern const std::string WILDCARD;
extern const std::string INDEX_NAME;
+ enum TextIndexVersion {
+ TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated.
+ TEXT_INDEX_VERSION_2 = 2 // Current index format.
+ };
+
+
/**
* destructive!
*/
diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py
index 2434e30fdc7..e0dc801ca92 100644
--- a/src/mongo/db/fts/generate_stop_words.py
+++ b/src/mongo/db/fts/generate_stop_words.py
@@ -10,13 +10,13 @@ def generate( header, source, language_files ):
out = open( header, "wb" )
out.write( """
#pragma once
-#include <map>
#include <set>
#include <string>
+#include "mongo/util/string_map.h"
namespace mongo {
namespace fts {
- void loadStopWordMap( std::map< std::string, std::set< std::string > >* m );
+ void loadStopWordMap( StringMap< std::set< std::string > >* m );
}
}
""" )
@@ -30,7 +30,7 @@ namespace fts {
namespace mongo {
namespace fts {
- void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ) {
+ void loadStopWordMap( StringMap< std::set< std::string > >* m ) {
""" )
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 5c4431f9712..a86cfda8015 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -32,12 +32,13 @@
#include <string>
#include "mongo/db/fts/stemmer.h"
+#include "mongo/util/mongoutils/str.h"
namespace mongo {
namespace fts {
- Stemmer::Stemmer( const FTSLanguage language ) {
+ Stemmer::Stemmer( const FTSLanguage& language ) {
_stemmer = NULL;
if ( language.str() != "none" )
_stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8");
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index 9b06bda4f2e..fe028e2aba7 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -48,7 +48,7 @@ namespace mongo {
*/
class Stemmer {
public:
- Stemmer( const FTSLanguage language );
+ Stemmer( const FTSLanguage& language );
~Stemmer();
std::string stem( const StringData& word ) const;
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index 1833f20fe37..9037715d4da 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -31,16 +31,23 @@
#include "mongo/unittest/unittest.h"
+#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/fts/stemmer.h"
namespace mongo {
namespace fts {
TEST( English, Stemmer1 ) {
- Stemmer s( FTSLanguage::makeFTSLanguage( "english" ).getValue() );
+ Stemmer s( languageEnglishV2 );
ASSERT_EQUALS( "run", s.stem( "running" ) );
ASSERT_EQUALS( "Run", s.stem( "Running" ) );
}
+ TEST( English, Caps ) {
+ Stemmer s( languagePorterV1 );
+ ASSERT_EQUALS( "unit", s.stem( "united" ) );
+ ASSERT_EQUALS( "Unite", s.stem( "United" ) );
+ }
+
}
}
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index d858992f5ce..bc0240600c1 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -28,14 +28,13 @@
* it in the license file.
*/
-#include <map>
#include <set>
#include <string>
#include "mongo/db/fts/stop_words.h"
#include "mongo/base/init.h"
-#include "mongo/platform/unordered_map.h"
+#include "mongo/util/string_map.h"
@@ -43,10 +42,10 @@ namespace mongo {
namespace fts {
- void loadStopWordMap( std::map< std::string, std::set< std::string > >* m );
+ void loadStopWordMap( StringMap< std::set< std::string > >* m );
namespace {
- unordered_map<string,StopWords*> STOP_WORDS;
+ StringMap<StopWords*> STOP_WORDS;
StopWords* empty = NULL;
}
@@ -59,8 +58,8 @@ namespace mongo {
_words.insert( *i );
}
- const StopWords* StopWords::getStopWords( const FTSLanguage language ) {
- unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( language.str() );
+ const StopWords* StopWords::getStopWords( const FTSLanguage& language ) {
+ StringMap<StopWords*>::const_iterator i = STOP_WORDS.find( language.str() );
if ( i == STOP_WORDS.end() )
return empty;
return i->second;
@@ -70,9 +69,9 @@ namespace mongo {
MONGO_INITIALIZER(StopWords)(InitializerContext* context) {
empty = new StopWords();
- std::map< std::string, std::set< std::string > > raw;
+ StringMap< std::set< std::string > > raw;
loadStopWordMap( &raw );
- for ( std::map< std::string, std::set< std::string > >::const_iterator i = raw.begin();
+ for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin();
i != raw.end();
++i ) {
STOP_WORDS[i->first] = new StopWords( i->second );
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index 24e433c6992..22ec22f3fa8 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -52,7 +52,7 @@ namespace mongo {
size_t numStopWords() const { return _words.size(); }
- static const StopWords* getStopWords( const FTSLanguage langauge );
+ static const StopWords* getStopWords( const FTSLanguage& langauge );
private:
~StopWords(){}
unordered_set<std::string> _words;
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index 4d6b78f7f6b..0edf4e2540c 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -28,6 +28,7 @@
* it in the license file.
*/
+#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/fts/stop_words.h"
#include "mongo/unittest/unittest.h"
@@ -35,8 +36,7 @@ namespace mongo {
namespace fts {
TEST( English, Basic1 ) {
- FTSLanguage language = FTSLanguage::makeFTSLanguage( "english" ).getValue();
- const StopWords* englishStopWords = StopWords::getStopWords( language );
+ const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 );
ASSERT( englishStopWords->isStopWord( "the" ) );
ASSERT( !englishStopWords->isStopWord( "computer" ) );
}
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index 1a25898bd75..53580fca4be 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -31,15 +31,16 @@
#include <string>
#include "mongo/db/fts/tokenizer.h"
+#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
namespace mongo {
namespace fts {
- Tokenizer::Tokenizer( const FTSLanguage language, const StringData& str )
+ Tokenizer::Tokenizer( const FTSLanguage& language, const StringData& str )
: _pos(0), _raw( str ) {
- _english = language.str() == "english";
+ _english = ( language.str() == "english" );
_skipWhitespace();
_previousWhiteSpace = true;
}
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index 6930f7543f6..06e63f6aa2f 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -61,7 +61,7 @@ namespace mongo {
class Tokenizer {
public:
- Tokenizer( const FTSLanguage language, const StringData& str );
+ Tokenizer( const FTSLanguage& language, const StringData& str );
bool more() const;
Token next();
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index eac91987c61..29153a329a6 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -28,6 +28,7 @@
* it in the license file.
*/
+#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/fts/tokenizer.h"
#include "mongo/unittest/unittest.h"
@@ -35,14 +36,12 @@ namespace mongo {
namespace fts {
TEST( Tokenizer, Empty1 ) {
- Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "" );
+ Tokenizer i( languageEnglishV2, "" );
ASSERT( !i.more() );
}
TEST( Tokenizer, Basic1 ) {
- Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "blue red green" );
+ Tokenizer i( languageEnglishV2, "blue red green" );
ASSERT( i.more() );
ASSERT_EQUALS( i.next().data.toString(), "blue" );
@@ -57,8 +56,7 @@ namespace mongo {
}
TEST( Tokenizer, Basic2 ) {
- Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "blue-red" );
+ Tokenizer i( languageEnglishV2, "blue-red" );
Token a = i.next();
Token b = i.next();
@@ -80,8 +78,7 @@ namespace mongo {
}
TEST( Tokenizer, Basic3 ) {
- Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "blue -red" );
+ Tokenizer i( languageEnglishV2, "blue -red" );
Token a = i.next();
Token b = i.next();
@@ -108,8 +105,7 @@ namespace mongo {
}
TEST( Tokenizer, Quote1English ) {
- Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
- "eliot's car" );
+ Tokenizer i( languageEnglishV2, "eliot's car" );
Token a = i.next();
Token b = i.next();
@@ -119,8 +115,7 @@ namespace mongo {
}
TEST( Tokenizer, Quote1French ) {
- Tokenizer i( FTSLanguage::makeFTSLanguage( "french" ).getValue(),
- "eliot's car" );
+ Tokenizer i( languageFrenchV2, "eliot's car" );
Token a = i.next();
Token b = i.next();
diff --git a/src/mongo/db/matcher/expression_parser_text.cpp b/src/mongo/db/matcher/expression_parser_text.cpp
index 3b2b6bd6ea0..86b16e71130 100644
--- a/src/mongo/db/matcher/expression_parser_text.cpp
+++ b/src/mongo/db/matcher/expression_parser_text.cpp
@@ -30,6 +30,7 @@
#include "mongo/base/init.h"
#include "mongo/db/fts/fts_language.h"
+#include "mongo/db/fts/fts_spec.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/matcher/expression_parser.h"
#include "mongo/db/matcher/expression_text.h"
@@ -52,7 +53,9 @@ namespace mongo {
"$language needs a String" );
}
language = languageElt.String();
- if ( !fts::FTSLanguage::makeFTSLanguage( language ).getStatus().isOK() ) {
+ Status status =
+ fts::FTSLanguage::make( language, fts::TEXT_INDEX_VERSION_2 ).getStatus();
+ if ( !status.isOK() ) {
return StatusWithMatchExpression( ErrorCodes::BadValue,
"$language specifies unsupported language" );
}
diff --git a/src/mongo/db/query/stage_builder.cpp b/src/mongo/db/query/stage_builder.cpp
index 781e4eaff19..764675bdd5d 100644
--- a/src/mongo/db/query/stage_builder.cpp
+++ b/src/mongo/db/query/stage_builder.cpp
@@ -234,9 +234,9 @@ namespace mongo {
return NULL;
}
- string language = ("" == node->_language
- ? fam->getSpec().defaultLanguage().str()
- : node->_language);
+ StringData language = ("" == node->_language
+ ? fam->getSpec().defaultLanguage().str().c_str()
+ : node->_language);
FTSQuery ftsq;
Status parseStatus = ftsq.parse(node->_query, language);