diff options
Diffstat (limited to 'src/mongo/db')
24 files changed, 887 insertions, 291 deletions
diff --git a/src/mongo/db/exec/stagedebug_cmd.cpp b/src/mongo/db/exec/stagedebug_cmd.cpp index dd3f53d9bb9..cfaea092033 100644 --- a/src/mongo/db/exec/stagedebug_cmd.cpp +++ b/src/mongo/db/exec/stagedebug_cmd.cpp @@ -373,7 +373,8 @@ namespace mongo { params.spec = fam->getSpec(); - if (!params.query.parse(search, fam->getSpec().defaultLanguage().str()).isOK()) { + if (!params.query.parse(search, + fam->getSpec().defaultLanguage().str().c_str()).isOK()) { return NULL; } diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript index 7ec267cc6ad..afd6e0386a9 100644 --- a/src/mongo/db/fts/SConscript +++ b/src/mongo/db/fts/SConscript @@ -29,6 +29,7 @@ env.Library('base', [ 'fts_matcher.cpp', 'fts_query.cpp', 'fts_spec.cpp', + 'fts_spec_legacy.cpp', 'fts_language.cpp', 'fts_util.cpp', 'stemmer.cpp', diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index 61bd33cf93f..21474038f06 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -34,6 +34,7 @@ #include "mongo/base/init.h" #include "mongo/util/assert_util.h" +#include "mongo/util/mongoutils/str.h" #include "mongo/util/string_map.h" #include "mongo/util/stringutils.h" @@ -43,110 +44,219 @@ namespace mongo { namespace { - // Supported languages in canonical form (English names, lowercased). Includes "none". - const string LanguageNone( "none" ); - const string LanguageDanish( "danish" ); - const string LanguageDutch( "dutch" ); - const string LanguageEnglish( "english" ); - const string LanguageFinnish( "finnish" ); - const string LanguageFrench( "french" ); - const string LanguageGerman( "german" ); - const string LanguageHungarian( "hungarian" ); - const string LanguageItalian( "italian" ); - const string LanguageNorwegian( "norwegian" ); - const string LanguagePortuguese( "portuguese" ); - const string LanguageRomanian( "romanian" ); - const string LanguageRussian( "russian" ); - const string LanguageSpanish( "spanish" ); - const string LanguageSwedish( "swedish" ); - const string LanguageTurkish( "turkish" ); - - // Map from lowercased user string to language string. Resolves any language aliases - // (two-letter codes). - typedef StringMap<std::string> LanguageMap; - LanguageMap languageMap; - } + /** + * Case-insensitive StringData comparator. + */ + struct LanguageStringCompare { + /** Returns true if lhs < rhs. */ + bool operator()( const StringData& lhs, const StringData& rhs ) const { + size_t minSize = std::min( lhs.size(), rhs.size() ); - MONGO_INITIALIZER( FTSLanguageMap )( InitializerContext* context ) { - languageMap[LanguageNone] = LanguageNone; - - languageMap["da"] = LanguageDanish; - languageMap[LanguageDanish] = LanguageDanish; - languageMap["nl"] = LanguageDutch; - languageMap[LanguageDutch] = LanguageDutch; - languageMap["en"] = LanguageEnglish; - languageMap[LanguageEnglish] = LanguageEnglish; - languageMap["fi"] = LanguageFinnish; - languageMap[LanguageFinnish] = LanguageFinnish; - languageMap["fr"] = LanguageFrench; - languageMap[LanguageFrench] = LanguageFrench; - languageMap["de"] = LanguageGerman; - languageMap[LanguageGerman] = LanguageGerman; - languageMap["hu"] = LanguageHungarian; - languageMap[LanguageHungarian] = LanguageHungarian; - languageMap["it"] = LanguageItalian; - languageMap[LanguageItalian] = LanguageItalian; - languageMap["nb"] = LanguageNorwegian; - languageMap[LanguageNorwegian] = LanguageNorwegian; - languageMap["pt"] = LanguagePortuguese; - languageMap[LanguagePortuguese] = LanguagePortuguese; - languageMap["ro"] = LanguageRomanian; - languageMap[LanguageRomanian] = LanguageRomanian; - languageMap["ru"] = LanguageRussian; - languageMap[LanguageRussian] = LanguageRussian; - languageMap["es"] = LanguageSpanish; - languageMap[LanguageSpanish] = LanguageSpanish; - languageMap["sv"] = LanguageSwedish; - languageMap[LanguageSwedish] = LanguageSwedish; - languageMap["tr"] = LanguageTurkish; - languageMap[LanguageTurkish] = LanguageTurkish; - return Status::OK(); - } + for ( size_t x = 0; x < minSize; x++ ) { + char a = tolower( lhs[x] ); + char b = tolower( rhs[x] ); + if ( a < b ) { + return true; + } + if ( a > b ) { + return false; + } + } - FTSLanguage::FTSLanguage() - : _lang() { - } + return lhs.size() < rhs.size(); + } + }; - FTSLanguage::FTSLanguage( const FTSLanguage& other ) - : _lang( other._lang ) { - } + // Lookup table from user language string (case-insensitive) to FTSLanguage. Populated + // by initializers in group FTSAllLanguagesRegistered and initializer + // FTSRegisterLanguageAliases. For use with TEXT_INDEX_VERSION_2 text indexes only. + typedef std::map<StringData, const FTSLanguage*, LanguageStringCompare> LanguageMapV2; + LanguageMapV2 languageMapV2; - FTSLanguage& FTSLanguage::operator=( const FTSLanguage& other ) { - _lang = other._lang; - return *this; + // Like languageMapV2, but for use with TEXT_INDEX_VERSION_1 text indexes. + // Case-sensitive by lookup key. + typedef std::map<StringData, const FTSLanguage*> LanguageMapV1; + LanguageMapV1 languageMapV1; } - FTSLanguage::~FTSLanguage() { - } + MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, + MONGO_NO_DEPENDENTS ); - Status FTSLanguage::init( const std::string& lang ) { - // Lowercase. - std::string langLower = tolowerString( lang ); + // + // Register supported languages' canonical names for TEXT_INDEX_VERSION_2. + // - // Resolve language aliases. - LanguageMap::const_iterator it = languageMap.find( langLower ); - if ( it == languageMap.end() ) { - return Status( ErrorCodes::BadValue, - "unsupported language: \"" + lang + "\"" ); - } + MONGO_FTS_LANGUAGE_DECLARE( languageNoneV2, "none", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDanishV2, "danish", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDutchV2, "dutch", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV2, "english", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV2, "finnish", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV2, "french", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageGermanV2, "german", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV2, "hungarian", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageItalianV2, "italian", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV2, "norwegian", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV2, "portuguese", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV2, "romanian", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRussianV2, "russian", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV2, "spanish", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV2, "swedish", TEXT_INDEX_VERSION_2 ); + MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV2, "turkish", TEXT_INDEX_VERSION_2 ); + + // + // Register all Snowball language modules for TEXT_INDEX_VERSION_1. Note that only the full + // names are recognized by the StopWords class (as such, the language string "dan" in + // TEXT_INDEX_VERSION_1 will generate the Danish stemmer and the empty stopword list). + // + + MONGO_FTS_LANGUAGE_DECLARE( languageNoneV1, "none", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDaV1, "da", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDanV1, "dan", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDanishV1, "danish", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDeV1, "de", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDeuV1, "deu", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDutV1, "dut", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageDutchV1, "dutch", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageEnV1, "en", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageEngV1, "eng", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageEnglishV1, "english", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageEsV1, "es", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageEslV1, "esl", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFiV1, "fi", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFinV1, "fin", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFinnishV1, "finnish", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFrV1, "fr", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFraV1, "fra", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFreV1, "fre", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageFrenchV1, "french", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageGerV1, "ger", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageGermanV1, "german", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageHuV1, "hu", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageHunV1, "hun", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageHungarianV1, "hungarian", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageItV1, "it", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageItaV1, "ita", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageItalianV1, "italian", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageNlV1, "nl", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageNldV1, "nld", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageNoV1, "no", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageNorV1, "nor", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageNorwegianV1, "norwegian", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languagePorV1, "por", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languagePorterV1, "porter", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languagePortugueseV1, "portuguese", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languagePtV1, "pt", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRoV1, "ro", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRomanianV1, "romanian", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRonV1, "ron", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRuV1, "ru", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRumV1, "rum", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRusV1, "rus", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageRussianV1, "russian", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSpaV1, "spa", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSpanishV1, "spanish", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSvV1, "sv", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSweV1, "swe", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageSwedishV1, "swedish", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageTrV1, "tr", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageTurV1, "tur", TEXT_INDEX_VERSION_1 ); + MONGO_FTS_LANGUAGE_DECLARE( languageTurkishV1, "turkish", TEXT_INDEX_VERSION_1 ); - _lang = StringData( it->second ); + MONGO_INITIALIZER_WITH_PREREQUISITES( FTSRegisterLanguageAliases, + ( "FTSAllLanguagesRegistered" ) ) + ( InitializerContext* context ) { + // Register language aliases for TEXT_INDEX_VERSION_2. + FTSLanguage::registerLanguageAlias( &languageDanishV2, "da", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageDutchV2, "nl", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageEnglishV2, "en", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageFinnishV2, "fi", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageFrenchV2, "fr", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageGermanV2, "de", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageHungarianV2, "hu", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageItalianV2, "it", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageNorwegianV2, "nb", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languagePortugueseV2, "pt", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageRomanianV2, "ro", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageRussianV2, "ru", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageSpanishV2, "es", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageSwedishV2, "sv", TEXT_INDEX_VERSION_2 ); + FTSLanguage::registerLanguageAlias( &languageTurkishV2, "tr", TEXT_INDEX_VERSION_2 ); return Status::OK(); } - std::string FTSLanguage::str() const { - verify( !_lang.empty() ); - return _lang.toString(); + // static + void FTSLanguage::registerLanguage( const StringData& languageName, + TextIndexVersion textIndexVersion, + FTSLanguage* language ) { + verify( !languageName.empty() ); + language->_canonicalName = languageName.toString(); + switch ( textIndexVersion ) { + case TEXT_INDEX_VERSION_2: + verify( languageMapV2.find( languageName ) == languageMapV2.end() ); + languageMapV2[ languageName ] = language; + return; + case TEXT_INDEX_VERSION_1: + verify( languageMapV1.find( languageName ) == languageMapV1.end() ); + languageMapV1[ languageName ] = language; + return; + } + verify( false ); } - StatusWithFTSLanguage FTSLanguage::makeFTSLanguage( const std::string& lang ) { - FTSLanguage language; - Status s = language.init( lang ); - if ( !s.isOK() ) { - return StatusWithFTSLanguage( s ); + // static + void FTSLanguage::registerLanguageAlias( const FTSLanguage* language, + const StringData& alias, + TextIndexVersion textIndexVersion ) { + switch ( textIndexVersion ) { + case TEXT_INDEX_VERSION_2: + verify( languageMapV2.find( alias ) == languageMapV2.end() ); + languageMapV2[ alias ] = language; + return; + case TEXT_INDEX_VERSION_1: + verify( languageMapV1.find( alias ) == languageMapV1.end() ); + languageMapV1[ alias ] = language; + return; } - return StatusWithFTSLanguage( language ); + verify( false ); + } + + FTSLanguage::FTSLanguage() : _canonicalName() { + } + + const std::string& FTSLanguage::str() const { + verify( !_canonicalName.empty() ); + return _canonicalName; } + // static + StatusWithFTSLanguage FTSLanguage::make( const StringData& langName, + TextIndexVersion textIndexVersion ) { + switch ( textIndexVersion ) { + case TEXT_INDEX_VERSION_2: { + LanguageMapV2::const_iterator it = languageMapV2.find( langName ); + if ( it == languageMapV2.end() ) { + // TEXT_INDEX_VERSION_2 rejects unrecognized language strings. + Status status = Status( ErrorCodes::BadValue, + mongoutils::str::stream() << + "unsupported language: \"" << langName << + "\"" ); + return StatusWithFTSLanguage( status ); + } + + return StatusWithFTSLanguage( it->second ); + } + case TEXT_INDEX_VERSION_1: { + LanguageMapV1::const_iterator it = languageMapV1.find( langName ); + if ( it == languageMapV1.end() ) { + // TEXT_INDEX_VERSION_1 treats unrecognized language strings as "none". + return StatusWithFTSLanguage( &languageNoneV1 ); + } + return StatusWithFTSLanguage( it->second ); + } + } + + verify( false ); + return StatusWithFTSLanguage( Status::OK() ); + } } } diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index 9a1d7053ea8..3a7a471bc32 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -30,6 +30,7 @@ #pragma once +#include "mongo/db/fts/fts_util.h" #include "mongo/base/status_with.h" #include <string> @@ -38,55 +39,90 @@ namespace mongo { namespace fts { + #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \ + FTSLanguage language; \ + MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \ + ( "FTSAllLanguagesRegistered" ) ) \ + ( ::mongo::InitializerContext* context ) { \ + FTSLanguage::registerLanguage( name, version, &language ); \ + return Status::OK(); \ + } + /** - * A FTSLanguage is a copyable glorified enum representing a language for a text-indexed - * document or a text search. Example of suggested usage: + * A FTSLanguage represents a language for a text-indexed document or a text search. + * FTSLanguage objects are not copyable. + * + * Recommended usage: * - * StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "en" ); + * StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_2 ); * if ( !swl.getStatus().isOK() ) { * // Error. * } * else { - * const FTSLanguage language = swl.getValue(); + * const FTSLanguage* language = swl.getValue(); * // Use language. * } */ class FTSLanguage { + // Use make() instead of copying. + MONGO_DISALLOW_COPYING( FTSLanguage ); public: /** Create an uninitialized language. */ FTSLanguage(); - ~FTSLanguage(); - FTSLanguage( const FTSLanguage& ); - FTSLanguage& operator=( const FTSLanguage & ); + /** + * Returns the language as a string in canonical form (lowercased English name). It is + * an error to call str() on an uninitialized language. + */ + const std::string& str() const; /** - * Initialize an FTSLanguage from a language string. Language strings are - * case-insensitive, and can be in one of the two following forms: - * - English name, like "spanish". - * - Two-letter code, like "es". - * Returns an error Status if an invalid language string is passed. + * Register string 'languageName' as a new language with text index version + * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. + * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language + * string. */ - Status init( const std::string& lang ); + static void registerLanguage( const StringData& languageName, + TextIndexVersion textIndexVersion, + FTSLanguage *languageOut ); /** - * Returns the language as a string in canonical form (lowercased English name). It is - * an error to call str() on an uninitialized language. + * Register 'alias' as an alias for 'language' with text index version + * 'textIndexVersion'. Subsequent calls to FTSLanguage::make() will recognize the + * newly-registered alias. */ - std::string str() const; + static void registerLanguageAlias( const FTSLanguage* language, + const StringData& alias, + TextIndexVersion textIndexVersion ); /** - * Convenience method for creating an FTSLanguage out of a language string. Caller - * must check getStatus().isOK() on return value. + * Return the FTSLanguage associated with the given language string. Returns an error + * Status if an invalid language string is passed. + * + * For textIndexVersion=TEXT_INDEX_VERSION_2, language strings are + * case-insensitive, and need to be in one of the two following forms: + * - English name, like "spanish". + * - Two-letter code, like "es". + * + * For textIndexVersion=TEXT_INDEX_VERSION_1, no validation or normalization of + * language strings is performed. This is necessary to preserve indexing behavior for + * documents with language strings like "en": for compatibility, text data in these + * documents needs to be processed with the English stemmer and the empty stopword list + * (since "en" is recognized by Snowball but not the stopword processing logic). */ - static StatusWith<const FTSLanguage> makeFTSLanguage( const std::string& lang ); + static StatusWith<const FTSLanguage*> make( const StringData& langName, + TextIndexVersion textIndexVersion ); private: - // Pointer to string representation of language. Not owned here. - StringData _lang; + // String representation of language in canonical form. + std::string _canonicalName; }; - typedef StatusWith<const FTSLanguage> StatusWithFTSLanguage; + typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; + + extern FTSLanguage languagePorterV1; + extern FTSLanguage languageEnglishV2; + extern FTSLanguage languageFrenchV2; } } diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp index 5fdd9a4aa73..141bbce27d9 100644 --- a/src/mongo/db/fts/fts_language_test.cpp +++ b/src/mongo/db/fts/fts_language_test.cpp @@ -30,76 +30,107 @@ #include "mongo/pch.h" #include "mongo/db/fts/fts_language.h" +#include "mongo/db/fts/fts_spec.h" #include "mongo/unittest/unittest.h" namespace mongo { namespace fts { - // Positive tests for FTSLanguage::init() and FTSLanguage::str(). + // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - TEST( FTSLanguage, ExactLanguage ) { - FTSLanguage lang; - Status s = lang.init( "spanish" ); - ASSERT( s.isOK() ); - ASSERT_EQUALS( lang.str(), "spanish" ); + TEST( FTSLanguageV2, ExactLanguage ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_2 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); } - TEST( FTSLanguage, ExactCode ) { - FTSLanguage lang; - Status s = lang.init( "es" ); - ASSERT( s.isOK() ); - ASSERT_EQUALS( lang.str(), "spanish" ); + TEST( FTSLanguageV2, ExactCode ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "es", TEXT_INDEX_VERSION_2 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); } - TEST( FTSLanguage, UpperCaseLanguage ) { - FTSLanguage lang; - Status s = lang.init( "SPANISH" ); - ASSERT( s.isOK() ); - ASSERT_EQUALS( lang.str(), "spanish" ); + TEST( FTSLanguageV2, UpperCaseLanguage ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_2 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); } - TEST( FTSLanguage, UpperCaseCode ) { - FTSLanguage lang; - Status s = lang.init( "ES" ); - ASSERT( s.isOK() ); - ASSERT_EQUALS( lang.str(), "spanish" ); + TEST( FTSLanguageV2, UpperCaseCode ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "ES", TEXT_INDEX_VERSION_2 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); } - TEST( FTSLanguage, NoneLanguage ) { - FTSLanguage lang; - Status s = lang.init( "none" ); - ASSERT( s.isOK() ); - ASSERT_EQUALS( lang.str(), "none" ); + TEST( FTSLanguageV2, NoneLanguage ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_2 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "none" ); } - // Negative tests for FTSLanguage::init() and FTSLanguage::str(). + // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_2. - TEST( FTSLanguage, Unknown ) { - FTSLanguage lang; - Status s = lang.init( "spanglish" ); - ASSERT( !s.isOK() ); + TEST( FTSLanguageV2, Unknown ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "spanglish", TEXT_INDEX_VERSION_2 ); + ASSERT( !swl.getStatus().isOK() ); } - TEST( FTSLanguage, Empty ) { - FTSLanguage lang; - Status s = lang.init( "" ); - ASSERT( !s.isOK() ); + TEST( FTSLanguageV2, Empty ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_2 ); + ASSERT( !swl.getStatus().isOK() ); } - // Positive tests for FTSLanguage::makeFTSLanguage(). + // Positive tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. - TEST( FTSLanguage, MakeFTSLanguage1 ) { - StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "english" ); + TEST( FTSLanguageV1, ExactLanguage ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "spanish", TEXT_INDEX_VERSION_1 ); ASSERT( swl.getStatus().isOK() ); - ASSERT_EQUALS( swl.getValue().str(), "english" ); + ASSERT_EQUALS( swl.getValue()->str(), "spanish" ); } - // Negative tests for FTSLanguage::makeFTSLanguage(). + TEST( FTSLanguageV1, DeprecatedLanguage ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "porter", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "porter" ); + } - TEST( FTSLanguage, MakeFTSLanguage2 ) { - StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "onglish" ); - ASSERT( !swl.getStatus().isOK() ); + TEST( FTSLanguageV1, StemmerOnlyLanguage1 ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "en", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "en" ); + } + + TEST( FTSLanguageV1, StemmerOnlyLanguage2 ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "eng", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "eng" ); + } + + TEST( FTSLanguageV1, NoneLanguage ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "none", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "none" ); + } + + // Negative tests for FTSLanguage::make() with TEXT_INDEX_VERSION_1. + + TEST( FTSLanguageV1, CaseSensitive ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "SPANISH", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "none" ); + } + + TEST( FTSLanguageV1, Unknown ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "asdf", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "none" ); + } + + TEST( FTSLanguageV1, Empty ) { + StatusWithFTSLanguage swl = FTSLanguage::make( "", TEXT_INDEX_VERSION_1 ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue()->str(), "none" ); } } diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index 2f5e215ce64..bab78397614 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -31,6 +31,7 @@ #include "mongo/pch.h" #include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/tokenizer.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -41,22 +42,23 @@ namespace mongo { using namespace mongoutils; - Status FTSQuery::parse(const string& query, const string& language) { + Status FTSQuery::parse(const string& query, const StringData& language) { _search = query; - Status status = _language.init( language ); - if ( !status.isOK() ) { - return status; + StatusWithFTSLanguage swl = FTSLanguage::make( language, TEXT_INDEX_VERSION_2 ); + if ( !swl.getStatus().isOK() ) { + return swl.getStatus(); } + _language = swl.getValue(); - const StopWords* stopWords = StopWords::getStopWords( _language ); - Stemmer stemmer( _language ); + const StopWords* stopWords = StopWords::getStopWords( *_language ); + Stemmer stemmer( *_language ); bool inNegation = false; bool inPhrase = false; unsigned quoteOffset = 0; - Tokenizer i( _language, query ); + Tokenizer i( *_language, query ); while ( i.more() ) { Token t = i.next(); diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index 4eec8d404c8..e7ee925b5ca 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -50,7 +50,7 @@ namespace mongo { class FTSQuery { public: - Status parse(const string& query, const string& language); + Status parse(const string& query, const StringData& language); const vector<string>& getTerms() const { return _terms; } const unordered_set<string>& getNegatedTerms() const { return _negatedTerms; } @@ -69,7 +69,7 @@ namespace mongo { } string getSearch() const { return _search; } - const FTSLanguage getLanguage() const { return _language; } + const FTSLanguage& getLanguage() const { return *_language; } string toString() const; @@ -77,7 +77,7 @@ namespace mongo { protected: string _search; - FTSLanguage _language; + const FTSLanguage* _language; vector<string> _terms; unordered_set<string> _negatedTerms; vector<string> _phrases; diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index be74ae79bf1..fc692b097d6 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -30,8 +30,9 @@ #include "mongo/pch.h" -#include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_spec.h" + +#include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -61,14 +62,37 @@ namespace mongo { } FTSSpec::FTSSpec( const BSONObj& indexInfo ) { + // indexInfo is a text index spec. Text index specs pass through fixSpec() before + // being saved to the system.indexes collection. fixSpec() enforces a schema, such that + // required fields must exist and be of the correct type (e.g. weights, + // textIndexVersion). massert( 16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj() ); - - Status status = _defaultLanguage.init( indexInfo["default_language"].String() ); - verify( status.isOK() ); + BSONElement textIndexVersionElt = indexInfo["textIndexVersion"]; + massert( 17367, + "found invalid spec for text index, expected number for textIndexVersion", + textIndexVersionElt.isNumber() ); + + // We currently support TEXT_INDEX_VERSION_1 (deprecated) and TEXT_INDEX_VERSION_2. + // Reject all other values. + massert( 17364, + str::stream() << "attempt to use unsupported textIndexVersion " << + textIndexVersionElt.numberInt() << "; versions supported: " << + TEXT_INDEX_VERSION_2 << ", " << TEXT_INDEX_VERSION_1, + textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 || + textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_1 ); + + _textIndexVersion = ( textIndexVersionElt.numberInt() == TEXT_INDEX_VERSION_2 ) ? + TEXT_INDEX_VERSION_2 : TEXT_INDEX_VERSION_1; + + // Initialize _defaultLanguage. Note that the FTSLanguage constructor requires + // textIndexVersion, since language parsing is version-specific. + StatusWithFTSLanguage swl = + FTSLanguage::make( indexInfo["default_language"].String(), _textIndexVersion ); + verify( swl.getStatus().isOK() ); // should not fail, since validated by fixSpec(). + _defaultLanguage = swl.getValue(); _languageOverrideField = indexInfo["language_override"].valuestrsafe(); - verify( validateOverride( _languageOverrideField ) ); _wildcard = false; @@ -116,8 +140,8 @@ namespace mongo { } } - const FTSLanguage FTSSpec::getLanguageToUse( const BSONObj& userDoc, - const FTSLanguage currentLanguage ) const { + const FTSLanguage& FTSSpec::_getLanguageToUseV2( const BSONObj& userDoc, + const FTSLanguage& currentLanguage ) const { BSONElement e = userDoc[_languageOverrideField]; if ( e.eoo() ) { return currentLanguage; @@ -125,11 +149,11 @@ namespace mongo { uassert( 17261, "found language override field in document with non-string type", e.type() == mongo::String ); - StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( e.String() ); + StatusWithFTSLanguage swl = FTSLanguage::make( e.String(), TEXT_INDEX_VERSION_2 ); uassert( 17262, "language override unsupported: " + e.String(), swl.getStatus().isOK() ); - return swl.getValue(); + return *swl.getValue(); } @@ -147,11 +171,18 @@ namespace mongo { } void FTSSpec::scoreDocument( const BSONObj& obj, - const FTSLanguage parentLanguage, + const FTSLanguage& parentLanguage, const string& parentPath, bool isArray, TermFrequencyMap* term_freqs ) const { - const FTSLanguage language = getLanguageToUse( obj, parentLanguage ); + + if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) { + dassert( parentPath == "" ); + dassert( !isArray ); + return _scoreDocumentV1( obj, term_freqs ); + } + + const FTSLanguage& language = _getLanguageToUseV2( obj, parentLanguage ); Stemmer stemmer( language ); Tools tools( language, &stemmer, StopWords::getStopWords( language ) ); @@ -209,7 +240,7 @@ namespace mongo { case String: // Only index strings on exact match or wildcard. if ( exactMatch || wildcard() ) { - _scoreString( tools, elem.valuestr(), term_freqs, weight ); + _scoreStringV2( tools, elem.valuestr(), term_freqs, weight ); } break; case Object: @@ -233,22 +264,10 @@ namespace mongo { } } - namespace { - struct ScoreHelperStruct { - ScoreHelperStruct() - : freq(0), count(0), exp(0){ - } - double freq; - double count; - double exp; - }; - typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap; - } - - void FTSSpec::_scoreString( const Tools& tools, - const StringData& raw, - TermFrequencyMap* docScores, - double weight ) const { + void FTSSpec::_scoreStringV2( const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight ) const { ScoreHelperMap terms; @@ -335,6 +354,10 @@ namespace mongo { } BSONObj FTSSpec::fixSpec( const BSONObj& spec ) { + if ( spec["textIndexVersion"].numberInt() == TEXT_INDEX_VERSION_1 ) { + return _fixSpecV1( spec ); + } + map<string,int> m; BSONObj keyPattern; @@ -477,7 +500,8 @@ namespace mongo { } uassert( 17264, "default_language is not valid", - FTSLanguage::makeFTSLanguage( default_language ).getStatus().isOK() ); + FTSLanguage::make( default_language, + TEXT_INDEX_VERSION_2 ).getStatus().isOK() ); BSONElement language_override_elt = spec["language_override"]; string language_override( language_override_elt.str() ); @@ -492,7 +516,7 @@ namespace mongo { } int version = -1; - int textIndexVersion = 2; + int textIndexVersion = TEXT_INDEX_VERSION_2; BSONObjBuilder b; BSONObjIterator i( spec ); @@ -523,7 +547,7 @@ namespace mongo { textIndexVersion = e.numberInt(); uassert( 16730, str::stream() << "bad textIndexVersion: " << textIndexVersion, - textIndexVersion == 2 ); + textIndexVersion == TEXT_INDEX_VERSION_2 ); } else { b.append( e ); diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index 258ecf7407a..570303f181b 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -46,23 +46,33 @@ namespace mongo { namespace fts { extern const double MAX_WEIGHT; + extern const double MAX_WORD_WEIGHT; typedef std::map<string,double> Weights; // TODO cool map typedef unordered_map<string,double> TermFrequencyMap; + struct ScoreHelperStruct { + ScoreHelperStruct() + : freq(0), count(0), exp(0){ + } + double freq; + double count; + double exp; + }; + typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap; class FTSSpec { struct Tools { - Tools( const FTSLanguage _language, + Tools( const FTSLanguage& _language, const Stemmer* _stemmer, const StopWords* _stopwords ) : language( _language ) , stemmer( _stemmer ) , stopwords( _stopwords ) {} - const FTSLanguage language; + const FTSLanguage& language; const Stemmer* stemmer; const StopWords* stopwords; }; @@ -71,7 +81,7 @@ namespace mongo { FTSSpec( const BSONObj& indexInfo ); bool wildcard() const { return _wildcard; } - const FTSLanguage defaultLanguage() const { return _defaultLanguage; } + const FTSLanguage& defaultLanguage() const { return *_defaultLanguage; } const string& languageOverrideField() const { return _languageOverrideField; } size_t numExtraBefore() const { return _extraBefore.size(); } @@ -89,7 +99,7 @@ namespace mongo { * - "term_freqs": out-parameter to store results */ void scoreDocument( const BSONObj& obj, - const FTSLanguage parentLanguage, + const FTSLanguage& parentLanguage, const string& parentPath, bool isArray, TermFrequencyMap* term_freqs ) const; @@ -102,20 +112,56 @@ namespace mongo { const Weights& weights() const { return _weights; } static BSONObj fixSpec( const BSONObj& spec ); + private: + // + // Helper methods. Invoked for TEXT_INDEX_VERSION_2 spec objects only. + // + /** * Get the language override for the given BSON doc. If no language override is * specified, returns currentLanguage. */ - const FTSLanguage getLanguageToUse( const BSONObj& userDoc, - const FTSLanguage currentLanguage ) const; + const FTSLanguage& _getLanguageToUseV2( const BSONObj& userDoc, + const FTSLanguage& currentLanguage ) const; + + /** + * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses + * 'raw' using 'tools', and weights term scores based on 'weight'. + */ + void _scoreStringV2( const Tools& tools, + const StringData& raw, + TermFrequencyMap* term_freqs, + double weight ) const; + + // + // Deprecated helper methods. Invoked for TEXT_INDEX_VERSION_1 spec objects only. + // + + void _scoreStringV1( const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight ) const; + + bool _weightV1( const StringData& field, double* out ) const; + + void _scoreRecurseV1( const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs ) const; + + void _scoreDocumentV1( const BSONObj& obj, TermFrequencyMap* term_freqs ) const; + + const FTSLanguage& _getLanguageToUseV1( const BSONObj& userDoc ) const; + + static BSONObj _fixSpecV1( const BSONObj& spec ); + + // + // Instance variables. + // - void _scoreString( const Tools& tools, - const StringData& raw, - TermFrequencyMap* term_freqs, - double weight ) const; + TextIndexVersion _textIndexVersion; - FTSLanguage _defaultLanguage; + const FTSLanguage* _defaultLanguage; string _languageOverrideField; bool _wildcard; diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp new file mode 100644 index 00000000000..556cac1e091 --- /dev/null +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -0,0 +1,320 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_spec.h" + +#include "mongo/util/mongoutils/str.h" + +namespace mongo { + + namespace fts { + + // + // This file contains functionality specific to indexing documents from TEXT_INDEX_VERSION_1 + // text indexes. + // + + using namespace mongoutils; + + namespace { + void _addFTSStuff( BSONObjBuilder* b ) { + b->append( "_fts", INDEX_NAME ); + b->append( "_ftsx", 1 ); + } + } + + const FTSLanguage& FTSSpec::_getLanguageToUseV1( const BSONObj& userDoc ) const { + BSONElement e = userDoc[_languageOverrideField]; + if ( e.type() == String ) { + const char * x = e.valuestrsafe(); + if ( strlen( x ) > 0 ) { + StatusWithFTSLanguage swl = FTSLanguage::make( x, TEXT_INDEX_VERSION_1 ); + dassert( swl.isOK() ); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. + return *swl.getValue(); + } + } + return *_defaultLanguage; + } + + void FTSSpec::_scoreStringV1( const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight ) const { + + ScoreHelperMap terms; + + unsigned numTokens = 0; + + Tokenizer i( tools.language, raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) + continue; + + string term = t.data.toString(); + makeLower( &term ); + if ( tools.stopwords->isStopWord( term ) ) + continue; + term = tools.stemmer->stem( term ); + + ScoreHelperStruct& data = terms[term]; + + if ( data.exp ) + data.exp *= 2; + else + data.exp = 1; + data.count += 1; + data.freq += ( 1 / data.exp ); + + numTokens++; + } + + for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + + const string& term = i->first; + const ScoreHelperStruct& data = i->second; + + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? + + double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) + adjustment += 0.1; + + double& score = (*docScores)[term]; + score += ( weight * data.freq * coeff * adjustment ); + verify( score <= MAX_WEIGHT ); + } + } + + bool FTSSpec::_weightV1( const StringData& field, double* out ) const { + Weights::const_iterator i = _weights.find( field.toString() ); + if ( i == _weights.end() ) + return false; + *out = i->second; + return true; + } + + /* + * Recurses over all fields of an obj (document in collection) + * and fills term,score map term_freqs + * @param tokenizer, tokenizer to tokenize a string into terms + * @param obj, object being parsed + * term_freqs, map <term,score> to be filled up + */ + void FTSSpec::_scoreRecurseV1( const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( languageOverrideField() == x.fieldName() ) + continue; + + if (x.type() == String) { + double w = 1; + _weightV1( x.fieldName(), &w ); + _scoreStringV1(tools, x.valuestr(), term_freqs, w); + } + else if ( x.isABSONObj() ) { + _scoreRecurseV1( tools, x.Obj(), term_freqs); + } + + } + } + + void FTSSpec::_scoreDocumentV1( const BSONObj& obj, + TermFrequencyMap* term_freqs ) const { + + const FTSLanguage& language = _getLanguageToUseV1( obj ); + + Stemmer stemmer(language); + Tools tools(language, &stemmer, StopWords::getStopWords( language )); + + if ( wildcard() ) { + // if * is specified for weight, we can recurse over all fields. + _scoreRecurseV1(tools, obj, term_freqs); + return; + } + + // otherwise, we need to remember the different weights for each field + // and act accordingly (in other words, call _score) + for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { + const char * leftOverName = i->first.c_str(); + // name of field + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + // weight associated to name of field + double weight = i->second; + + if ( e.eoo() ) { + // do nothing + } + else if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + if ( x.type() == String ) + _scoreStringV1( tools, x.valuestr(), term_freqs, weight ); + } + } + else if ( e.type() == String ) { + _scoreStringV1( tools, e.valuestr(), term_freqs, weight ); + } + + } + } + + BSONObj FTSSpec::_fixSpecV1( const BSONObj& spec ) { + map<string,int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; + bool addedFtsStuff = false; + + BSONObjIterator i( spec["key"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "_fts" ) || + str::equals( e.fieldName(), "_ftsx" ) ) { + addedFtsStuff = true; + b.append( e ); + } + else if ( e.type() == String && + ( str::equals( "fts", e.valuestr() ) || + str::equals( "text", e.valuestr() ) ) ) { + + if ( !addedFtsStuff ) { + _addFTSStuff( &b ); + addedFtsStuff = true; + } + + m[e.fieldName()] = 1; + } + else { + b.append( e ); + } + } + + if ( !addedFtsStuff ) + _addFTSStuff( &b ); + + keyPattern = b.obj(); + } + + if ( spec["weights"].isABSONObj() ) { + BSONObjIterator i( spec["weights"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + m[e.fieldName()] = e.numberInt(); + } + } + else if ( spec["weights"].str() == WILDCARD ) { + m[WILDCARD] = 1; + } + + BSONObj weights; + { + BSONObjBuilder b; + for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) { + uassert( 17365, "score for word too high", + i->second > 0 && i->second < MAX_WORD_WEIGHT ); + b.append( i->first, i->second ); + } + weights = b.obj(); + } + + string default_language(spec.getStringField("default_language")); + if ( default_language.empty() ) + default_language = "english"; + + string language_override(spec.getStringField("language_override")); + if ( language_override.empty() ) + language_override = "language"; + + int version = -1; + int textIndexVersion = 1; + + BSONObjBuilder b; + BSONObjIterator i( spec ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "key" ) ) { + b.append( "key", keyPattern ); + } + else if ( str::equals( e.fieldName(), "weights" ) ) { + b.append( "weights", weights ); + weights = BSONObj(); + } + else if ( str::equals( e.fieldName(), "default_language" ) ) { + b.append( "default_language", default_language); + default_language = ""; + } + else if ( str::equals( e.fieldName(), "language_override" ) ) { + b.append( "language_override", language_override); + language_override = ""; + } + else if ( str::equals( e.fieldName(), "v" ) ) { + version = e.numberInt(); + } + else if ( str::equals( e.fieldName(), "textIndexVersion" ) ) { + textIndexVersion = e.numberInt(); + uassert( 17366, + str::stream() << "bad textIndexVersion: " << textIndexVersion, + textIndexVersion == 1 ); + } + else { + b.append( e ); + } + } + + if ( !weights.isEmpty() ) + b.append( "weights", weights ); + if ( !default_language.empty() ) + b.append( "default_language", default_language); + if ( !language_override.empty() ) + b.append( "language_override", language_override); + + if ( version >= 0 ) + b.append( "v", version ); + + b.append( "textIndexVersion", textIndexVersion ); + + return b.obj(); + } + } +} diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp index 73c53758229..8323ecc0cea 100644 --- a/src/mongo/db/fts/fts_spec_test.cpp +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -160,6 +160,9 @@ namespace mongo { } TEST( FTSSpec, FixTextIndexVersion1 ) { + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 1.0}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(1)}}"); + assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(1)}}"); assertFixSuccess("{key: {a: 'text'}, textIndexVersion: 2.0}}"); assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberInt(2)}}"); assertFixSuccess("{key: {a: 'text'}, textIndexVersion: NumberLong(2)}}"); @@ -178,7 +181,7 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "title" << "cat sat run" ), - FTSLanguage::makeFTSLanguage( "english" ).getValue(), + spec.defaultLanguage(), "", false, &m ); @@ -197,7 +200,7 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ), - FTSLanguage::makeFTSLanguage( "english" ).getValue(), + spec.defaultLanguage(), "", false, &m ); @@ -220,7 +223,7 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "a" << BSON( "b" << "term" ) ), - FTSLanguage::makeFTSLanguage( "english" ).getValue(), + spec.defaultLanguage(), "", false, &m ); @@ -236,7 +239,7 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), - FTSLanguage::makeFTSLanguage( "english" ).getValue(), + spec.defaultLanguage(), "", false, &m ); @@ -308,11 +311,7 @@ namespace mongo { // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays TermFrequencyMap m; - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &m ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &m ); ASSERT_EQUALS( 1U, m.size() ); } @@ -323,11 +322,7 @@ namespace mongo { // The wildcard spec implies a full recursive traversal, so "term" should be indexed. BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays TermFrequencyMap m; - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &m ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &m ); ASSERT_EQUALS( 1U, m.size() ); } @@ -339,11 +334,7 @@ namespace mongo { // indexed. BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays TermFrequencyMap m; - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &m ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &m ); ASSERT_EQUALS( 0U, m.size() ); } @@ -362,11 +353,7 @@ namespace mongo { " }" " }" ); - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &tfm ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); set<string> hits; hits.insert("walk"); @@ -397,11 +384,7 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &tfm ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); set<string> hits; hits.insert("foredrag"); @@ -432,11 +415,7 @@ namespace mongo { " } ]" "}" ); - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &tfm ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); set<string> hits; hits.insert("foredrag"); @@ -469,11 +448,7 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &tfm ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); set<string> hits; hits.insert("foredrag"); @@ -506,11 +481,7 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &tfm ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); set<string> hits; hits.insert("foredrag"); @@ -545,11 +516,7 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, - FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "", - false, - &tfm ); + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); set<string> hits; hits.insert("foredrag"); @@ -564,6 +531,52 @@ namespace mongo { } + /** Test differences across textIndexVersion values in handling of nested arrays. */ + TEST( FTSSpec, TextIndexLegacyNestedArrays ) { + BSONObj obj = fromjson( "{a: [{b: ['hello']}]}" ); + + // textIndexVersion=1 FTSSpec objects do not index nested arrays. + { + BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 1}" ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); + ASSERT_EQUALS( tfm.size(), 0U ); + } + + // textIndexVersion=2 FTSSpec objects do index nested arrays. + { + BSONObj indexSpec = fromjson( "{key: {'a.b': 'text'}, textIndexVersion: 2}" ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); + ASSERT_EQUALS( tfm.size(), 1U ); + } + } + + /** Test differences across textIndexVersion values in handling of language annotations. */ + TEST( FTSSpec, TextIndexLegacyLanguageRecognition) { + BSONObj obj = fromjson( "{a: 'the', language: 'EN'}" ); + + // textIndexVersion=1 FTSSpec objects treat two-letter language annotations as "none" + // for purposes of stopword processing. + { + BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 1}" ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); + ASSERT_EQUALS( tfm.size(), 1U ); // "the" not recognized as stopword + } + + // textIndexVersion=2 FTSSpec objects recognize two-letter codes. + { + BSONObj indexSpec = fromjson( "{key: {'a': 'text'}, textIndexVersion: 2}" ); + FTSSpec spec( FTSSpec::fixSpec( indexSpec ) ); + TermFrequencyMap tfm; + spec.scoreDocument( obj, spec.defaultLanguage(), "", false, &tfm ); + ASSERT_EQUALS( tfm.size(), 0U ); // "the" recognized as stopword + } + } } } diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h index d16f605c16c..fe9ad0de341 100644 --- a/src/mongo/db/fts/fts_util.h +++ b/src/mongo/db/fts/fts_util.h @@ -44,6 +44,12 @@ namespace mongo { extern const std::string WILDCARD; extern const std::string INDEX_NAME; + enum TextIndexVersion { + TEXT_INDEX_VERSION_1 = 1, // Legacy index format. Deprecated. + TEXT_INDEX_VERSION_2 = 2 // Current index format. + }; + + /** * destructive! */ diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py index 2434e30fdc7..e0dc801ca92 100644 --- a/src/mongo/db/fts/generate_stop_words.py +++ b/src/mongo/db/fts/generate_stop_words.py @@ -10,13 +10,13 @@ def generate( header, source, language_files ): out = open( header, "wb" ) out.write( """ #pragma once -#include <map> #include <set> #include <string> +#include "mongo/util/string_map.h" namespace mongo { namespace fts { - void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ); + void loadStopWordMap( StringMap< std::set< std::string > >* m ); } } """ ) @@ -30,7 +30,7 @@ namespace fts { namespace mongo { namespace fts { - void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ) { + void loadStopWordMap( StringMap< std::set< std::string > >* m ) { """ ) diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index 5c4431f9712..a86cfda8015 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -32,12 +32,13 @@ #include <string> #include "mongo/db/fts/stemmer.h" +#include "mongo/util/mongoutils/str.h" namespace mongo { namespace fts { - Stemmer::Stemmer( const FTSLanguage language ) { + Stemmer::Stemmer( const FTSLanguage& language ) { _stemmer = NULL; if ( language.str() != "none" ) _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index 9b06bda4f2e..fe028e2aba7 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -48,7 +48,7 @@ namespace mongo { */ class Stemmer { public: - Stemmer( const FTSLanguage language ); + Stemmer( const FTSLanguage& language ); ~Stemmer(); std::string stem( const StringData& word ) const; diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index 1833f20fe37..9037715d4da 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -31,16 +31,23 @@ #include "mongo/unittest/unittest.h" +#include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/stemmer.h" namespace mongo { namespace fts { TEST( English, Stemmer1 ) { - Stemmer s( FTSLanguage::makeFTSLanguage( "english" ).getValue() ); + Stemmer s( languageEnglishV2 ); ASSERT_EQUALS( "run", s.stem( "running" ) ); ASSERT_EQUALS( "Run", s.stem( "Running" ) ); } + TEST( English, Caps ) { + Stemmer s( languagePorterV1 ); + ASSERT_EQUALS( "unit", s.stem( "united" ) ); + ASSERT_EQUALS( "Unite", s.stem( "United" ) ); + } + } } diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index d858992f5ce..bc0240600c1 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -28,14 +28,13 @@ * it in the license file. */ -#include <map> #include <set> #include <string> #include "mongo/db/fts/stop_words.h" #include "mongo/base/init.h" -#include "mongo/platform/unordered_map.h" +#include "mongo/util/string_map.h" @@ -43,10 +42,10 @@ namespace mongo { namespace fts { - void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ); + void loadStopWordMap( StringMap< std::set< std::string > >* m ); namespace { - unordered_map<string,StopWords*> STOP_WORDS; + StringMap<StopWords*> STOP_WORDS; StopWords* empty = NULL; } @@ -59,8 +58,8 @@ namespace mongo { _words.insert( *i ); } - const StopWords* StopWords::getStopWords( const FTSLanguage language ) { - unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( language.str() ); + const StopWords* StopWords::getStopWords( const FTSLanguage& language ) { + StringMap<StopWords*>::const_iterator i = STOP_WORDS.find( language.str() ); if ( i == STOP_WORDS.end() ) return empty; return i->second; @@ -70,9 +69,9 @@ namespace mongo { MONGO_INITIALIZER(StopWords)(InitializerContext* context) { empty = new StopWords(); - std::map< std::string, std::set< std::string > > raw; + StringMap< std::set< std::string > > raw; loadStopWordMap( &raw ); - for ( std::map< std::string, std::set< std::string > >::const_iterator i = raw.begin(); + for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin(); i != raw.end(); ++i ) { STOP_WORDS[i->first] = new StopWords( i->second ); diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index 24e433c6992..22ec22f3fa8 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -52,7 +52,7 @@ namespace mongo { size_t numStopWords() const { return _words.size(); } - static const StopWords* getStopWords( const FTSLanguage langauge ); + static const StopWords* getStopWords( const FTSLanguage& langauge ); private: ~StopWords(){} unordered_set<std::string> _words; diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index 4d6b78f7f6b..0edf4e2540c 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -28,6 +28,7 @@ * it in the license file. */ +#include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/stop_words.h" #include "mongo/unittest/unittest.h" @@ -35,8 +36,7 @@ namespace mongo { namespace fts { TEST( English, Basic1 ) { - FTSLanguage language = FTSLanguage::makeFTSLanguage( "english" ).getValue(); - const StopWords* englishStopWords = StopWords::getStopWords( language ); + const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 ); ASSERT( englishStopWords->isStopWord( "the" ) ); ASSERT( !englishStopWords->isStopWord( "computer" ) ); } diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index 1a25898bd75..53580fca4be 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -31,15 +31,16 @@ #include <string> #include "mongo/db/fts/tokenizer.h" +#include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" namespace mongo { namespace fts { - Tokenizer::Tokenizer( const FTSLanguage language, const StringData& str ) + Tokenizer::Tokenizer( const FTSLanguage& language, const StringData& str ) : _pos(0), _raw( str ) { - _english = language.str() == "english"; + _english = ( language.str() == "english" ); _skipWhitespace(); _previousWhiteSpace = true; } diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h index 6930f7543f6..06e63f6aa2f 100644 --- a/src/mongo/db/fts/tokenizer.h +++ b/src/mongo/db/fts/tokenizer.h @@ -61,7 +61,7 @@ namespace mongo { class Tokenizer { public: - Tokenizer( const FTSLanguage language, const StringData& str ); + Tokenizer( const FTSLanguage& language, const StringData& str ); bool more() const; Token next(); diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index eac91987c61..29153a329a6 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -28,6 +28,7 @@ * it in the license file. */ +#include "mongo/db/fts/fts_spec.h" #include "mongo/db/fts/tokenizer.h" #include "mongo/unittest/unittest.h" @@ -35,14 +36,12 @@ namespace mongo { namespace fts { TEST( Tokenizer, Empty1 ) { - Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "" ); + Tokenizer i( languageEnglishV2, "" ); ASSERT( !i.more() ); } TEST( Tokenizer, Basic1 ) { - Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "blue red green" ); + Tokenizer i( languageEnglishV2, "blue red green" ); ASSERT( i.more() ); ASSERT_EQUALS( i.next().data.toString(), "blue" ); @@ -57,8 +56,7 @@ namespace mongo { } TEST( Tokenizer, Basic2 ) { - Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "blue-red" ); + Tokenizer i( languageEnglishV2, "blue-red" ); Token a = i.next(); Token b = i.next(); @@ -80,8 +78,7 @@ namespace mongo { } TEST( Tokenizer, Basic3 ) { - Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "blue -red" ); + Tokenizer i( languageEnglishV2, "blue -red" ); Token a = i.next(); Token b = i.next(); @@ -108,8 +105,7 @@ namespace mongo { } TEST( Tokenizer, Quote1English ) { - Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), - "eliot's car" ); + Tokenizer i( languageEnglishV2, "eliot's car" ); Token a = i.next(); Token b = i.next(); @@ -119,8 +115,7 @@ namespace mongo { } TEST( Tokenizer, Quote1French ) { - Tokenizer i( FTSLanguage::makeFTSLanguage( "french" ).getValue(), - "eliot's car" ); + Tokenizer i( languageFrenchV2, "eliot's car" ); Token a = i.next(); Token b = i.next(); diff --git a/src/mongo/db/matcher/expression_parser_text.cpp b/src/mongo/db/matcher/expression_parser_text.cpp index 3b2b6bd6ea0..86b16e71130 100644 --- a/src/mongo/db/matcher/expression_parser_text.cpp +++ b/src/mongo/db/matcher/expression_parser_text.cpp @@ -30,6 +30,7 @@ #include "mongo/base/init.h" #include "mongo/db/fts/fts_language.h" +#include "mongo/db/fts/fts_spec.h" #include "mongo/db/jsobj.h" #include "mongo/db/matcher/expression_parser.h" #include "mongo/db/matcher/expression_text.h" @@ -52,7 +53,9 @@ namespace mongo { "$language needs a String" ); } language = languageElt.String(); - if ( !fts::FTSLanguage::makeFTSLanguage( language ).getStatus().isOK() ) { + Status status = + fts::FTSLanguage::make( language, fts::TEXT_INDEX_VERSION_2 ).getStatus(); + if ( !status.isOK() ) { return StatusWithMatchExpression( ErrorCodes::BadValue, "$language specifies unsupported language" ); } diff --git a/src/mongo/db/query/stage_builder.cpp b/src/mongo/db/query/stage_builder.cpp index 781e4eaff19..764675bdd5d 100644 --- a/src/mongo/db/query/stage_builder.cpp +++ b/src/mongo/db/query/stage_builder.cpp @@ -234,9 +234,9 @@ namespace mongo { return NULL; } - string language = ("" == node->_language - ? fam->getSpec().defaultLanguage().str() - : node->_language); + StringData language = ("" == node->_language + ? fam->getSpec().defaultLanguage().str().c_str() + : node->_language); FTSQuery ftsq; Status parseStatus = ftsq.parse(node->_query, language); |