diff options
author | Jason Rassi <rassi@10gen.com> | 2013-11-13 03:46:19 -0500 |
---|---|---|
committer | Jason Rassi <rassi@10gen.com> | 2013-11-13 03:46:19 -0500 |
commit | 075ff3c615cc9c53aab272af32a180d9f1ceda4a (patch) | |
tree | a392a303536389756834a3909f88516155ac013f | |
parent | ef9a27f0feaa44ad4f897e49a52c9b755cf3d496 (diff) | |
download | mongo-075ff3c615cc9c53aab272af32a180d9f1ceda4a.tar.gz |
SERVER-9932 SERVER-10857 Text search language aliases/validation
Users can now use two-letter language codes and can specify
languages in mixed case.
The following operations will now fail if they do not pass language
validation:
- Building a text index (if default_language fails validation or if
the target collection contains a document with a language that
fails validation).
- Inserting a document into a text-indexed collection (if it
specifies a language that fails validation).
- A text query or text command (if it specifies a language that
fails validation).
26 files changed, 567 insertions, 90 deletions
diff --git a/jstests/fts_mix.js b/jstests/fts_mix.js index 7acff093480..56da123cdc3 100644 --- a/jstests/fts_mix.js +++ b/jstests/fts_mix.js @@ -67,6 +67,13 @@ res = tc.runCommand( "text", { search: "magazine's" } ); res2 = tc.runCommand( "text", { search: "magazine" } ); assert.eq( getIDS( res ), getIDS( res2 ) ); +// -------------------------------------------- LANGUAGE ------------------------------------------- + +res = tc.runCommand( "text", { search: "member", language: "spanglish" } ); +assert.commandFailed( res ); +res = tc.runCommand( "text", { search: "member", language: "english" } ); +assert.commandWorked( res ); + // -------------------------------------------- LIMIT RESULTS -------------------------------------- // ensure limit limits results diff --git a/jstests/fts_querylang.js b/jstests/fts_querylang.js index ebc4e05c216..f03b1836853 100644 --- a/jstests/fts_querylang.js +++ b/jstests/fts_querylang.js @@ -66,7 +66,15 @@ assert.throws(function() { t.find({$or: [{$text: {$search: "content -irrelevant" // TODO Test invalid inputs for $text, $search, $language. -// TODO Test $language. +// Test $language. +cursor = t.find({$text: {$search: "contents", $language: "none"}}); +assert.eq(false, cursor.hasNext()); + +cursor = t.find({$text: {$search: "contents", $language: "EN"}}); +assert.eq(true, cursor.hasNext()); + +cursor = t.find({$text: {$search: "contents", $language: "spanglish"}}); +assert.throws(function() { cursor.next() }); // TODO Test $and of basic text query with geo expression. diff --git a/jstests/fts_spanish.js b/jstests/fts_spanish.js index 136eaf17ae1..cdf73343b5f 100644 --- a/jstests/fts_spanish.js +++ b/jstests/fts_spanish.js @@ -7,26 +7,25 @@ t.drop(); t.save( { _id: 1, title: "mi blog", text: "Este es un blog de prueba" } ); t.save( { _id: 2, title: "mi segundo post", text: "Este es un blog de prueba" } ); t.save( { _id: 3, title: "cuchillos son divertidos", text: "este es mi tercer blog stemmed" } ); -t.save( { _id: 4, language: "english", title: "My fourth blog", text: "This stemmed blog is in english" } ); +t.save( { _id: 4, language: "en", title: "My fourth blog", text: "This stemmed blog is in english" } ); // default weight is 1 // specify weights if you want a field to be more meaningull t.ensureIndex( { "title": "text", text: "text" }, { weights: { title: 10 }, - default_language: "spanish" } ); + default_language: "es" } ); res = t.runCommand( "text", { search: "blog" } ); assert.eq( 4, res.results.length ); assert.eq( [4], queryIDS( t, "stem" ) ); assert.eq( [3], queryIDS( t, "stemmed" ) ); -assert.eq( [4], queryIDS( t, "stemmed", null, { language : "english" } ) ); +assert.eq( [4], queryIDS( t, "stemmed", null, { language : "en" } ) ); assert.eq( [1,2], queryIDS( t, "prueba" ) ); +t.save( { _id: 5, language: "spanglish", title: "", text: "" } ); +assert( db.getLastError() ); - - - - - - +t.dropIndexes(); +t.ensureIndex( { "title": "text", text: "text" }, { default_language: "spanglish" } ); +assert( db.getLastError() ); diff --git a/src/mongo/db/exec/stagedebug_cmd.cpp b/src/mongo/db/exec/stagedebug_cmd.cpp index 261baf036c4..4c43a7b4e0f 100644 --- a/src/mongo/db/exec/stagedebug_cmd.cpp +++ b/src/mongo/db/exec/stagedebug_cmd.cpp @@ -375,7 +375,7 @@ namespace mongo { params.spec = fam->getSpec(); - if (!params.query.parse(search, fam->getSpec().defaultLanguage()).isOK()) { + if (!params.query.parse(search, fam->getSpec().defaultLanguage().str()).isOK()) { return NULL; } diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript index 369c8f7892d..e1a1ebc2ec1 100644 --- a/src/mongo/db/fts/SConscript +++ b/src/mongo/db/fts/SConscript @@ -29,6 +29,7 @@ env.StaticLibrary('base', [ 'fts_matcher.cpp', 'fts_query.cpp', 'fts_spec.cpp', + 'fts_language.cpp', 'fts_util.cpp', 'stemmer.cpp', 'stop_words.cpp', @@ -74,6 +75,9 @@ env.CppUnitTest( "fts_query_test", "fts_query_test.cpp", env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp", LIBDEPS=["base"] ) +env.CppUnitTest( "fts_language_test", "fts_language_test.cpp", + LIBDEPS=["base"] ) + env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp", LIBDEPS=["base"] ) diff --git a/src/mongo/db/fts/fts_command_mongod.cpp b/src/mongo/db/fts/fts_command_mongod.cpp index 2a95e79038b..74022b7b9d4 100644 --- a/src/mongo/db/fts/fts_command_mongod.cpp +++ b/src/mongo/db/fts/fts_command_mongod.cpp @@ -107,7 +107,7 @@ namespace mongo { IndexDescriptor* descriptor = collection->getIndexCatalog()->getDescriptor(idxMatches[0]); auto_ptr<FTSAccessMethod> fam(new FTSAccessMethod(descriptor)); if ( language == "" ) { - language = fam->getSpec().defaultLanguage(); + language = fam->getSpec().defaultLanguage().str(); } Status s = fam->getSpec().getIndexPrefix( filter, &indexPrefix ); if ( !s.isOK() ) { diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp new file mode 100644 index 00000000000..61bd33cf93f --- /dev/null +++ b/src/mongo/db/fts/fts_language.cpp @@ -0,0 +1,152 @@ +// fts_language.cpp + +/** + * Copyright (C) 2013 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/fts/fts_language.h" + +#include <string> + +#include "mongo/base/init.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/string_map.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + namespace { + + // Supported languages in canonical form (English names, lowercased). Includes "none". + const string LanguageNone( "none" ); + const string LanguageDanish( "danish" ); + const string LanguageDutch( "dutch" ); + const string LanguageEnglish( "english" ); + const string LanguageFinnish( "finnish" ); + const string LanguageFrench( "french" ); + const string LanguageGerman( "german" ); + const string LanguageHungarian( "hungarian" ); + const string LanguageItalian( "italian" ); + const string LanguageNorwegian( "norwegian" ); + const string LanguagePortuguese( "portuguese" ); + const string LanguageRomanian( "romanian" ); + const string LanguageRussian( "russian" ); + const string LanguageSpanish( "spanish" ); + const string LanguageSwedish( "swedish" ); + const string LanguageTurkish( "turkish" ); + + // Map from lowercased user string to language string. Resolves any language aliases + // (two-letter codes). + typedef StringMap<std::string> LanguageMap; + LanguageMap languageMap; + } + + MONGO_INITIALIZER( FTSLanguageMap )( InitializerContext* context ) { + languageMap[LanguageNone] = LanguageNone; + + languageMap["da"] = LanguageDanish; + languageMap[LanguageDanish] = LanguageDanish; + languageMap["nl"] = LanguageDutch; + languageMap[LanguageDutch] = LanguageDutch; + languageMap["en"] = LanguageEnglish; + languageMap[LanguageEnglish] = LanguageEnglish; + languageMap["fi"] = LanguageFinnish; + languageMap[LanguageFinnish] = LanguageFinnish; + languageMap["fr"] = LanguageFrench; + languageMap[LanguageFrench] = LanguageFrench; + languageMap["de"] = LanguageGerman; + languageMap[LanguageGerman] = LanguageGerman; + languageMap["hu"] = LanguageHungarian; + languageMap[LanguageHungarian] = LanguageHungarian; + languageMap["it"] = LanguageItalian; + languageMap[LanguageItalian] = LanguageItalian; + languageMap["nb"] = LanguageNorwegian; + languageMap[LanguageNorwegian] = LanguageNorwegian; + languageMap["pt"] = LanguagePortuguese; + languageMap[LanguagePortuguese] = LanguagePortuguese; + languageMap["ro"] = LanguageRomanian; + languageMap[LanguageRomanian] = LanguageRomanian; + languageMap["ru"] = LanguageRussian; + languageMap[LanguageRussian] = LanguageRussian; + languageMap["es"] = LanguageSpanish; + languageMap[LanguageSpanish] = LanguageSpanish; + languageMap["sv"] = LanguageSwedish; + languageMap[LanguageSwedish] = LanguageSwedish; + languageMap["tr"] = LanguageTurkish; + languageMap[LanguageTurkish] = LanguageTurkish; + return Status::OK(); + } + + FTSLanguage::FTSLanguage() + : _lang() { + } + + FTSLanguage::FTSLanguage( const FTSLanguage& other ) + : _lang( other._lang ) { + } + + FTSLanguage& FTSLanguage::operator=( const FTSLanguage& other ) { + _lang = other._lang; + return *this; + } + + FTSLanguage::~FTSLanguage() { + } + + Status FTSLanguage::init( const std::string& lang ) { + // Lowercase. + std::string langLower = tolowerString( lang ); + + // Resolve language aliases. + LanguageMap::const_iterator it = languageMap.find( langLower ); + if ( it == languageMap.end() ) { + return Status( ErrorCodes::BadValue, + "unsupported language: \"" + lang + "\"" ); + } + + _lang = StringData( it->second ); + return Status::OK(); + } + + std::string FTSLanguage::str() const { + verify( !_lang.empty() ); + return _lang.toString(); + } + + StatusWithFTSLanguage FTSLanguage::makeFTSLanguage( const std::string& lang ) { + FTSLanguage language; + Status s = language.init( lang ); + if ( !s.isOK() ) { + return StatusWithFTSLanguage( s ); + } + return StatusWithFTSLanguage( language ); + } + + } +} diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h new file mode 100644 index 00000000000..9a1d7053ea8 --- /dev/null +++ b/src/mongo/db/fts/fts_language.h @@ -0,0 +1,92 @@ +// fts_language.h + +/** + * Copyright (C) 2013 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/status_with.h" + +#include <string> + +namespace mongo { + + namespace fts { + + /** + * A FTSLanguage is a copyable glorified enum representing a language for a text-indexed + * document or a text search. Example of suggested usage: + * + * StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "en" ); + * if ( !swl.getStatus().isOK() ) { + * // Error. + * } + * else { + * const FTSLanguage language = swl.getValue(); + * // Use language. + * } + */ + class FTSLanguage { + public: + /** Create an uninitialized language. */ + FTSLanguage(); + + ~FTSLanguage(); + FTSLanguage( const FTSLanguage& ); + FTSLanguage& operator=( const FTSLanguage & ); + + /** + * Initialize an FTSLanguage from a language string. Language strings are + * case-insensitive, and can be in one of the two following forms: + * - English name, like "spanish". + * - Two-letter code, like "es". + * Returns an error Status if an invalid language string is passed. + */ + Status init( const std::string& lang ); + + /** + * Returns the language as a string in canonical form (lowercased English name). It is + * an error to call str() on an uninitialized language. + */ + std::string str() const; + + /** + * Convenience method for creating an FTSLanguage out of a language string. Caller + * must check getStatus().isOK() on return value. + */ + static StatusWith<const FTSLanguage> makeFTSLanguage( const std::string& lang ); + + private: + // Pointer to string representation of language. Not owned here. + StringData _lang; + }; + + typedef StatusWith<const FTSLanguage> StatusWithFTSLanguage; + + } +} diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp new file mode 100644 index 00000000000..5fdd9a4aa73 --- /dev/null +++ b/src/mongo/db/fts/fts_language_test.cpp @@ -0,0 +1,106 @@ +// fts_language_test.cpp + +/** + * Copyright (C) 2013 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/pch.h" +#include "mongo/db/fts/fts_language.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + + namespace fts { + + // Positive tests for FTSLanguage::init() and FTSLanguage::str(). + + TEST( FTSLanguage, ExactLanguage ) { + FTSLanguage lang; + Status s = lang.init( "spanish" ); + ASSERT( s.isOK() ); + ASSERT_EQUALS( lang.str(), "spanish" ); + } + + TEST( FTSLanguage, ExactCode ) { + FTSLanguage lang; + Status s = lang.init( "es" ); + ASSERT( s.isOK() ); + ASSERT_EQUALS( lang.str(), "spanish" ); + } + + TEST( FTSLanguage, UpperCaseLanguage ) { + FTSLanguage lang; + Status s = lang.init( "SPANISH" ); + ASSERT( s.isOK() ); + ASSERT_EQUALS( lang.str(), "spanish" ); + } + + TEST( FTSLanguage, UpperCaseCode ) { + FTSLanguage lang; + Status s = lang.init( "ES" ); + ASSERT( s.isOK() ); + ASSERT_EQUALS( lang.str(), "spanish" ); + } + + TEST( FTSLanguage, NoneLanguage ) { + FTSLanguage lang; + Status s = lang.init( "none" ); + ASSERT( s.isOK() ); + ASSERT_EQUALS( lang.str(), "none" ); + } + + // Negative tests for FTSLanguage::init() and FTSLanguage::str(). + + TEST( FTSLanguage, Unknown ) { + FTSLanguage lang; + Status s = lang.init( "spanglish" ); + ASSERT( !s.isOK() ); + } + + TEST( FTSLanguage, Empty ) { + FTSLanguage lang; + Status s = lang.init( "" ); + ASSERT( !s.isOK() ); + } + + // Positive tests for FTSLanguage::makeFTSLanguage(). + + TEST( FTSLanguage, MakeFTSLanguage1 ) { + StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "english" ); + ASSERT( swl.getStatus().isOK() ); + ASSERT_EQUALS( swl.getValue().str(), "english" ); + } + + // Negative tests for FTSLanguage::makeFTSLanguage(). + + TEST( FTSLanguage, MakeFTSLanguage2 ) { + StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "onglish" ); + ASSERT( !swl.getStatus().isOK() ); + } + + } +} diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index b03871cc9dd..2f5e215ce64 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -43,10 +43,13 @@ namespace mongo { Status FTSQuery::parse(const string& query, const string& language) { _search = query; - _language = language; + Status status = _language.init( language ); + if ( !status.isOK() ) { + return status; + } - const StopWords* stopWords = StopWords::getStopWords( language ); - Stemmer stemmer( language ); + const StopWords* stopWords = StopWords::getStopWords( _language ); + Stemmer stemmer( _language ); bool inNegation = false; bool inPhrase = false; diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index 1bb3a9e9e17..4eec8d404c8 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -69,7 +69,7 @@ namespace mongo { } string getSearch() const { return _search; } - string getLanguage() const { return _language; } + const FTSLanguage getLanguage() const { return _language; } string toString() const; @@ -77,7 +77,7 @@ namespace mongo { protected: string _search; - string _language; + FTSLanguage _language; vector<string> _terms; unordered_set<string> _negatedTerms; vector<string> _phrases; diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index d3339078628..6f88df3534d 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -45,15 +45,19 @@ namespace mongo { const double MAX_WEIGHT = 1000000000; const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000; + namespace { + // Default language. Used for new indexes. + const std::string moduleDefaultLanguage( "english" ); + } + FTSSpec::FTSSpec( const BSONObj& indexInfo ) { massert( 16739, "found invalid spec for text index", indexInfo["weights"].isABSONObj() ); - _defaultLanguage = indexInfo["default_language"].valuestrsafe(); - _languageOverrideField = indexInfo["language_override"].valuestrsafe(); + Status status = _defaultLanguage.init( indexInfo["default_language"].String() ); + verify( status.isOK() ); - if ( _defaultLanguage.size() == 0 ) - _defaultLanguage = "english"; + _languageOverrideField = indexInfo["language_override"].valuestrsafe(); if ( _languageOverrideField.size() == 0 ) _languageOverrideField = "language"; @@ -103,15 +107,20 @@ namespace mongo { } } - string FTSSpec::getLanguageToUse( const BSONObj& userDoc, - const string& currentLanguage ) const { + const FTSLanguage FTSSpec::getLanguageToUse( const BSONObj& userDoc, + const FTSLanguage currentLanguage ) const { BSONElement e = userDoc[_languageOverrideField]; - if ( e.type() == String ) { - const char * x = e.valuestrsafe(); - if ( strlen( x ) > 0 ) - return x; + if ( e.eoo() ) { + return currentLanguage; } - return currentLanguage; + uassert( 17261, + "found language override field in document with non-string type", + e.type() == mongo::String ); + StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( e.String() ); + uassert( 17262, + "language override unsupported: " + e.String(), + swl.getStatus().isOK() ); + return swl.getValue(); } @@ -129,11 +138,11 @@ namespace mongo { } void FTSSpec::scoreDocument( const BSONObj& obj, - const string& parentLanguage, + const FTSLanguage parentLanguage, const string& parentPath, bool isArray, TermFrequencyMap* term_freqs ) const { - string language = getLanguageToUse( obj, parentLanguage ); + const FTSLanguage language = getLanguageToUse( obj, parentLanguage ); Stemmer stemmer( language ); Tools tools( language, &stemmer, StopWords::getStopWords( language ) ); @@ -374,9 +383,19 @@ namespace mongo { weights = b.obj(); } - string default_language(spec.getStringField("default_language")); - if ( default_language.empty() ) - default_language = "english"; + BSONElement default_language_elt = spec["default_language"]; + string default_language( default_language_elt.str() ); + if ( default_language_elt.eoo() ) { + default_language = moduleDefaultLanguage; + } + else { + uassert( 17263, + "default_language needs a string type", + default_language_elt.type() == String ); + } + uassert( 17264, + "default_language is not valid", + FTSLanguage::makeFTSLanguage( default_language ).getStatus().isOK() ); string language_override(spec.getStringField("language_override")); if ( language_override.empty() ) diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index d13281eac9f..258ecf7407a 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -34,6 +34,7 @@ #include <vector> #include <string> +#include "mongo/db/fts/fts_language.h" #include "mongo/db/fts/fts_util.h" #include "mongo/db/fts/stemmer.h" #include "mongo/db/fts/stop_words.h" @@ -54,14 +55,14 @@ namespace mongo { class FTSSpec { struct Tools { - Tools( string _language, + Tools( const FTSLanguage _language, const Stemmer* _stemmer, const StopWords* _stopwords ) : language( _language ) , stemmer( _stemmer ) , stopwords( _stopwords ) {} - const std::string& language; + const FTSLanguage language; const Stemmer* stemmer; const StopWords* stopwords; }; @@ -70,7 +71,7 @@ namespace mongo { FTSSpec( const BSONObj& indexInfo ); bool wildcard() const { return _wildcard; } - const string& defaultLanguage() const { return _defaultLanguage; } + const FTSLanguage defaultLanguage() const { return _defaultLanguage; } const string& languageOverrideField() const { return _languageOverrideField; } size_t numExtraBefore() const { return _extraBefore.size(); } @@ -80,13 +81,6 @@ namespace mongo { const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; } /** - * Find a "language" field, if any, in a given BSON doc. If the language is not on the - * list of valid languages, return current. - */ - string getLanguageToUse( const BSONObj& userDoc, - const std::string& currentLanguage ) const; - - /** * Calculates term/score pairs for a BSONObj as applied to this spec. * - "obj": the BSONObj to traverse; can be a subdocument or array * - "parentLanguage": nearest enclosing document "language" spec for obj @@ -95,7 +89,7 @@ namespace mongo { * - "term_freqs": out-parameter to store results */ void scoreDocument( const BSONObj& obj, - const string& parentLanguage, + const FTSLanguage parentLanguage, const string& parentPath, bool isArray, TermFrequencyMap* term_freqs ) const; @@ -109,12 +103,19 @@ namespace mongo { static BSONObj fixSpec( const BSONObj& spec ); private: + /** + * Get the language override for the given BSON doc. If no language override is + * specified, returns currentLanguage. + */ + const FTSLanguage getLanguageToUse( const BSONObj& userDoc, + const FTSLanguage currentLanguage ) const; + void _scoreString( const Tools& tools, const StringData& raw, TermFrequencyMap* term_freqs, double weight ) const; - string _defaultLanguage; + FTSLanguage _defaultLanguage; string _languageOverrideField; bool _wildcard; diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp index b8bb292c777..1cbc9e61eeb 100644 --- a/src/mongo/db/fts/fts_spec_test.cpp +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -47,6 +47,29 @@ namespace mongo { ASSERT_EQUALS( fixed, fixed2 ); } + TEST( FTSSpec, DefaultLanguage1 ) { + BSONObj user = BSON( "key" << BSON( "text" << "fts" ) << + "default_language" << "spanish" ); + + try { + BSONObj fixed = FTSSpec::fixSpec( user ); + } + catch ( UserException& e ) { + ASSERT(false); + } + } + + TEST( FTSSpec, DefaultLanguage2 ) { + BSONObj user = BSON( "key" << BSON( "text" << "fts" ) << + "default_language" << "spanglish" ); + + try { + BSONObj fixed = FTSSpec::fixSpec( user ); + ASSERT(false); + } + catch ( UserException& e ) {} + } + TEST( FTSSpec, ScoreSingleField1 ) { BSONObj user = BSON( "key" << BSON( "title" << "fts" << "text" << "fts" ) << @@ -56,7 +79,10 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "title" << "cat sat run" ), - "english", "", false, &m ); + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &m ); ASSERT_EQUALS( 3U, m.size() ); ASSERT_EQUALS( m["cat"], m["sat"] ); ASSERT_EQUALS( m["cat"], m["run"] ); @@ -72,7 +98,10 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ), - "english", "", false, &m ); + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &m ); ASSERT_EQUALS( 4U, m.size() ); ASSERT_EQUALS( m["sat"], m["run"] ); @@ -94,7 +123,10 @@ namespace mongo { TermFrequencyMap m; spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), - "english", "", false, &m ); + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &m ); ASSERT_EQUALS( 3U, m.size() ); ASSERT( m["cat"] > 0 ); ASSERT( m["sat"] > m["cat"] ); @@ -163,7 +195,11 @@ namespace mongo { // The following document matches {"a.b": {$type: 2}}, so "term" should be indexed. BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays TermFrequencyMap m; - spec.scoreDocument( obj, "english", "", false, &m ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &m ); ASSERT_EQUALS( 1U, m.size() ); } @@ -174,7 +210,11 @@ namespace mongo { // The wildcard spec implies a full recursive traversal, so "term" should be indexed. BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays TermFrequencyMap m; - spec.scoreDocument( obj, "english", "", false, &m ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &m ); ASSERT_EQUALS( 1U, m.size() ); } @@ -186,7 +226,11 @@ namespace mongo { // indexed. BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays TermFrequencyMap m; - spec.scoreDocument( obj, "english", "", false, &m ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &m ); ASSERT_EQUALS( 0U, m.size() ); } @@ -205,7 +249,11 @@ namespace mongo { " }" " }" ); - spec.scoreDocument( obj, "english", "", false, &tfm ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &tfm ); set<string> hits; hits.insert("walk"); @@ -236,7 +284,11 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, "english", "", false, &tfm ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &tfm ); set<string> hits; hits.insert("foredrag"); @@ -267,7 +319,11 @@ namespace mongo { " } ]" "}" ); - spec.scoreDocument( obj, "english", "", false, &tfm ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &tfm ); set<string> hits; hits.insert("foredrag"); @@ -300,7 +356,11 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, "english", "", false, &tfm ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &tfm ); set<string> hits; hits.insert("foredrag"); @@ -333,7 +393,11 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, "english", "", false, &tfm ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &tfm ); set<string> hits; hits.insert("foredrag"); @@ -368,7 +432,11 @@ namespace mongo { " }" "}" ); - spec.scoreDocument( obj, "english", "", false, &tfm ); + spec.scoreDocument( obj, + FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "", + false, + &tfm ); set<string> hits; hits.insert("foredrag"); diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index 878bf1bf0dc..5c4431f9712 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -37,10 +37,10 @@ namespace mongo { namespace fts { - Stemmer::Stemmer( const string& language ) { + Stemmer::Stemmer( const FTSLanguage language ) { _stemmer = NULL; - if ( language != "none" ) - _stemmer = sb_stemmer_new(language.c_str(), "UTF_8"); + if ( language.str() != "none" ) + _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); } Stemmer::~Stemmer() { diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index 521f41490fa..9b06bda4f2e 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -34,6 +34,7 @@ #include <string> #include "mongo/base/string_data.h" +#include "mongo/db/fts/fts_language.h" #include "third_party/libstemmer_c/include/libstemmer.h" namespace mongo { @@ -47,7 +48,7 @@ namespace mongo { */ class Stemmer { public: - Stemmer( const std::string& language ); + Stemmer( const FTSLanguage language ); ~Stemmer(); std::string stem( const StringData& word ) const; diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index 8f8003fb3ee..1833f20fe37 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -37,18 +37,10 @@ namespace mongo { namespace fts { TEST( English, Stemmer1 ) { - Stemmer s( "english" ); + Stemmer s( FTSLanguage::makeFTSLanguage( "english" ).getValue() ); ASSERT_EQUALS( "run", s.stem( "running" ) ); ASSERT_EQUALS( "Run", s.stem( "Running" ) ); } - - TEST( English, Caps ) { - Stemmer s( "porter" ); - ASSERT_EQUALS( "unit", s.stem( "united" ) ); - ASSERT_EQUALS( "Unite", s.stem( "United" ) ); - } - - } } diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index ee22da9dad7..d858992f5ce 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -59,8 +59,8 @@ namespace mongo { _words.insert( *i ); } - const StopWords* StopWords::getStopWords( const std::string& langauge ) { - unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( langauge ); + const StopWords* StopWords::getStopWords( const FTSLanguage language ) { + unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( language.str() ); if ( i == STOP_WORDS.end() ) return empty; return i->second; diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index a2c9cf8ff69..24e433c6992 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -34,6 +34,7 @@ #include <set> #include <string> +#include "mongo/db/fts/fts_language.h" #include "mongo/platform/unordered_set.h" namespace mongo { @@ -51,7 +52,7 @@ namespace mongo { size_t numStopWords() const { return _words.size(); } - static const StopWords* getStopWords( const std::string& langauge ); + static const StopWords* getStopWords( const FTSLanguage langauge ); private: ~StopWords(){} unordered_set<std::string> _words; diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index a365480aa72..4d6b78f7f6b 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -35,9 +35,10 @@ namespace mongo { namespace fts { TEST( English, Basic1 ) { - const StopWords* english = StopWords::getStopWords( "english" ); - ASSERT( english->isStopWord( "the" ) ); - ASSERT( !english->isStopWord( "computer" ) ); + FTSLanguage language = FTSLanguage::makeFTSLanguage( "english" ).getValue(); + const StopWords* englishStopWords = StopWords::getStopWords( language ); + ASSERT( englishStopWords->isStopWord( "the" ) ); + ASSERT( !englishStopWords->isStopWord( "computer" ) ); } } diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index 050210a6e1b..1a25898bd75 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -37,9 +37,9 @@ namespace mongo { namespace fts { - Tokenizer::Tokenizer( const string& language, const StringData& str ) + Tokenizer::Tokenizer( const FTSLanguage language, const StringData& str ) : _pos(0), _raw( str ) { - _english = language == "english"; + _english = language.str() == "english"; _skipWhitespace(); _previousWhiteSpace = true; } diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h index d820338b0b0..6930f7543f6 100644 --- a/src/mongo/db/fts/tokenizer.h +++ b/src/mongo/db/fts/tokenizer.h @@ -34,6 +34,7 @@ #include <string> #include "mongo/base/string_data.h" +#include "mongo/db/fts/fts_language.h" #include "mongo/platform/unordered_map.h" #include "mongo/platform/unordered_set.h" @@ -60,7 +61,7 @@ namespace mongo { class Tokenizer { public: - Tokenizer( const std::string& language, const StringData& str ); + Tokenizer( const FTSLanguage language, const StringData& str ); bool more() const; Token next(); diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index 5d60a50769e..eac91987c61 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -35,12 +35,14 @@ namespace mongo { namespace fts { TEST( Tokenizer, Empty1 ) { - Tokenizer i( "english", "" ); + Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "" ); ASSERT( !i.more() ); } TEST( Tokenizer, Basic1 ) { - Tokenizer i( "english", "blue red green" ); + Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "blue red green" ); ASSERT( i.more() ); ASSERT_EQUALS( i.next().data.toString(), "blue" ); @@ -55,7 +57,8 @@ namespace mongo { } TEST( Tokenizer, Basic2 ) { - Tokenizer i( "english", "blue-red" ); + Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "blue-red" ); Token a = i.next(); Token b = i.next(); @@ -77,7 +80,8 @@ namespace mongo { } TEST( Tokenizer, Basic3 ) { - Tokenizer i( "english", "blue -red" ); + Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "blue -red" ); Token a = i.next(); Token b = i.next(); @@ -104,7 +108,8 @@ namespace mongo { } TEST( Tokenizer, Quote1English ) { - Tokenizer i( "english", "eliot's car" ); + Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(), + "eliot's car" ); Token a = i.next(); Token b = i.next(); @@ -114,7 +119,8 @@ namespace mongo { } TEST( Tokenizer, Quote1French ) { - Tokenizer i( "french", "eliot's car" ); + Tokenizer i( FTSLanguage::makeFTSLanguage( "french" ).getValue(), + "eliot's car" ); Token a = i.next(); Token b = i.next(); diff --git a/src/mongo/db/matcher/expression_parser_text.cpp b/src/mongo/db/matcher/expression_parser_text.cpp index 45b44419197..1c36ad2ef44 100644 --- a/src/mongo/db/matcher/expression_parser_text.cpp +++ b/src/mongo/db/matcher/expression_parser_text.cpp @@ -30,6 +30,7 @@ #include "mongo/base/init.h" #include "mongo/db/client.h" +#include "mongo/db/fts/fts_language.h" #include "mongo/db/index/catalog_hack.h" #include "mongo/db/jsobj.h" #include "mongo/db/matcher/expression_parser.h" @@ -46,11 +47,19 @@ namespace mongo { return StatusWithMatchExpression( ErrorCodes::BadValue, "$search needs a String" ); } + string language = ""; BSONElement languageElt = queryObj["$language"]; - if ( !languageElt.eoo() && mongo::String != languageElt.type() ) { - return StatusWithMatchExpression( ErrorCodes::BadValue, "$language needs a String" ); + if ( !languageElt.eoo() ) { + if ( mongo::String != languageElt.type() ) { + return StatusWithMatchExpression( ErrorCodes::BadValue, + "$language needs a String" ); + } + language = languageElt.String(); + if ( !fts::FTSLanguage::makeFTSLanguage( language ).getStatus().isOK() ) { + return StatusWithMatchExpression( ErrorCodes::BadValue, + "$language specifies unsupported language" ); + } } - string language = ( !languageElt.eoo() ? languageElt.String() : "" ); string query = queryObj["$search"].String(); if ( queryObj.nFields() != ( languageElt.eoo() ? 1 : 2 ) ) { diff --git a/src/mongo/db/matcher/expression_parser_text_test.cpp b/src/mongo/db/matcher/expression_parser_text_test.cpp index 5ce0523134c..b0d7166e5fe 100644 --- a/src/mongo/db/matcher/expression_parser_text_test.cpp +++ b/src/mongo/db/matcher/expression_parser_text_test.cpp @@ -39,7 +39,7 @@ namespace mongo { - TEST( MatchExpressionParserText, Text ) { + TEST( MatchExpressionParserText, Parse1 ) { BSONObj query = fromjson( "{$text:{$search:\"awesome\", $language:\"english\"}}" ); StatusWithMatchExpression result = MatchExpressionParser::parse( query ); @@ -52,4 +52,11 @@ namespace mongo { ASSERT_EQUALS( textExp->getQuery(), "awesome" ); ASSERT_EQUALS( textExp->getLanguage(), "english" ); } + + TEST( MatchExpressionParserText, Parse2 ) { + BSONObj query = fromjson( "{$text:{$search:\"awesome\", $language:\"spanglish\"}}" ); + + StatusWithMatchExpression result = MatchExpressionParser::parse( query ); + ASSERT_FALSE( result.isOK() ); + } } diff --git a/src/mongo/db/query/stage_builder.cpp b/src/mongo/db/query/stage_builder.cpp index 637fb1f3cf6..ed874a9f37c 100644 --- a/src/mongo/db/query/stage_builder.cpp +++ b/src/mongo/db/query/stage_builder.cpp @@ -215,7 +215,7 @@ namespace mongo { if (!s.isOK()) { return NULL; } string language = ("" == node->_language - ? fam->getSpec().defaultLanguage() + ? fam->getSpec().defaultLanguage().str() : node->_language); FTSQuery ftsq; |