summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Rassi <rassi@10gen.com>2013-11-13 03:46:19 -0500
committerJason Rassi <rassi@10gen.com>2013-11-13 03:46:19 -0500
commit075ff3c615cc9c53aab272af32a180d9f1ceda4a (patch)
treea392a303536389756834a3909f88516155ac013f
parentef9a27f0feaa44ad4f897e49a52c9b755cf3d496 (diff)
downloadmongo-075ff3c615cc9c53aab272af32a180d9f1ceda4a.tar.gz
SERVER-9932 SERVER-10857 Text search language aliases/validation
Users can now use two-letter language codes and can specify languages in mixed case. The following operations will now fail if they do not pass language validation: - Building a text index (if default_language fails validation or if the target collection contains a document with a language that fails validation). - Inserting a document into a text-indexed collection (if it specifies a language that fails validation). - A text query or text command (if it specifies a language that fails validation).
-rw-r--r--jstests/fts_mix.js7
-rw-r--r--jstests/fts_querylang.js10
-rw-r--r--jstests/fts_spanish.js17
-rw-r--r--src/mongo/db/exec/stagedebug_cmd.cpp2
-rw-r--r--src/mongo/db/fts/SConscript4
-rw-r--r--src/mongo/db/fts/fts_command_mongod.cpp2
-rw-r--r--src/mongo/db/fts/fts_language.cpp152
-rw-r--r--src/mongo/db/fts/fts_language.h92
-rw-r--r--src/mongo/db/fts/fts_language_test.cpp106
-rw-r--r--src/mongo/db/fts/fts_query.cpp9
-rw-r--r--src/mongo/db/fts/fts_query.h4
-rw-r--r--src/mongo/db/fts/fts_spec.cpp51
-rw-r--r--src/mongo/db/fts/fts_spec.h25
-rw-r--r--src/mongo/db/fts/fts_spec_test.cpp92
-rw-r--r--src/mongo/db/fts/stemmer.cpp6
-rw-r--r--src/mongo/db/fts/stemmer.h3
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp10
-rw-r--r--src/mongo/db/fts/stop_words.cpp4
-rw-r--r--src/mongo/db/fts/stop_words.h3
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp7
-rw-r--r--src/mongo/db/fts/tokenizer.cpp4
-rw-r--r--src/mongo/db/fts/tokenizer.h3
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp18
-rw-r--r--src/mongo/db/matcher/expression_parser_text.cpp15
-rw-r--r--src/mongo/db/matcher/expression_parser_text_test.cpp9
-rw-r--r--src/mongo/db/query/stage_builder.cpp2
26 files changed, 567 insertions, 90 deletions
diff --git a/jstests/fts_mix.js b/jstests/fts_mix.js
index 7acff093480..56da123cdc3 100644
--- a/jstests/fts_mix.js
+++ b/jstests/fts_mix.js
@@ -67,6 +67,13 @@ res = tc.runCommand( "text", { search: "magazine's" } );
res2 = tc.runCommand( "text", { search: "magazine" } );
assert.eq( getIDS( res ), getIDS( res2 ) );
+// -------------------------------------------- LANGUAGE -------------------------------------------
+
+res = tc.runCommand( "text", { search: "member", language: "spanglish" } );
+assert.commandFailed( res );
+res = tc.runCommand( "text", { search: "member", language: "english" } );
+assert.commandWorked( res );
+
// -------------------------------------------- LIMIT RESULTS --------------------------------------
// ensure limit limits results
diff --git a/jstests/fts_querylang.js b/jstests/fts_querylang.js
index ebc4e05c216..f03b1836853 100644
--- a/jstests/fts_querylang.js
+++ b/jstests/fts_querylang.js
@@ -66,7 +66,15 @@ assert.throws(function() { t.find({$or: [{$text: {$search: "content -irrelevant"
// TODO Test invalid inputs for $text, $search, $language.
-// TODO Test $language.
+// Test $language.
+cursor = t.find({$text: {$search: "contents", $language: "none"}});
+assert.eq(false, cursor.hasNext());
+
+cursor = t.find({$text: {$search: "contents", $language: "EN"}});
+assert.eq(true, cursor.hasNext());
+
+cursor = t.find({$text: {$search: "contents", $language: "spanglish"}});
+assert.throws(function() { cursor.next() });
// TODO Test $and of basic text query with geo expression.
diff --git a/jstests/fts_spanish.js b/jstests/fts_spanish.js
index 136eaf17ae1..cdf73343b5f 100644
--- a/jstests/fts_spanish.js
+++ b/jstests/fts_spanish.js
@@ -7,26 +7,25 @@ t.drop();
t.save( { _id: 1, title: "mi blog", text: "Este es un blog de prueba" } );
t.save( { _id: 2, title: "mi segundo post", text: "Este es un blog de prueba" } );
t.save( { _id: 3, title: "cuchillos son divertidos", text: "este es mi tercer blog stemmed" } );
-t.save( { _id: 4, language: "english", title: "My fourth blog", text: "This stemmed blog is in english" } );
+t.save( { _id: 4, language: "en", title: "My fourth blog", text: "This stemmed blog is in english" } );
// default weight is 1
// specify weights if you want a field to be more meaningull
t.ensureIndex( { "title": "text", text: "text" }, { weights: { title: 10 },
- default_language: "spanish" } );
+ default_language: "es" } );
res = t.runCommand( "text", { search: "blog" } );
assert.eq( 4, res.results.length );
assert.eq( [4], queryIDS( t, "stem" ) );
assert.eq( [3], queryIDS( t, "stemmed" ) );
-assert.eq( [4], queryIDS( t, "stemmed", null, { language : "english" } ) );
+assert.eq( [4], queryIDS( t, "stemmed", null, { language : "en" } ) );
assert.eq( [1,2], queryIDS( t, "prueba" ) );
+t.save( { _id: 5, language: "spanglish", title: "", text: "" } );
+assert( db.getLastError() );
-
-
-
-
-
-
+t.dropIndexes();
+t.ensureIndex( { "title": "text", text: "text" }, { default_language: "spanglish" } );
+assert( db.getLastError() );
diff --git a/src/mongo/db/exec/stagedebug_cmd.cpp b/src/mongo/db/exec/stagedebug_cmd.cpp
index 261baf036c4..4c43a7b4e0f 100644
--- a/src/mongo/db/exec/stagedebug_cmd.cpp
+++ b/src/mongo/db/exec/stagedebug_cmd.cpp
@@ -375,7 +375,7 @@ namespace mongo {
params.spec = fam->getSpec();
- if (!params.query.parse(search, fam->getSpec().defaultLanguage()).isOK()) {
+ if (!params.query.parse(search, fam->getSpec().defaultLanguage().str()).isOK()) {
return NULL;
}
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 369c8f7892d..e1a1ebc2ec1 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -29,6 +29,7 @@ env.StaticLibrary('base', [
'fts_matcher.cpp',
'fts_query.cpp',
'fts_spec.cpp',
+ 'fts_language.cpp',
'fts_util.cpp',
'stemmer.cpp',
'stop_words.cpp',
@@ -74,6 +75,9 @@ env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp",
LIBDEPS=["base"] )
+env.CppUnitTest( "fts_language_test", "fts_language_test.cpp",
+ LIBDEPS=["base"] )
+
env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
LIBDEPS=["base"] )
diff --git a/src/mongo/db/fts/fts_command_mongod.cpp b/src/mongo/db/fts/fts_command_mongod.cpp
index 2a95e79038b..74022b7b9d4 100644
--- a/src/mongo/db/fts/fts_command_mongod.cpp
+++ b/src/mongo/db/fts/fts_command_mongod.cpp
@@ -107,7 +107,7 @@ namespace mongo {
IndexDescriptor* descriptor = collection->getIndexCatalog()->getDescriptor(idxMatches[0]);
auto_ptr<FTSAccessMethod> fam(new FTSAccessMethod(descriptor));
if ( language == "" ) {
- language = fam->getSpec().defaultLanguage();
+ language = fam->getSpec().defaultLanguage().str();
}
Status s = fam->getSpec().getIndexPrefix( filter, &indexPrefix );
if ( !s.isOK() ) {
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
new file mode 100644
index 00000000000..61bd33cf93f
--- /dev/null
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -0,0 +1,152 @@
+// fts_language.cpp
+
+/**
+ * Copyright (C) 2013 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/fts/fts_language.h"
+
+#include <string>
+
+#include "mongo/base/init.h"
+#include "mongo/util/assert_util.h"
+#include "mongo/util/string_map.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ namespace {
+
+ // Supported languages in canonical form (English names, lowercased). Includes "none".
+ const string LanguageNone( "none" );
+ const string LanguageDanish( "danish" );
+ const string LanguageDutch( "dutch" );
+ const string LanguageEnglish( "english" );
+ const string LanguageFinnish( "finnish" );
+ const string LanguageFrench( "french" );
+ const string LanguageGerman( "german" );
+ const string LanguageHungarian( "hungarian" );
+ const string LanguageItalian( "italian" );
+ const string LanguageNorwegian( "norwegian" );
+ const string LanguagePortuguese( "portuguese" );
+ const string LanguageRomanian( "romanian" );
+ const string LanguageRussian( "russian" );
+ const string LanguageSpanish( "spanish" );
+ const string LanguageSwedish( "swedish" );
+ const string LanguageTurkish( "turkish" );
+
+ // Map from lowercased user string to language string. Resolves any language aliases
+ // (two-letter codes).
+ typedef StringMap<std::string> LanguageMap;
+ LanguageMap languageMap;
+ }
+
+ MONGO_INITIALIZER( FTSLanguageMap )( InitializerContext* context ) {
+ languageMap[LanguageNone] = LanguageNone;
+
+ languageMap["da"] = LanguageDanish;
+ languageMap[LanguageDanish] = LanguageDanish;
+ languageMap["nl"] = LanguageDutch;
+ languageMap[LanguageDutch] = LanguageDutch;
+ languageMap["en"] = LanguageEnglish;
+ languageMap[LanguageEnglish] = LanguageEnglish;
+ languageMap["fi"] = LanguageFinnish;
+ languageMap[LanguageFinnish] = LanguageFinnish;
+ languageMap["fr"] = LanguageFrench;
+ languageMap[LanguageFrench] = LanguageFrench;
+ languageMap["de"] = LanguageGerman;
+ languageMap[LanguageGerman] = LanguageGerman;
+ languageMap["hu"] = LanguageHungarian;
+ languageMap[LanguageHungarian] = LanguageHungarian;
+ languageMap["it"] = LanguageItalian;
+ languageMap[LanguageItalian] = LanguageItalian;
+ languageMap["nb"] = LanguageNorwegian;
+ languageMap[LanguageNorwegian] = LanguageNorwegian;
+ languageMap["pt"] = LanguagePortuguese;
+ languageMap[LanguagePortuguese] = LanguagePortuguese;
+ languageMap["ro"] = LanguageRomanian;
+ languageMap[LanguageRomanian] = LanguageRomanian;
+ languageMap["ru"] = LanguageRussian;
+ languageMap[LanguageRussian] = LanguageRussian;
+ languageMap["es"] = LanguageSpanish;
+ languageMap[LanguageSpanish] = LanguageSpanish;
+ languageMap["sv"] = LanguageSwedish;
+ languageMap[LanguageSwedish] = LanguageSwedish;
+ languageMap["tr"] = LanguageTurkish;
+ languageMap[LanguageTurkish] = LanguageTurkish;
+ return Status::OK();
+ }
+
+ FTSLanguage::FTSLanguage()
+ : _lang() {
+ }
+
+ FTSLanguage::FTSLanguage( const FTSLanguage& other )
+ : _lang( other._lang ) {
+ }
+
+ FTSLanguage& FTSLanguage::operator=( const FTSLanguage& other ) {
+ _lang = other._lang;
+ return *this;
+ }
+
+ FTSLanguage::~FTSLanguage() {
+ }
+
+ Status FTSLanguage::init( const std::string& lang ) {
+ // Lowercase.
+ std::string langLower = tolowerString( lang );
+
+ // Resolve language aliases.
+ LanguageMap::const_iterator it = languageMap.find( langLower );
+ if ( it == languageMap.end() ) {
+ return Status( ErrorCodes::BadValue,
+ "unsupported language: \"" + lang + "\"" );
+ }
+
+ _lang = StringData( it->second );
+ return Status::OK();
+ }
+
+ std::string FTSLanguage::str() const {
+ verify( !_lang.empty() );
+ return _lang.toString();
+ }
+
+ StatusWithFTSLanguage FTSLanguage::makeFTSLanguage( const std::string& lang ) {
+ FTSLanguage language;
+ Status s = language.init( lang );
+ if ( !s.isOK() ) {
+ return StatusWithFTSLanguage( s );
+ }
+ return StatusWithFTSLanguage( language );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
new file mode 100644
index 00000000000..9a1d7053ea8
--- /dev/null
+++ b/src/mongo/db/fts/fts_language.h
@@ -0,0 +1,92 @@
+// fts_language.h
+
+/**
+ * Copyright (C) 2013 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/status_with.h"
+
+#include <string>
+
+namespace mongo {
+
+ namespace fts {
+
+ /**
+ * A FTSLanguage is a copyable glorified enum representing a language for a text-indexed
+ * document or a text search. Example of suggested usage:
+ *
+ * StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "en" );
+ * if ( !swl.getStatus().isOK() ) {
+ * // Error.
+ * }
+ * else {
+ * const FTSLanguage language = swl.getValue();
+ * // Use language.
+ * }
+ */
+ class FTSLanguage {
+ public:
+ /** Create an uninitialized language. */
+ FTSLanguage();
+
+ ~FTSLanguage();
+ FTSLanguage( const FTSLanguage& );
+ FTSLanguage& operator=( const FTSLanguage & );
+
+ /**
+ * Initialize an FTSLanguage from a language string. Language strings are
+ * case-insensitive, and can be in one of the two following forms:
+ * - English name, like "spanish".
+ * - Two-letter code, like "es".
+ * Returns an error Status if an invalid language string is passed.
+ */
+ Status init( const std::string& lang );
+
+ /**
+ * Returns the language as a string in canonical form (lowercased English name). It is
+ * an error to call str() on an uninitialized language.
+ */
+ std::string str() const;
+
+ /**
+ * Convenience method for creating an FTSLanguage out of a language string. Caller
+ * must check getStatus().isOK() on return value.
+ */
+ static StatusWith<const FTSLanguage> makeFTSLanguage( const std::string& lang );
+
+ private:
+ // Pointer to string representation of language. Not owned here.
+ StringData _lang;
+ };
+
+ typedef StatusWith<const FTSLanguage> StatusWithFTSLanguage;
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_language_test.cpp b/src/mongo/db/fts/fts_language_test.cpp
new file mode 100644
index 00000000000..5fdd9a4aa73
--- /dev/null
+++ b/src/mongo/db/fts/fts_language_test.cpp
@@ -0,0 +1,106 @@
+// fts_language_test.cpp
+
+/**
+ * Copyright (C) 2013 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/pch.h"
+#include "mongo/db/fts/fts_language.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ // Positive tests for FTSLanguage::init() and FTSLanguage::str().
+
+ TEST( FTSLanguage, ExactLanguage ) {
+ FTSLanguage lang;
+ Status s = lang.init( "spanish" );
+ ASSERT( s.isOK() );
+ ASSERT_EQUALS( lang.str(), "spanish" );
+ }
+
+ TEST( FTSLanguage, ExactCode ) {
+ FTSLanguage lang;
+ Status s = lang.init( "es" );
+ ASSERT( s.isOK() );
+ ASSERT_EQUALS( lang.str(), "spanish" );
+ }
+
+ TEST( FTSLanguage, UpperCaseLanguage ) {
+ FTSLanguage lang;
+ Status s = lang.init( "SPANISH" );
+ ASSERT( s.isOK() );
+ ASSERT_EQUALS( lang.str(), "spanish" );
+ }
+
+ TEST( FTSLanguage, UpperCaseCode ) {
+ FTSLanguage lang;
+ Status s = lang.init( "ES" );
+ ASSERT( s.isOK() );
+ ASSERT_EQUALS( lang.str(), "spanish" );
+ }
+
+ TEST( FTSLanguage, NoneLanguage ) {
+ FTSLanguage lang;
+ Status s = lang.init( "none" );
+ ASSERT( s.isOK() );
+ ASSERT_EQUALS( lang.str(), "none" );
+ }
+
+ // Negative tests for FTSLanguage::init() and FTSLanguage::str().
+
+ TEST( FTSLanguage, Unknown ) {
+ FTSLanguage lang;
+ Status s = lang.init( "spanglish" );
+ ASSERT( !s.isOK() );
+ }
+
+ TEST( FTSLanguage, Empty ) {
+ FTSLanguage lang;
+ Status s = lang.init( "" );
+ ASSERT( !s.isOK() );
+ }
+
+ // Positive tests for FTSLanguage::makeFTSLanguage().
+
+ TEST( FTSLanguage, MakeFTSLanguage1 ) {
+ StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "english" );
+ ASSERT( swl.getStatus().isOK() );
+ ASSERT_EQUALS( swl.getValue().str(), "english" );
+ }
+
+ // Negative tests for FTSLanguage::makeFTSLanguage().
+
+ TEST( FTSLanguage, MakeFTSLanguage2 ) {
+ StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( "onglish" );
+ ASSERT( !swl.getStatus().isOK() );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index b03871cc9dd..2f5e215ce64 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -43,10 +43,13 @@ namespace mongo {
Status FTSQuery::parse(const string& query, const string& language) {
_search = query;
- _language = language;
+ Status status = _language.init( language );
+ if ( !status.isOK() ) {
+ return status;
+ }
- const StopWords* stopWords = StopWords::getStopWords( language );
- Stemmer stemmer( language );
+ const StopWords* stopWords = StopWords::getStopWords( _language );
+ Stemmer stemmer( _language );
bool inNegation = false;
bool inPhrase = false;
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index 1bb3a9e9e17..4eec8d404c8 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -69,7 +69,7 @@ namespace mongo {
}
string getSearch() const { return _search; }
- string getLanguage() const { return _language; }
+ const FTSLanguage getLanguage() const { return _language; }
string toString() const;
@@ -77,7 +77,7 @@ namespace mongo {
protected:
string _search;
- string _language;
+ FTSLanguage _language;
vector<string> _terms;
unordered_set<string> _negatedTerms;
vector<string> _phrases;
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index d3339078628..6f88df3534d 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -45,15 +45,19 @@ namespace mongo {
const double MAX_WEIGHT = 1000000000;
const double MAX_WORD_WEIGHT = MAX_WEIGHT / 10000;
+ namespace {
+ // Default language. Used for new indexes.
+ const std::string moduleDefaultLanguage( "english" );
+ }
+
FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
massert( 16739, "found invalid spec for text index",
indexInfo["weights"].isABSONObj() );
- _defaultLanguage = indexInfo["default_language"].valuestrsafe();
- _languageOverrideField = indexInfo["language_override"].valuestrsafe();
+ Status status = _defaultLanguage.init( indexInfo["default_language"].String() );
+ verify( status.isOK() );
- if ( _defaultLanguage.size() == 0 )
- _defaultLanguage = "english";
+ _languageOverrideField = indexInfo["language_override"].valuestrsafe();
if ( _languageOverrideField.size() == 0 )
_languageOverrideField = "language";
@@ -103,15 +107,20 @@ namespace mongo {
}
}
- string FTSSpec::getLanguageToUse( const BSONObj& userDoc,
- const string& currentLanguage ) const {
+ const FTSLanguage FTSSpec::getLanguageToUse( const BSONObj& userDoc,
+ const FTSLanguage currentLanguage ) const {
BSONElement e = userDoc[_languageOverrideField];
- if ( e.type() == String ) {
- const char * x = e.valuestrsafe();
- if ( strlen( x ) > 0 )
- return x;
+ if ( e.eoo() ) {
+ return currentLanguage;
}
- return currentLanguage;
+ uassert( 17261,
+ "found language override field in document with non-string type",
+ e.type() == mongo::String );
+ StatusWithFTSLanguage swl = FTSLanguage::makeFTSLanguage( e.String() );
+ uassert( 17262,
+ "language override unsupported: " + e.String(),
+ swl.getStatus().isOK() );
+ return swl.getValue();
}
@@ -129,11 +138,11 @@ namespace mongo {
}
void FTSSpec::scoreDocument( const BSONObj& obj,
- const string& parentLanguage,
+ const FTSLanguage parentLanguage,
const string& parentPath,
bool isArray,
TermFrequencyMap* term_freqs ) const {
- string language = getLanguageToUse( obj, parentLanguage );
+ const FTSLanguage language = getLanguageToUse( obj, parentLanguage );
Stemmer stemmer( language );
Tools tools( language, &stemmer, StopWords::getStopWords( language ) );
@@ -374,9 +383,19 @@ namespace mongo {
weights = b.obj();
}
- string default_language(spec.getStringField("default_language"));
- if ( default_language.empty() )
- default_language = "english";
+ BSONElement default_language_elt = spec["default_language"];
+ string default_language( default_language_elt.str() );
+ if ( default_language_elt.eoo() ) {
+ default_language = moduleDefaultLanguage;
+ }
+ else {
+ uassert( 17263,
+ "default_language needs a string type",
+ default_language_elt.type() == String );
+ }
+ uassert( 17264,
+ "default_language is not valid",
+ FTSLanguage::makeFTSLanguage( default_language ).getStatus().isOK() );
string language_override(spec.getStringField("language_override"));
if ( language_override.empty() )
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index d13281eac9f..258ecf7407a 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -34,6 +34,7 @@
#include <vector>
#include <string>
+#include "mongo/db/fts/fts_language.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/db/fts/stemmer.h"
#include "mongo/db/fts/stop_words.h"
@@ -54,14 +55,14 @@ namespace mongo {
class FTSSpec {
struct Tools {
- Tools( string _language,
+ Tools( const FTSLanguage _language,
const Stemmer* _stemmer,
const StopWords* _stopwords )
: language( _language )
, stemmer( _stemmer )
, stopwords( _stopwords ) {}
- const std::string& language;
+ const FTSLanguage language;
const Stemmer* stemmer;
const StopWords* stopwords;
};
@@ -70,7 +71,7 @@ namespace mongo {
FTSSpec( const BSONObj& indexInfo );
bool wildcard() const { return _wildcard; }
- const string& defaultLanguage() const { return _defaultLanguage; }
+ const FTSLanguage defaultLanguage() const { return _defaultLanguage; }
const string& languageOverrideField() const { return _languageOverrideField; }
size_t numExtraBefore() const { return _extraBefore.size(); }
@@ -80,13 +81,6 @@ namespace mongo {
const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; }
/**
- * Find a "language" field, if any, in a given BSON doc. If the language is not on the
- * list of valid languages, return current.
- */
- string getLanguageToUse( const BSONObj& userDoc,
- const std::string& currentLanguage ) const;
-
- /**
* Calculates term/score pairs for a BSONObj as applied to this spec.
* - "obj": the BSONObj to traverse; can be a subdocument or array
* - "parentLanguage": nearest enclosing document "language" spec for obj
@@ -95,7 +89,7 @@ namespace mongo {
* - "term_freqs": out-parameter to store results
*/
void scoreDocument( const BSONObj& obj,
- const string& parentLanguage,
+ const FTSLanguage parentLanguage,
const string& parentPath,
bool isArray,
TermFrequencyMap* term_freqs ) const;
@@ -109,12 +103,19 @@ namespace mongo {
static BSONObj fixSpec( const BSONObj& spec );
private:
+ /**
+ * Get the language override for the given BSON doc. If no language override is
+ * specified, returns currentLanguage.
+ */
+ const FTSLanguage getLanguageToUse( const BSONObj& userDoc,
+ const FTSLanguage currentLanguage ) const;
+
void _scoreString( const Tools& tools,
const StringData& raw,
TermFrequencyMap* term_freqs,
double weight ) const;
- string _defaultLanguage;
+ FTSLanguage _defaultLanguage;
string _languageOverrideField;
bool _wildcard;
diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp
index b8bb292c777..1cbc9e61eeb 100644
--- a/src/mongo/db/fts/fts_spec_test.cpp
+++ b/src/mongo/db/fts/fts_spec_test.cpp
@@ -47,6 +47,29 @@ namespace mongo {
ASSERT_EQUALS( fixed, fixed2 );
}
+ TEST( FTSSpec, DefaultLanguage1 ) {
+ BSONObj user = BSON( "key" << BSON( "text" << "fts" ) <<
+ "default_language" << "spanish" );
+
+ try {
+ BSONObj fixed = FTSSpec::fixSpec( user );
+ }
+ catch ( UserException& e ) {
+ ASSERT(false);
+ }
+ }
+
+ TEST( FTSSpec, DefaultLanguage2 ) {
+ BSONObj user = BSON( "key" << BSON( "text" << "fts" ) <<
+ "default_language" << "spanglish" );
+
+ try {
+ BSONObj fixed = FTSSpec::fixSpec( user );
+ ASSERT(false);
+ }
+ catch ( UserException& e ) {}
+ }
+
TEST( FTSSpec, ScoreSingleField1 ) {
BSONObj user = BSON( "key" << BSON( "title" << "fts" <<
"text" << "fts" ) <<
@@ -56,7 +79,10 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "title" << "cat sat run" ),
- "english", "", false, &m );
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &m );
ASSERT_EQUALS( 3U, m.size() );
ASSERT_EQUALS( m["cat"], m["sat"] );
ASSERT_EQUALS( m["cat"], m["run"] );
@@ -72,7 +98,10 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "title" << "cat sat run" << "text" << "cat book" ),
- "english", "", false, &m );
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &m );
ASSERT_EQUALS( 4U, m.size() );
ASSERT_EQUALS( m["sat"], m["run"] );
@@ -94,7 +123,10 @@ namespace mongo {
TermFrequencyMap m;
spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ),
- "english", "", false, &m );
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &m );
ASSERT_EQUALS( 3U, m.size() );
ASSERT( m["cat"] > 0 );
ASSERT( m["sat"] > m["cat"] );
@@ -163,7 +195,11 @@ namespace mongo {
// The following document matches {"a.b": {$type: 2}}, so "term" should be indexed.
BSONObj obj = fromjson("{a: [{b: ['term']}]}"); // indirectly nested arrays
TermFrequencyMap m;
- spec.scoreDocument( obj, "english", "", false, &m );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &m );
ASSERT_EQUALS( 1U, m.size() );
}
@@ -174,7 +210,11 @@ namespace mongo {
// The wildcard spec implies a full recursive traversal, so "term" should be indexed.
BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
TermFrequencyMap m;
- spec.scoreDocument( obj, "english", "", false, &m );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &m );
ASSERT_EQUALS( 1U, m.size() );
}
@@ -186,7 +226,11 @@ namespace mongo {
// indexed.
BSONObj obj = fromjson("{a: {b: [['term']]}}"); // directly nested arrays
TermFrequencyMap m;
- spec.scoreDocument( obj, "english", "", false, &m );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &m );
ASSERT_EQUALS( 0U, m.size() );
}
@@ -205,7 +249,11 @@ namespace mongo {
" }"
" }" );
- spec.scoreDocument( obj, "english", "", false, &tfm );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &tfm );
set<string> hits;
hits.insert("walk");
@@ -236,7 +284,11 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj, "english", "", false, &tfm );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -267,7 +319,11 @@ namespace mongo {
" } ]"
"}" );
- spec.scoreDocument( obj, "english", "", false, &tfm );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -300,7 +356,11 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj, "english", "", false, &tfm );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -333,7 +393,11 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj, "english", "", false, &tfm );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &tfm );
set<string> hits;
hits.insert("foredrag");
@@ -368,7 +432,11 @@ namespace mongo {
" }"
"}" );
- spec.scoreDocument( obj, "english", "", false, &tfm );
+ spec.scoreDocument( obj,
+ FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "",
+ false,
+ &tfm );
set<string> hits;
hits.insert("foredrag");
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 878bf1bf0dc..5c4431f9712 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -37,10 +37,10 @@ namespace mongo {
namespace fts {
- Stemmer::Stemmer( const string& language ) {
+ Stemmer::Stemmer( const FTSLanguage language ) {
_stemmer = NULL;
- if ( language != "none" )
- _stemmer = sb_stemmer_new(language.c_str(), "UTF_8");
+ if ( language.str() != "none" )
+ _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8");
}
Stemmer::~Stemmer() {
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index 521f41490fa..9b06bda4f2e 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -34,6 +34,7 @@
#include <string>
#include "mongo/base/string_data.h"
+#include "mongo/db/fts/fts_language.h"
#include "third_party/libstemmer_c/include/libstemmer.h"
namespace mongo {
@@ -47,7 +48,7 @@ namespace mongo {
*/
class Stemmer {
public:
- Stemmer( const std::string& language );
+ Stemmer( const FTSLanguage language );
~Stemmer();
std::string stem( const StringData& word ) const;
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index 8f8003fb3ee..1833f20fe37 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -37,18 +37,10 @@ namespace mongo {
namespace fts {
TEST( English, Stemmer1 ) {
- Stemmer s( "english" );
+ Stemmer s( FTSLanguage::makeFTSLanguage( "english" ).getValue() );
ASSERT_EQUALS( "run", s.stem( "running" ) );
ASSERT_EQUALS( "Run", s.stem( "Running" ) );
}
-
- TEST( English, Caps ) {
- Stemmer s( "porter" );
- ASSERT_EQUALS( "unit", s.stem( "united" ) );
- ASSERT_EQUALS( "Unite", s.stem( "United" ) );
- }
-
-
}
}
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index ee22da9dad7..d858992f5ce 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -59,8 +59,8 @@ namespace mongo {
_words.insert( *i );
}
- const StopWords* StopWords::getStopWords( const std::string& langauge ) {
- unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( langauge );
+ const StopWords* StopWords::getStopWords( const FTSLanguage language ) {
+ unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( language.str() );
if ( i == STOP_WORDS.end() )
return empty;
return i->second;
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index a2c9cf8ff69..24e433c6992 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -34,6 +34,7 @@
#include <set>
#include <string>
+#include "mongo/db/fts/fts_language.h"
#include "mongo/platform/unordered_set.h"
namespace mongo {
@@ -51,7 +52,7 @@ namespace mongo {
size_t numStopWords() const { return _words.size(); }
- static const StopWords* getStopWords( const std::string& langauge );
+ static const StopWords* getStopWords( const FTSLanguage langauge );
private:
~StopWords(){}
unordered_set<std::string> _words;
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index a365480aa72..4d6b78f7f6b 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -35,9 +35,10 @@ namespace mongo {
namespace fts {
TEST( English, Basic1 ) {
- const StopWords* english = StopWords::getStopWords( "english" );
- ASSERT( english->isStopWord( "the" ) );
- ASSERT( !english->isStopWord( "computer" ) );
+ FTSLanguage language = FTSLanguage::makeFTSLanguage( "english" ).getValue();
+ const StopWords* englishStopWords = StopWords::getStopWords( language );
+ ASSERT( englishStopWords->isStopWord( "the" ) );
+ ASSERT( !englishStopWords->isStopWord( "computer" ) );
}
}
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index 050210a6e1b..1a25898bd75 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -37,9 +37,9 @@ namespace mongo {
namespace fts {
- Tokenizer::Tokenizer( const string& language, const StringData& str )
+ Tokenizer::Tokenizer( const FTSLanguage language, const StringData& str )
: _pos(0), _raw( str ) {
- _english = language == "english";
+ _english = language.str() == "english";
_skipWhitespace();
_previousWhiteSpace = true;
}
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index d820338b0b0..6930f7543f6 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -34,6 +34,7 @@
#include <string>
#include "mongo/base/string_data.h"
+#include "mongo/db/fts/fts_language.h"
#include "mongo/platform/unordered_map.h"
#include "mongo/platform/unordered_set.h"
@@ -60,7 +61,7 @@ namespace mongo {
class Tokenizer {
public:
- Tokenizer( const std::string& language, const StringData& str );
+ Tokenizer( const FTSLanguage language, const StringData& str );
bool more() const;
Token next();
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index 5d60a50769e..eac91987c61 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -35,12 +35,14 @@ namespace mongo {
namespace fts {
TEST( Tokenizer, Empty1 ) {
- Tokenizer i( "english", "" );
+ Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "" );
ASSERT( !i.more() );
}
TEST( Tokenizer, Basic1 ) {
- Tokenizer i( "english", "blue red green" );
+ Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "blue red green" );
ASSERT( i.more() );
ASSERT_EQUALS( i.next().data.toString(), "blue" );
@@ -55,7 +57,8 @@ namespace mongo {
}
TEST( Tokenizer, Basic2 ) {
- Tokenizer i( "english", "blue-red" );
+ Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "blue-red" );
Token a = i.next();
Token b = i.next();
@@ -77,7 +80,8 @@ namespace mongo {
}
TEST( Tokenizer, Basic3 ) {
- Tokenizer i( "english", "blue -red" );
+ Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "blue -red" );
Token a = i.next();
Token b = i.next();
@@ -104,7 +108,8 @@ namespace mongo {
}
TEST( Tokenizer, Quote1English ) {
- Tokenizer i( "english", "eliot's car" );
+ Tokenizer i( FTSLanguage::makeFTSLanguage( "english" ).getValue(),
+ "eliot's car" );
Token a = i.next();
Token b = i.next();
@@ -114,7 +119,8 @@ namespace mongo {
}
TEST( Tokenizer, Quote1French ) {
- Tokenizer i( "french", "eliot's car" );
+ Tokenizer i( FTSLanguage::makeFTSLanguage( "french" ).getValue(),
+ "eliot's car" );
Token a = i.next();
Token b = i.next();
diff --git a/src/mongo/db/matcher/expression_parser_text.cpp b/src/mongo/db/matcher/expression_parser_text.cpp
index 45b44419197..1c36ad2ef44 100644
--- a/src/mongo/db/matcher/expression_parser_text.cpp
+++ b/src/mongo/db/matcher/expression_parser_text.cpp
@@ -30,6 +30,7 @@
#include "mongo/base/init.h"
#include "mongo/db/client.h"
+#include "mongo/db/fts/fts_language.h"
#include "mongo/db/index/catalog_hack.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/matcher/expression_parser.h"
@@ -46,11 +47,19 @@ namespace mongo {
return StatusWithMatchExpression( ErrorCodes::BadValue, "$search needs a String" );
}
+ string language = "";
BSONElement languageElt = queryObj["$language"];
- if ( !languageElt.eoo() && mongo::String != languageElt.type() ) {
- return StatusWithMatchExpression( ErrorCodes::BadValue, "$language needs a String" );
+ if ( !languageElt.eoo() ) {
+ if ( mongo::String != languageElt.type() ) {
+ return StatusWithMatchExpression( ErrorCodes::BadValue,
+ "$language needs a String" );
+ }
+ language = languageElt.String();
+ if ( !fts::FTSLanguage::makeFTSLanguage( language ).getStatus().isOK() ) {
+ return StatusWithMatchExpression( ErrorCodes::BadValue,
+ "$language specifies unsupported language" );
+ }
}
- string language = ( !languageElt.eoo() ? languageElt.String() : "" );
string query = queryObj["$search"].String();
if ( queryObj.nFields() != ( languageElt.eoo() ? 1 : 2 ) ) {
diff --git a/src/mongo/db/matcher/expression_parser_text_test.cpp b/src/mongo/db/matcher/expression_parser_text_test.cpp
index 5ce0523134c..b0d7166e5fe 100644
--- a/src/mongo/db/matcher/expression_parser_text_test.cpp
+++ b/src/mongo/db/matcher/expression_parser_text_test.cpp
@@ -39,7 +39,7 @@
namespace mongo {
- TEST( MatchExpressionParserText, Text ) {
+ TEST( MatchExpressionParserText, Parse1 ) {
BSONObj query = fromjson( "{$text:{$search:\"awesome\", $language:\"english\"}}" );
StatusWithMatchExpression result = MatchExpressionParser::parse( query );
@@ -52,4 +52,11 @@ namespace mongo {
ASSERT_EQUALS( textExp->getQuery(), "awesome" );
ASSERT_EQUALS( textExp->getLanguage(), "english" );
}
+
+ TEST( MatchExpressionParserText, Parse2 ) {
+ BSONObj query = fromjson( "{$text:{$search:\"awesome\", $language:\"spanglish\"}}" );
+
+ StatusWithMatchExpression result = MatchExpressionParser::parse( query );
+ ASSERT_FALSE( result.isOK() );
+ }
}
diff --git a/src/mongo/db/query/stage_builder.cpp b/src/mongo/db/query/stage_builder.cpp
index 637fb1f3cf6..ed874a9f37c 100644
--- a/src/mongo/db/query/stage_builder.cpp
+++ b/src/mongo/db/query/stage_builder.cpp
@@ -215,7 +215,7 @@ namespace mongo {
if (!s.isOK()) { return NULL; }
string language = ("" == node->_language
- ? fam->getSpec().defaultLanguage()
+ ? fam->getSpec().defaultLanguage().str()
: node->_language);
FTSQuery ftsq;