summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/fts/SConscript88
-rw-r--r--src/mongo/db/fts/fts_command.cpp93
-rw-r--r--src/mongo/db/fts/fts_command.h68
-rw-r--r--src/mongo/db/fts/fts_command_mongod.cpp159
-rw-r--r--src/mongo/db/fts/fts_command_mongos.cpp129
-rw-r--r--src/mongo/db/fts/fts_enabled.cpp28
-rw-r--r--src/mongo/db/fts/fts_enabled.h25
-rw-r--r--src/mongo/db/fts/fts_index.cpp96
-rw-r--r--src/mongo/db/fts/fts_index.h67
-rw-r--r--src/mongo/db/fts/fts_index_format.cpp119
-rw-r--r--src/mongo/db/fts/fts_index_format.h55
-rw-r--r--src/mongo/db/fts/fts_index_format_test.cpp96
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp247
-rw-r--r--src/mongo/db/fts/fts_matcher.h67
-rw-r--r--src/mongo/db/fts/fts_matcher_test.cpp63
-rw-r--r--src/mongo/db/fts/fts_query.cpp173
-rw-r--r--src/mongo/db/fts/fts_query.h80
-rw-r--r--src/mongo/db/fts/fts_query_test.cpp73
-rw-r--r--src/mongo/db/fts/fts_search.cpp175
-rw-r--r--src/mongo/db/fts/fts_search.h103
-rw-r--r--src/mongo/db/fts/fts_spec.cpp395
-rw-r--r--src/mongo/db/fts/fts_spec.h108
-rw-r--r--src/mongo/db/fts/fts_spec_test.cpp139
-rw-r--r--src/mongo/db/fts/fts_util.cpp30
-rw-r--r--src/mongo/db/fts/fts_util.h112
-rw-r--r--src/mongo/db/fts/fts_util_test.cpp36
-rw-r--r--src/mongo/db/fts/generate_stop_words.py56
-rw-r--r--src/mongo/db/fts/stemmer.cpp58
-rw-r--r--src/mongo/db/fts/stemmer.h48
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp42
-rw-r--r--src/mongo/db/fts/stop_words.cpp73
-rw-r--r--src/mongo/db/fts/stop_words.h50
-rw-r--r--src/mongo/db/fts/stop_words_danish.txt100
-rw-r--r--src/mongo/db/fts/stop_words_dutch.txt48
-rw-r--r--src/mongo/db/fts/stop_words_english.txt174
-rw-r--r--src/mongo/db/fts/stop_words_finnish.txt747
-rw-r--r--src/mongo/db/fts/stop_words_french.txt126
-rw-r--r--src/mongo/db/fts/stop_words_german.txt992
-rw-r--r--src/mongo/db/fts/stop_words_hungarian.txt35
-rw-r--r--src/mongo/db/fts/stop_words_italian.txt279
-rw-r--r--src/mongo/db/fts/stop_words_norwegian.txt119
-rw-r--r--src/mongo/db/fts/stop_words_portuguese.txt147
-rw-r--r--src/mongo/db/fts/stop_words_romanian.txt258
-rw-r--r--src/mongo/db/fts/stop_words_russian.txt421
-rw-r--r--src/mongo/db/fts/stop_words_spanish.txt177
-rw-r--r--src/mongo/db/fts/stop_words_swedish.txt386
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp32
-rw-r--r--src/mongo/db/fts/stop_words_turkish.txt114
-rw-r--r--src/mongo/db/fts/tokenizer.cpp129
-rw-r--r--src/mongo/db/fts/tokenizer.h68
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp119
51 files changed, 7622 insertions, 0 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
new file mode 100644
index 00000000000..efc1f10586f
--- /dev/null
+++ b/src/mongo/db/fts/SConscript
@@ -0,0 +1,88 @@
+# -*- mode: python -*-
+
+Import("env")
+
+stop_word_lanages = [
+ 'danish',
+ 'dutch',
+ 'english',
+ 'finnish',
+ 'french',
+ 'german',
+ 'hungarian',
+ 'italian',
+ 'norwegian',
+ 'portuguese',
+ 'romanian',
+ 'russian',
+ 'spanish',
+ 'swedish',
+ 'turkish',
+]
+
+env.Command( [ "stop_words_list.h", "stop_words_list.cpp"],
+ [ "generate_stop_words.py"] + [ 'stop_words_%s.txt' % x for x in stop_word_lanages ],
+ "$PYTHON $SOURCES $TARGETS" )
+
+# this is not awesome
+hack = env.Clone()
+hack.StaticLibrary( "stopwords", [ "stop_words_list.cpp" ] )
+if "-O3" in hack["CCFLAGS"]:
+ hack["CCFLAGS"] = hack["CCFLAGS"].remove( "-O3" )
+
+env.StaticLibrary('base', [
+ 'fts_index_format.cpp',
+ 'fts_matcher.cpp',
+ 'fts_query.cpp',
+ 'fts_spec.cpp',
+ 'fts_util.cpp',
+ 'stemmer.cpp',
+ 'stop_words.cpp',
+ 'tokenizer.cpp',
+ ], LIBDEPS=["stopwords",
+ "$BUILD_DIR/mongo/base/base",
+ "$BUILD_DIR/mongo/bson",
+ "$BUILD_DIR/mongo/platform/platform",
+ "$BUILD_DIR/third_party/libstemmer_c/stemmer"
+ ])
+
+env.StaticLibrary( 'server_common', [
+ 'fts_command.cpp',
+ 'fts_enabled.cpp'
+ ] )
+
+env.StaticLibrary('ftsmongod', [
+ 'fts_command_mongod.cpp',
+ 'fts_index.cpp',
+ 'fts_search.cpp',
+ ], LIBDEPS=["base","server_common"])
+
+
+env.StaticLibrary('ftsmongos', [
+ 'fts_command_mongos.cpp',
+ ], LIBDEPS=["server_common"])
+
+
+env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_query_test", "fts_query_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",
+ LIBDEPS=["base"] )
+
+env.CppUnitTest( "fts_util_test", "fts_util_test.cpp",
+ LIBDEPS=["base","$BUILD_DIR/mongo/mongohasher"] )
diff --git a/src/mongo/db/fts/fts_command.cpp b/src/mongo/db/fts/fts_command.cpp
new file mode 100644
index 00000000000..0cfdf29f8c6
--- /dev/null
+++ b/src/mongo/db/fts/fts_command.cpp
@@ -0,0 +1,93 @@
+// fts_command.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string>
+#include <vector>
+
+#include "mongo/db/fts/fts_command.h"
+#include "mongo/db/fts/fts_enabled.h"
+#include "mongo/db/fts/fts_search.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/timer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ FTSCommand ftsCommand;
+
+ FTSCommand::FTSCommand()
+ : Command( "text" ) {
+ }
+
+ void FTSCommand::addRequiredPrivileges(const std::string& dbname,
+ const BSONObj& cmdObj,
+ std::vector<Privilege>* out) {
+ ActionSet actions;
+ actions.addAction(ActionType::find);
+ out->push_back(Privilege(parseNs(dbname, cmdObj), actions));
+ }
+
+
+ bool FTSCommand::run(const string& dbname,
+ BSONObj& cmdObj,
+ int options,
+ string& errmsg,
+ BSONObjBuilder& result,
+ bool fromRepl) {
+
+ if ( !isTextSearchEnabled() ) {
+ errmsg = "text search not enabled";
+ return false;
+ }
+
+ string ns = dbname + "." + cmdObj.firstElement().String();
+
+ string search = cmdObj["search"].valuestrsafe();
+ if ( search.size() == 0 ) {
+ errmsg = "no search specified";
+ return false;
+ }
+
+ string language = cmdObj["language"].valuestrsafe();
+
+ int limit = cmdObj["limit"].numberInt();
+ if (limit == 0)
+ limit = 100;
+
+ BSONObj filter;
+ if ( cmdObj["filter"].isABSONObj() )
+ filter = cmdObj["filter"].Obj();
+
+ BSONObj projection;
+ if (cmdObj["projection"].isABSONObj()) {
+ projection = cmdObj["projection"].Obj();
+ }
+
+ return _run( dbname, cmdObj, options,
+ ns, search, language, limit, filter, projection, errmsg, result );
+ }
+
+
+ }
+
+
+}
diff --git a/src/mongo/db/fts/fts_command.h b/src/mongo/db/fts/fts_command.h
new file mode 100644
index 00000000000..cbd92758ecb
--- /dev/null
+++ b/src/mongo/db/fts/fts_command.h
@@ -0,0 +1,68 @@
+// fts_command.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "mongo/db/commands.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSCommand : public Command {
+ public:
+ FTSCommand();
+
+ bool slaveOk() const { return true; }
+ bool slaveOverrideOk() const { return true; }
+
+ LockType locktype() const;
+
+ void addRequiredPrivileges(const std::string& dbname,
+ const BSONObj& cmdObj,
+ std::vector<Privilege>* out);
+
+
+ bool run(const string& dbname,
+ BSONObj& cmdObj,
+ int options,
+ string& errmsg,
+ BSONObjBuilder& result,
+ bool fromRepl);
+
+ protected:
+ bool _run( const string& dbName,
+ BSONObj& cmdObj,
+ int cmdOptions,
+ const string& ns,
+ const string& searchString,
+ string language, // "" for not-set
+ int limit,
+ BSONObj& filter,
+ BSONObj& projection,
+ string& errmsg,
+ BSONObjBuilder& result );
+ };
+
+ }
+
+}
+
diff --git a/src/mongo/db/fts/fts_command_mongod.cpp b/src/mongo/db/fts/fts_command_mongod.cpp
new file mode 100644
index 00000000000..cd38175c8e5
--- /dev/null
+++ b/src/mongo/db/fts/fts_command_mongod.cpp
@@ -0,0 +1,159 @@
+// fts_command_mongod.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "mongo/db/fts/fts_command.h"
+#include "mongo/db/fts/fts_search.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/db/pdfile.h"
+#include "mongo/db/projection.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/timer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ Command::LockType FTSCommand::locktype() const {
+ return READ;
+ }
+
+ /*
+ * Runs the command object cmdobj on the db with name dbname and puts result in result.
+ * @param dbname, name of db
+ * @param cmdobj, object that contains entire command
+ * @param options
+ * @param errmsg, reference to error message
+ * @param result, reference to builder for result
+ * @param fromRepl
+ * @return true if successful, false otherwise
+ */
+ bool FTSCommand::_run(const string& dbname,
+ BSONObj& cmdObj,
+ int cmdOptions,
+ const string& ns,
+ const string& searchString,
+ string language, // "" for not-set
+ int limit,
+ BSONObj& filter,
+ BSONObj& projection,
+ string& errmsg,
+ BSONObjBuilder& result ) {
+
+ Timer comm;
+
+ scoped_ptr<Projection> pr;
+ if ( !projection.isEmpty() ) {
+ pr.reset( new Projection() );
+ pr->init( projection );
+ }
+
+ // priority queue for results
+ Results results;
+
+ NamespaceDetails * d = nsdetails( ns.c_str() );
+ if ( !d ) {
+ errmsg = "can't find ns";
+ return false;
+ }
+
+ vector<int> idxMatches;
+ d->findIndexByType( INDEX_NAME, idxMatches );
+ if ( idxMatches.size() == 0 ) {
+ errmsg = str::stream() << "no text index for: " << ns;
+ return false;
+ }
+ if ( idxMatches.size() > 1 ) {
+ errmsg = str::stream() << "too many text index for: " << ns;
+ return false;
+ }
+
+ const IndexDetails& id = d->idx( idxMatches[0] );
+ BSONObj indexPrefix;
+
+ if ( language == "" ) {
+ FTSIndex* ftsIndex = static_cast<FTSIndex*>(id.getSpec().getType());
+ language = ftsIndex->getFtsSpec().defaultLanguage();
+ Status s = ftsIndex->getFtsSpec().getIndexPrefix( filter, &indexPrefix );
+ if ( !s.isOK() ) {
+ errmsg = s.toString();
+ return false;
+ }
+ }
+
+
+ FTSQuery query;
+ if ( !query.parse( searchString, language ).isOK() ) {
+ errmsg = "can't parse search";
+ return false;
+ }
+ result.append( "queryDebugString", query.debugString() );
+ result.append( "language", language );
+
+ FTSSearch search( d, id, indexPrefix, query, filter );
+ search.go( &results, limit );
+
+ // grab underlying container inside priority queue
+ vector<ScoredLocation> r( results.dangerous() );
+
+ // sort results by score (not always in correct order, especially w.r.t. multiterm)
+ sort( r.begin(), r.end() );
+
+ // build the results bson array shown to user
+ BSONArrayBuilder a( result.subarrayStart( "results" ) );
+
+ int BSONResultSize = 1024;
+
+ for ( unsigned n = 0; n < r.size(); n++ ) {
+ BSONObj obj = BSONObj::make(r[n].rec);
+ BSONObj toSendBack = obj;
+
+ if ( pr ) {
+ toSendBack = pr->transform(obj);
+ }
+
+ if ( ( BSONResultSize + toSendBack.objsize() ) >= BSONObjMaxUserSize ) {
+ break;
+ }
+
+ BSONObjBuilder x( a.subobjStart() );
+ x.append( "score" , r[n].score );
+ x.append( "obj", toSendBack );
+
+ BSONObj xobj = x.done();
+ BSONResultSize += xobj.objsize();
+ }
+
+ a.done();
+
+ // returns some stats to the user
+ BSONObjBuilder bb( result.subobjStart( "stats" ) );
+ bb.appendNumber( "nscanned" , search.getKeysLookedAt() );
+ bb.appendNumber( "nscannedObjects" , search.getObjLookedAt() );
+ bb.appendNumber( "n" , r.size() );
+ bb.append( "timeMicros", (int)comm.micros() );
+ bb.done();
+
+ return true;
+ }
+ }
+
+}
diff --git a/src/mongo/db/fts/fts_command_mongos.cpp b/src/mongo/db/fts/fts_command_mongos.cpp
new file mode 100644
index 00000000000..04cc8a1b808
--- /dev/null
+++ b/src/mongo/db/fts/fts_command_mongos.cpp
@@ -0,0 +1,129 @@
+// fts_command_mongos.cpp
+
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_command.h"
+#include "mongo/s/strategy.h"
+
+
+namespace mongo {
+ namespace fts {
+
+ struct Scored {
+ Scored( BSONObj full )
+ : full( full ) {
+ score = full["score"].numberDouble();
+ }
+ bool operator<( const Scored& other ) const {
+ return other.score < score;
+ }
+ BSONObj full;
+ double score;
+ };
+
+
+ // all grid commands are designed not to lock
+ Command::LockType FTSCommand::locktype() const { return NONE; }
+
+ bool FTSCommand::_run(const string& dbName,
+ BSONObj& cmdObj,
+ int cmdOptions,
+ const string& ns,
+ const string& searchString,
+ string language, // "" for not-set
+ int limit,
+ BSONObj& filter,
+ BSONObj& projection,
+ string& errmsg,
+ BSONObjBuilder& result ) {
+
+ Timer timer;
+
+ map<Shard, BSONObj> results;
+ SHARDED->commandOp( dbName, cmdObj, cmdOptions, ns, filter, results );
+
+ vector<Scored> all;
+ long long nscanned = 0;
+ long long nscannedObjects = 0;
+
+ BSONObjBuilder shardStats;
+
+ for ( map<Shard,BSONObj>::const_iterator i = results.begin(); i != results.end(); ++i ) {
+ BSONObj r = i->second;
+
+ LOG(2) << "fts result for shard: " << i->first << "\n" << r << endl;
+
+ if ( !r["ok"].trueValue() ) {
+ errmsg = str::stream() << "failure on shard: " << i->first.toString()
+ << ": " << r["errmsg"];
+ result.append( "rawresult", r );
+ return false;
+ }
+
+ if ( r["stats"].isABSONObj() ) {
+ BSONObj x = r["stats"].Obj();
+ nscanned += x["nscanned"].numberLong();
+ nscannedObjects += x["nscannedObjects"].numberLong();
+
+ shardStats.append( i->first.getName(), x );
+ }
+
+ if ( r["results"].isABSONObj() ) {
+ BSONObjIterator j( r["results"].Obj() );
+ while ( j.more() ) {
+ BSONElement e = j.next();
+ all.push_back( Scored(e.Obj()) );
+ }
+ }
+ }
+
+ sort( all.begin(), all.end() );
+ long long n = 0;
+ {
+ BSONArrayBuilder arr( result.subarrayStart( "results" ) );
+ for ( unsigned i = 0; i < all.size(); i++ ) {
+ arr.append( all[i].full );
+ if ( ++n >= limit )
+ break;
+ }
+ arr.done();
+ }
+
+ {
+ BSONObjBuilder stats( result.subobjStart( "stats" ) );
+ stats.appendNumber( "nscanned", nscanned );
+ stats.appendNumber( "nscannedObjects", nscannedObjects );
+ stats.appendNumber( "n", n );
+ stats.append( "timeMicros", (int)timer.micros() );
+
+ stats.append( "shards", shardStats.obj() );
+
+ stats.done();
+ }
+
+ return true;
+ }
+
+ FTSCommand ftsCommandSharded;
+ }
+}
diff --git a/src/mongo/db/fts/fts_enabled.cpp b/src/mongo/db/fts/fts_enabled.cpp
new file mode 100644
index 00000000000..7a11e394f6a
--- /dev/null
+++ b/src/mongo/db/fts/fts_enabled.cpp
@@ -0,0 +1,28 @@
+// fts_enabled.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/server_parameters.h"
+
+namespace mongo {
+ namespace fts {
+ MONGO_EXPORT_SERVER_PARAMETER( textSearchEnabled, bool, false );
+ bool isTextSearchEnabled() {
+ return textSearchEnabled;
+ }
+ }
+}
diff --git a/src/mongo/db/fts/fts_enabled.h b/src/mongo/db/fts/fts_enabled.h
new file mode 100644
index 00000000000..d3f733dc49f
--- /dev/null
+++ b/src/mongo/db/fts/fts_enabled.h
@@ -0,0 +1,25 @@
+// fts_enabled.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+ namespace fts {
+ bool isTextSearchEnabled();
+ }
+}
diff --git a/src/mongo/db/fts/fts_index.cpp b/src/mongo/db/fts/fts_index.cpp
new file mode 100644
index 00000000000..04fafe12a83
--- /dev/null
+++ b/src/mongo/db/fts/fts_index.cpp
@@ -0,0 +1,96 @@
+// fts_index.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/base/init.h"
+#include "mongo/db/client.h"
+#include "mongo/db/fts/fts_enabled.h"
+#include "mongo/db/fts/fts_index.h"
+#include "mongo/db/fts/fts_index_format.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+#include "mongo/util/timer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ /*
+ * extrapolates the weights vector
+ * and extra information from the spec
+ * @param plugin the index plugin for FTS
+ * @param spec the index specification
+ */
+ FTSIndex::FTSIndex( const IndexPlugin* plugin, const IndexSpec* spec )
+ : IndexType( plugin, spec ), _ftsSpec( spec->info ) {
+ }
+
+ void FTSIndex::getKeys( const BSONObj& obj, BSONObjSet& keys) const {
+ FTSIndexFormat::getKeys( _ftsSpec, obj, &keys );
+ }
+
+ shared_ptr<Cursor> FTSIndex::newCursor( const BSONObj& query,
+ const BSONObj& order,
+ int numWanted ) const {
+ shared_ptr<Cursor> c;
+ verify(0);
+ return c;
+ }
+
+
+ FTSIndexPlugin::FTSIndexPlugin() : IndexPlugin( INDEX_NAME ) {}
+
+
+ /*
+ * Adjusts spec by appending information relative to the
+ * FTS Index (such as weights, index name, etc)
+ * @param spec, specification object
+ *
+ */
+ BSONObj FTSIndexPlugin::adjustIndexSpec( const BSONObj& spec ) const {
+ StringData desc = cc().desc();
+ if ( desc.find( "conn" ) == 0 ) {
+ // this is to make sure we only complain for users
+ // if you do get a text index created an a primary
+ // want it to index on the secondary as well
+ massert( 16633, "text search not enabled", isTextSearchEnabled() );
+ }
+ return FTSSpec::fixSpec( spec );
+ }
+
+ /*
+ * Generates an FTSIndex with a spec and this plugin
+ * @param spec, specification to be used
+ */
+ IndexType* FTSIndexPlugin::generate( const IndexSpec* spec ) const {
+ return new FTSIndex( this, spec );
+ }
+
+
+ FTSIndexPlugin* ftsPlugin;
+ MONGO_INITIALIZER(FTSIndexPlugin)(InitializerContext* context) {
+ ftsPlugin = new FTSIndexPlugin();
+ return Status::OK();
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/fts/fts_index.h b/src/mongo/db/fts/fts_index.h
new file mode 100644
index 00000000000..d9bf8a61b16
--- /dev/null
+++ b/src/mongo/db/fts/fts_index.h
@@ -0,0 +1,67 @@
+// fts_index.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/db/index.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSIndex : public IndexType {
+ public:
+
+ // index constructor, called when user enters ensureIndex command with fts flag
+ FTSIndex(const IndexPlugin *plugin, const IndexSpec* spec);
+
+ void getKeys( const BSONObj& obj, BSONObjSet& keys) const;
+
+ /* newCursor is pure Virtual in IndexType so it has to be redefined in FTSIndex */
+ shared_ptr<Cursor> newCursor( const BSONObj& query,
+ const BSONObj& order,
+ int numWanted ) const;
+
+ const FTSSpec& getFtsSpec() const { return _ftsSpec; }
+
+ private:
+
+ FTSSpec _ftsSpec;
+ };
+
+
+ class FTSIndexPlugin : public IndexPlugin {
+ public:
+ FTSIndexPlugin();
+
+ IndexType* generate( const IndexSpec* spec ) const;
+
+ BSONObj adjustIndexSpec( const BSONObj& spec ) const;
+
+ };
+
+ } //namespace fts
+} //namespace mongo
diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp
new file mode 100644
index 00000000000..b39b336d651
--- /dev/null
+++ b/src/mongo/db/fts/fts_index_format.cpp
@@ -0,0 +1,119 @@
+// fts_index_format.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/base/init.h"
+#include "mongo/db/fts/fts_index_format.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ namespace {
+ BSONObj nullObj;
+ BSONElement nullElt;
+ }
+
+ MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) {
+ BSONObjBuilder b;
+ b.appendNull( "" );
+ nullObj = b.obj();
+ nullElt = nullObj.firstElement();
+ return Status::OK();
+ }
+
+ void FTSIndexFormat::getKeys( const FTSSpec& spec,
+ const BSONObj& obj,
+ BSONObjSet* keys ) {
+
+ int extraSize = 0;
+ vector<BSONElement> extrasBefore;
+ vector<BSONElement> extrasAfter;
+
+ // compute the non FTS key elements
+ for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) {
+ BSONElement e = obj.getFieldDotted(spec.extraBefore(i));
+ if ( e.eoo() )
+ e = nullElt;
+ extrasBefore.push_back(e);
+ extraSize += e.size();
+ }
+ for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) {
+ BSONElement e = obj.getFieldDotted(spec.extraAfter(i));
+ if ( e.eoo() )
+ e = nullElt;
+ extrasAfter.push_back(e);
+ extraSize += e.size();
+ }
+
+
+ TermFrequencyMap term_freqs;
+ spec.scoreDocument( obj, &term_freqs );
+
+ // create index keys from raw scores
+ // only 1 per string
+ for ( TermFrequencyMap::const_iterator i = term_freqs.begin();
+ i != term_freqs.end();
+ ++i ) {
+
+ const string& term = i->first;
+ double weight = i->second;
+
+ // guess the total size of the btree entry based on the size of the weight, term tuple
+ int guess =
+ 5 /* bson overhead */ +
+ 10 /* weight */ +
+ 8 /* term overhead */ +
+ term.size() +
+ extraSize;
+
+ BSONObjBuilder b(guess); // builds a BSON object with guess length.
+ for ( unsigned k = 0; k < extrasBefore.size(); k++ )
+ b.appendAs( extrasBefore[k], "" );
+ _appendIndexKey( b, weight, term );
+ for ( unsigned k = 0; k < extrasAfter.size(); k++ )
+ b.appendAs( extrasAfter[k], "" );
+ BSONObj res = b.obj();
+
+ verify( guess >= res.objsize() );
+
+ keys->insert( res );
+ }
+ }
+
+ BSONObj FTSIndexFormat::getIndexKey( double weight,
+ const string& term,
+ const BSONObj& indexPrefix ) {
+ BSONObjBuilder b;
+
+ BSONObjIterator i( indexPrefix );
+ while ( i.more() )
+ b.appendAs( i.next(), "" );
+
+ _appendIndexKey( b, weight, term );
+ return b.obj();
+ }
+
+ void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term ) {
+ verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header
+ b.append( "", term );
+ b.append( "", weight );
+ }
+ }
+}
diff --git a/src/mongo/db/fts/fts_index_format.h b/src/mongo/db/fts/fts_index_format.h
new file mode 100644
index 00000000000..eeb225e756f
--- /dev/null
+++ b/src/mongo/db/fts/fts_index_format.h
@@ -0,0 +1,55 @@
+// fts_index_format.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mongo/db/fts/fts_spec.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSIndexFormat {
+ public:
+
+ static void getKeys( const FTSSpec& spec,
+ const BSONObj& document,
+ BSONObjSet* keys );
+
+ /*
+ * Helper method to get return entry from the FTSIndex as a BSONObj
+ * @param weight, the weight of the term in the entry
+ * @param term, the string term in the entry
+ * @param indexPrefix, the fields that go in the index first
+ */
+ static BSONObj getIndexKey( double weight,
+ const string& term,
+ const BSONObj& indexPrefix );
+
+ private:
+ /*
+ * Helper method to get return entry from the FTSIndex as a BSONObj
+ * @param b, reference to the BSONOBjBuilder
+ * @param weight, the weight of the term in the entry
+ * @param term, the string term in the entry
+ */
+ static void _appendIndexKey( BSONObjBuilder& b, double weight, const string& term );
+ };
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_index_format_test.cpp b/src/mongo/db/fts/fts_index_format_test.cpp
new file mode 100644
index 00000000000..7b0f5b32f0a
--- /dev/null
+++ b/src/mongo/db/fts/fts_index_format_test.cpp
@@ -0,0 +1,96 @@
+// fts_index_format_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_index_format.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ TEST( FTSIndexFormat, Simple1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys );
+
+ ASSERT_EQUALS( 2U, keys.size() );
+ for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+ BSONObj key = *i;
+ ASSERT_EQUALS( 2, key.nFields() );
+ ASSERT_EQUALS( String, key.firstElement().type() );
+ }
+ }
+
+ TEST( FTSIndexFormat, ExtraBack1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
+ "x" << 1 ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+ }
+
+ /*
+ TEST( FTSIndexFormat, ExtraBackArray1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" <<
+ "x.y" << 1 ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec,
+ BSON( "data" << "cat" <<
+ "x" << BSON_ARRAY( BSON( "y" << 1 ) <<
+ BSON( "y" << 2 ) ) ),
+ &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ log() << "e: " << key << endl;
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+ }
+ */
+
+ TEST( FTSIndexFormat, ExtraFront1 ) {
+ FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 <<
+ "data" << "text" ) ) ) );
+ BSONObjSet keys;
+ FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys );
+
+ ASSERT_EQUALS( 1U, keys.size() );
+ BSONObj key = *(keys.begin());
+ ASSERT_EQUALS( 3, key.nFields() );
+ BSONObjIterator i( key );
+ ASSERT_EQUALS( 5, i.next().numberInt() );
+ ASSERT_EQUALS( StringData("cat"), i.next().valuestr() );
+ ASSERT( i.next().numberDouble() > 0 );
+ }
+
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
new file mode 100644
index 00000000000..313fdd5be9e
--- /dev/null
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -0,0 +1,247 @@
+// fts_matcher.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_matcher.h"
+
+namespace mongo {
+
+ namespace fts {
+
+
+ FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec )
+ : _query( query ),
+ _spec( spec ),
+ _stemmer( query.getLanguage() ){
+ }
+
+ /*
+ * Checks if the obj contains any of the negTerms, if so returns true, otherwise false
+ * @param obj, object to be checked
+ */
+ bool FTSMatcher::hasNegativeTerm(const BSONObj& obj ) const {
+ // called during search. deals with the case in which we have a term
+ // flagged for exclusion, i.e. "hello -world" we want to remove all
+ // results that include "world"
+
+ if ( _query.getNegatedTerms().size() == 0 )
+ return false;
+
+ if ( _spec.wildcard() ) {
+ return _hasNegativeTerm_recurse(obj);
+ }
+
+ /* otherwise look at fields where weights are defined */
+ for ( Weights::const_iterator i = _spec.weights().begin();
+ i != _spec.weights().end();
+ i++ ) {
+ const char * leftOverName = i->first.c_str();
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+
+ if ( e.type() == Array ) {
+ BSONObjIterator j( e.Obj() );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+ if ( leftOverName[0] && x.isABSONObj() )
+ x = x.Obj().getFieldDotted( leftOverName );
+ if ( x.type() == String )
+ if ( _hasNegativeTerm_string( x.String() ) )
+ return true;
+ }
+ }
+ else if ( e.type() == String ) {
+ if ( _hasNegativeTerm_string( e.String() ) )
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FTSMatcher::_hasNegativeTerm_recurse(const BSONObj& obj ) const {
+ BSONObjIterator j( obj );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( _spec.languageOverrideField() == x.fieldName())
+ continue;
+
+ if (x.type() == String) {
+ if ( _hasNegativeTerm_string( x.String() ) )
+ return true;
+ }
+ else if ( x.isABSONObj() ) {
+ BSONObjIterator k( x.Obj() );
+ while ( k.more() ) {
+ // check if k.next() is a obj/array or not
+ BSONElement y = k.next();
+ if ( y.type() == String ) {
+ if ( _hasNegativeTerm_string( y.String() ) )
+ return true;
+ }
+ else if ( y.isABSONObj() ) {
+ if ( _hasNegativeTerm_recurse( y.Obj() ) )
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /*
+ * Checks if any of the negTerms is in the tokenized string
+ * @param raw, the raw string to be tokenized
+ */
+ bool FTSMatcher::_hasNegativeTerm_string( const string& raw ) const {
+
+ Tokenizer i( _query.getLanguage(), raw );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT )
+ continue;
+ string word = tolowerString( _stemmer.stem( t.data ) );
+ if ( _query.getNegatedTerms().count( word ) > 0 )
+ return true;
+ }
+ return false;
+ }
+
+
+ bool FTSMatcher::phrasesMatch( const BSONObj& obj ) const {
+ for (unsigned i = 0; i < _query.getPhr().size(); i++ ) {
+ if ( !phraseMatch( _query.getPhr()[i], obj ) ) {
+ return false;
+ }
+ }
+
+ for (unsigned i = 0; i < _query.getNegatedPhr().size(); i++ ) {
+ if ( phraseMatch( _query.getNegatedPhr()[i], obj ) ) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+
+ /**
+ * Checks if phrase is exactly matched in obj, returns true if so, false otherwise
+ * @param phrase, the string to be matched
+ * @param obj, document in the collection to match against
+ */
+ bool FTSMatcher::phraseMatch( const string& phrase, const BSONObj& obj ) const {
+
+ if ( _spec.wildcard() ) {
+ // case where everything is indexed (all fields)
+ return _phraseRecurse( phrase, obj );
+ }
+
+ for ( Weights::const_iterator i = _spec.weights().begin();
+ i != _spec.weights().end();
+ ++i ) {
+
+ // figure out what the indexed field is.. ie. is it "field" or "field.subfield" etc.
+ const char * leftOverName = i->first.c_str();
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+
+ if ( e.type() == Array ) {
+ BSONObjIterator j( e.Obj() );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( leftOverName[0] && x.isABSONObj() )
+ x = x.Obj().getFieldDotted( leftOverName );
+
+ if ( x.type() == String )
+ if ( _phraseMatches( phrase, x.String() ) )
+ return true;
+ }
+ }
+ else if ( e.type() == String ) {
+ if ( _phraseMatches( phrase, e.String() ) )
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ /*
+ * Recurses over all fields in the obj to match against phrase
+ * @param phrase, string to be matched
+ * @param obj, object to matched against
+ */
+ bool FTSMatcher::_phraseRecurse( const string& phrase, const BSONObj& obj ) const {
+ BSONObjIterator j( obj );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( _spec.languageOverrideField() == x.fieldName() )
+ continue;
+
+ if ( x.type() == String ) {
+ if ( _phraseMatches( phrase, x.String() ) )
+ return true;
+ }
+ else if ( x.isABSONObj() ) {
+ BSONObjIterator k( x.Obj() );
+
+ while ( k.more() ) {
+
+ BSONElement y = k.next();
+
+ if ( y.type() == mongo::String ) {
+ if ( _phraseMatches( phrase, y.String() ) )
+ return true;
+ }
+ else if ( y.isABSONObj() ) {
+ if ( _phraseRecurse( phrase, y.Obj() ) )
+ return true;
+ }
+ }
+
+ }
+ }
+
+ return false;
+ }
+
+
+ /*
+ * Looks for phrase in a raw string
+ * @param phrase, phrase to match
+ * @param raw, raw string to be parsed
+ */
+ bool FTSMatcher::_phraseMatches( const string& phrase, const string& haystack ) const {
+#ifdef _WIN32
+ // windows doesn't have strcasestr
+ // for now, doing something very slow, bu correct
+ string p = phrase;
+ string h = haystack;
+ makeLower( &p );
+ makeLower( &h );
+ return strstr( h.c_str(), p.c_str() ) > 0;
+#else
+ return strcasestr( haystack.c_str(), phrase.c_str() ) > 0;
+#endif
+ }
+
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h
new file mode 100644
index 00000000000..c5478d63b78
--- /dev/null
+++ b/src/mongo/db/fts/fts_matcher.h
@@ -0,0 +1,67 @@
+// fts_matcher.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/tokenizer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class FTSMatcher {
+ public:
+ FTSMatcher( const FTSQuery& query, const FTSSpec& spec );
+
+ /**
+ * @return true if obj has a negated term
+ */
+ bool hasNegativeTerm(const BSONObj& obj ) const;
+
+ /**
+ * @return true if obj is ok by all phrases
+ * so all full phrases and no negated
+ */
+ bool phrasesMatch( const BSONObj& obj ) const;
+
+ bool phraseMatch( const string& phrase, const BSONObj& obj ) const;
+
+ bool matchesNonTerm( const BSONObj& obj ) const {
+ return !hasNegativeTerm( obj ) && phrasesMatch( obj );
+ }
+
+ private:
+ bool _hasNegativeTerm_recurse(const BSONObj& obj ) const;
+
+ /**
+ * @return true if raw has a negated term
+ */
+ bool _hasNegativeTerm_string( const string& raw ) const;
+
+ bool _phraseRecurse( const string& phrase, const BSONObj& obj ) const;
+ bool _phraseMatches( const string& phrase, const string& haystack ) const;
+
+ FTSQuery _query;
+ FTSSpec _spec;
+ Stemmer _stemmer;
+ };
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp
new file mode 100644
index 00000000000..15369980885
--- /dev/null
+++ b/src/mongo/db/fts/fts_matcher_test.cpp
@@ -0,0 +1,63 @@
+// fts_matcher_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_matcher.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( FTSMatcher, NegWild1 ) {
+ FTSQuery q;
+ q.parse( "foo -bar", "english" );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) );
+
+ ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) );
+ ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) );
+ }
+
+ TEST( FTSMatcher, Phrase1 ) {
+ FTSQuery q;
+ q.parse( "foo \"table top\"", "english" );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) );
+
+ ASSERT( m.phraseMatch( "table top", BSON( "x" << "table top" ) ) );
+ ASSERT( m.phraseMatch( "table top", BSON( "x" << " asd table top asd" ) ) );
+ ASSERT( !m.phraseMatch( "table top", BSON( "x" << "tablz top" ) ) );
+ ASSERT( !m.phraseMatch( "table top", BSON( "x" << " asd tablz top asd" ) ) );
+
+ ASSERT( m.phrasesMatch( BSON( "x" << "table top" ) ) );
+ ASSERT( !m.phrasesMatch( BSON( "x" << "table a top" ) ) );
+
+ }
+
+ TEST( FTSMatcher, Phrase2 ) {
+ FTSQuery q;
+ q.parse( "foo \"table top\"", "english" );
+ FTSMatcher m( q,
+ FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "fts" ) ) ) ) );
+ ASSERT( m.phraseMatch( "table top",
+ BSON( "x" << BSON_ARRAY( "table top" ) ) ) );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
new file mode 100644
index 00000000000..0f32ba1afad
--- /dev/null
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -0,0 +1,173 @@
+// fts_query.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ Status FTSQuery::parse(const string& query, const string& language) {
+ _search = query;
+ _language = language;
+
+ const StopWords* stopWords = StopWords::getStopWords( language );
+ Stemmer stemmer( language );
+
+ bool inNegation = false;
+ bool inPhrase = false;
+
+ str::stream phrase;
+
+ Tokenizer i( _language, query );
+ while ( i.more() ) {
+ Token t = i.next();
+
+ if ( t.type == Token::TEXT ) {
+ string s = t.data.toString();
+
+ if ( inPhrase ) {
+ if ( phrase.ss.len() > 0 )
+ phrase << ' ';
+ phrase << s;
+ }
+
+ if ( inPhrase && inNegation ) {
+ // don't add term
+ }
+ else {
+ _addTerm( stopWords, stemmer, s, inNegation );
+ }
+
+ if ( inNegation && !inPhrase )
+ inNegation = false;
+ }
+ else if ( t.type == Token::DELIMITER ) {
+ char c = t.data[0];
+ if ( c == '-' ) {
+ if ( t.previousWhiteSpace )
+ inNegation = true;
+ }
+ else if ( c == '"' ) {
+ if ( inPhrase ) {
+ // end of a phrase
+ if ( inNegation )
+ _negatedPhrases.push_back( tolowerString( phrase ) );
+ else
+ _phrases.push_back( tolowerString( phrase ) );
+ inNegation = false;
+ inPhrase = false;
+ }
+ else {
+ // start of a phrase
+ inPhrase = true;
+ phrase.ss.reset();
+ }
+ }
+ }
+ else {
+ abort();
+ }
+ }
+
+ return Status::OK();
+ }
+
+ void FTSQuery::_addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ) {
+ string word = tolowerString( term );
+ if ( sw->isStopWord( word ) )
+ return;
+ word = stemmer.stem( word );
+ if ( negated )
+ _negatedTerms.insert( word );
+ else
+ _terms.push_back( word );
+ }
+
+ namespace {
+ void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) {
+ bool first = true;
+ for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) {
+ if ( first )
+ first = false;
+ else
+ ss << sep;
+ ss << *i;
+ }
+ }
+
+ void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) {
+ set<string> s( v.begin(), v.end() );
+ _debugHelp( ss, s, sep );
+ }
+
+ void _debugHelp( stringstream& ss, const unordered_set<string>& v, const string& sep ) {
+ set<string> s( v.begin(), v.end() );
+ _debugHelp( ss, s, sep );
+ }
+
+ }
+
+ string FTSQuery::toString() const {
+ stringstream ss;
+ ss << "FTSQuery\n";
+
+ ss << " terms: ";
+ _debugHelp( ss, getTerms(), ", " );
+ ss << "\n";
+
+ ss << " negated terms: ";
+ _debugHelp( ss, getNegatedTerms(), ", " );
+ ss << "\n";
+
+ ss << " phrases: ";
+ _debugHelp( ss, getPhr(), ", " );
+ ss << "\n";
+
+ ss << " negated phrases: ";
+ _debugHelp( ss, getNegatedPhr(), ", " );
+ ss << "\n";
+
+ return ss.str();
+ }
+
+ string FTSQuery::debugString() const {
+ stringstream ss;
+
+ _debugHelp( ss, getTerms(), "|" );
+ ss << "||";
+
+ _debugHelp( ss, getNegatedTerms(), "|" );
+ ss << "||";
+
+ _debugHelp( ss, getPhr(), "|" );
+ ss << "||";
+
+ _debugHelp( ss, getNegatedPhr(), "|" );
+
+ return ss.str();
+ }
+ }
+}
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
new file mode 100644
index 00000000000..7022760b3a7
--- /dev/null
+++ b/src/mongo/db/fts/fts_query.h
@@ -0,0 +1,80 @@
+// fts_query.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "mongo/base/status.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/platform/unordered_set.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using std::string;
+ using std::vector;
+ using std::set;
+
+ class FTSQuery {
+
+ public:
+ Status parse(const string& query, const string& language);
+
+ const vector<string>& getTerms() const { return _terms; }
+ const unordered_set<string>& getNegatedTerms() const { return _negatedTerms; }
+
+ const vector<string>& getPhr() const { return _phrases; }
+ const vector<string>& getNegatedPhr() const { return _negatedPhrases; }
+
+ /**
+ * @return true if any negations or phrase + or -
+ */
+ bool hasNonTermPieces() const {
+ return
+ _negatedTerms.size() > 0 ||
+ _phrases.size() > 0 ||
+ _negatedPhrases.size() > 0;
+ }
+
+ string getSearch() const { return _search; }
+ string getLanguage() const { return _language; }
+
+ string toString() const;
+
+ string debugString() const;
+
+ protected:
+ string _search;
+ string _language;
+ vector<string> _terms;
+ unordered_set<string> _negatedTerms;
+ vector<string> _phrases;
+ vector<string> _negatedPhrases;
+
+ private:
+ void _addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated );
+ };
+
+ }
+}
+
diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp
new file mode 100644
index 00000000000..92bd6ee222a
--- /dev/null
+++ b/src/mongo/db/fts/fts_query_test.cpp
@@ -0,0 +1,73 @@
+// fts_query_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( FTSQuery, Basic1 ) {
+ FTSQuery q;
+ ASSERT( q.parse( "this is fun", "english" ).isOK() );
+
+ ASSERT_EQUALS( 1U, q.getTerms().size() );
+ ASSERT_EQUALS( "fun", q.getTerms()[0] );
+ ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
+ ASSERT_EQUALS( 0U, q.getPhr().size() );
+ ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
+ }
+
+ TEST( FTSQuery, Neg1 ) {
+ FTSQuery q;
+ ASSERT( q.parse( "this is -really fun", "english" ).isOK() );
+
+ ASSERT_EQUALS( 1U, q.getTerms().size() );
+ ASSERT_EQUALS( "fun", q.getTerms()[0] );
+ ASSERT_EQUALS( 1U, q.getNegatedTerms().size() );
+ ASSERT_EQUALS( "realli", *q.getNegatedTerms().begin() );
+ }
+
+ TEST( FTSQuery, Phrase1 ) {
+ FTSQuery q;
+ ASSERT( q.parse( "doing a \"phrase test\" for fun", "english" ).isOK() );
+
+ ASSERT_EQUALS( 3U, q.getTerms().size() );
+ ASSERT_EQUALS( 0U, q.getNegatedTerms().size() );
+ ASSERT_EQUALS( 1U, q.getPhr().size() );
+ ASSERT_EQUALS( 0U, q.getNegatedPhr().size() );
+
+ ASSERT_EQUALS( "phrase test", q.getPhr()[0] );
+ ASSERT_EQUALS( "fun|phrase|test||||phrase test||", q.debugString() );
+ }
+
+ TEST( FTSQuery, NegPhrase1 ) {
+ FTSQuery q;
+ ASSERT( q.parse( "doing a -\"phrase test\" for fun", "english" ).isOK() );
+ ASSERT_EQUALS( "fun||||||phrase test", q.debugString() );
+ }
+
+ TEST( FTSQuery, Mix1 ) {
+ FTSQuery q;
+ ASSERT( q.parse( "\"industry\" -Melbourne -Physics", "english" ).isOK() );
+ ASSERT_EQUALS( "industri||melbourn|physic||industry||", q.debugString() );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_search.cpp b/src/mongo/db/fts/fts_search.cpp
new file mode 100644
index 00000000000..5686cb89ffb
--- /dev/null
+++ b/src/mongo/db/fts/fts_search.cpp
@@ -0,0 +1,175 @@
+// fts_search.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/btreecursor.h"
+#include "mongo/db/fts/fts_index_format.h"
+#include "mongo/db/fts/fts_search.h"
+#include "mongo/db/kill_current_op.h"
+#include "mongo/db/pdfile.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ /*
+ * Constructor generates query and term dictionaries
+ * @param ns, namespace
+ * @param idxNum, index number
+ * @param search, query string
+ * @param language, language of the query
+ * @param filter, filter object
+ */
+ FTSSearch::FTSSearch( NamespaceDetails* ns,
+ const IndexDetails& id,
+ const BSONObj& indexPrefix,
+ const FTSQuery& query,
+ const BSONObj& filter )
+ : _ns( ns ),
+ _id( id ),
+ _fts( static_cast<FTSIndex*>(_id.getSpec().getType()) ),
+ _indexPrefix( indexPrefix ),
+ _query( query ),
+ _ftsMatcher( query, static_cast<FTSIndex*>(_id.getSpec().getType())->getFtsSpec() ) {
+
+ if ( !filter.isEmpty() )
+ _matcher.reset( new CoveredIndexMatcher( filter, _fts->keyPattern() ) );
+
+ _keysLookedAt = 0;
+ _objectsLookedAt = 0;
+ }
+
+ bool FTSSearch::_ok( Record* record ) const {
+ if ( !_query.hasNonTermPieces() )
+ return true;
+ return _ftsMatcher.matchesNonTerm( BSONObj::make( record ) );
+ }
+
+ /*
+ * GO: sets the tree cursors on each term in terms, processes the terms by advancing
+ * the terms cursors and storing the partial
+ * results and lastly calculates the top results
+ * @param results, the priority queue containing the top results
+ * @param limit, number of results in the priority queue
+ */
+ void FTSSearch::go(Results* results, unsigned limit ) {
+ vector< shared_ptr<BtreeCursor> > cursors;
+
+ for ( unsigned i = 0; i < _query.getTerms().size(); i++ ) {
+ const string& term = _query.getTerms()[i];
+ BSONObj min = FTSIndexFormat::getIndexKey( MAX_WEIGHT, term, _indexPrefix );
+ BSONObj max = FTSIndexFormat::getIndexKey( 0, term, _indexPrefix );
+ shared_ptr<BtreeCursor> c( BtreeCursor::make( _ns, _id, min, max, true, -1 ) );
+ cursors.push_back( c );
+ }
+
+ while ( !inShutdown() ) {
+ bool gotAny = false;
+ for ( unsigned i = 0; i < cursors.size(); i++ ) {
+ if ( cursors[i]->eof() )
+ continue;
+ gotAny = true;
+ _process( cursors[i].get() );
+ cursors[i]->advance();
+ }
+
+ if ( !gotAny )
+ break;
+
+ RARELY killCurrentOp.checkForInterrupt();
+ }
+
+
+ // priority queue using a compare that grabs the lowest of two ScoredLocations by score.
+ for ( Scores::iterator i = _scores.begin(); i != _scores.end(); ++i ) {
+
+ if ( i->second < 0 )
+ continue;
+
+ // priority queue
+ if ( results->size() < limit ) { // case a: queue unfilled
+
+ if ( !_ok( i->first ) )
+ continue;
+
+ results->push( ScoredLocation( i->first, i->second ) );
+
+ }
+ else if ( i->second > results->top().score ) { // case b: queue filled
+
+ if ( !_ok( i->first ) )
+ continue;
+
+ results->pop();
+ results->push( ScoredLocation( i->first, i->second ) );
+ }
+ else {
+ // else do nothing (case c)
+ }
+
+ }
+
+ }
+
+ /*
+ * Takes a cursor and updates the partial score for said cursor in _scores map
+ * @param cursor, btree cursor pointing to the current document to be scored
+ */
+ void FTSSearch::_process( BtreeCursor* cursor ) {
+ _keysLookedAt++;
+
+ BSONObj key = cursor->currKey();
+
+ BSONObjIterator i( key );
+ BSONElement indexToken = i.next();
+ BSONElement scoreElement = i.next();
+
+ double score = scoreElement.number();
+
+ double& cur = _scores[(cursor->currLoc()).rec()];
+
+ if ( cur < 0 ) {
+ // already been rejected
+ return;
+ }
+
+ if ( cur == 0 && _matcher.get() ) {
+ // we haven't seen this before and we have a matcher
+ MatchDetails d;
+ if ( !_matcher->matchesCurrent( cursor, &d ) ) {
+ cur = -1;
+ }
+
+ if ( d.hasLoadedRecord() )
+ _objectsLookedAt++;
+
+ if ( cur == -1 )
+ return;
+ }
+
+ if ( cur )
+ cur += score * (1 + 1 / score);
+ else
+ cur += score;
+
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/fts/fts_search.h b/src/mongo/db/fts/fts_search.h
new file mode 100644
index 00000000000..82e5b66f3b2
--- /dev/null
+++ b/src/mongo/db/fts/fts_search.h
@@ -0,0 +1,103 @@
+// fts_search.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+#include <queue>
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/db/fts/fts_index.h"
+#include "mongo/db/fts/fts_matcher.h"
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/db/matcher.h"
+
+namespace mongo {
+
+ class BtreeCursor;
+
+ namespace fts {
+
+ // priority queue template, for use when we're populating results
+ // vector returned to the user. extends the default priority_queue
+ // by providing direct access to the underlying vector, which should
+ // be used CAREFULLY because you can get into trouble..
+ template <class T, class S, class C>
+ class a_priority_queue : public std::priority_queue<T, S, C> {
+ public:
+ // return the value of an element at position n when we call pq[n]
+ T operator[](const int &n) { return this->c[n]; }
+ // return underlying data structure. called dangerous because it is.
+ S dangerous() { return this->c; }
+ };
+
+ typedef a_priority_queue<ScoredLocation, vector<ScoredLocation>, ScoredLocationComp> Results;
+
+ class FTSSearch {
+ MONGO_DISALLOW_COPYING(FTSSearch);
+ public:
+
+ typedef std::map<Record*,double> Scores;
+
+ FTSSearch( NamespaceDetails* ns,
+ const IndexDetails& id,
+ const BSONObj& indexPrefix,
+ const FTSQuery& query,
+ const BSONObj& filter );
+
+
+ void go(Results* results, unsigned limit );
+
+ const FTSIndex * getIndex() const { return _fts; }
+
+ long long getKeysLookedAt() const { return _keysLookedAt; }
+ long long getObjLookedAt() const { return _objectsLookedAt; }
+
+ private:
+
+ void _process( BtreeCursor* cursor );
+
+ /**
+ * checks not index pieces
+ * i.e. prhases & negated terms
+ */
+ bool _ok( Record* record ) const;
+
+ NamespaceDetails* _ns;
+ const IndexDetails& _id;
+ FTSIndex* _fts;
+ BSONObj _indexPrefix;
+ FTSQuery _query;
+ FTSMatcher _ftsMatcher;
+
+ scoped_ptr<CoveredIndexMatcher> _matcher;
+
+ long long _keysLookedAt;
+ long long _objectsLookedAt;
+
+ Scores _scores;
+
+ };
+
+ } // namespace fts
+
+} // namespace mongo
+
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
new file mode 100644
index 00000000000..ab541b6a7f4
--- /dev/null
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -0,0 +1,395 @@
+// fts_spec.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/util/mongoutils/str.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ using namespace mongoutils;
+
+ const double MAX_WEIGHT = 1000000000.0;
+
+
+ FTSSpec::FTSSpec( const BSONObj& indexInfo ) {
+ _defaultLanguage = indexInfo["default_language"].valuestrsafe();
+ _languageOverrideField = indexInfo["language_override"].valuestrsafe();
+
+ if ( _defaultLanguage.size() == 0 )
+ _defaultLanguage = "english";
+ if ( _languageOverrideField.size() == 0 )
+ _languageOverrideField = "language";
+
+ _wildcard = false;
+
+ // in this block we fill in the _weights map
+ {
+ BSONObjIterator i( indexInfo["weights"].Obj() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ verify( e.isNumber() );
+
+ if ( WILDCARD == e.fieldName() ) {
+ _wildcard = true;
+ }
+ else {
+ double num = e.number();
+ _weights[ e.fieldName() ] = num;
+ verify( num > 0 && num < MAX_WEIGHT );
+ }
+ }
+ verify( _wildcard || _weights.size() );
+ }
+
+ // extra information
+ {
+ BSONObj keyPattern = indexInfo["key"].Obj();
+ verify( keyPattern.nFields() >= 2 );
+ BSONObjIterator i( keyPattern );
+
+ bool passedFTS = false;
+
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( str::equals( e.fieldName(), "_fts" ) ||
+ str::equals( e.fieldName(), "_ftsx" ) ) {
+ passedFTS = true;
+ continue;
+ }
+
+ if ( passedFTS )
+ _extraAfter.push_back( e.fieldName() );
+ else
+ _extraBefore.push_back( e.fieldName() );
+ }
+
+ }
+ }
+
+ bool FTSSpec::weight( const StringData& field, double* out ) const {
+ Weights::const_iterator i = _weights.find( field.toString() );
+ if ( i == _weights.end() )
+ return false;
+ *out = i->second;
+ return true;
+ }
+
+ string FTSSpec::getLanguageToUse( const BSONObj& userDoc ) const {
+ BSONElement e = userDoc[_languageOverrideField];
+ if ( e.type() == String ) {
+ const char * x = e.valuestrsafe();
+ if ( strlen( x ) > 0 )
+ return x;
+ }
+ return _defaultLanguage;
+ }
+
+
+ /*
+ * Calculates the score for all terms in a document of a collection
+ * @param obj, the document in the collection being parsed
+ * @param term_freqs, map<string,double> to fill up
+ */
+ void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const {
+
+ string language = getLanguageToUse( obj );
+
+ Stemmer stemmer(language);
+ Tools tools(language);
+ tools.stemmer = &stemmer;
+ tools.stopwords = StopWords::getStopWords( language );
+
+ if ( wildcard() ) {
+ // if * is specified for weight, we can recurse over all fields.
+ _scoreRecurse(tools, obj, term_freqs);
+ return;
+ }
+
+ // otherwise, we need to remember the different weights for each field
+ // and act accordingly (in other words, call _score)
+ for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) {
+ const char * leftOverName = i->first.c_str();
+ // name of field
+ BSONElement e = obj.getFieldDottedOrArray(leftOverName);
+ // weight associated to name of field
+ double weight = i->second;
+
+ if ( e.eoo() ) {
+ // do nothing
+ }
+ else if ( e.type() == Array ) {
+ BSONObjIterator j( e.Obj() );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+ if ( leftOverName[0] && x.isABSONObj() )
+ x = x.Obj().getFieldDotted( leftOverName );
+ if ( x.type() == String )
+ _scoreString( tools, x.valuestr(), term_freqs, weight );
+ }
+ }
+ else if ( e.type() == String ) {
+ _scoreString( tools, e.valuestr(), term_freqs, weight );
+ }
+
+ }
+ }
+
+
+ /*
+ * Recurses over all fields of an obj (document in collection)
+ * and fills term,score map term_freqs
+ * @param tokenizer, tokenizer to tokenize a string into terms
+ * @param obj, object being parsed
+ * term_freqs, map <term,score> to be filled up
+ */
+ void FTSSpec::_scoreRecurse(const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs ) const {
+ BSONObjIterator j( obj );
+ while ( j.more() ) {
+ BSONElement x = j.next();
+
+ if ( languageOverrideField() == x.fieldName() )
+ continue;
+
+ if (x.type() == String) {
+ double w = 1;
+ weight( x.fieldName(), &w );
+ _scoreString(tools, x.valuestr(), term_freqs, w);
+ }
+ else if ( x.isABSONObj() ) {
+ _scoreRecurse( tools, x.Obj(), term_freqs);
+ }
+
+ }
+ }
+
+ namespace {
+ struct ScoreHelperStruct {
+ ScoreHelperStruct()
+ : freq(0), count(0), exp(0){
+ }
+ double freq;
+ double count;
+ double exp;
+ };
+ typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap;
+ }
+
+ void FTSSpec::_scoreString( const Tools& tools,
+ const StringData& raw,
+ TermFrequencyMap* docScores,
+ double weight ) const {
+
+ ScoreHelperMap terms;
+
+ unsigned numTokens = 0;
+
+ Tokenizer i( tools.language, raw );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT )
+ continue;
+
+ string term = t.data.toString();
+ makeLower( &term );
+ term = tools.stemmer->stem( term );
+ if ( tools.stopwords->isStopWord( term ) )
+ continue;
+
+ ScoreHelperStruct& data = terms[term];
+
+ if ( data.exp )
+ data.exp *= 2;
+ else
+ data.exp = 1;
+ data.count += 1;
+ data.freq += ( 1 / data.exp );
+
+ numTokens++;
+ }
+
+ for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {
+
+ const string& term = i->first;
+ const ScoreHelperStruct& data = i->second;
+
+ // in order to adjust weights as a function of term count as it
+ // relates to total field length. ie. is this the only word or
+ // a frequently occuring term? or does it only show up once in
+ // a long block of text?
+
+ double coeff = ( 0.5 * data.count / numTokens ) + 0.5;
+
+ // if term is identical to the raw form of the
+ // field (untokenized) give it a small boost.
+ double adjustment = 1;
+ if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
+ adjustment += 0.1;
+
+ double& score = (*docScores)[term];
+ score += ( weight * data.freq * coeff * adjustment );
+ verify( score <= MAX_WEIGHT );
+ }
+ }
+
+ Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const {
+ if ( numExtraBefore() == 0 ) {
+ *out = BSONObj();
+ return Status::OK();
+ }
+
+ BSONObjBuilder b;
+ for ( unsigned i = 0; i < numExtraBefore(); i++ ) {
+ BSONElement e = query.getFieldDotted(extraBefore(i));
+ if ( e.eoo() )
+ return Status( ErrorCodes::BadValue,
+ str::stream()
+ << "need have an eaulity filter on: "
+ << extraBefore(i) );
+
+ if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 )
+ return Status( ErrorCodes::BadValue,
+ str::stream()
+ << "need have an eaulity filter on: "
+ << extraBefore(i) );
+
+ b.append( e );
+ }
+ *out = b.obj();
+ return Status::OK();
+ }
+
+ void _addFTSStuff( BSONObjBuilder* b ) {
+ b->append( "_fts", INDEX_NAME );
+ b->append( "_ftsx", 1 );
+ }
+
+ BSONObj FTSSpec::fixSpec( const BSONObj& spec ) {
+ map<string,int> m;
+
+ BSONObj keyPattern;
+ {
+ BSONObjBuilder b;
+ bool addedFtsStuff = false;
+
+ BSONObjIterator i( spec["key"].Obj() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( str::equals( e.fieldName(), "_fts" ) ||
+ str::equals( e.fieldName(), "_ftsx" ) ) {
+ continue;
+ }
+ else if ( e.type() == String &&
+ ( str::equals( "fts", e.valuestr() ) ||
+ str::equals( "text", e.valuestr() ) ) ) {
+
+ if ( !addedFtsStuff ) {
+ _addFTSStuff( &b );
+ addedFtsStuff = true;
+ }
+
+ m[e.fieldName()] = 1;
+ }
+ else {
+ b.append( e );
+ }
+ }
+
+ if ( !addedFtsStuff )
+ _addFTSStuff( &b );
+
+ keyPattern = b.obj();
+ }
+
+ if ( spec["weights"].isABSONObj() ) {
+ BSONObjIterator i( spec["weights"].Obj() );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ m[e.fieldName()] = e.numberInt();
+ }
+ }
+ else if ( spec["weights"].str() == WILDCARD ) {
+ m[WILDCARD] = 1;
+ }
+
+ BSONObj weights;
+ {
+ BSONObjBuilder b;
+ for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i )
+ b.append( i->first, i->second );
+ weights = b.obj();
+ }
+
+ string default_language(spec.getStringField("default_language"));
+ if ( default_language.empty() )
+ default_language = "english";
+
+ string language_override(spec.getStringField("language_override"));
+ if ( language_override.empty() )
+ language_override = "language";
+
+ int version = 0;
+
+ BSONObjBuilder b;
+ BSONObjIterator i( spec );
+ while ( i.more() ) {
+ BSONElement e = i.next();
+ if ( str::equals( e.fieldName(), "key" ) ) {
+ b.append( "key", keyPattern );
+ }
+ else if ( str::equals( e.fieldName(), "weights" ) ) {
+ b.append( "weights", weights );
+ weights = BSONObj();
+ }
+ else if ( str::equals( e.fieldName(), "default_language" ) ) {
+ b.append( "default_language", default_language);
+ default_language = "";
+ }
+ else if ( str::equals( e.fieldName(), "language_override" ) ) {
+ b.append( "language_override", language_override);
+ language_override = "";
+ }
+ else if ( str::equals( e.fieldName(), "v" ) ) {
+ version = e.numberInt();
+ }
+ else {
+ b.append( e );
+ }
+ }
+
+ if ( !weights.isEmpty() )
+ b.append( "weights", weights );
+ if ( !default_language.empty() )
+ b.append( "default_language", default_language);
+ if ( !language_override.empty() )
+ b.append( "language_override", language_override);
+
+ b.append( "v", version );
+
+ return b.obj();
+
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
new file mode 100644
index 00000000000..e3ebf24f76b
--- /dev/null
+++ b/src/mongo/db/fts/fts_spec.h
@@ -0,0 +1,108 @@
+// fts_spec.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <string>
+
+#include "mongo/db/fts/fts_util.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/platform/unordered_map.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ extern const double MAX_WEIGHT;
+
+ typedef std::map<string,double> Weights; // TODO cool map
+
+ typedef unordered_map<string,double> TermFrequencyMap;
+
+
+ class FTSSpec {
+
+ struct Tools {
+ Tools( string language )
+ : language( language ){}
+ const std::string& language;
+ const Stemmer* stemmer;
+ const StopWords* stopwords;
+ };
+
+ public:
+ FTSSpec( const BSONObj& indexInfo );
+
+ bool wildcard() const { return _wildcard; }
+ const string& defaultLanguage() const { return _defaultLanguage; }
+ const string& languageOverrideField() const { return _languageOverrideField; }
+
+ size_t numExtraBefore() const { return _extraBefore.size(); }
+ const std::string& extraBefore( unsigned i ) const { return _extraBefore[i]; }
+
+ size_t numExtraAfter() const { return _extraAfter.size(); }
+ const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; }
+
+ string getLanguageToUse( const BSONObj& userDoc ) const;
+
+ void scoreDocument( const BSONObj& obj, TermFrequencyMap* scores ) const;
+
+ /**
+ * given a query, pulls out the pieces (in order) that go in the index first
+ */
+ Status getIndexPrefix( const BSONObj& filter, BSONObj* out ) const;
+
+ const Weights& weights() const { return _weights; }
+
+ /**
+ * @param out - untouched if field isn't present
+ * @return if field is here
+ */
+ bool weight( const StringData& field, double* out ) const;
+
+
+ static BSONObj fixSpec( const BSONObj& spec );
+ private:
+ void _scoreRecurse(const Tools& tools,
+ const BSONObj& obj,
+ TermFrequencyMap* term_freqs ) const;
+
+ void _scoreString( const Tools& tools,
+ const StringData& raw,
+ TermFrequencyMap* term_freqs,
+ double weight ) const;
+
+ string _defaultLanguage;
+ string _languageOverrideField;
+ bool _wildcard;
+
+ // _weights stores a mapping between the fields and the value as a double
+ // basically, how much should an occurence of (query term) in (field) be worth
+ Weights _weights;
+
+ // other fields to index
+ std::vector<string> _extraBefore;
+ std::vector<string> _extraAfter;
+ };
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp
new file mode 100644
index 00000000000..541bd4a56d8
--- /dev/null
+++ b/src/mongo/db/fts/fts_spec_test.cpp
@@ -0,0 +1,139 @@
+// fts_spec_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( FTSSpec, Fix1 ) {
+ BSONObj user = BSON( "key" << BSON( "title" << "fts" <<
+ "text" << "fts" ) <<
+ "weights" << BSON( "title" << 10 ) );
+
+ BSONObj fixed = FTSSpec::fixSpec( user );
+ BSONObj fixed2 = FTSSpec::fixSpec( fixed );
+ ASSERT_EQUALS( fixed, fixed2 );
+ }
+
+ TEST( FTSSpec, ScoreSingleField1 ) {
+ BSONObj user = BSON( "key" << BSON( "title" << "fts" <<
+ "text" << "fts" ) <<
+ "weights" << BSON( "title" << 10 ) );
+
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+
+ TermFrequencyMap m;
+ spec.scoreDocument( BSON( "title" << "cat sat run" ), &m );
+ ASSERT_EQUALS( 3U, m.size() );
+ ASSERT_EQUALS( m["cat"], m["sat"] );
+ ASSERT_EQUALS( m["cat"], m["run"] );
+ ASSERT( m["cat"] > 0 );
+ }
+
+ TEST( FTSSpec, ScoreMultipleField1 ) {
+ BSONObj user = BSON( "key" << BSON( "title" << "fts" <<
+ "text" << "fts" ) <<
+ "weights" << BSON( "title" << 10 ) );
+
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+
+ TermFrequencyMap m;
+ spec.scoreDocument( BSON( "title" << "cat sat run"
+ << "text" << "cat book" ),
+ &m );
+
+ ASSERT_EQUALS( 4U, m.size() );
+ ASSERT_EQUALS( m["sat"], m["run"] );
+ ASSERT( m["sat"] > 0 );
+
+ ASSERT( m["cat"] > m["sat"] );
+ ASSERT( m["cat"] > m["book"] );
+ ASSERT( m["book"] > 0 );
+ ASSERT( m["book"] < m["sat"] );
+ }
+
+
+ TEST( FTSSpec, ScoreRepeatWord ) {
+ BSONObj user = BSON( "key" << BSON( "title" << "fts" <<
+ "text" << "fts" ) <<
+ "weights" << BSON( "title" << 10 ) );
+
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+
+ TermFrequencyMap m;
+ spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m );
+ ASSERT_EQUALS( 3U, m.size() );
+ ASSERT( m["cat"] > 0 );
+ ASSERT( m["sat"] > m["cat"] );
+ ASSERT( m["run"] > m["sat"] );
+
+ }
+
+ TEST( FTSSpec, Extra1 ) {
+ BSONObj user = BSON( "key" << BSON( "data" << "fts" ) );
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+ ASSERT_EQUALS( 0U, spec.numExtraBefore() );
+ ASSERT_EQUALS( 0U, spec.numExtraAfter() );
+ }
+
+ TEST( FTSSpec, Extra2 ) {
+ BSONObj user = BSON( "key" << BSON( "data" << "fts" << "x" << 1 ) );
+ FTSSpec spec( FTSSpec::fixSpec( user ) );
+ ASSERT_EQUALS( 0U, spec.numExtraBefore() );
+ ASSERT_EQUALS( 1U, spec.numExtraAfter() );
+ ASSERT_EQUALS( StringData("x"), spec.extraAfter(0) );
+ }
+
+ TEST( FTSSpec, Extra3 ) {
+ BSONObj user = BSON( "key" << BSON( "x" << 1 << "data" << "fts" ) );
+ BSONObj fixed = FTSSpec::fixSpec( user );
+
+ ASSERT_EQUALS( BSON( "x" << 1 <<
+ "_fts" << "text" <<
+ "_ftsx" << 1 ),
+ fixed["key"].Obj() );
+ ASSERT_EQUALS( BSON( "data" << 1 ),
+ fixed["weights"].Obj() );
+
+ BSONObj fixed2 = FTSSpec::fixSpec( fixed );
+ ASSERT_EQUALS( fixed, fixed2 );
+
+ FTSSpec spec( fixed );
+ ASSERT_EQUALS( 1U, spec.numExtraBefore() );
+ ASSERT_EQUALS( StringData("x"), spec.extraBefore(0) );
+ ASSERT_EQUALS( 0U, spec.numExtraAfter() );
+
+ BSONObj prefix;
+
+ ASSERT( spec.getIndexPrefix( BSON( "x" << 2 ), &prefix ).isOK() );
+ ASSERT_EQUALS( BSON( "x" << 2 ), prefix );
+
+ ASSERT( spec.getIndexPrefix( BSON( "x" << 3 << "y" << 4 ), &prefix ).isOK() );
+ ASSERT_EQUALS( BSON( "x" << 3 ), prefix );
+
+ ASSERT( !spec.getIndexPrefix( BSON( "x" << BSON( "$gt" << 5 ) ), &prefix ).isOK() );
+ ASSERT( !spec.getIndexPrefix( BSON( "y" << 4 ), &prefix ).isOK() );
+ ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/fts_util.cpp b/src/mongo/db/fts/fts_util.cpp
new file mode 100644
index 00000000000..ace11b67409
--- /dev/null
+++ b/src/mongo/db/fts/fts_util.cpp
@@ -0,0 +1,30 @@
+// fts_util.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/fts/fts_util.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ const std::string INDEX_NAME = "text";
+ const std::string WILDCARD = "$**";
+
+ }
+}
+
diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h
new file mode 100644
index 00000000000..3df5a0c5ee2
--- /dev/null
+++ b/src/mongo/db/fts/fts_util.h
@@ -0,0 +1,112 @@
+// fts_util.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+
+#include "mongo/db/hasher.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/record.h"
+#include "mongo/util/unordered_fast_key_table.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ extern const std::string WILDCARD;
+ extern const std::string INDEX_NAME;
+
+ /**
+ * destructive!
+ */
+ inline void makeLower( std::string* s ) {
+ std::string::size_type sz = s->size();
+ for ( std::string::size_type i = 0; i < sz; i++ )
+ (*s)[i] = (char)tolower( (int)(*s)[i] );
+ }
+
+ /*
+ * ScoredLocation stores the total score for a document (record *) wrt a search
+ *
+ */
+ struct ScoredLocation {
+ ScoredLocation( Record* r, double sc )
+ : rec(r), score(sc) {
+ }
+
+ Record* rec;
+ double score;
+
+ bool operator<( const ScoredLocation& other ) const {
+ if ( other.score < score )
+ return true;
+ if ( other.score > score )
+ return false;
+ return rec < other.rec;
+ }
+ };
+
+ // scored location comparison is done based on score
+ class ScoredLocationComp {
+ public:
+ bool operator() (const ScoredLocation& lhs, const ScoredLocation& rhs) const {
+ return (lhs.score > rhs.score);
+ }
+ };
+
+ struct _be_hash {
+ size_t operator()( const BSONElement& e ) const {
+ return static_cast<size_t>( BSONElementHasher::hash64( e, 17 ) );
+ }
+ };
+
+ struct _be_equals {
+ bool operator()( const BSONElement& a, const BSONElement& b ) const {
+ return a == b;
+ }
+ };
+
+ struct _be_convert {
+ BSONElement operator()( const BSONObj& o ) const {
+ const BSONElement& x = o.firstElement();
+ BSONElement y( x.rawdata() );
+ return y;
+ }
+ };
+
+ struct _be_convert_other {
+ BSONObj operator()( const BSONElement& e ) const {
+ return e.wrap();
+ }
+ };
+
+ template< typename V >
+ class BSONElementMap : public UnorderedFastKeyTable<BSONElement,
+ BSONObj,
+ V,
+ _be_hash,
+ _be_equals,
+ _be_convert,
+ _be_convert_other > {
+ };
+
+
+ }
+}
+
diff --git a/src/mongo/db/fts/fts_util_test.cpp b/src/mongo/db/fts/fts_util_test.cpp
new file mode 100644
index 00000000000..7d959dca08a
--- /dev/null
+++ b/src/mongo/db/fts/fts_util_test.cpp
@@ -0,0 +1,36 @@
+// fts_util_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "mongo/unittest/unittest.h"
+
+#include "mongo/db/fts/fts_util.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( BSONElementMap, Simple1 ) {
+ BSONElementMap<double> m;
+
+ BSONObj x = BSON( "x" << 5 );
+ m[x.firstElement()] = 5;
+ ASSERT_EQUALS( 5, m[x.firstElement()] );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py
new file mode 100644
index 00000000000..5010fe702a5
--- /dev/null
+++ b/src/mongo/db/fts/generate_stop_words.py
@@ -0,0 +1,56 @@
+import sys
+
+def generate( header, source, language_files ):
+ print( "header: %s" % header )
+ print( "source: %s" % source )
+ print( "language_files:" )
+ for x in language_files:
+ print( "\t%s" % x )
+
+ out = open( header, "wb" )
+ out.write( """
+#pragma once
+#include <map>
+#include <set>
+#include <string>
+namespace mongo {
+namespace fts {
+
+ void loadStopWordMap( std::map< std::string, std::set< std::string > >* m );
+}
+}
+""" )
+ out.close()
+
+
+
+ out = open( source, "wb" )
+ out.write( '#include "%s"' % header.rpartition( "/" )[2].rpartition( "\\" )[2] )
+ out.write( """
+namespace mongo {
+namespace fts {
+
+ void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ) {
+
+""" )
+
+ for l_file in language_files:
+ l = l_file.rpartition( "_" )[2].partition( "." )[0]
+
+ out.write( ' // %s\n' % l_file )
+ out.write( ' {\n' )
+ out.write( ' std::set< std::string >& l = (*m)["%s"];\n' % l )
+ for word in open( l_file, "rb" ):
+ out.write( ' l.insert( "%s" );\n' % word.strip() )
+ out.write( ' }\n' )
+ out.write( """
+ }
+} // namespace fts
+} // namespace mongo
+""" )
+
+
+if __name__ == "__main__":
+ generate( sys.argv[ len(sys.argv) - 2],
+ sys.argv[ len(sys.argv) - 1],
+ sys.argv[1:-2] )
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
new file mode 100644
index 00000000000..c04d05c87ca
--- /dev/null
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -0,0 +1,58 @@
+// stemmer.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string>
+
+#include "mongo/db/fts/stemmer.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ Stemmer::Stemmer( const string& language ) {
+ _stemmer = NULL;
+ if ( language != "none" )
+ _stemmer = sb_stemmer_new(language.c_str(), "UTF_8");
+ }
+
+ Stemmer::~Stemmer() {
+ if ( _stemmer ) {
+ sb_stemmer_delete(_stemmer);
+ _stemmer = NULL;
+ }
+ }
+
+ string Stemmer::stem( const StringData& word ) const {
+ if ( !_stemmer )
+ return word.toString();
+
+ const sb_symbol* sb_sym = sb_stemmer_stem( _stemmer,
+ (const sb_symbol*)word.rawData(),
+ word.size() );
+
+ if ( sb_sym == NULL ) {
+ // out of memory
+ abort();
+ }
+
+ return string( (const char*)(sb_sym), sb_stemmer_length( _stemmer ) );
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
new file mode 100644
index 00000000000..d212cc01fce
--- /dev/null
+++ b/src/mongo/db/fts/stemmer.h
@@ -0,0 +1,48 @@
+// stemmer.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#pragma once
+
+#include <string>
+
+#include "libstemmer.h"
+
+#include "mongo/base/string_data.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ /**
+ * maintains case
+ * but works
+ * running/Running -> run/Run
+ */
+ class Stemmer {
+ public:
+ Stemmer( const std::string& language );
+ ~Stemmer();
+
+ std::string stem( const StringData& word ) const;
+ private:
+ struct sb_stemmer* _stemmer;
+ };
+ }
+}
+
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
new file mode 100644
index 00000000000..808b8141a64
--- /dev/null
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -0,0 +1,42 @@
+// stemmer_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "mongo/unittest/unittest.h"
+
+#include "mongo/db/fts/stemmer.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( English, Stemmer1 ) {
+ Stemmer s( "english" );
+ ASSERT_EQUALS( "run", s.stem( "running" ) );
+ ASSERT_EQUALS( "Run", s.stem( "Running" ) );
+ }
+
+
+ TEST( English, Caps ) {
+ Stemmer s( "porter" );
+ ASSERT_EQUALS( "unit", s.stem( "united" ) );
+ ASSERT_EQUALS( "Unite", s.stem( "United" ) );
+ }
+
+
+ }
+}
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
new file mode 100644
index 00000000000..0d664caf1bf
--- /dev/null
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -0,0 +1,73 @@
+// stop_words.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "mongo/db/fts/stop_words.h"
+
+#include "mongo/base/init.h"
+#include "mongo/platform/unordered_map.h"
+
+
+
+namespace mongo {
+
+ namespace fts {
+
+ void loadStopWordMap( std::map< std::string, std::set< std::string > >* m );
+
+ namespace {
+ unordered_map<string,StopWords*> STOP_WORDS;
+ StopWords* empty = NULL;
+ }
+
+
+ StopWords::StopWords(){
+ }
+
+ StopWords::StopWords( const std::set<std::string>& words ) {
+ for ( std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i )
+ _words.insert( *i );
+ }
+
+ const StopWords* StopWords::getStopWords( const std::string& langauge ) {
+ unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( langauge );
+ if ( i == STOP_WORDS.end() )
+ return empty;
+ return i->second;
+ }
+
+
+ MONGO_INITIALIZER(StopWords)(InitializerContext* context) {
+ empty = new StopWords();
+
+ std::map< std::string, std::set< std::string > > raw;
+ loadStopWordMap( &raw );
+ for ( std::map< std::string, std::set< std::string > >::const_iterator i = raw.begin();
+ i != raw.end();
+ ++i ) {
+ STOP_WORDS[i->first] = new StopWords( i->second );
+ }
+ return Status::OK();
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
new file mode 100644
index 00000000000..5816afa560c
--- /dev/null
+++ b/src/mongo/db/fts/stop_words.h
@@ -0,0 +1,50 @@
+// stop_words.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#pragma once
+
+#include <set>
+#include <string>
+
+#include "mongo/platform/unordered_set.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ class StopWords {
+ public:
+ StopWords();
+ StopWords( const std::set<std::string>& words );
+
+ bool isStopWord( const std::string& word ) const {
+ return _words.count( word ) > 0;
+ }
+
+ size_t numStopWords() const { return _words.size(); }
+
+ static const StopWords* getStopWords( const std::string& langauge );
+ private:
+ ~StopWords(){}
+ unordered_set<std::string> _words;
+ };
+
+ }
+}
+
diff --git a/src/mongo/db/fts/stop_words_danish.txt b/src/mongo/db/fts/stop_words_danish.txt
new file mode 100644
index 00000000000..1b3c2867fec
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_danish.txt
@@ -0,0 +1,100 @@
+få
+intet
+som
+den
+forrige
+ni
+alle
+at
+ned
+et
+næsten
+fordi
+og
+jeres
+seks
+op
+har
+flere
+hvis
+hvem
+andre
+mens
+fem
+over
+da
+din
+deres
+for
+ny
+hvad
+fra
+kan
+kommer
+hvornår
+jeg
+denne
+end
+nogen
+meget
+man mand
+på
+store
+ind
+lav
+ud
+ej
+hvordan
+ingen
+to
+der
+se
+kom
+dig
+tre
+eneste
+dette
+hans
+hver
+før
+hendes
+andet
+syv
+hvilken
+hvor
+det
+de
+hvorfor
+god
+otte
+ikke
+han
+mig
+fleste
+ti
+i
+ene
+med
+til
+stor
+her
+lille
+mange
+du
+ses
+begge
+dog
+eller
+en
+nyt
+var
+hun
+enhver
+fire
+mere
+nær
+næste
+men
+noget
+lidt
+af
diff --git a/src/mongo/db/fts/stop_words_dutch.txt b/src/mongo/db/fts/stop_words_dutch.txt
new file mode 100644
index 00000000000..251822f9570
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_dutch.txt
@@ -0,0 +1,48 @@
+hun
+dit
+zij
+van
+kan
+een
+zal
+wat
+dat
+in
+hij
+die
+zou
+bij
+met
+al
+ook
+is
+uit
+en
+hem
+zei
+heb
+mij
+was
+ik
+nog
+of
+zo
+we
+men
+wij
+ons
+als
+tot
+wel
+nu
+aan
+je
+er
+ze
+af
+hoe
+had
+te
+me
+het
+dan
diff --git a/src/mongo/db/fts/stop_words_english.txt b/src/mongo/db/fts/stop_words_english.txt
new file mode 100644
index 00000000000..fbb6c3063fd
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_english.txt
@@ -0,0 +1,174 @@
+than
+during
+himself
+your
+theirs
+most
+same
+wouldn't
+at
+it
+here's
+their
+his
+an
+out
+between
+doesn't
+not
+those
+only
+yourself
+mustn't
+and
+shouldn't
+him
+you'll
+which
+more
+shan't
+after
+why
+up
+further
+over
+no
+its
+until
+them
+you
+don't
+few
+why's
+i've
+for
+ours
+some
+when's
+they've
+won't
+herself
+but
+she
+he'd
+how's
+were
+how
+we've
+because
+aren't
+should
+our
+each
+once
+they'd
+where
+above
+there
+or
+they'll
+be
+to
+are
+it's
+too
+itself
+what
+whom
+has
+they're
+had
+she'd
+these
+other
+when
+hasn't
+by
+we'll
+having
+then
+against
+he's
+as
+is
+that
+isn't
+below
+could
+wasn't
+who's
+ourselves
+so
+any
+he
+cannot
+weren't
+was
+my
+would
+we'd
+yourselves
+where's
+couldn't
+who
+didn't
+from
+i'm
+off
+have
+hers
+i
+am
+themselves
+of
+before
+i'll
+here
+while
+what's
+myself
+ought
+me
+the
+into
+about
+this
+do
+can't
+a
+her
+that's
+did
+very
+down
+you've
+we
+you're
+haven't
+on
+let's
+such
+they
+in
+with
+being
+doing
+she's
+yours
+hadn't
+nor
+both
+does
+own
+again
+there's
+he'll
+i'd
+under
+you'd
+through
+we're
+she'll
+been
+all
+if
diff --git a/src/mongo/db/fts/stop_words_finnish.txt b/src/mongo/db/fts/stop_words_finnish.txt
new file mode 100644
index 00000000000..0d898159660
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_finnish.txt
@@ -0,0 +1,747 @@
+toistaiseksi
+aika
+tykö
+haluavat
+hyvinä
+lähekkäin
+menimme
+mahdollista
+silloin
+olette
+kaikkiaan
+alla
+haluaa
+joku
+jopa
+uuden
+haluat
+lähellä
+jolta
+yhtä
+sieltä
+oleva
+niistä
+pienelle
+samallasta
+pieni
+uudet
+avulla
+kahdessa
+edes
+aloitin
+koska
+pienin
+olisi
+esi
+paljon
+hyville
+toisaalta
+vaikeat
+kumpikin
+ensimmäiseksi
+jonka
+jotka
+kennessästä
+ensimmäistä
+joihin
+peräti
+kymmenen
+hänessä
+vaikean
+lähtien
+neljän
+olisin
+ette
+joutui
+jokin
+pienempi
+vieri
+kuka
+viime
+vaiheessa
+olisit
+neljää
+aikovat
+mikä
+kaikkea
+muiden
+niiden
+kumpikaan
+kenellä
+esillä
+joo
+menit
+joudutte
+olisivat
+ne
+sinua
+muun
+täältä
+mitään
+kohti
+usea
+ilmeisesti
+koskaan
+vähemmän
+menossa
+ajan
+liian
+omat
+tietysti
+jota
+kenen
+kahdella
+aikaa
+ensimmäiset
+parhaillaan
+muualla
+joukosta
+moi
+hyvät
+näissä
+kuin
+minut
+ainoa
+nro
+eräiden
+kanssasi
+seuraavat
+yhtäällä
+jonkin
+näissästä
+ehkä
+aloitamme
+jälkeen
+kanssa
+aloititte
+voisi
+viimeinen
+jolle
+toki
+myöskään
+sekä
+josta
+eli
+moni
+olli
+ellet
+alkuisin
+tulleet
+kaikkialle
+ainakin
+muita
+aikana
+toinen
+muuta
+keiltä
+nämä
+meidän
+tuhannen
+edemmäs
+mihin
+vierekkäin
+sinulle
+yhteyteen
+halusi
+etenkin
+täytyvät
+niitä
+mitä
+alta
+välillä
+seitsemän
+kumpainen
+hyviltä
+taa
+me
+mikään
+hän
+uudelleen
+omien
+kerta
+tuhat
+yksi
+tulisit
+tulimme
+joissa
+monesti
+paitsi
+heti
+aloitat
+joko
+eniten
+vuoden
+yhdessä
+suuri
+haluamme
+varten
+olisitte
+kenet
+entinen
+samaa
+vaikeille
+samallassa
+tänään
+edelleen
+läpi
+helposti
+täysin
+monta
+jolloin
+useimmiten
+ensin
+sinusta
+suuntaan
+omilta
+uuteen
+voidaan
+näissälle
+sisällä
+siitä
+kuitenkaan
+mones
+mukaan
+asiasta
+toisaalla
+tällöin
+että
+yksittäin
+erityisesti
+vaikea
+ellei
+moniaalla
+aluksi
+antoi
+tavoitteena
+verran
+runsaasti
+ovat
+menevät
+toista
+meni
+vastaan
+myös
+heitä
+menin
+kokonaan
+ja
+hänen
+suurin
+meille
+tosin
+hetkellä
+parhaiten
+ensimmäisenä
+kaksi
+tulemme
+alle
+vuosina
+ensimmäisen
+aloitan
+emme
+te
+heille
+uusia
+itse
+halunnut
+keille
+samallalta
+halusin
+aikoo
+minun
+menet
+näissähin
+meneet
+kovin
+jouduin
+toisen
+tämä
+halusit
+useita
+muassa
+heiltä
+tulette
+kahta
+jouduit
+haluton
+omia
+minne
+aloitettevaksi
+siten
+sinussa
+aloitettu
+voi
+suuren
+asioihin
+vaikeilta
+vähiten
+asiat
+vuosien
+ellen
+joilla
+jälleen
+suuret
+omalta
+tässä
+asian
+nuo
+menette
+kuten
+kolme
+voimme
+eri
+menivät
+tahansa
+vai
+kaiken
+joiden
+puolesta
+hyvissä
+kumpainenkaan
+kaikille
+yhtäälle
+toiseksi
+hänelle
+toisessa
+toisesta
+toiselle
+keillä
+lisäksi
+tuolloin
+muualle
+edessä
+älköön
+keneen
+mennyt
+aikoina
+myöhemmin
+annettu
+menemme
+aoua
+ensimmäisiä
+mikäli
+pieneen
+keinä
+annetteva
+vähän
+tuolla
+jää
+kannattaa
+jonne
+mukana
+kolmas
+häneen
+jouduitte
+joten
+tänne
+aloittivat
+olitte
+juuri
+keitten
+tullut
+paremmin
+sillä
+meillä
+avutta
+keneltä
+kahden
+jos
+omaan
+aion
+jo
+samaan
+sama
+joskus
+sinä
+kaikki
+yhtään
+omille
+sataa
+pian
+ihan
+eivät
+ylös
+keissä
+voin
+siinä
+lisää
+halutessa
+tämän
+omaa
+asioita
+kanssanne
+tapauksessa
+kenestä
+vaikeilla
+minä
+suurten
+kenelle
+tulisivat
+melkein
+kun
+kaikkia
+uutta
+lähinnä
+ole
+hyvä
+aiotte
+asia
+hyvää
+uusi
+tulisimme
+aikajen
+viimeisen
+mukaansa
+eteen
+hänestä
+omiksi
+heistä
+edestä
+kukin
+hyviksi
+moniaalle
+missä
+takaa
+omista
+joilta
+saakka
+onkin
+yhdeksän
+sitä
+ollut
+hänet
+antamatta
+ensimmäisinä
+avun
+yhteensä
+kolmen
+hyviä
+olemme
+muulloin
+kumpi
+yli
+sinun
+erittäin
+aiemmin
+ympäri
+voivat
+keiksi
+tulitte
+tästä
+siis
+perusteella
+sinut
+täten
+menen
+asti
+muu
+omalle
+haluamatta
+keitä
+apu
+menee
+yhä
+voit
+varsin
+neljä
+yhden
+uusinta
+tuolta
+yksin
+keskimäärin
+entistä
+sinne
+samat
+jotenkin
+eilen
+luo
+takana
+edellä
+useasti
+muutama
+he
+sitten
+sinulta
+pieneksi
+enää
+itsensä
+tulla
+joille
+jotenkuten
+kahdesta
+kuutta
+omissa
+parempi
+pakosti
+olla
+joutua
+aloitattivat
+silti
+kahdelta
+joudun
+viiden
+kehen
+paikoittain
+kukaan
+näiden
+aloitti
+olevan
+myöskin
+jouduimme
+mistä
+tähän
+takaisin
+eräät
+tai
+tulee
+siihen
+olimme
+molemmat
+kesken
+keneksi
+jotta
+toisella
+tätä
+jompikumpi
+ennen
+täytyy
+kolmesti
+voitte
+pieneltä
+en
+niin
+keistä
+jotain
+ensimmäisiksi
+esimerkiksi
+pienellä
+takia
+on
+kautta
+muka
+oli
+kai
+hänellä
+onko
+itseään
+joutumaan
+aina
+aivan
+aloitimme
+taemmas
+todella
+ensi
+tulisitte
+taas
+mutta
+vuotta
+nyt
+kenettä
+tule
+enemmän
+vain
+usein
+haluatte
+heissä
+kannalta
+avuksi
+tuonne
+toiseen
+kaikkialta
+hyviin
+olevat
+olisimme
+uudeksi
+tulen
+elleivät
+huomenna
+olleet
+moniaalta
+kahdeksan
+nopeasti
+joudumme
+heihin
+antaa
+johon
+ei
+hei
+omassa
+saman
+lähelle
+et
+keiden
+kanssani
+yhteen
+huolimatta
+häneltä
+alussa
+olivat
+aloitatte
+kuuden
+annettavaksi
+esiin
+näissältä
+joukossa
+halusimme
+tuntuu
+kenenä
+vuosi
+hyvien
+tällä
+tulevat
+joista
+läheltä
+tulin
+jokainen
+joutuivat
+vaikka
+asiaa
+halusivat
+päälle
+joukkoon
+lähemmäs
+joutuu
+joutuvat
+yhteydessä
+vaikeista
+nopeammin
+vuoksi
+toiselta
+koko
+hieman
+kauemmas
+kertaa
+muutaman
+kyse
+kahteen
+sen
+edeltä
+kuusi
+varsinkin
+toisensa
+jossa
+kaikilta
+mahdollisimman
+tavalla
+näin
+liki
+joka
+olin
+entisen
+aist
+oma
+yleensä
+nopeiten
+tulisi
+viisi
+vähintään
+hyvistä
+aikaisin
+kahdelle
+miksi
+olit
+joita
+halua
+asioiden
+miten
+aiomme
+heidän
+kaikkin
+ylemmäs
+näitä
+aloittaa
+kahdeksannen
+ketä
+vielä
+aikaisemmin
+puolestaan
+aloitit
+jolla
+myötä
+omaksi
+lähes
+kiitos
+hyvin
+samalla
+sijaan
+milloin
+sisäkkäin
+se
+olen
+noin
+alemmas
+kaikkialla
+muuten
+ellemme
+pienestä
+omiin
+olet
+aikaan
+tuskin
+yhtäältä
+siellä
+keittä
+kuinka
+vastakkain
+halusitte
+ilman
+sata
+siksi
+tuo
+tulet
+suuria
+muut
+satojen
+eräs
+kaikkien
+tulivat
+varmasti
+ohi
+muualta
+kyllä
+haluan
+kenessä
+ensimmäinen
+mikin
+tulisin
+oman
+joiksi
+edelle
+entisiä
+entisten
+kauan
+alas
+ainoat
+ellette
+suoraan
+eikä
+melko
+alusta
+ettei
+alkuun
+aloitettava
+monet
+ketkä
+uusien
+tulit
+kuitenkin
+aloittamatta
+vaan
+toisemme
+keihin
+viimeksi
+toisaalle
+muuanne
+ainakaan
+tänä
+oikein
+mennessä
+samoin
+kerran
+kanssaan
+sadam
+täällä
+kanssamme
+vasta
+vaikeissa
+älä
diff --git a/src/mongo/db/fts/stop_words_french.txt b/src/mongo/db/fts/stop_words_french.txt
new file mode 100644
index 00000000000..42502590cbc
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_french.txt
@@ -0,0 +1,126 @@
+vu
+avoir
+sa
+étions
+faites
+ni
+quand
+ou
+tu
+tout
+et
+hors
+haut
+avec
+votre
+ton
+tes
+dedans
+comment
+il
+aussi
+vont
+fait
+soyez
+peu
+ils
+sans
+avant
+valeur
+été
+pas
+font
+mine
+encore
+sien
+le
+mais
+ci
+quels
+devrait
+mot
+droite
+ma
+pièce
+tels
+force
+dans
+qui
+trop
+ça
+juste
+au
+dehors
+eu
+cela
+voient
+leur
+notre
+plupart
+elles
+dos
+tous
+elle
+bon
+étaient
+nommés
+je
+ces
+nouveaux
+donc
+est
+pourquoi
+car
+son
+alors
+peut
+quel
+quelle
+pour
+essai
+ici
+sujet
+mon
+état
+depuis
+même
+que
+où
+parole
+être
+voie
+autre
+sont
+sur
+des
+début
+ta
+la
+deux
+ce
+doit
+quelles
+du
+ses
+là
+tandis
+personnes
+en
+par
+chaque
+parce
+fois
+si
+les
+nous
+mes
+vous
+aucuns
+seulement
+moins
+maintenant
+ceux
+tellement
+très
+comme
+sous
diff --git a/src/mongo/db/fts/stop_words_german.txt b/src/mongo/db/fts/stop_words_german.txt
new file mode 100644
index 00000000000..4fcf107963b
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_german.txt
@@ -0,0 +1,992 @@
+kleiner
+muss
+zwanzig
+berichtete
+dort
+nun
+könnten
+fortsetzten
+seinen
+important
+gefiel
+sofort
+liegt
+wolle
+eben
+in
+dadurch
+sind
+geben
+etc
+wenngleich
+erhalten
+ihr
+solchen
+bezüglich
+bald
+befragte
+werde
+geehrten
+singt
+fragte
+jedem
+eines
+behalten
+wegen
+freies
+einseitig
+dasselbe
+befragen
+unmögliche
+weg
+nachhinein
+senkte
+ihrem
+drunter
+später
+wäre
+daraus
+ausdrückte
+sagt
+einmal
+inzwischen
+startet
+ganzem
+lesen
+vermag
+forderten
+fast
+wieviel
+alsbald
+damals
+gewollt
+mithin
+behielt
+erhielten
+irgendwen
+geteilt
+bearbeitete
+eröffnen
+bist
+daran
+will
+diese
+danke
+einst
+oft
+legte
+gleichwohl
+demnach
+begonnen
+reagieren
+derartig
+links
+dieser
+soweit
+worin
+möglichen
+folgender
+desto
+zahlreich
+wogegen
+sich
+jenen
+schnell
+meta
+wen
+rechts
+abgerufener
+muesste
+vorbei
+nacher
+nebenan
+schwierig
+senkt
+für
+soviel
+deswegen
+blieb
+veröffentlichtes
+dagegen
+veröffentlichen
+einem
+stets
+hast
+demselben
+wollen
+nötigenfalls
+sondern
+zugleich
+umso
+ins
+wolltet
+deine
+danach
+konkreter
+diesen
+abgerufene
+mancherorts
+befragten
+teilten
+tat
+übermorgen
+unse
+manche
+weshalb
+senken
+solltest
+woher
+siebte
+sofern
+so
+guten
+vollständig
+meiste
+doppelt
+veröffentlicht
+derselbe
+zuviel
+dem
+hinterher
+veröffentlicher
+ziehen
+mich
+anderem
+überallhin
+können
+lagen
+weiteres
+jedenfalls
+muessen
+eigenen
+beitragen
+gmbh
+keines
+pfui
+befragter
+falls
+ergänzten
+ergänze
+vielleicht
+hundert
+koennen
+entweder
+muesst
+durchaus
+sollt
+jemand
+solch
+wirklich
+fortsetzt
+sage
+derjenige
+angesetzt
+gleichzeitig
+derselben
+diesseits
+anderm
+gern
+keinen
+gehen
+könnt
+daher
+allerdings
+wohingegen
+steigen
+unserm
+unserem
+nämlich
+heraus
+wodurch
+obgleich
+bloss
+seit
+aufhören
+verrate
+je
+wohlweislich
+einführten
+tatsächlichen
+ganzes
+kommen
+waren
+alles
+komme
+einige
+dinge
+erst
+tragen
+machst
+direkten
+einiger
+anerkannter
+wären
+ganz
+woraufhin
+darauf
+jedoch
+daneben
+einführen
+längstens
+koennten
+beide
+such
+dorthin
+möglich
+ausser
+unterhalb
+vor
+darfst
+sicherlich
+geworden
+stattdessen
+beinahe
+gängig
+gibt
+sieht
+schätzte
+euren
+obwohl
+vom
+könnte
+vergangenes
+wolltest
+welchen
+wiewohl
+schreiber
+deshalb
+mochte
+unses
+aber
+tät
+manches
+eröffnet
+anders
+gängiges
+seines
+seitdem
+titel
+musste
+jenes
+unbedingt
+trotzdem
+sowohl
+kaum
+davon
+möchten
+gemacht
+veröffentlichten
+verraten
+sangen
+glücklicherweise
+somit
+gefallen
+eurer
+machte
+womit
+übrig
+ihren
+sollten
+weder
+heute
+regelmäßig
+meiner
+was
+legten
+vorher
+niemals
+dein
+derart
+schließlich
+außen
+noch
+zwar
+vieles
+wenig
+fünf
+anderes
+mussten
+schätzen
+sagten
+angesetze
+eigentlich
+wann
+deines
+letztes
+fordert
+einig
+meinem
+hiesige
+möchte
+bin
+möglicher
+und
+z.B.
+unten
+angesetzten
+sehr
+dessen
+ihrer
+beim
+weiterhin
+weiteren
+tun
+solcher
+hattet
+würden
+viel
+gängige
+an
+vermutlich
+müssten
+kommt
+lediglich
+zehn
+mehr
+wohin
+nie
+gedurft
+einerseits
+besonders
+teilen
+hätte
+darüberhinaus
+warst
+es
+allerlei
+ander
+unserer
+versorgtes
+benutzt
+hätt
+ausdrücken
+hab
+usw
+keine
+veröffentlichte
+stieg
+neues
+ab
+leicht
+allen
+könn
+fortsetzen
+keineswegs
+herein
+siehe
+waere
+derem
+möchtest
+überdies
+allgemein
+einseitiger
+überall
+möglicherweise
+solche
+obschon
+beitrugen
+gratulieren
+fordern
+denen
+versorgt
+mehrere
+haette
+ueber
+meisten
+wenige
+gesagt
+ausgenommen
+drauf
+einfach
+verrieten
+hätten
+leer
+euer
+unmöglich
+eingesetzt
+gefälligst
+zur
+dannen
+gänzlich
+anerkannt
+zumal
+derjenigen
+alle
+anfing
+nichts
+mir
+wird
+jedes
+hallo
+dafür
+spielen
+ausserdem
+einstmals
+ca.
+einseitigen
+außerhalb
+manchen
+ggf
+andern
+unsen
+wär
+selbst
+eröffne
+ob
+findest
+höchstens
+dich
+suchen
+mögen
+direkter
+sobald
+folglich
+einigen
+dunklen
+nicht
+ganzen
+bearbeiten
+bereits
+müssen
+jährigen
+steigt
+meist
+sicher
+seid
+geblieben
+info
+ein
+währenddessen
+derzeit
+finden
+ersten
+bei
+dabei
+nützt
+bestehen
+oberhalb
+wer
+seht
+nachdem
+wo
+wollte
+gratuliert
+wenigstens
+schreibens
+author
+bearbeite
+koennte
+dritte
+lichten
+sehen
+dreißig
+vorne
+eurem
+geehrte
+uns
+hinein
+ihnen
+daß
+konkrete
+irgendwie
+sollen
+natürlich
+also
+hin
+doch
+welches
+geb
+gar
+deinem
+ferner
+eher
+eine
+hiermit
+neben
+gefällt
+tatsächlicher
+beides
+eröffnetes
+dennoch
+etliche
+wurde
+brachten
+zwölf
+setzen
+zudem
+berichten
+müsste
+reagiert
+sollst
+hattest
+jederlei
+tut
+denselben
+euch
+auch
+konkretes
+gemocht
+stiegen
+kannst
+mache
+lag
+kein
+unsere
+ungefähr
+txt
+klaren
+außer
+weiß
+braucht
+tust
+dürfte
+nutzen
+dazu
+ergänzte
+warum
+konnte
+sog
+einführte
+morgen
+ihm
+genug
+durfte
+hatte
+eröffnete
+igitt
+neuen
+musst
+der
+wenn
+fortsetzte
+wessen
+insofern
+rund
+vorgestern
+irgendwas
+erhielt
+gewesen
+anderen
+geht
+angesetzter
+frei
+teile
+kleines
+keiner
+nächste
+gbr
+die
+etwa
+im
+wir
+zusammen
+bräuchte
+weiterem
+bleiben
+starteten
+davor
+liest
+durch
+niemand
+zum
+jeden
+einer
+seiten
+freier
+dank
+oben
+hat
+eigenes
+soll
+beträchtlich
+um
+sodaß
+gib
+erste
+völlig
+oder
+geehrter
+setzten
+fall
+aller
+aufzusuchen
+seither
+versorgten
+böden
+sonst
+nein
+machen
+zog
+danken
+einiges
+anerkanntes
+müßt
+setzt
+eins
+bedürfen
+gegeben
+konkret
+bietet
+voran
+unter
+gekonnt
+geteilte
+jähriges
+diejenige
+sagen
+bsp.
+folgende
+du
+allmählich
+dieses
+sogar
+ende
+folgendes
+abgerufen
+neue
+vorüber
+reagiere
+nirgendwo
+welche
+abgerufenes
+vergangener
+sein
+sowie
+welchem
+übel
+sie
+plötzlich
+müßte
+mancher
+dahin
+auf
+gängiger
+seiner
+bearbeiteten
+jährige
+machten
+solc hen
+autor
+beiderlei
+aufgehört
+mal
+haben
+jener
+sei
+per
+sagtest
+legen
+unterbrach
+bekannt
+mindestens
+acht
+jenseits
+bedarf
+sooft
+habe
+da
+unsem
+zuerst
+versorge
+hinten
+eure
+letztlich
+beiden
+sechs
+mögliche
+werdet
+eures
+wollt
+bringen
+immer
+unterbrechen
+besser
+dies
+erneut
+dir
+war
+einigem
+drei
+findet
+letztens
+gängigen
+derer
+klares
+mußt
+schätzten
+margin
+besteht
+seine
+langsam
+einbaün
+sang
+drin
+nach
+weiterer
+zwischen
+ganze
+vielmals
+gesehen
+getragen
+klein
+indem
+frau
+teilte
+hinter
+genommen
+bekennen
+darum
+seinem
+ehe
+indessen
+begann
+hoch
+solange
+kleinen
+das
+berichteten
+sect
+ohne
+leider
+solchem
+nur
+durften
+vergangen
+bloß
+direkte
+unseren
+bekannte
+berichtet
+andernfalls
+verriet
+steige
+ergänzen
+zurück
+dieselben
+jede
+morgige
+jene
+kam
+trug
+nimm
+zieht
+letzten
+wieder
+zogen
+konnten
+willst
+allzu
+angefangen
+nehmen
+ganzer
+geehrt
+mann
+trage
+bis
+schlechter
+sehe
+mag
+unmöglichen
+werden
+unnötig
+dieselbe
+bedurfte
+gemäss
+bevor
+sämtliche
+anerkannte
+gab
+konkreten
+unwichtig
+wieso
+senke
+keinerlei
+senkten
+allem
+anfangen
+erster
+denn
+ich
+total
+gleich
+schreibe
+versorgen
+gestrige
+zu
+bekannter
+aufhörte
+irgendwer
+gekommen
+ist
+darf
+tausend
+manchmal
+zufolge
+andere
+vier
+hatten
+neuem
+fuer
+gewissermaßen
+unmöglicher
+jenem
+wem
+damit
+sonstwo
+nirgends
+sagte
+kann
+unser
+klar
+macht
+folgenden
+wichtig
+woraus
+nahm
+darüber
+jeder
+arbeiten
+einen
+wurden
+vielen
+tatsächlich
+trägt
+man
+soeben
+wohl
+ausdrückt
+ebenso
+muß
+wachen
+lassen
+letze
+zuletzt
+manchem
+immerhin
+diesem
+startete
+weitere
+nutzt
+statt
+etwas
+aus
+reagierte
+beginnen
+nimmt
+infolge
+brauchen
+hier
+dann
+nacht
+forderte
+bislang
+desselben
+über
+meines
+weiter
+ja
+rief
+aufgrund
+wollten
+den
+als
+vieler
+neun
+gehabt
+anderer
+gebracht
+irgendwo
+sollte
+andererseits
+dürfen
+heutige
+deiner
+selber
+meinen
+entsprechend
+dass
+wirst
+gratulierte
+bisher
+vermögen
+brachte
+setzte
+ihres
+weniger
+laut
+magst
+versorgte
+klare
+während
+darin
+schätzt
+gegen
+einigermaßen
+solches
+keinem
+außerdem
+am
+freie
+gestern
+anstatt
+fand
+wie
+vergangene
+welcher
+deinen
+schreiben
+jetzt
+ansetzen
+halb
+allein
+einseitige
+tatsächliches
+längst
+ebenfalls
+übrigens
+er
+mit
+unseres
+drüber
+sieben
+darunter
+jährig
+gute
+seite
+innen
+nimmer
+neuer
+singen
+erhält
+irgend
+künftig
+deren
+zwei
+einzig
+bzw
+getan
+version
+steht
+zeitweise
+irgendeine
+direkt
+schon
+innerhalb
+mein
+ihn
+viele
+dorther
+meine
+des
+würde
+nutzung
+ihre
+pro
+letztendlich
+von
+neu
+ähnlich
+tages
+sehrwohl
+weil
+koennt
diff --git a/src/mongo/db/fts/stop_words_hungarian.txt b/src/mongo/db/fts/stop_words_hungarian.txt
new file mode 100644
index 00000000000..abdfe8b7498
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_hungarian.txt
@@ -0,0 +1,35 @@
+nem
+be
+mi
+igen
+a
+fel
+van
+õk
+csak
+hát
+én
+meg
+lesz
+szét
+az
+és
+ide
+ön
+le
+ki
+össze
+hogy
+mint
+te
+oda
+vagy
+ti
+vissza
+egy
+át
+de
+el
+rá
+volt
diff --git a/src/mongo/db/fts/stop_words_italian.txt b/src/mongo/db/fts/stop_words_italian.txt
new file mode 100644
index 00000000000..50ebb1c32c3
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_italian.txt
@@ -0,0 +1,279 @@
+tuoi
+farò
+all
+stai
+avevamo
+avevo
+nelle
+fanno
+stesse
+dagli
+avesti
+stiamo
+abbiate
+ai
+facevi
+avevi
+dall
+sull
+sarebbero
+dalle
+abbiamo
+avessi
+stettero
+nella
+avrete
+del
+stessi
+stavate
+farai
+dalla
+tutti
+starò
+facemmo
+avesse
+tu
+stiano
+faceste
+dallo
+tutto
+avevate
+farete
+faceva
+nello
+e
+sulle
+loro
+sono
+se
+degli
+fu
+sarai
+il
+avrei
+avuto
+stessimo
+a
+sarete
+avrebbero
+stavano
+sulla
+in
+si
+farà
+c
+facessi
+avevano
+sul
+li
+fecero
+sia
+starebbe
+eravamo
+lo
+fui
+contro
+furono
+suoi
+steste
+avuti
+o
+starei
+faremmo
+staranno
+sullo
+avrò
+fece
+i
+della
+agli
+stesti
+avute
+la
+non
+saranno
+gli
+facessimo
+starà
+dal
+avendo
+coi
+delle
+faccio
+nei
+fareste
+stando
+negl
+su
+anche
+avuta
+da
+facesse
+le
+facciamo
+siate
+ebbero
+faranno
+facevate
+foste
+dello
+avrebbe
+facessero
+facevano
+feci
+al
+stemmo
+fai
+cui
+fummo
+quale
+avrà
+perché
+farebbe
+mi
+negli
+faccia
+di
+siete
+facciano
+sta
+stavo
+sarebbe
+faremo
+degl
+stavamo
+dov
+dei
+uno
+saremo
+starete
+faresti
+stavi
+staresti
+essendo
+facevamo
+nostro
+ad
+tra
+avresti
+alle
+starai
+nell
+fossero
+abbiano
+fosti
+nostri
+per
+ma
+stia
+saresti
+una
+sue
+dove
+siamo
+eravate
+starebbero
+facendo
+lei
+avreste
+agl
+alla
+sto
+stava
+nostre
+quelli
+stette
+hanno
+sua
+dagl
+quello
+voi
+staremmo
+vi
+nostra
+stareste
+sarò
+avessimo
+allo
+siano
+io
+come
+suo
+facciate
+saremmo
+ci
+quella
+era
+l
+avemmo
+miei
+sareste
+ed
+sui
+quanto
+avete
+un
+con
+ero
+vostre
+questo
+nel
+quanti
+più
+ha
+stiate
+quelle
+stetti
+col
+avremmo
+fosse
+questi
+noi
+tua
+ho
+mia
+farebbero
+erano
+vostra
+ebbi
+farei
+quante
+aveste
+abbia
+stessero
+staremo
+avessero
+eri
+avranno
+fossi
+queste
+hai
+sarei
+avrai
+tue
+chi
+sei
+stanno
+mie
+dell
+ebbe
+ne
+quanta
+dai
+avremo
+aveva
+vostro
+questa
+che
+fossimo
+sarà
+facevo
+vostri
+lui
+ti
+sugl
+tuo
+facesti
+sugli
+mio
diff --git a/src/mongo/db/fts/stop_words_norwegian.txt b/src/mongo/db/fts/stop_words_norwegian.txt
new file mode 100644
index 00000000000..daf5f27d9c8
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_norwegian.txt
@@ -0,0 +1,119 @@
+som
+alle
+et
+vÖre
+gjÛre
+slik
+ha
+nÅ
+fordi
+og
+skulle
+sist
+hvis
+vil
+hvem
+andre
+slutt
+mens
+siden
+sÅ
+over
+lage
+da
+din
+vite
+deres
+disse
+for
+hva
+hennes
+kunne
+ny
+ved
+fra
+lang
+mer
+kan
+er
+denne
+verdi
+riktig
+bruke
+meget
+opp
+mÅ
+mye
+sant
+samme
+mÅte
+hvordan
+der
+ville
+uten
+eneste
+hans
+oss
+hver
+like
+tilstand
+arbeid
+hvilken
+fÅ
+hvor
+folk
+det
+ut
+start
+gÅ
+hvorfor
+god
+tid
+ikke
+meg
+han
+stille
+bra
+fÛrst
+i
+ene
+fÛr
+ogsÅ
+enn
+rett
+navn
+lik
+makt
+med
+av
+til
+inn
+vÅr
+pÅ
+her
+nÅr
+mange
+du
+forsÛke
+begge
+vi
+part
+eller
+hadde
+tilbake
+en
+var
+enhver
+si
+vÖrt
+mest
+om
+gjorde
+men
+min
+punkt
+bort
+under
+nei
+innen
diff --git a/src/mongo/db/fts/stop_words_portuguese.txt b/src/mongo/db/fts/stop_words_portuguese.txt
new file mode 100644
index 00000000000..5b14cf26243
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_portuguese.txt
@@ -0,0 +1,147 @@
+para
+o
+está
+como
+quem
+ou
+eles
+nosso
+este
+não
+tu
+muitos
+somente
+corrente
+parte
+quando
+todos
+por
+das
+estão
+têm
+poderá
+com
+qualquer
+fora
+nome
+estar
+pode
+também
+novo
+bem
+meu
+dois
+desde
+ver
+fez
+estado
+ser
+ir
+aquelas
+trabalhar
+promeiro
+iste
+usar
+teu
+muito
+mais
+tentar
+quieto
+aquele
+teve
+quê
+comprido
+ela
+irá
+tentaram
+deverá
+você
+estará
+são
+fará
+desligado
+tenho
+e
+podia
+então
+direita
+isto
+esteve
+tive
+tal
+eu
+debaixo
+dentro
+foi
+último
+caminho
+tentei
+diz
+valor
+fazer
+pessoas
+fazia
+tipo
+deve
+acerca
+pelo
+trabalho
+dos
+em
+veja
+aqueles
+uns
+devem
+alguns
+verdade
+nós
+uma
+tente
+bom
+mas
+conhecido
+algmas
+saber
+umas
+usa
+qual
+um
+porque
+seu
+estivemos
+inicio
+horas
+ambos
+cima
+outro
+maioria
+faz
+estive
+aquela
+os
+pegar
+estiveram
+ista
+sem
+mesmo
+povo
+apontar
+fim
+estes
+ligado
+atrás
+dizer
+maiorias
+iniciar
+aqui
+ele
+tempo
+enquanto
+ali
+tem
+antes
+verdadeiro
+onde
+agora
+cada
diff --git a/src/mongo/db/fts/stop_words_romanian.txt b/src/mongo/db/fts/stop_words_romanian.txt
new file mode 100644
index 00000000000..1a7cb994c86
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_romanian.txt
@@ -0,0 +1,258 @@
+vreo
+acelea
+cita
+degraba
+lor
+alta
+tot
+ai
+dat
+x
+despre
+peste
+bine
+dar
+foarte
+z
+avea
+multi
+cit
+alt
+mai
+sa
+fie
+tu
+multe
+e
+orice
+dintr
+se
+g
+intr
+niste
+multa
+insa
+il
+fost
+a
+abia
+nimic
+sub
+acel
+in
+altceva
+si
+avem
+altfel
+c
+ea
+acest
+li
+parca
+fi
+dintre
+unele
+m
+acestei
+mare
+cel
+este
+pe
+atitia
+uneori
+acela
+iti
+astazi
+acestui
+o
+imi
+ele
+ceilalti
+pai
+fata
+noua
+sa-ti
+altul
+au
+i
+prin
+conform
+aceste
+anume
+azi
+k
+unul
+ala
+unei
+fara
+ei
+la
+aceeasi
+u
+inapoi
+acestea
+acesta
+catre
+sale
+asupra
+as
+aceea
+ba
+ale
+da
+le
+apoi
+aia
+suntem
+cum
+isi
+inainte
+s
+de
+cind
+cumva
+chiar
+acestia
+daca
+sunt
+care
+al
+numai
+cui
+sus
+tocmai
+prea
+cu
+mi
+eu
+doar
+niciodata
+exact
+putini
+aiurea
+tuturor
+celor
+astfel
+atunci
+citeva
+cat
+sau
+fel
+intre
+acolo
+nostri
+ma
+mult
+una
+ceea
+iar
+sintem
+ati
+din
+geaba
+sai
+caruia
+adica
+inca
+are
+aici
+ca
+ia
+nici
+d
+oricum
+asta
+carora
+face
+citiva
+voi
+unor
+f
+atat
+toata
+alaturi
+cea
+nu
+totusi
+ce
+altii
+acum
+sint
+capat
+mod
+deasupra
+cam
+vom
+b
+toate
+careia
+aceasta
+atit
+nimeni
+ii
+ci
+unde
+ul
+plus
+era
+sa-mi
+l
+spre
+dupa
+nou
+cele
+acea
+un
+incit
+n
+cei
+or
+va
+deci
+acelasi
+atatea
+h
+vor
+decit
+noi
+cineva
+desi
+ceva
+j
+ului
+atitea
+avut
+ar
+pina
+t
+atata
+unui
+el
+citi
+asa
+totul
+pentru
+atita
+v
+alti
+asemenea
+atatia
+te
+ne
+deja
+unii
+p
+atare
+cite
+cine
+cand
+toti
+vreun
+ori
+r
+alte
+lui
+ti
+ni
+aceia
+am
diff --git a/src/mongo/db/fts/stop_words_russian.txt b/src/mongo/db/fts/stop_words_russian.txt
new file mode 100644
index 00000000000..b44c0fc7011
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_russian.txt
@@ -0,0 +1,421 @@
+четвертый
+многочисленное
+там
+ты
+обычно
+даром
+через
+из
+туда
+каждый
+начала
+алло
+он
+за
+мор
+вам
+долго
+только
+пока
+быть
+этим
+ими
+важные
+раз
+да
+теми
+никогда
+менее
+под
+раньше
+них
+прекрасно
+сама
+время
+семнадцать
+несколько
+люди
+чаще
+им
+действительно
+том
+десятый
+везде
+тою
+четырнадцать
+вся
+тринадцать
+какой
+такая
+внизу
+разве
+нее
+моя
+наш
+зато
+каждая
+ними
+моё
+почти
+другие
+отовсюду
+которых
+должно
+затем
+которые
+более
+важное
+давно
+рано
+всему
+может
+второй
+пятый
+тысяч
+кто
+тому
+тоже
+ваши
+нею
+меля
+нужно
+ни
+нх
+был
+твоё
+часто
+ею
+позже
+твоя
+вас
+двадцатый
+нибудь
+именно
+друго
+самими
+своих
+себе
+семь
+процентов
+одиннадцать
+этом
+нему
+сказать
+вдали
+всеми
+другой
+тобою
+хорошо
+сказала
+теперь
+были
+вверх
+двух
+неё
+говорит
+сегодня
+тех
+потому
+этого
+ну
+пятнадцать
+будете
+хоть
+сих
+занято
+года
+бы
+конечно
+восемнадцатый
+которой
+девять
+пожалуйста
+после
+этими
+мало
+впрочем
+без
+двенадцать
+было
+совсем
+этот
+так
+она
+непрерывно
+свою
+нет
+хотеть
+себя
+самого
+оба
+многочисленная
+назад
+бывь
+кого
+где
+будут
+буду
+ней
+можно
+всем
+ещё
+сам
+вами
+мне
+кругом
+мог
+шестнадцатый
+эта
+такие
+такой
+еще
+который
+мной
+самой
+ком
+то
+те
+восемнадцать
+весь
+занят
+оно
+сказал
+сначала
+которая
+нередко
+никуда
+много
+со
+ниже
+хотя
+которого
+других
+против
+однажды
+восемь
+будь
+наша
+ли
+нас
+особенно
+тебе
+четырнадцатый
+две
+иметь
+первый
+вы
+иногда
+кем
+человек
+самому
+во
+будет
+это
+три
+заняты
+этих
+пятнадцатый
+довольно
+чтоб
+все
+над
+их
+мы
+ту
+будто
+от
+всего
+что
+но
+при
+день
+всё
+собою
+ваш
+очень
+посреди
+говорил
+наше
+год
+опять
+третий
+четыре
+чего
+шесть
+кажется
+немного
+семнадцатый
+мой
+недавно
+ему
+дел
+далеко
+также
+такое
+того
+будем
+однако
+времени
+какая
+ваше
+бывает
+сами
+потом
+не
+уж
+надо
+занята
+низко
+когда
+вдруг
+пять
+году
+вон
+девятнадцать
+седьмой
+девятый
+тебя
+здесь
+этой
+шестнадцать
+миллионов
+стал
+самих
+снова
+чуть
+самим
+одной
+или
+ей
+саму
+мира
+каждое
+больше
+ваша
+была
+чтобы
+просто
+двадцать
+само
+суть
+на
+слишком
+эту
+твой
+жизнь
+тринадцатый
+мимо
+тут
+восьмой
+многочисленный
+тем
+её
+него
+вокруг
+об
+кроме
+недалеко
+лишь
+хочешь
+один
+наиболее
+пора
+мои
+как
+чем
+меня
+нами
+есть
+близко
+почему
+уже
+тобой
+мною
+его
+они
+девятнадцатый
+перед
+всех
+пор
+всею
+вот
+чему
+значит
+каждые
+уметь
+зачем
+нем
+между
+всюду
+своей
+двенадцатый
+важный
+около
+мож
+для
+можхо
+другая
+до
+эти
+самом
+даже
+кому
+всегда
+сейчас
+ее
+одиннадцатый
+наверху
+куда
+одного
+своего
+меньше
+важная
+свои
+по
+могут
+про
+тот
+тогда
+свое
+лучше
+сколько
+будешь
+наконец
+вниз
+всю
+спасибо
+многочисленные
+ведь
+наши
+два
+шестой
+имя
+же
+сеаой
+отсюда
+рядом
+лет
+собой
+другое
+дальше
+ничего
+мочь
+если
+та
+десять
+этому
+нельзя
+нам
diff --git a/src/mongo/db/fts/stop_words_spanish.txt b/src/mongo/db/fts/stop_words_spanish.txt
new file mode 100644
index 00000000000..04132dc6b66
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_spanish.txt
@@ -0,0 +1,177 @@
+para
+tuyo
+usan
+primero desde
+sabes
+como
+aquellos
+largo
+ante
+podriais
+sin
+incluso
+un
+intento
+eras
+cierto
+una
+otro
+consigues
+ha
+bien
+tras
+alguna
+hacemos
+podrian
+tiempo
+por
+pero
+verdadera
+podrias
+somos
+fue
+muchos
+podeis
+modo
+intentas
+el
+trabajar
+bajo
+fin
+atras
+ultimo
+puedo
+hace
+ellas
+aquel
+intenta
+estado
+ir
+ser
+haces
+las
+tener
+entre
+vais
+cierta
+van
+usar
+intentan
+su
+nos
+trabajamos
+verdad
+estan
+trabajan
+estoy
+ellos
+empleas
+algunas
+también
+siendo
+muy
+solamente
+pueden
+ciertas
+yo
+tengo
+estamos
+unos
+hacen
+los
+empleo
+dentro
+sus
+valor
+sois
+vosotros
+eramos
+sabe
+tiene
+fui
+voy
+consiguen
+podemos
+esta
+trabajais
+entonces
+dos
+verdadero
+saben
+trabajas
+era
+vaya
+estaba
+usamos
+poder
+podriamos
+vosotras
+mio
+encima
+consigo
+usas
+teneis
+algún
+solo
+podria
+lo
+tienen
+conseguimos
+trabajo
+saber
+usa
+soy
+eran
+ampleamos
+porque
+emplear
+es
+donde
+fueron
+ambos
+tenemos
+sabeis
+fuimos
+eres
+va
+empleais
+cuando
+haceis
+con
+mientras
+intentar
+aquellas
+conseguir
+puede
+emplean
+la
+algunos
+sobre
+vamos
+estais
+quien
+hacer
+ciertos
+aqui
+usais
+todo
+en
+intentais
+bastante
+uno
+sabemos
+unas
+si
+consigue
+trabaja
+alguno
+uso
+antes
+intentamos
+hago
+cual
+arriba
+por qué
+gueno
+nosotros
+cada
diff --git a/src/mongo/db/fts/stop_words_swedish.txt b/src/mongo/db/fts/stop_words_swedish.txt
new file mode 100644
index 00000000000..fb5ecc1d275
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_swedish.txt
@@ -0,0 +1,386 @@
+bort
+ert
+gör
+likställda
+skall
+nittonde
+finns
+artonn
+flera
+kanske
+tills
+gick
+gjorde
+tjugoett
+fjorton
+siste
+hundra
+senare
+varit
+sjuttonde
+hundraen
+sina
+honom
+den
+nederst
+viktigare
+allas
+del
+vilket
+vad
+större
+möjligen
+åttio
+tolfte
+kommer
+heller
+inför
+viktigast
+sista
+gjort
+och
+tidigt
+mitt
+tio
+gäller
+fler
+komma
+fem
+tretton
+smått
+mittemot
+jämfört
+eller
+åtta
+vart
+mycket
+aderton
+enligt
+minst
+sjunde
+kommit
+möjligt
+ingenting
+liten
+goda
+längst
+när
+nummer
+femtionde
+mest
+adertonde
+över
+alltid
+från
+fyra
+det
+göra
+enkla
+nr
+förlåt
+varsågod
+under
+hon
+övermorgon
+sextionde
+något
+flesta
+fyrtio
+gått
+nittio
+gällt
+ute
+annat
+bland
+långsammare
+nästa
+in
+hennes
+kunna
+ursäkt
+fram
+dagar
+du
+innan
+varken
+elva
+två
+vill
+lätt
+mera
+inuti
+dig
+tillsammans
+kunnat
+femte
+som
+även
+någon
+kr
+vänstra
+åtminstone
+allt
+utanför
+nedersta
+annan
+noll
+nittionde
+får
+i
+gå
+tidig
+säga
+stora
+sig
+behövt
+åttionde
+genom
+helt
+varför
+behövde
+slutligen
+alltså
+fjortonde
+sitt
+tionde
+där
+delen
+imorgon
+kvar
+tjugotvå
+jag
+verkligen
+andra
+ettusen
+hellre
+mina
+inom
+vår
+tidigare
+sextonde
+kom
+han
+värre
+om
+högre
+sämst
+vilka
+bättre
+mindre
+första
+vårt
+ur
+tredje
+också
+finnas
+gälla
+möjlig
+legat
+behövas
+fjärde
+ut
+litet
+de
+har
+bakom
+deras
+små
+övre
+detta
+till
+hit
+ofta
+tjungo
+för
+sagt
+beslutat
+att
+min
+så
+blivit
+nio
+mellan
+vems
+olika
+nionde
+upp
+mot
+ditt
+nödvändigt
+dina
+länge
+hur
+haft
+femtio
+nitton
+efter
+framför
+idag
+inne
+beslutit
+dagarna
+tjugoen
+möjligtvis
+rätt
+därför
+på
+igår
+gärna
+samma
+stort
+dem
+sjuttionde
+igen
+tjugonde
+senast
+säger
+båda
+då
+ännu
+inga
+vidare
+tidigast
+längre
+hög
+godare
+störst
+bli
+sex
+inget
+trettionde
+din
+sexton
+hans
+vem
+våra
+tre
+går
+få
+åttonde
+nedre
+dagen
+sedan
+alla
+sist
+man
+långsammast
+bra
+ligga
+femton
+många
+henne
+femtonde
+helst
+tolv
+förra
+lika
+aldrig
+nu
+vid
+sjuttio
+nödvändigtvis
+långsam
+viktigt
+vi
+ibland
+ingen
+enkelt
+överst
+enkel
+viktig
+några
+med
+nödvändiga
+kunde
+höger
+tack
+dag
+ska
+fin
+fanns
+eftersom
+oss
+adjö
+era
+andras
+ja
+snart
+rakt
+bäst
+lättast
+ner
+vilken
+sextio
+följande
+bådas
+lilla
+hundraett
+måste
+utan
+trettio
+lättare
+långsamt
+men
+än
+ha
+god
+kan
+av
+skulle
+vänster
+sjutton
+sämre
+olikt
+varifrån
+var
+stor
+sju
+långt
+gott
+före
+tjugo
+någonting
+mig
+fyrtionde
+elfte
+inte
+genast
+likställd
+nej
+hade
+ligger
+bara
+borta
+trettonde
+sade
+beslut
+dock
+lite
+vara
+redan
+ned
+en
+fick
+tjugotre
+dess
+artonde
+knappast
+fått
+här
+högst
+tvåhundra
+ett
+blir
+sent
+sin
+sjätte
+nödvändig
+mer
+er
+ni
+blev
+nog
+godast
+dit
+oftast
+behöva
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
new file mode 100644
index 00000000000..8d70600ce8e
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -0,0 +1,32 @@
+// stop_words_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( English, Basic1 ) {
+ const StopWords* english = StopWords::getStopWords( "english" );
+ ASSERT( english->isStopWord( "the" ) );
+ ASSERT( !english->isStopWord( "computer" ) );
+ }
+
+ }
+}
diff --git a/src/mongo/db/fts/stop_words_turkish.txt b/src/mongo/db/fts/stop_words_turkish.txt
new file mode 100644
index 00000000000..66dea7e2dec
--- /dev/null
+++ b/src/mongo/db/fts/stop_words_turkish.txt
@@ -0,0 +1,114 @@
+mu
+onlar
+seksen
+ama
+trilyon
+buna
+bizim
+þeyden
+yirmi
+altý
+iki
+seni
+doksan
+dört
+bunun
+ki
+nereye
+altmýþ
+hem
+milyon
+kez
+otuz
+beþ
+elli
+bizi
+da
+sekiz
+ve
+çok
+bu
+veya
+ya
+kýrk
+onlarýn
+ona
+bana
+yetmiþ
+milyar
+þunu
+senden
+birþeyi
+dokuz
+yani
+kimi
+þeyler
+kim
+neden
+senin
+yedi
+niye
+üç
+þey
+mý
+tüm
+onlari
+bunda
+ise
+þundan
+hep
+þuna
+bin
+ben
+ondan
+kimden
+bazý
+belki
+ne
+bundan
+gibi
+de
+onlardan
+sizi
+sizin
+daha
+niçin
+þunda
+INSERmi
+bunu
+beni
+ile
+þu
+þeyi
+sizden
+defa
+biz
+için
+dahi
+siz
+nerde
+kime
+birþey
+birkez
+her
+biri
+on
+mü
+diye
+acaba
+sen
+en
+hepsi
+bir
+bizden
+sanki
+benim
+nerede
+onu
+benden
+yüz
+birkaç
+çünkü
+nasýl
+hiç
+katrilyon
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
new file mode 100644
index 00000000000..73f485901f6
--- /dev/null
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -0,0 +1,129 @@
+// tokenizer.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string>
+
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ Tokenizer::Tokenizer( const string& language, const StringData& str )
+ : _pos(0), _raw( str ) {
+ _english = language == "english";
+ _skipWhitespace();
+ _previousWhiteSpace = true;
+ }
+
+ bool Tokenizer::more() const {
+ return _pos < _raw.size();
+ }
+
+ Token Tokenizer::next() {
+ if ( _pos >= _raw.size() )
+ return Token( Token::INVALID, "", 0, false );
+
+ unsigned start = _pos++;
+ Token::Type type = _type( _raw[start] );
+ if ( type == Token::WHITESPACE ) abort();
+
+ if ( type == Token::TEXT )
+ while ( _pos < _raw.size() && _type( _raw[_pos] ) == type )
+ _pos++;
+
+ StringData ret = _raw.substr( start, _pos - start );
+ bool old = _previousWhiteSpace;
+ _previousWhiteSpace = _skipWhitespace();
+ return Token( type, ret, start, old );
+ }
+
+
+ bool Tokenizer::_skipWhitespace() {
+ unsigned start = _pos;
+ while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE )
+ _pos++;
+ return _pos > start;
+ }
+
+
+ Token::Type Tokenizer::_type( char c ) const {
+ switch ( c ) {
+ case ' ':
+ case '\f':
+ case '\v':
+ case '\t':
+ case '\r':
+ case '\n':
+ return Token::WHITESPACE;
+ case '\'':
+ if ( _english )
+ return Token::TEXT;
+ else
+ return Token::WHITESPACE;
+
+ case '~':
+ case '`':
+
+ case '!':
+ case '@':
+ case '#':
+ case '$':
+ case '%':
+ case '^':
+ case '&':
+ case '*':
+ case '(':
+ case ')':
+
+ case '-':
+
+ case '=':
+ case '+':
+
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '\\':
+
+ case ';':
+ case ':':
+
+ case '"':
+
+ case '<':
+ case '>':
+
+ case ',':
+ case '.':
+
+ case '/':
+ case '?':
+
+ return Token::DELIMITER;
+ default:
+ return Token::TEXT;
+ }
+ }
+
+ }
+
+}
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
new file mode 100644
index 00000000000..5b9a56ed8d6
--- /dev/null
+++ b/src/mongo/db/fts/tokenizer.h
@@ -0,0 +1,68 @@
+// tokenizer.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#pragma once
+
+#include <string>
+
+#include "mongo/base/string_data.h"
+#include "mongo/platform/unordered_map.h"
+#include "mongo/platform/unordered_set.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ struct Token {
+ enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
+ Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace )
+ : type( type ),
+ data( data ),
+ offset( offset ),
+ previousWhiteSpace( previousWhiteSpace ) {}
+
+ bool ok() const { return type != INVALID; }
+
+ Type type;
+ StringData data;
+ unsigned offset;
+ bool previousWhiteSpace;
+ };
+
+ class Tokenizer {
+ public:
+
+ Tokenizer( const std::string& language, const StringData& str );
+
+ bool more() const;
+ Token next();
+
+ private:
+ Token::Type _type( char c ) const;
+ bool _skipWhitespace();
+
+ unsigned _pos;
+ bool _previousWhiteSpace;
+ const StringData& _raw;
+ bool _english;
+ };
+
+ }
+}
+
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
new file mode 100644
index 00000000000..1502b2f4390
--- /dev/null
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -0,0 +1,119 @@
+// tokenizer_test.cpp
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+ namespace fts {
+
+ TEST( Tokenizer, Empty1 ) {
+ Tokenizer i( "english", "" );
+ ASSERT( !i.more() );
+ }
+
+ TEST( Tokenizer, Basic1 ) {
+ Tokenizer i( "english", "blue red green" );
+
+ ASSERT( i.more() );
+ ASSERT_EQUALS( i.next().data.toString(), "blue" );
+
+ ASSERT( i.more() );
+ ASSERT_EQUALS( i.next().data.toString(), "red" );
+
+ ASSERT( i.more() );
+ ASSERT_EQUALS( i.next().data.toString(), "green" );
+
+ ASSERT( !i.more() );
+ }
+
+ TEST( Tokenizer, Basic2 ) {
+ Tokenizer i( "english", "blue-red" );
+
+ Token a = i.next();
+ Token b = i.next();
+ Token c = i.next();
+ Token d = i.next();
+
+ ASSERT_EQUALS( Token::TEXT, a.type );
+ ASSERT_EQUALS( Token::DELIMITER, b.type );
+ ASSERT_EQUALS( Token::TEXT, c.type );
+ ASSERT_EQUALS( Token::INVALID, d.type );
+
+ ASSERT_EQUALS( "blue", a.data.toString() );
+ ASSERT_EQUALS( "-", b.data.toString() );
+ ASSERT_EQUALS( "red", c.data.toString() );
+
+ ASSERT( a.previousWhiteSpace );
+ ASSERT( !b.previousWhiteSpace );
+ ASSERT( !c.previousWhiteSpace );
+ }
+
+ TEST( Tokenizer, Basic3 ) {
+ Tokenizer i( "english", "blue -red" );
+
+ Token a = i.next();
+ Token b = i.next();
+ Token c = i.next();
+ Token d = i.next();
+
+ ASSERT_EQUALS( Token::TEXT, a.type );
+ ASSERT_EQUALS( Token::DELIMITER, b.type );
+ ASSERT_EQUALS( Token::TEXT, c.type );
+ ASSERT_EQUALS( Token::INVALID, d.type );
+
+ ASSERT_EQUALS( "blue", a.data.toString() );
+ ASSERT_EQUALS( "-", b.data.toString() );
+ ASSERT_EQUALS( "red", c.data.toString() );
+
+ ASSERT( a.previousWhiteSpace );
+ ASSERT( b.previousWhiteSpace );
+ ASSERT( !c.previousWhiteSpace );
+
+
+ ASSERT_EQUALS( 0U, a.offset );
+ ASSERT_EQUALS( 5U, b.offset );
+ ASSERT_EQUALS( 6U, c.offset );
+ }
+
+ TEST( Tokenizer, Quote1English ) {
+ Tokenizer i( "english", "eliot's car" );
+
+ Token a = i.next();
+ Token b = i.next();
+
+ ASSERT_EQUALS( "eliot's", a.data.toString() );
+ ASSERT_EQUALS( "car", b.data.toString() );
+ }
+
+ TEST( Tokenizer, Quote1French ) {
+ Tokenizer i( "french", "eliot's car" );
+
+ Token a = i.next();
+ Token b = i.next();
+ Token c = i.next();
+
+ ASSERT_EQUALS( "eliot", a.data.toString() );
+ ASSERT_EQUALS( "s", b.data.toString() );
+ ASSERT_EQUALS( "car", c.data.toString() );
+ }
+
+ }
+}
+
+