diff options
Diffstat (limited to 'src/mongo')
51 files changed, 7622 insertions, 0 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript new file mode 100644 index 00000000000..efc1f10586f --- /dev/null +++ b/src/mongo/db/fts/SConscript @@ -0,0 +1,88 @@ +# -*- mode: python -*- + +Import("env") + +stop_word_lanages = [ + 'danish', + 'dutch', + 'english', + 'finnish', + 'french', + 'german', + 'hungarian', + 'italian', + 'norwegian', + 'portuguese', + 'romanian', + 'russian', + 'spanish', + 'swedish', + 'turkish', +] + +env.Command( [ "stop_words_list.h", "stop_words_list.cpp"], + [ "generate_stop_words.py"] + [ 'stop_words_%s.txt' % x for x in stop_word_lanages ], + "$PYTHON $SOURCES $TARGETS" ) + +# this is not awesome +hack = env.Clone() +hack.StaticLibrary( "stopwords", [ "stop_words_list.cpp" ] ) +if "-O3" in hack["CCFLAGS"]: + hack["CCFLAGS"] = hack["CCFLAGS"].remove( "-O3" ) + +env.StaticLibrary('base', [ + 'fts_index_format.cpp', + 'fts_matcher.cpp', + 'fts_query.cpp', + 'fts_spec.cpp', + 'fts_util.cpp', + 'stemmer.cpp', + 'stop_words.cpp', + 'tokenizer.cpp', + ], LIBDEPS=["stopwords", + "$BUILD_DIR/mongo/base/base", + "$BUILD_DIR/mongo/bson", + "$BUILD_DIR/mongo/platform/platform", + "$BUILD_DIR/third_party/libstemmer_c/stemmer" + ]) + +env.StaticLibrary( 'server_common', [ + 'fts_command.cpp', + 'fts_enabled.cpp' + ] ) + +env.StaticLibrary('ftsmongod', [ + 'fts_command_mongod.cpp', + 'fts_index.cpp', + 'fts_search.cpp', + ], LIBDEPS=["base","server_common"]) + + +env.StaticLibrary('ftsmongos', [ + 'fts_command_mongos.cpp', + ], LIBDEPS=["server_common"]) + + +env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_query_test", "fts_query_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_util_test", "fts_util_test.cpp", + LIBDEPS=["base","$BUILD_DIR/mongo/mongohasher"] ) diff --git a/src/mongo/db/fts/fts_command.cpp b/src/mongo/db/fts/fts_command.cpp new file mode 100644 index 00000000000..0cfdf29f8c6 --- /dev/null +++ b/src/mongo/db/fts/fts_command.cpp @@ -0,0 +1,93 @@ +// fts_command.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string> +#include <vector> + +#include "mongo/db/fts/fts_command.h" +#include "mongo/db/fts/fts_enabled.h" +#include "mongo/db/fts/fts_search.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/timer.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + FTSCommand ftsCommand; + + FTSCommand::FTSCommand() + : Command( "text" ) { + } + + void FTSCommand::addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::find); + out->push_back(Privilege(parseNs(dbname, cmdObj), actions)); + } + + + bool FTSCommand::run(const string& dbname, + BSONObj& cmdObj, + int options, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + + if ( !isTextSearchEnabled() ) { + errmsg = "text search not enabled"; + return false; + } + + string ns = dbname + "." + cmdObj.firstElement().String(); + + string search = cmdObj["search"].valuestrsafe(); + if ( search.size() == 0 ) { + errmsg = "no search specified"; + return false; + } + + string language = cmdObj["language"].valuestrsafe(); + + int limit = cmdObj["limit"].numberInt(); + if (limit == 0) + limit = 100; + + BSONObj filter; + if ( cmdObj["filter"].isABSONObj() ) + filter = cmdObj["filter"].Obj(); + + BSONObj projection; + if (cmdObj["projection"].isABSONObj()) { + projection = cmdObj["projection"].Obj(); + } + + return _run( dbname, cmdObj, options, + ns, search, language, limit, filter, projection, errmsg, result ); + } + + + } + + +} diff --git a/src/mongo/db/fts/fts_command.h b/src/mongo/db/fts/fts_command.h new file mode 100644 index 00000000000..cbd92758ecb --- /dev/null +++ b/src/mongo/db/fts/fts_command.h @@ -0,0 +1,68 @@ +// fts_command.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> +#include <vector> + +#include "mongo/db/commands.h" + +namespace mongo { + + namespace fts { + + class FTSCommand : public Command { + public: + FTSCommand(); + + bool slaveOk() const { return true; } + bool slaveOverrideOk() const { return true; } + + LockType locktype() const; + + void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out); + + + bool run(const string& dbname, + BSONObj& cmdObj, + int options, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl); + + protected: + bool _run( const string& dbName, + BSONObj& cmdObj, + int cmdOptions, + const string& ns, + const string& searchString, + string language, // "" for not-set + int limit, + BSONObj& filter, + BSONObj& projection, + string& errmsg, + BSONObjBuilder& result ); + }; + + } + +} + diff --git a/src/mongo/db/fts/fts_command_mongod.cpp b/src/mongo/db/fts/fts_command_mongod.cpp new file mode 100644 index 00000000000..cd38175c8e5 --- /dev/null +++ b/src/mongo/db/fts/fts_command_mongod.cpp @@ -0,0 +1,159 @@ +// fts_command_mongod.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <algorithm> +#include <string> +#include <vector> + +#include "mongo/db/fts/fts_command.h" +#include "mongo/db/fts/fts_search.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/pdfile.h" +#include "mongo/db/projection.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/timer.h" + +namespace mongo { + + namespace fts { + + Command::LockType FTSCommand::locktype() const { + return READ; + } + + /* + * Runs the command object cmdobj on the db with name dbname and puts result in result. + * @param dbname, name of db + * @param cmdobj, object that contains entire command + * @param options + * @param errmsg, reference to error message + * @param result, reference to builder for result + * @param fromRepl + * @return true if successful, false otherwise + */ + bool FTSCommand::_run(const string& dbname, + BSONObj& cmdObj, + int cmdOptions, + const string& ns, + const string& searchString, + string language, // "" for not-set + int limit, + BSONObj& filter, + BSONObj& projection, + string& errmsg, + BSONObjBuilder& result ) { + + Timer comm; + + scoped_ptr<Projection> pr; + if ( !projection.isEmpty() ) { + pr.reset( new Projection() ); + pr->init( projection ); + } + + // priority queue for results + Results results; + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( !d ) { + errmsg = "can't find ns"; + return false; + } + + vector<int> idxMatches; + d->findIndexByType( INDEX_NAME, idxMatches ); + if ( idxMatches.size() == 0 ) { + errmsg = str::stream() << "no text index for: " << ns; + return false; + } + if ( idxMatches.size() > 1 ) { + errmsg = str::stream() << "too many text index for: " << ns; + return false; + } + + const IndexDetails& id = d->idx( idxMatches[0] ); + BSONObj indexPrefix; + + if ( language == "" ) { + FTSIndex* ftsIndex = static_cast<FTSIndex*>(id.getSpec().getType()); + language = ftsIndex->getFtsSpec().defaultLanguage(); + Status s = ftsIndex->getFtsSpec().getIndexPrefix( filter, &indexPrefix ); + if ( !s.isOK() ) { + errmsg = s.toString(); + return false; + } + } + + + FTSQuery query; + if ( !query.parse( searchString, language ).isOK() ) { + errmsg = "can't parse search"; + return false; + } + result.append( "queryDebugString", query.debugString() ); + result.append( "language", language ); + + FTSSearch search( d, id, indexPrefix, query, filter ); + search.go( &results, limit ); + + // grab underlying container inside priority queue + vector<ScoredLocation> r( results.dangerous() ); + + // sort results by score (not always in correct order, especially w.r.t. multiterm) + sort( r.begin(), r.end() ); + + // build the results bson array shown to user + BSONArrayBuilder a( result.subarrayStart( "results" ) ); + + int BSONResultSize = 1024; + + for ( unsigned n = 0; n < r.size(); n++ ) { + BSONObj obj = BSONObj::make(r[n].rec); + BSONObj toSendBack = obj; + + if ( pr ) { + toSendBack = pr->transform(obj); + } + + if ( ( BSONResultSize + toSendBack.objsize() ) >= BSONObjMaxUserSize ) { + break; + } + + BSONObjBuilder x( a.subobjStart() ); + x.append( "score" , r[n].score ); + x.append( "obj", toSendBack ); + + BSONObj xobj = x.done(); + BSONResultSize += xobj.objsize(); + } + + a.done(); + + // returns some stats to the user + BSONObjBuilder bb( result.subobjStart( "stats" ) ); + bb.appendNumber( "nscanned" , search.getKeysLookedAt() ); + bb.appendNumber( "nscannedObjects" , search.getObjLookedAt() ); + bb.appendNumber( "n" , r.size() ); + bb.append( "timeMicros", (int)comm.micros() ); + bb.done(); + + return true; + } + } + +} diff --git a/src/mongo/db/fts/fts_command_mongos.cpp b/src/mongo/db/fts/fts_command_mongos.cpp new file mode 100644 index 00000000000..04cc8a1b808 --- /dev/null +++ b/src/mongo/db/fts/fts_command_mongos.cpp @@ -0,0 +1,129 @@ +// fts_command_mongos.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <map> +#include <string> +#include <vector> + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_command.h" +#include "mongo/s/strategy.h" + + +namespace mongo { + namespace fts { + + struct Scored { + Scored( BSONObj full ) + : full( full ) { + score = full["score"].numberDouble(); + } + bool operator<( const Scored& other ) const { + return other.score < score; + } + BSONObj full; + double score; + }; + + + // all grid commands are designed not to lock + Command::LockType FTSCommand::locktype() const { return NONE; } + + bool FTSCommand::_run(const string& dbName, + BSONObj& cmdObj, + int cmdOptions, + const string& ns, + const string& searchString, + string language, // "" for not-set + int limit, + BSONObj& filter, + BSONObj& projection, + string& errmsg, + BSONObjBuilder& result ) { + + Timer timer; + + map<Shard, BSONObj> results; + SHARDED->commandOp( dbName, cmdObj, cmdOptions, ns, filter, results ); + + vector<Scored> all; + long long nscanned = 0; + long long nscannedObjects = 0; + + BSONObjBuilder shardStats; + + for ( map<Shard,BSONObj>::const_iterator i = results.begin(); i != results.end(); ++i ) { + BSONObj r = i->second; + + LOG(2) << "fts result for shard: " << i->first << "\n" << r << endl; + + if ( !r["ok"].trueValue() ) { + errmsg = str::stream() << "failure on shard: " << i->first.toString() + << ": " << r["errmsg"]; + result.append( "rawresult", r ); + return false; + } + + if ( r["stats"].isABSONObj() ) { + BSONObj x = r["stats"].Obj(); + nscanned += x["nscanned"].numberLong(); + nscannedObjects += x["nscannedObjects"].numberLong(); + + shardStats.append( i->first.getName(), x ); + } + + if ( r["results"].isABSONObj() ) { + BSONObjIterator j( r["results"].Obj() ); + while ( j.more() ) { + BSONElement e = j.next(); + all.push_back( Scored(e.Obj()) ); + } + } + } + + sort( all.begin(), all.end() ); + long long n = 0; + { + BSONArrayBuilder arr( result.subarrayStart( "results" ) ); + for ( unsigned i = 0; i < all.size(); i++ ) { + arr.append( all[i].full ); + if ( ++n >= limit ) + break; + } + arr.done(); + } + + { + BSONObjBuilder stats( result.subobjStart( "stats" ) ); + stats.appendNumber( "nscanned", nscanned ); + stats.appendNumber( "nscannedObjects", nscannedObjects ); + stats.appendNumber( "n", n ); + stats.append( "timeMicros", (int)timer.micros() ); + + stats.append( "shards", shardStats.obj() ); + + stats.done(); + } + + return true; + } + + FTSCommand ftsCommandSharded; + } +} diff --git a/src/mongo/db/fts/fts_enabled.cpp b/src/mongo/db/fts/fts_enabled.cpp new file mode 100644 index 00000000000..7a11e394f6a --- /dev/null +++ b/src/mongo/db/fts/fts_enabled.cpp @@ -0,0 +1,28 @@ +// fts_enabled.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/server_parameters.h" + +namespace mongo { + namespace fts { + MONGO_EXPORT_SERVER_PARAMETER( textSearchEnabled, bool, false ); + bool isTextSearchEnabled() { + return textSearchEnabled; + } + } +} diff --git a/src/mongo/db/fts/fts_enabled.h b/src/mongo/db/fts/fts_enabled.h new file mode 100644 index 00000000000..d3f733dc49f --- /dev/null +++ b/src/mongo/db/fts/fts_enabled.h @@ -0,0 +1,25 @@ +// fts_enabled.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + namespace fts { + bool isTextSearchEnabled(); + } +} diff --git a/src/mongo/db/fts/fts_index.cpp b/src/mongo/db/fts/fts_index.cpp new file mode 100644 index 00000000000..04fafe12a83 --- /dev/null +++ b/src/mongo/db/fts/fts_index.cpp @@ -0,0 +1,96 @@ +// fts_index.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/base/init.h" +#include "mongo/db/client.h" +#include "mongo/db/fts/fts_enabled.h" +#include "mongo/db/fts/fts_index.h" +#include "mongo/db/fts/fts_index_format.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" +#include "mongo/util/timer.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + /* + * extrapolates the weights vector + * and extra information from the spec + * @param plugin the index plugin for FTS + * @param spec the index specification + */ + FTSIndex::FTSIndex( const IndexPlugin* plugin, const IndexSpec* spec ) + : IndexType( plugin, spec ), _ftsSpec( spec->info ) { + } + + void FTSIndex::getKeys( const BSONObj& obj, BSONObjSet& keys) const { + FTSIndexFormat::getKeys( _ftsSpec, obj, &keys ); + } + + shared_ptr<Cursor> FTSIndex::newCursor( const BSONObj& query, + const BSONObj& order, + int numWanted ) const { + shared_ptr<Cursor> c; + verify(0); + return c; + } + + + FTSIndexPlugin::FTSIndexPlugin() : IndexPlugin( INDEX_NAME ) {} + + + /* + * Adjusts spec by appending information relative to the + * FTS Index (such as weights, index name, etc) + * @param spec, specification object + * + */ + BSONObj FTSIndexPlugin::adjustIndexSpec( const BSONObj& spec ) const { + StringData desc = cc().desc(); + if ( desc.find( "conn" ) == 0 ) { + // this is to make sure we only complain for users + // if you do get a text index created an a primary + // want it to index on the secondary as well + massert( 16633, "text search not enabled", isTextSearchEnabled() ); + } + return FTSSpec::fixSpec( spec ); + } + + /* + * Generates an FTSIndex with a spec and this plugin + * @param spec, specification to be used + */ + IndexType* FTSIndexPlugin::generate( const IndexSpec* spec ) const { + return new FTSIndex( this, spec ); + } + + + FTSIndexPlugin* ftsPlugin; + MONGO_INITIALIZER(FTSIndexPlugin)(InitializerContext* context) { + ftsPlugin = new FTSIndexPlugin(); + return Status::OK(); + } + + } + +} diff --git a/src/mongo/db/fts/fts_index.h b/src/mongo/db/fts/fts_index.h new file mode 100644 index 00000000000..d9bf8a61b16 --- /dev/null +++ b/src/mongo/db/fts/fts_index.h @@ -0,0 +1,67 @@ +// fts_index.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <map> +#include <vector> + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/db/index.h" + +namespace mongo { + + namespace fts { + + class FTSIndex : public IndexType { + public: + + // index constructor, called when user enters ensureIndex command with fts flag + FTSIndex(const IndexPlugin *plugin, const IndexSpec* spec); + + void getKeys( const BSONObj& obj, BSONObjSet& keys) const; + + /* newCursor is pure Virtual in IndexType so it has to be redefined in FTSIndex */ + shared_ptr<Cursor> newCursor( const BSONObj& query, + const BSONObj& order, + int numWanted ) const; + + const FTSSpec& getFtsSpec() const { return _ftsSpec; } + + private: + + FTSSpec _ftsSpec; + }; + + + class FTSIndexPlugin : public IndexPlugin { + public: + FTSIndexPlugin(); + + IndexType* generate( const IndexSpec* spec ) const; + + BSONObj adjustIndexSpec( const BSONObj& spec ) const; + + }; + + } //namespace fts +} //namespace mongo diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp new file mode 100644 index 00000000000..b39b336d651 --- /dev/null +++ b/src/mongo/db/fts/fts_index_format.cpp @@ -0,0 +1,119 @@ +// fts_index_format.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/base/init.h" +#include "mongo/db/fts/fts_index_format.h" + +namespace mongo { + + namespace fts { + + namespace { + BSONObj nullObj; + BSONElement nullElt; + } + + MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) { + BSONObjBuilder b; + b.appendNull( "" ); + nullObj = b.obj(); + nullElt = nullObj.firstElement(); + return Status::OK(); + } + + void FTSIndexFormat::getKeys( const FTSSpec& spec, + const BSONObj& obj, + BSONObjSet* keys ) { + + int extraSize = 0; + vector<BSONElement> extrasBefore; + vector<BSONElement> extrasAfter; + + // compute the non FTS key elements + for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) { + BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); + if ( e.eoo() ) + e = nullElt; + extrasBefore.push_back(e); + extraSize += e.size(); + } + for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) { + BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); + if ( e.eoo() ) + e = nullElt; + extrasAfter.push_back(e); + extraSize += e.size(); + } + + + TermFrequencyMap term_freqs; + spec.scoreDocument( obj, &term_freqs ); + + // create index keys from raw scores + // only 1 per string + for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); + i != term_freqs.end(); + ++i ) { + + const string& term = i->first; + double weight = i->second; + + // guess the total size of the btree entry based on the size of the weight, term tuple + int guess = + 5 /* bson overhead */ + + 10 /* weight */ + + 8 /* term overhead */ + + term.size() + + extraSize; + + BSONObjBuilder b(guess); // builds a BSON object with guess length. + for ( unsigned k = 0; k < extrasBefore.size(); k++ ) + b.appendAs( extrasBefore[k], "" ); + _appendIndexKey( b, weight, term ); + for ( unsigned k = 0; k < extrasAfter.size(); k++ ) + b.appendAs( extrasAfter[k], "" ); + BSONObj res = b.obj(); + + verify( guess >= res.objsize() ); + + keys->insert( res ); + } + } + + BSONObj FTSIndexFormat::getIndexKey( double weight, + const string& term, + const BSONObj& indexPrefix ) { + BSONObjBuilder b; + + BSONObjIterator i( indexPrefix ); + while ( i.more() ) + b.appendAs( i.next(), "" ); + + _appendIndexKey( b, weight, term ); + return b.obj(); + } + + void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term ) { + verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header + b.append( "", term ); + b.append( "", weight ); + } + } +} diff --git a/src/mongo/db/fts/fts_index_format.h b/src/mongo/db/fts/fts_index_format.h new file mode 100644 index 00000000000..eeb225e756f --- /dev/null +++ b/src/mongo/db/fts/fts_index_format.h @@ -0,0 +1,55 @@ +// fts_index_format.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "mongo/db/fts/fts_spec.h" + +namespace mongo { + + namespace fts { + + class FTSIndexFormat { + public: + + static void getKeys( const FTSSpec& spec, + const BSONObj& document, + BSONObjSet* keys ); + + /* + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param weight, the weight of the term in the entry + * @param term, the string term in the entry + * @param indexPrefix, the fields that go in the index first + */ + static BSONObj getIndexKey( double weight, + const string& term, + const BSONObj& indexPrefix ); + + private: + /* + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param b, reference to the BSONOBjBuilder + * @param weight, the weight of the term in the entry + * @param term, the string term in the entry + */ + static void _appendIndexKey( BSONObjBuilder& b, double weight, const string& term ); + }; + + } +} diff --git a/src/mongo/db/fts/fts_index_format_test.cpp b/src/mongo/db/fts/fts_index_format_test.cpp new file mode 100644 index 00000000000..7b0f5b32f0a --- /dev/null +++ b/src/mongo/db/fts/fts_index_format_test.cpp @@ -0,0 +1,96 @@ +// fts_index_format_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_index_format.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + + namespace fts { + + TEST( FTSIndexFormat, Simple1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys ); + + ASSERT_EQUALS( 2U, keys.size() ); + for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { + BSONObj key = *i; + ASSERT_EQUALS( 2, key.nFields() ); + ASSERT_EQUALS( String, key.firstElement().type() ); + } + } + + TEST( FTSIndexFormat, ExtraBack1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << + "x" << 1 ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + ASSERT_EQUALS( 5, i.next().numberInt() ); + } + + /* + TEST( FTSIndexFormat, ExtraBackArray1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << + "x.y" << 1 ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, + BSON( "data" << "cat" << + "x" << BSON_ARRAY( BSON( "y" << 1 ) << + BSON( "y" << 2 ) ) ), + &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + log() << "e: " << key << endl; + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + ASSERT_EQUALS( 5, i.next().numberInt() ); + } + */ + + TEST( FTSIndexFormat, ExtraFront1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 << + "data" << "text" ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( 5, i.next().numberInt() ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + } + + + } +} diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp new file mode 100644 index 00000000000..313fdd5be9e --- /dev/null +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -0,0 +1,247 @@ +// fts_matcher.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_matcher.h" + +namespace mongo { + + namespace fts { + + + FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec ) + : _query( query ), + _spec( spec ), + _stemmer( query.getLanguage() ){ + } + + /* + * Checks if the obj contains any of the negTerms, if so returns true, otherwise false + * @param obj, object to be checked + */ + bool FTSMatcher::hasNegativeTerm(const BSONObj& obj ) const { + // called during search. deals with the case in which we have a term + // flagged for exclusion, i.e. "hello -world" we want to remove all + // results that include "world" + + if ( _query.getNegatedTerms().size() == 0 ) + return false; + + if ( _spec.wildcard() ) { + return _hasNegativeTerm_recurse(obj); + } + + /* otherwise look at fields where weights are defined */ + for ( Weights::const_iterator i = _spec.weights().begin(); + i != _spec.weights().end(); + i++ ) { + const char * leftOverName = i->first.c_str(); + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + + if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + if ( x.type() == String ) + if ( _hasNegativeTerm_string( x.String() ) ) + return true; + } + } + else if ( e.type() == String ) { + if ( _hasNegativeTerm_string( e.String() ) ) + return true; + } + } + return false; + } + + bool FTSMatcher::_hasNegativeTerm_recurse(const BSONObj& obj ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( _spec.languageOverrideField() == x.fieldName()) + continue; + + if (x.type() == String) { + if ( _hasNegativeTerm_string( x.String() ) ) + return true; + } + else if ( x.isABSONObj() ) { + BSONObjIterator k( x.Obj() ); + while ( k.more() ) { + // check if k.next() is a obj/array or not + BSONElement y = k.next(); + if ( y.type() == String ) { + if ( _hasNegativeTerm_string( y.String() ) ) + return true; + } + else if ( y.isABSONObj() ) { + if ( _hasNegativeTerm_recurse( y.Obj() ) ) + return true; + } + } + } + } + return false; + } + + /* + * Checks if any of the negTerms is in the tokenized string + * @param raw, the raw string to be tokenized + */ + bool FTSMatcher::_hasNegativeTerm_string( const string& raw ) const { + + Tokenizer i( _query.getLanguage(), raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) + continue; + string word = tolowerString( _stemmer.stem( t.data ) ); + if ( _query.getNegatedTerms().count( word ) > 0 ) + return true; + } + return false; + } + + + bool FTSMatcher::phrasesMatch( const BSONObj& obj ) const { + for (unsigned i = 0; i < _query.getPhr().size(); i++ ) { + if ( !phraseMatch( _query.getPhr()[i], obj ) ) { + return false; + } + } + + for (unsigned i = 0; i < _query.getNegatedPhr().size(); i++ ) { + if ( phraseMatch( _query.getNegatedPhr()[i], obj ) ) { + return false; + } + } + + return true; + } + + + /** + * Checks if phrase is exactly matched in obj, returns true if so, false otherwise + * @param phrase, the string to be matched + * @param obj, document in the collection to match against + */ + bool FTSMatcher::phraseMatch( const string& phrase, const BSONObj& obj ) const { + + if ( _spec.wildcard() ) { + // case where everything is indexed (all fields) + return _phraseRecurse( phrase, obj ); + } + + for ( Weights::const_iterator i = _spec.weights().begin(); + i != _spec.weights().end(); + ++i ) { + + // figure out what the indexed field is.. ie. is it "field" or "field.subfield" etc. + const char * leftOverName = i->first.c_str(); + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + + if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + + if ( x.type() == String ) + if ( _phraseMatches( phrase, x.String() ) ) + return true; + } + } + else if ( e.type() == String ) { + if ( _phraseMatches( phrase, e.String() ) ) + return true; + } + } + return false; + } + + + /* + * Recurses over all fields in the obj to match against phrase + * @param phrase, string to be matched + * @param obj, object to matched against + */ + bool FTSMatcher::_phraseRecurse( const string& phrase, const BSONObj& obj ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( _spec.languageOverrideField() == x.fieldName() ) + continue; + + if ( x.type() == String ) { + if ( _phraseMatches( phrase, x.String() ) ) + return true; + } + else if ( x.isABSONObj() ) { + BSONObjIterator k( x.Obj() ); + + while ( k.more() ) { + + BSONElement y = k.next(); + + if ( y.type() == mongo::String ) { + if ( _phraseMatches( phrase, y.String() ) ) + return true; + } + else if ( y.isABSONObj() ) { + if ( _phraseRecurse( phrase, y.Obj() ) ) + return true; + } + } + + } + } + + return false; + } + + + /* + * Looks for phrase in a raw string + * @param phrase, phrase to match + * @param raw, raw string to be parsed + */ + bool FTSMatcher::_phraseMatches( const string& phrase, const string& haystack ) const { +#ifdef _WIN32 + // windows doesn't have strcasestr + // for now, doing something very slow, bu correct + string p = phrase; + string h = haystack; + makeLower( &p ); + makeLower( &h ); + return strstr( h.c_str(), p.c_str() ) > 0; +#else + return strcasestr( haystack.c_str(), phrase.c_str() ) > 0; +#endif + } + + + } +} diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h new file mode 100644 index 00000000000..c5478d63b78 --- /dev/null +++ b/src/mongo/db/fts/fts_matcher.h @@ -0,0 +1,67 @@ +// fts_matcher.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/tokenizer.h" + +namespace mongo { + + namespace fts { + + class FTSMatcher { + public: + FTSMatcher( const FTSQuery& query, const FTSSpec& spec ); + + /** + * @return true if obj has a negated term + */ + bool hasNegativeTerm(const BSONObj& obj ) const; + + /** + * @return true if obj is ok by all phrases + * so all full phrases and no negated + */ + bool phrasesMatch( const BSONObj& obj ) const; + + bool phraseMatch( const string& phrase, const BSONObj& obj ) const; + + bool matchesNonTerm( const BSONObj& obj ) const { + return !hasNegativeTerm( obj ) && phrasesMatch( obj ); + } + + private: + bool _hasNegativeTerm_recurse(const BSONObj& obj ) const; + + /** + * @return true if raw has a negated term + */ + bool _hasNegativeTerm_string( const string& raw ) const; + + bool _phraseRecurse( const string& phrase, const BSONObj& obj ) const; + bool _phraseMatches( const string& phrase, const string& haystack ) const; + + FTSQuery _query; + FTSSpec _spec; + Stemmer _stemmer; + }; + + } +} diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp new file mode 100644 index 00000000000..15369980885 --- /dev/null +++ b/src/mongo/db/fts/fts_matcher_test.cpp @@ -0,0 +1,63 @@ +// fts_matcher_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_matcher.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( FTSMatcher, NegWild1 ) { + FTSQuery q; + q.parse( "foo -bar", "english" ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) ); + + ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); + ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); + } + + TEST( FTSMatcher, Phrase1 ) { + FTSQuery q; + q.parse( "foo \"table top\"", "english" ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) ); + + ASSERT( m.phraseMatch( "table top", BSON( "x" << "table top" ) ) ); + ASSERT( m.phraseMatch( "table top", BSON( "x" << " asd table top asd" ) ) ); + ASSERT( !m.phraseMatch( "table top", BSON( "x" << "tablz top" ) ) ); + ASSERT( !m.phraseMatch( "table top", BSON( "x" << " asd tablz top asd" ) ) ); + + ASSERT( m.phrasesMatch( BSON( "x" << "table top" ) ) ); + ASSERT( !m.phrasesMatch( BSON( "x" << "table a top" ) ) ); + + } + + TEST( FTSMatcher, Phrase2 ) { + FTSQuery q; + q.parse( "foo \"table top\"", "english" ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "fts" ) ) ) ) ); + ASSERT( m.phraseMatch( "table top", + BSON( "x" << BSON_ARRAY( "table top" ) ) ) ); + } + + } +} diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp new file mode 100644 index 00000000000..0f32ba1afad --- /dev/null +++ b/src/mongo/db/fts/fts_query.cpp @@ -0,0 +1,173 @@ +// fts_query.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + Status FTSQuery::parse(const string& query, const string& language) { + _search = query; + _language = language; + + const StopWords* stopWords = StopWords::getStopWords( language ); + Stemmer stemmer( language ); + + bool inNegation = false; + bool inPhrase = false; + + str::stream phrase; + + Tokenizer i( _language, query ); + while ( i.more() ) { + Token t = i.next(); + + if ( t.type == Token::TEXT ) { + string s = t.data.toString(); + + if ( inPhrase ) { + if ( phrase.ss.len() > 0 ) + phrase << ' '; + phrase << s; + } + + if ( inPhrase && inNegation ) { + // don't add term + } + else { + _addTerm( stopWords, stemmer, s, inNegation ); + } + + if ( inNegation && !inPhrase ) + inNegation = false; + } + else if ( t.type == Token::DELIMITER ) { + char c = t.data[0]; + if ( c == '-' ) { + if ( t.previousWhiteSpace ) + inNegation = true; + } + else if ( c == '"' ) { + if ( inPhrase ) { + // end of a phrase + if ( inNegation ) + _negatedPhrases.push_back( tolowerString( phrase ) ); + else + _phrases.push_back( tolowerString( phrase ) ); + inNegation = false; + inPhrase = false; + } + else { + // start of a phrase + inPhrase = true; + phrase.ss.reset(); + } + } + } + else { + abort(); + } + } + + return Status::OK(); + } + + void FTSQuery::_addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ) { + string word = tolowerString( term ); + if ( sw->isStopWord( word ) ) + return; + word = stemmer.stem( word ); + if ( negated ) + _negatedTerms.insert( word ); + else + _terms.push_back( word ); + } + + namespace { + void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) { + bool first = true; + for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) { + if ( first ) + first = false; + else + ss << sep; + ss << *i; + } + } + + void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) { + set<string> s( v.begin(), v.end() ); + _debugHelp( ss, s, sep ); + } + + void _debugHelp( stringstream& ss, const unordered_set<string>& v, const string& sep ) { + set<string> s( v.begin(), v.end() ); + _debugHelp( ss, s, sep ); + } + + } + + string FTSQuery::toString() const { + stringstream ss; + ss << "FTSQuery\n"; + + ss << " terms: "; + _debugHelp( ss, getTerms(), ", " ); + ss << "\n"; + + ss << " negated terms: "; + _debugHelp( ss, getNegatedTerms(), ", " ); + ss << "\n"; + + ss << " phrases: "; + _debugHelp( ss, getPhr(), ", " ); + ss << "\n"; + + ss << " negated phrases: "; + _debugHelp( ss, getNegatedPhr(), ", " ); + ss << "\n"; + + return ss.str(); + } + + string FTSQuery::debugString() const { + stringstream ss; + + _debugHelp( ss, getTerms(), "|" ); + ss << "||"; + + _debugHelp( ss, getNegatedTerms(), "|" ); + ss << "||"; + + _debugHelp( ss, getPhr(), "|" ); + ss << "||"; + + _debugHelp( ss, getNegatedPhr(), "|" ); + + return ss.str(); + } + } +} diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h new file mode 100644 index 00000000000..7022760b3a7 --- /dev/null +++ b/src/mongo/db/fts/fts_query.h @@ -0,0 +1,80 @@ +// fts_query.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> +#include <vector> + +#include "mongo/base/status.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/platform/unordered_set.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + using std::string; + using std::vector; + using std::set; + + class FTSQuery { + + public: + Status parse(const string& query, const string& language); + + const vector<string>& getTerms() const { return _terms; } + const unordered_set<string>& getNegatedTerms() const { return _negatedTerms; } + + const vector<string>& getPhr() const { return _phrases; } + const vector<string>& getNegatedPhr() const { return _negatedPhrases; } + + /** + * @return true if any negations or phrase + or - + */ + bool hasNonTermPieces() const { + return + _negatedTerms.size() > 0 || + _phrases.size() > 0 || + _negatedPhrases.size() > 0; + } + + string getSearch() const { return _search; } + string getLanguage() const { return _language; } + + string toString() const; + + string debugString() const; + + protected: + string _search; + string _language; + vector<string> _terms; + unordered_set<string> _negatedTerms; + vector<string> _phrases; + vector<string> _negatedPhrases; + + private: + void _addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ); + }; + + } +} + diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp new file mode 100644 index 00000000000..92bd6ee222a --- /dev/null +++ b/src/mongo/db/fts/fts_query_test.cpp @@ -0,0 +1,73 @@ +// fts_query_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/db/fts/fts_query.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( FTSQuery, Basic1 ) { + FTSQuery q; + ASSERT( q.parse( "this is fun", "english" ).isOK() ); + + ASSERT_EQUALS( 1U, q.getTerms().size() ); + ASSERT_EQUALS( "fun", q.getTerms()[0] ); + ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); + ASSERT_EQUALS( 0U, q.getPhr().size() ); + ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); + } + + TEST( FTSQuery, Neg1 ) { + FTSQuery q; + ASSERT( q.parse( "this is -really fun", "english" ).isOK() ); + + ASSERT_EQUALS( 1U, q.getTerms().size() ); + ASSERT_EQUALS( "fun", q.getTerms()[0] ); + ASSERT_EQUALS( 1U, q.getNegatedTerms().size() ); + ASSERT_EQUALS( "realli", *q.getNegatedTerms().begin() ); + } + + TEST( FTSQuery, Phrase1 ) { + FTSQuery q; + ASSERT( q.parse( "doing a \"phrase test\" for fun", "english" ).isOK() ); + + ASSERT_EQUALS( 3U, q.getTerms().size() ); + ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); + ASSERT_EQUALS( 1U, q.getPhr().size() ); + ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); + + ASSERT_EQUALS( "phrase test", q.getPhr()[0] ); + ASSERT_EQUALS( "fun|phrase|test||||phrase test||", q.debugString() ); + } + + TEST( FTSQuery, NegPhrase1 ) { + FTSQuery q; + ASSERT( q.parse( "doing a -\"phrase test\" for fun", "english" ).isOK() ); + ASSERT_EQUALS( "fun||||||phrase test", q.debugString() ); + } + + TEST( FTSQuery, Mix1 ) { + FTSQuery q; + ASSERT( q.parse( "\"industry\" -Melbourne -Physics", "english" ).isOK() ); + ASSERT_EQUALS( "industri||melbourn|physic||industry||", q.debugString() ); + } + + } +} diff --git a/src/mongo/db/fts/fts_search.cpp b/src/mongo/db/fts/fts_search.cpp new file mode 100644 index 00000000000..5686cb89ffb --- /dev/null +++ b/src/mongo/db/fts/fts_search.cpp @@ -0,0 +1,175 @@ +// fts_search.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/btreecursor.h" +#include "mongo/db/fts/fts_index_format.h" +#include "mongo/db/fts/fts_search.h" +#include "mongo/db/kill_current_op.h" +#include "mongo/db/pdfile.h" + +namespace mongo { + + namespace fts { + + /* + * Constructor generates query and term dictionaries + * @param ns, namespace + * @param idxNum, index number + * @param search, query string + * @param language, language of the query + * @param filter, filter object + */ + FTSSearch::FTSSearch( NamespaceDetails* ns, + const IndexDetails& id, + const BSONObj& indexPrefix, + const FTSQuery& query, + const BSONObj& filter ) + : _ns( ns ), + _id( id ), + _fts( static_cast<FTSIndex*>(_id.getSpec().getType()) ), + _indexPrefix( indexPrefix ), + _query( query ), + _ftsMatcher( query, static_cast<FTSIndex*>(_id.getSpec().getType())->getFtsSpec() ) { + + if ( !filter.isEmpty() ) + _matcher.reset( new CoveredIndexMatcher( filter, _fts->keyPattern() ) ); + + _keysLookedAt = 0; + _objectsLookedAt = 0; + } + + bool FTSSearch::_ok( Record* record ) const { + if ( !_query.hasNonTermPieces() ) + return true; + return _ftsMatcher.matchesNonTerm( BSONObj::make( record ) ); + } + + /* + * GO: sets the tree cursors on each term in terms, processes the terms by advancing + * the terms cursors and storing the partial + * results and lastly calculates the top results + * @param results, the priority queue containing the top results + * @param limit, number of results in the priority queue + */ + void FTSSearch::go(Results* results, unsigned limit ) { + vector< shared_ptr<BtreeCursor> > cursors; + + for ( unsigned i = 0; i < _query.getTerms().size(); i++ ) { + const string& term = _query.getTerms()[i]; + BSONObj min = FTSIndexFormat::getIndexKey( MAX_WEIGHT, term, _indexPrefix ); + BSONObj max = FTSIndexFormat::getIndexKey( 0, term, _indexPrefix ); + shared_ptr<BtreeCursor> c( BtreeCursor::make( _ns, _id, min, max, true, -1 ) ); + cursors.push_back( c ); + } + + while ( !inShutdown() ) { + bool gotAny = false; + for ( unsigned i = 0; i < cursors.size(); i++ ) { + if ( cursors[i]->eof() ) + continue; + gotAny = true; + _process( cursors[i].get() ); + cursors[i]->advance(); + } + + if ( !gotAny ) + break; + + RARELY killCurrentOp.checkForInterrupt(); + } + + + // priority queue using a compare that grabs the lowest of two ScoredLocations by score. + for ( Scores::iterator i = _scores.begin(); i != _scores.end(); ++i ) { + + if ( i->second < 0 ) + continue; + + // priority queue + if ( results->size() < limit ) { // case a: queue unfilled + + if ( !_ok( i->first ) ) + continue; + + results->push( ScoredLocation( i->first, i->second ) ); + + } + else if ( i->second > results->top().score ) { // case b: queue filled + + if ( !_ok( i->first ) ) + continue; + + results->pop(); + results->push( ScoredLocation( i->first, i->second ) ); + } + else { + // else do nothing (case c) + } + + } + + } + + /* + * Takes a cursor and updates the partial score for said cursor in _scores map + * @param cursor, btree cursor pointing to the current document to be scored + */ + void FTSSearch::_process( BtreeCursor* cursor ) { + _keysLookedAt++; + + BSONObj key = cursor->currKey(); + + BSONObjIterator i( key ); + BSONElement indexToken = i.next(); + BSONElement scoreElement = i.next(); + + double score = scoreElement.number(); + + double& cur = _scores[(cursor->currLoc()).rec()]; + + if ( cur < 0 ) { + // already been rejected + return; + } + + if ( cur == 0 && _matcher.get() ) { + // we haven't seen this before and we have a matcher + MatchDetails d; + if ( !_matcher->matchesCurrent( cursor, &d ) ) { + cur = -1; + } + + if ( d.hasLoadedRecord() ) + _objectsLookedAt++; + + if ( cur == -1 ) + return; + } + + if ( cur ) + cur += score * (1 + 1 / score); + else + cur += score; + + } + + } + +} diff --git a/src/mongo/db/fts/fts_search.h b/src/mongo/db/fts/fts_search.h new file mode 100644 index 00000000000..82e5b66f3b2 --- /dev/null +++ b/src/mongo/db/fts/fts_search.h @@ -0,0 +1,103 @@ +// fts_search.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <map> +#include <set> +#include <vector> +#include <queue> + +#include "mongo/base/disallow_copying.h" +#include "mongo/db/fts/fts_index.h" +#include "mongo/db/fts/fts_matcher.h" +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/matcher.h" + +namespace mongo { + + class BtreeCursor; + + namespace fts { + + // priority queue template, for use when we're populating results + // vector returned to the user. extends the default priority_queue + // by providing direct access to the underlying vector, which should + // be used CAREFULLY because you can get into trouble.. + template <class T, class S, class C> + class a_priority_queue : public std::priority_queue<T, S, C> { + public: + // return the value of an element at position n when we call pq[n] + T operator[](const int &n) { return this->c[n]; } + // return underlying data structure. called dangerous because it is. + S dangerous() { return this->c; } + }; + + typedef a_priority_queue<ScoredLocation, vector<ScoredLocation>, ScoredLocationComp> Results; + + class FTSSearch { + MONGO_DISALLOW_COPYING(FTSSearch); + public: + + typedef std::map<Record*,double> Scores; + + FTSSearch( NamespaceDetails* ns, + const IndexDetails& id, + const BSONObj& indexPrefix, + const FTSQuery& query, + const BSONObj& filter ); + + + void go(Results* results, unsigned limit ); + + const FTSIndex * getIndex() const { return _fts; } + + long long getKeysLookedAt() const { return _keysLookedAt; } + long long getObjLookedAt() const { return _objectsLookedAt; } + + private: + + void _process( BtreeCursor* cursor ); + + /** + * checks not index pieces + * i.e. prhases & negated terms + */ + bool _ok( Record* record ) const; + + NamespaceDetails* _ns; + const IndexDetails& _id; + FTSIndex* _fts; + BSONObj _indexPrefix; + FTSQuery _query; + FTSMatcher _ftsMatcher; + + scoped_ptr<CoveredIndexMatcher> _matcher; + + long long _keysLookedAt; + long long _objectsLookedAt; + + Scores _scores; + + }; + + } // namespace fts + +} // namespace mongo + diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp new file mode 100644 index 00000000000..ab541b6a7f4 --- /dev/null +++ b/src/mongo/db/fts/fts_spec.cpp @@ -0,0 +1,395 @@ +// fts_spec.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/util/mongoutils/str.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + const double MAX_WEIGHT = 1000000000.0; + + + FTSSpec::FTSSpec( const BSONObj& indexInfo ) { + _defaultLanguage = indexInfo["default_language"].valuestrsafe(); + _languageOverrideField = indexInfo["language_override"].valuestrsafe(); + + if ( _defaultLanguage.size() == 0 ) + _defaultLanguage = "english"; + if ( _languageOverrideField.size() == 0 ) + _languageOverrideField = "language"; + + _wildcard = false; + + // in this block we fill in the _weights map + { + BSONObjIterator i( indexInfo["weights"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + verify( e.isNumber() ); + + if ( WILDCARD == e.fieldName() ) { + _wildcard = true; + } + else { + double num = e.number(); + _weights[ e.fieldName() ] = num; + verify( num > 0 && num < MAX_WEIGHT ); + } + } + verify( _wildcard || _weights.size() ); + } + + // extra information + { + BSONObj keyPattern = indexInfo["key"].Obj(); + verify( keyPattern.nFields() >= 2 ); + BSONObjIterator i( keyPattern ); + + bool passedFTS = false; + + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "_fts" ) || + str::equals( e.fieldName(), "_ftsx" ) ) { + passedFTS = true; + continue; + } + + if ( passedFTS ) + _extraAfter.push_back( e.fieldName() ); + else + _extraBefore.push_back( e.fieldName() ); + } + + } + } + + bool FTSSpec::weight( const StringData& field, double* out ) const { + Weights::const_iterator i = _weights.find( field.toString() ); + if ( i == _weights.end() ) + return false; + *out = i->second; + return true; + } + + string FTSSpec::getLanguageToUse( const BSONObj& userDoc ) const { + BSONElement e = userDoc[_languageOverrideField]; + if ( e.type() == String ) { + const char * x = e.valuestrsafe(); + if ( strlen( x ) > 0 ) + return x; + } + return _defaultLanguage; + } + + + /* + * Calculates the score for all terms in a document of a collection + * @param obj, the document in the collection being parsed + * @param term_freqs, map<string,double> to fill up + */ + void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { + + string language = getLanguageToUse( obj ); + + Stemmer stemmer(language); + Tools tools(language); + tools.stemmer = &stemmer; + tools.stopwords = StopWords::getStopWords( language ); + + if ( wildcard() ) { + // if * is specified for weight, we can recurse over all fields. + _scoreRecurse(tools, obj, term_freqs); + return; + } + + // otherwise, we need to remember the different weights for each field + // and act accordingly (in other words, call _score) + for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { + const char * leftOverName = i->first.c_str(); + // name of field + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + // weight associated to name of field + double weight = i->second; + + if ( e.eoo() ) { + // do nothing + } + else if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + if ( x.type() == String ) + _scoreString( tools, x.valuestr(), term_freqs, weight ); + } + } + else if ( e.type() == String ) { + _scoreString( tools, e.valuestr(), term_freqs, weight ); + } + + } + } + + + /* + * Recurses over all fields of an obj (document in collection) + * and fills term,score map term_freqs + * @param tokenizer, tokenizer to tokenize a string into terms + * @param obj, object being parsed + * term_freqs, map <term,score> to be filled up + */ + void FTSSpec::_scoreRecurse(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( languageOverrideField() == x.fieldName() ) + continue; + + if (x.type() == String) { + double w = 1; + weight( x.fieldName(), &w ); + _scoreString(tools, x.valuestr(), term_freqs, w); + } + else if ( x.isABSONObj() ) { + _scoreRecurse( tools, x.Obj(), term_freqs); + } + + } + } + + namespace { + struct ScoreHelperStruct { + ScoreHelperStruct() + : freq(0), count(0), exp(0){ + } + double freq; + double count; + double exp; + }; + typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap; + } + + void FTSSpec::_scoreString( const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight ) const { + + ScoreHelperMap terms; + + unsigned numTokens = 0; + + Tokenizer i( tools.language, raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) + continue; + + string term = t.data.toString(); + makeLower( &term ); + term = tools.stemmer->stem( term ); + if ( tools.stopwords->isStopWord( term ) ) + continue; + + ScoreHelperStruct& data = terms[term]; + + if ( data.exp ) + data.exp *= 2; + else + data.exp = 1; + data.count += 1; + data.freq += ( 1 / data.exp ); + + numTokens++; + } + + for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + + const string& term = i->first; + const ScoreHelperStruct& data = i->second; + + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? + + double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) + adjustment += 0.1; + + double& score = (*docScores)[term]; + score += ( weight * data.freq * coeff * adjustment ); + verify( score <= MAX_WEIGHT ); + } + } + + Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const { + if ( numExtraBefore() == 0 ) { + *out = BSONObj(); + return Status::OK(); + } + + BSONObjBuilder b; + for ( unsigned i = 0; i < numExtraBefore(); i++ ) { + BSONElement e = query.getFieldDotted(extraBefore(i)); + if ( e.eoo() ) + return Status( ErrorCodes::BadValue, + str::stream() + << "need have an eaulity filter on: " + << extraBefore(i) ); + + if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 ) + return Status( ErrorCodes::BadValue, + str::stream() + << "need have an eaulity filter on: " + << extraBefore(i) ); + + b.append( e ); + } + *out = b.obj(); + return Status::OK(); + } + + void _addFTSStuff( BSONObjBuilder* b ) { + b->append( "_fts", INDEX_NAME ); + b->append( "_ftsx", 1 ); + } + + BSONObj FTSSpec::fixSpec( const BSONObj& spec ) { + map<string,int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; + bool addedFtsStuff = false; + + BSONObjIterator i( spec["key"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "_fts" ) || + str::equals( e.fieldName(), "_ftsx" ) ) { + continue; + } + else if ( e.type() == String && + ( str::equals( "fts", e.valuestr() ) || + str::equals( "text", e.valuestr() ) ) ) { + + if ( !addedFtsStuff ) { + _addFTSStuff( &b ); + addedFtsStuff = true; + } + + m[e.fieldName()] = 1; + } + else { + b.append( e ); + } + } + + if ( !addedFtsStuff ) + _addFTSStuff( &b ); + + keyPattern = b.obj(); + } + + if ( spec["weights"].isABSONObj() ) { + BSONObjIterator i( spec["weights"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + m[e.fieldName()] = e.numberInt(); + } + } + else if ( spec["weights"].str() == WILDCARD ) { + m[WILDCARD] = 1; + } + + BSONObj weights; + { + BSONObjBuilder b; + for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) + b.append( i->first, i->second ); + weights = b.obj(); + } + + string default_language(spec.getStringField("default_language")); + if ( default_language.empty() ) + default_language = "english"; + + string language_override(spec.getStringField("language_override")); + if ( language_override.empty() ) + language_override = "language"; + + int version = 0; + + BSONObjBuilder b; + BSONObjIterator i( spec ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "key" ) ) { + b.append( "key", keyPattern ); + } + else if ( str::equals( e.fieldName(), "weights" ) ) { + b.append( "weights", weights ); + weights = BSONObj(); + } + else if ( str::equals( e.fieldName(), "default_language" ) ) { + b.append( "default_language", default_language); + default_language = ""; + } + else if ( str::equals( e.fieldName(), "language_override" ) ) { + b.append( "language_override", language_override); + language_override = ""; + } + else if ( str::equals( e.fieldName(), "v" ) ) { + version = e.numberInt(); + } + else { + b.append( e ); + } + } + + if ( !weights.isEmpty() ) + b.append( "weights", weights ); + if ( !default_language.empty() ) + b.append( "default_language", default_language); + if ( !language_override.empty() ) + b.append( "language_override", language_override); + + b.append( "v", version ); + + return b.obj(); + + } + + } +} diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h new file mode 100644 index 00000000000..e3ebf24f76b --- /dev/null +++ b/src/mongo/db/fts/fts_spec.h @@ -0,0 +1,108 @@ +// fts_spec.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <map> +#include <vector> +#include <string> + +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/platform/unordered_map.h" + +namespace mongo { + + namespace fts { + + extern const double MAX_WEIGHT; + + typedef std::map<string,double> Weights; // TODO cool map + + typedef unordered_map<string,double> TermFrequencyMap; + + + class FTSSpec { + + struct Tools { + Tools( string language ) + : language( language ){} + const std::string& language; + const Stemmer* stemmer; + const StopWords* stopwords; + }; + + public: + FTSSpec( const BSONObj& indexInfo ); + + bool wildcard() const { return _wildcard; } + const string& defaultLanguage() const { return _defaultLanguage; } + const string& languageOverrideField() const { return _languageOverrideField; } + + size_t numExtraBefore() const { return _extraBefore.size(); } + const std::string& extraBefore( unsigned i ) const { return _extraBefore[i]; } + + size_t numExtraAfter() const { return _extraAfter.size(); } + const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; } + + string getLanguageToUse( const BSONObj& userDoc ) const; + + void scoreDocument( const BSONObj& obj, TermFrequencyMap* scores ) const; + + /** + * given a query, pulls out the pieces (in order) that go in the index first + */ + Status getIndexPrefix( const BSONObj& filter, BSONObj* out ) const; + + const Weights& weights() const { return _weights; } + + /** + * @param out - untouched if field isn't present + * @return if field is here + */ + bool weight( const StringData& field, double* out ) const; + + + static BSONObj fixSpec( const BSONObj& spec ); + private: + void _scoreRecurse(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs ) const; + + void _scoreString( const Tools& tools, + const StringData& raw, + TermFrequencyMap* term_freqs, + double weight ) const; + + string _defaultLanguage; + string _languageOverrideField; + bool _wildcard; + + // _weights stores a mapping between the fields and the value as a double + // basically, how much should an occurence of (query term) in (field) be worth + Weights _weights; + + // other fields to index + std::vector<string> _extraBefore; + std::vector<string> _extraAfter; + }; + + } +} diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp new file mode 100644 index 00000000000..541bd4a56d8 --- /dev/null +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -0,0 +1,139 @@ +// fts_spec_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( FTSSpec, Fix1 ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + BSONObj fixed = FTSSpec::fixSpec( user ); + BSONObj fixed2 = FTSSpec::fixSpec( fixed ); + ASSERT_EQUALS( fixed, fixed2 ); + } + + TEST( FTSSpec, ScoreSingleField1 ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + TermFrequencyMap m; + spec.scoreDocument( BSON( "title" << "cat sat run" ), &m ); + ASSERT_EQUALS( 3U, m.size() ); + ASSERT_EQUALS( m["cat"], m["sat"] ); + ASSERT_EQUALS( m["cat"], m["run"] ); + ASSERT( m["cat"] > 0 ); + } + + TEST( FTSSpec, ScoreMultipleField1 ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + TermFrequencyMap m; + spec.scoreDocument( BSON( "title" << "cat sat run" + << "text" << "cat book" ), + &m ); + + ASSERT_EQUALS( 4U, m.size() ); + ASSERT_EQUALS( m["sat"], m["run"] ); + ASSERT( m["sat"] > 0 ); + + ASSERT( m["cat"] > m["sat"] ); + ASSERT( m["cat"] > m["book"] ); + ASSERT( m["book"] > 0 ); + ASSERT( m["book"] < m["sat"] ); + } + + + TEST( FTSSpec, ScoreRepeatWord ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + TermFrequencyMap m; + spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m ); + ASSERT_EQUALS( 3U, m.size() ); + ASSERT( m["cat"] > 0 ); + ASSERT( m["sat"] > m["cat"] ); + ASSERT( m["run"] > m["sat"] ); + + } + + TEST( FTSSpec, Extra1 ) { + BSONObj user = BSON( "key" << BSON( "data" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + ASSERT_EQUALS( 0U, spec.numExtraBefore() ); + ASSERT_EQUALS( 0U, spec.numExtraAfter() ); + } + + TEST( FTSSpec, Extra2 ) { + BSONObj user = BSON( "key" << BSON( "data" << "fts" << "x" << 1 ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + ASSERT_EQUALS( 0U, spec.numExtraBefore() ); + ASSERT_EQUALS( 1U, spec.numExtraAfter() ); + ASSERT_EQUALS( StringData("x"), spec.extraAfter(0) ); + } + + TEST( FTSSpec, Extra3 ) { + BSONObj user = BSON( "key" << BSON( "x" << 1 << "data" << "fts" ) ); + BSONObj fixed = FTSSpec::fixSpec( user ); + + ASSERT_EQUALS( BSON( "x" << 1 << + "_fts" << "text" << + "_ftsx" << 1 ), + fixed["key"].Obj() ); + ASSERT_EQUALS( BSON( "data" << 1 ), + fixed["weights"].Obj() ); + + BSONObj fixed2 = FTSSpec::fixSpec( fixed ); + ASSERT_EQUALS( fixed, fixed2 ); + + FTSSpec spec( fixed ); + ASSERT_EQUALS( 1U, spec.numExtraBefore() ); + ASSERT_EQUALS( StringData("x"), spec.extraBefore(0) ); + ASSERT_EQUALS( 0U, spec.numExtraAfter() ); + + BSONObj prefix; + + ASSERT( spec.getIndexPrefix( BSON( "x" << 2 ), &prefix ).isOK() ); + ASSERT_EQUALS( BSON( "x" << 2 ), prefix ); + + ASSERT( spec.getIndexPrefix( BSON( "x" << 3 << "y" << 4 ), &prefix ).isOK() ); + ASSERT_EQUALS( BSON( "x" << 3 ), prefix ); + + ASSERT( !spec.getIndexPrefix( BSON( "x" << BSON( "$gt" << 5 ) ), &prefix ).isOK() ); + ASSERT( !spec.getIndexPrefix( BSON( "y" << 4 ), &prefix ).isOK() ); + ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() ); + } + + } +} diff --git a/src/mongo/db/fts/fts_util.cpp b/src/mongo/db/fts/fts_util.cpp new file mode 100644 index 00000000000..ace11b67409 --- /dev/null +++ b/src/mongo/db/fts/fts_util.cpp @@ -0,0 +1,30 @@ +// fts_util.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/fts/fts_util.h" + +namespace mongo { + + namespace fts { + + const std::string INDEX_NAME = "text"; + const std::string WILDCARD = "$**"; + + } +} + diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h new file mode 100644 index 00000000000..3df5a0c5ee2 --- /dev/null +++ b/src/mongo/db/fts/fts_util.h @@ -0,0 +1,112 @@ +// fts_util.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> + +#include "mongo/db/hasher.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/record.h" +#include "mongo/util/unordered_fast_key_table.h" + +namespace mongo { + + namespace fts { + + extern const std::string WILDCARD; + extern const std::string INDEX_NAME; + + /** + * destructive! + */ + inline void makeLower( std::string* s ) { + std::string::size_type sz = s->size(); + for ( std::string::size_type i = 0; i < sz; i++ ) + (*s)[i] = (char)tolower( (int)(*s)[i] ); + } + + /* + * ScoredLocation stores the total score for a document (record *) wrt a search + * + */ + struct ScoredLocation { + ScoredLocation( Record* r, double sc ) + : rec(r), score(sc) { + } + + Record* rec; + double score; + + bool operator<( const ScoredLocation& other ) const { + if ( other.score < score ) + return true; + if ( other.score > score ) + return false; + return rec < other.rec; + } + }; + + // scored location comparison is done based on score + class ScoredLocationComp { + public: + bool operator() (const ScoredLocation& lhs, const ScoredLocation& rhs) const { + return (lhs.score > rhs.score); + } + }; + + struct _be_hash { + size_t operator()( const BSONElement& e ) const { + return static_cast<size_t>( BSONElementHasher::hash64( e, 17 ) ); + } + }; + + struct _be_equals { + bool operator()( const BSONElement& a, const BSONElement& b ) const { + return a == b; + } + }; + + struct _be_convert { + BSONElement operator()( const BSONObj& o ) const { + const BSONElement& x = o.firstElement(); + BSONElement y( x.rawdata() ); + return y; + } + }; + + struct _be_convert_other { + BSONObj operator()( const BSONElement& e ) const { + return e.wrap(); + } + }; + + template< typename V > + class BSONElementMap : public UnorderedFastKeyTable<BSONElement, + BSONObj, + V, + _be_hash, + _be_equals, + _be_convert, + _be_convert_other > { + }; + + + } +} + diff --git a/src/mongo/db/fts/fts_util_test.cpp b/src/mongo/db/fts/fts_util_test.cpp new file mode 100644 index 00000000000..7d959dca08a --- /dev/null +++ b/src/mongo/db/fts/fts_util_test.cpp @@ -0,0 +1,36 @@ +// fts_util_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/unittest/unittest.h" + +#include "mongo/db/fts/fts_util.h" + +namespace mongo { + namespace fts { + + TEST( BSONElementMap, Simple1 ) { + BSONElementMap<double> m; + + BSONObj x = BSON( "x" << 5 ); + m[x.firstElement()] = 5; + ASSERT_EQUALS( 5, m[x.firstElement()] ); + } + + } +} diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py new file mode 100644 index 00000000000..5010fe702a5 --- /dev/null +++ b/src/mongo/db/fts/generate_stop_words.py @@ -0,0 +1,56 @@ +import sys + +def generate( header, source, language_files ): + print( "header: %s" % header ) + print( "source: %s" % source ) + print( "language_files:" ) + for x in language_files: + print( "\t%s" % x ) + + out = open( header, "wb" ) + out.write( """ +#pragma once +#include <map> +#include <set> +#include <string> +namespace mongo { +namespace fts { + + void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ); +} +} +""" ) + out.close() + + + + out = open( source, "wb" ) + out.write( '#include "%s"' % header.rpartition( "/" )[2].rpartition( "\\" )[2] ) + out.write( """ +namespace mongo { +namespace fts { + + void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ) { + +""" ) + + for l_file in language_files: + l = l_file.rpartition( "_" )[2].partition( "." )[0] + + out.write( ' // %s\n' % l_file ) + out.write( ' {\n' ) + out.write( ' std::set< std::string >& l = (*m)["%s"];\n' % l ) + for word in open( l_file, "rb" ): + out.write( ' l.insert( "%s" );\n' % word.strip() ) + out.write( ' }\n' ) + out.write( """ + } +} // namespace fts +} // namespace mongo +""" ) + + +if __name__ == "__main__": + generate( sys.argv[ len(sys.argv) - 2], + sys.argv[ len(sys.argv) - 1], + sys.argv[1:-2] ) diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp new file mode 100644 index 00000000000..c04d05c87ca --- /dev/null +++ b/src/mongo/db/fts/stemmer.cpp @@ -0,0 +1,58 @@ +// stemmer.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string> + +#include "mongo/db/fts/stemmer.h" + +namespace mongo { + + namespace fts { + + Stemmer::Stemmer( const string& language ) { + _stemmer = NULL; + if ( language != "none" ) + _stemmer = sb_stemmer_new(language.c_str(), "UTF_8"); + } + + Stemmer::~Stemmer() { + if ( _stemmer ) { + sb_stemmer_delete(_stemmer); + _stemmer = NULL; + } + } + + string Stemmer::stem( const StringData& word ) const { + if ( !_stemmer ) + return word.toString(); + + const sb_symbol* sb_sym = sb_stemmer_stem( _stemmer, + (const sb_symbol*)word.rawData(), + word.size() ); + + if ( sb_sym == NULL ) { + // out of memory + abort(); + } + + return string( (const char*)(sb_sym), sb_stemmer_length( _stemmer ) ); + } + + } + +} diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h new file mode 100644 index 00000000000..d212cc01fce --- /dev/null +++ b/src/mongo/db/fts/stemmer.h @@ -0,0 +1,48 @@ +// stemmer.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <string> + +#include "libstemmer.h" + +#include "mongo/base/string_data.h" + +namespace mongo { + + namespace fts { + + /** + * maintains case + * but works + * running/Running -> run/Run + */ + class Stemmer { + public: + Stemmer( const std::string& language ); + ~Stemmer(); + + std::string stem( const StringData& word ) const; + private: + struct sb_stemmer* _stemmer; + }; + } +} + diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp new file mode 100644 index 00000000000..808b8141a64 --- /dev/null +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -0,0 +1,42 @@ +// stemmer_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/unittest/unittest.h" + +#include "mongo/db/fts/stemmer.h" + +namespace mongo { + namespace fts { + + TEST( English, Stemmer1 ) { + Stemmer s( "english" ); + ASSERT_EQUALS( "run", s.stem( "running" ) ); + ASSERT_EQUALS( "Run", s.stem( "Running" ) ); + } + + + TEST( English, Caps ) { + Stemmer s( "porter" ); + ASSERT_EQUALS( "unit", s.stem( "united" ) ); + ASSERT_EQUALS( "Unite", s.stem( "United" ) ); + } + + + } +} diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp new file mode 100644 index 00000000000..0d664caf1bf --- /dev/null +++ b/src/mongo/db/fts/stop_words.cpp @@ -0,0 +1,73 @@ +// stop_words.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <map> +#include <set> +#include <string> + +#include "mongo/db/fts/stop_words.h" + +#include "mongo/base/init.h" +#include "mongo/platform/unordered_map.h" + + + +namespace mongo { + + namespace fts { + + void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ); + + namespace { + unordered_map<string,StopWords*> STOP_WORDS; + StopWords* empty = NULL; + } + + + StopWords::StopWords(){ + } + + StopWords::StopWords( const std::set<std::string>& words ) { + for ( std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i ) + _words.insert( *i ); + } + + const StopWords* StopWords::getStopWords( const std::string& langauge ) { + unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( langauge ); + if ( i == STOP_WORDS.end() ) + return empty; + return i->second; + } + + + MONGO_INITIALIZER(StopWords)(InitializerContext* context) { + empty = new StopWords(); + + std::map< std::string, std::set< std::string > > raw; + loadStopWordMap( &raw ); + for ( std::map< std::string, std::set< std::string > >::const_iterator i = raw.begin(); + i != raw.end(); + ++i ) { + STOP_WORDS[i->first] = new StopWords( i->second ); + } + return Status::OK(); + } + + } + +} diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h new file mode 100644 index 00000000000..5816afa560c --- /dev/null +++ b/src/mongo/db/fts/stop_words.h @@ -0,0 +1,50 @@ +// stop_words.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <set> +#include <string> + +#include "mongo/platform/unordered_set.h" + +namespace mongo { + + namespace fts { + + class StopWords { + public: + StopWords(); + StopWords( const std::set<std::string>& words ); + + bool isStopWord( const std::string& word ) const { + return _words.count( word ) > 0; + } + + size_t numStopWords() const { return _words.size(); } + + static const StopWords* getStopWords( const std::string& langauge ); + private: + ~StopWords(){} + unordered_set<std::string> _words; + }; + + } +} + diff --git a/src/mongo/db/fts/stop_words_danish.txt b/src/mongo/db/fts/stop_words_danish.txt new file mode 100644 index 00000000000..1b3c2867fec --- /dev/null +++ b/src/mongo/db/fts/stop_words_danish.txt @@ -0,0 +1,100 @@ +få +intet +som +den +forrige +ni +alle +at +ned +et +næsten +fordi +og +jeres +seks +op +har +flere +hvis +hvem +andre +mens +fem +over +da +din +deres +for +ny +hvad +fra +kan +kommer +hvornår +jeg +denne +end +nogen +meget +man mand +på +store +ind +lav +ud +ej +hvordan +ingen +to +der +se +kom +dig +tre +eneste +dette +hans +hver +før +hendes +andet +syv +hvilken +hvor +det +de +hvorfor +god +otte +ikke +han +mig +fleste +ti +i +ene +med +til +stor +her +lille +mange +du +ses +begge +dog +eller +en +nyt +var +hun +enhver +fire +mere +nær +næste +men +noget +lidt +af diff --git a/src/mongo/db/fts/stop_words_dutch.txt b/src/mongo/db/fts/stop_words_dutch.txt new file mode 100644 index 00000000000..251822f9570 --- /dev/null +++ b/src/mongo/db/fts/stop_words_dutch.txt @@ -0,0 +1,48 @@ +hun +dit +zij +van +kan +een +zal +wat +dat +in +hij +die +zou +bij +met +al +ook +is +uit +en +hem +zei +heb +mij +was +ik +nog +of +zo +we +men +wij +ons +als +tot +wel +nu +aan +je +er +ze +af +hoe +had +te +me +het +dan diff --git a/src/mongo/db/fts/stop_words_english.txt b/src/mongo/db/fts/stop_words_english.txt new file mode 100644 index 00000000000..fbb6c3063fd --- /dev/null +++ b/src/mongo/db/fts/stop_words_english.txt @@ -0,0 +1,174 @@ +than +during +himself +your +theirs +most +same +wouldn't +at +it +here's +their +his +an +out +between +doesn't +not +those +only +yourself +mustn't +and +shouldn't +him +you'll +which +more +shan't +after +why +up +further +over +no +its +until +them +you +don't +few +why's +i've +for +ours +some +when's +they've +won't +herself +but +she +he'd +how's +were +how +we've +because +aren't +should +our +each +once +they'd +where +above +there +or +they'll +be +to +are +it's +too +itself +what +whom +has +they're +had +she'd +these +other +when +hasn't +by +we'll +having +then +against +he's +as +is +that +isn't +below +could +wasn't +who's +ourselves +so +any +he +cannot +weren't +was +my +would +we'd +yourselves +where's +couldn't +who +didn't +from +i'm +off +have +hers +i +am +themselves +of +before +i'll +here +while +what's +myself +ought +me +the +into +about +this +do +can't +a +her +that's +did +very +down +you've +we +you're +haven't +on +let's +such +they +in +with +being +doing +she's +yours +hadn't +nor +both +does +own +again +there's +he'll +i'd +under +you'd +through +we're +she'll +been +all +if diff --git a/src/mongo/db/fts/stop_words_finnish.txt b/src/mongo/db/fts/stop_words_finnish.txt new file mode 100644 index 00000000000..0d898159660 --- /dev/null +++ b/src/mongo/db/fts/stop_words_finnish.txt @@ -0,0 +1,747 @@ +toistaiseksi +aika +tykö +haluavat +hyvinä +lähekkäin +menimme +mahdollista +silloin +olette +kaikkiaan +alla +haluaa +joku +jopa +uuden +haluat +lähellä +jolta +yhtä +sieltä +oleva +niistä +pienelle +samallasta +pieni +uudet +avulla +kahdessa +edes +aloitin +koska +pienin +olisi +esi +paljon +hyville +toisaalta +vaikeat +kumpikin +ensimmäiseksi +jonka +jotka +kennessästä +ensimmäistä +joihin +peräti +kymmenen +hänessä +vaikean +lähtien +neljän +olisin +ette +joutui +jokin +pienempi +vieri +kuka +viime +vaiheessa +olisit +neljää +aikovat +mikä +kaikkea +muiden +niiden +kumpikaan +kenellä +esillä +joo +menit +joudutte +olisivat +ne +sinua +muun +täältä +mitään +kohti +usea +ilmeisesti +koskaan +vähemmän +menossa +ajan +liian +omat +tietysti +jota +kenen +kahdella +aikaa +ensimmäiset +parhaillaan +muualla +joukosta +moi +hyvät +näissä +kuin +minut +ainoa +nro +eräiden +kanssasi +seuraavat +yhtäällä +jonkin +näissästä +ehkä +aloitamme +jälkeen +kanssa +aloititte +voisi +viimeinen +jolle +toki +myöskään +sekä +josta +eli +moni +olli +ellet +alkuisin +tulleet +kaikkialle +ainakin +muita +aikana +toinen +muuta +keiltä +nämä +meidän +tuhannen +edemmäs +mihin +vierekkäin +sinulle +yhteyteen +halusi +etenkin +täytyvät +niitä +mitä +alta +välillä +seitsemän +kumpainen +hyviltä +taa +me +mikään +hän +uudelleen +omien +kerta +tuhat +yksi +tulisit +tulimme +joissa +monesti +paitsi +heti +aloitat +joko +eniten +vuoden +yhdessä +suuri +haluamme +varten +olisitte +kenet +entinen +samaa +vaikeille +samallassa +tänään +edelleen +läpi +helposti +täysin +monta +jolloin +useimmiten +ensin +sinusta +suuntaan +omilta +uuteen +voidaan +näissälle +sisällä +siitä +kuitenkaan +mones +mukaan +asiasta +toisaalla +tällöin +että +yksittäin +erityisesti +vaikea +ellei +moniaalla +aluksi +antoi +tavoitteena +verran +runsaasti +ovat +menevät +toista +meni +vastaan +myös +heitä +menin +kokonaan +ja +hänen +suurin +meille +tosin +hetkellä +parhaiten +ensimmäisenä +kaksi +tulemme +alle +vuosina +ensimmäisen +aloitan +emme +te +heille +uusia +itse +halunnut +keille +samallalta +halusin +aikoo +minun +menet +näissähin +meneet +kovin +jouduin +toisen +tämä +halusit +useita +muassa +heiltä +tulette +kahta +jouduit +haluton +omia +minne +aloitettevaksi +siten +sinussa +aloitettu +voi +suuren +asioihin +vaikeilta +vähiten +asiat +vuosien +ellen +joilla +jälleen +suuret +omalta +tässä +asian +nuo +menette +kuten +kolme +voimme +eri +menivät +tahansa +vai +kaiken +joiden +puolesta +hyvissä +kumpainenkaan +kaikille +yhtäälle +toiseksi +hänelle +toisessa +toisesta +toiselle +keillä +lisäksi +tuolloin +muualle +edessä +älköön +keneen +mennyt +aikoina +myöhemmin +annettu +menemme +aoua +ensimmäisiä +mikäli +pieneen +keinä +annetteva +vähän +tuolla +jää +kannattaa +jonne +mukana +kolmas +häneen +jouduitte +joten +tänne +aloittivat +olitte +juuri +keitten +tullut +paremmin +sillä +meillä +avutta +keneltä +kahden +jos +omaan +aion +jo +samaan +sama +joskus +sinä +kaikki +yhtään +omille +sataa +pian +ihan +eivät +ylös +keissä +voin +siinä +lisää +halutessa +tämän +omaa +asioita +kanssanne +tapauksessa +kenestä +vaikeilla +minä +suurten +kenelle +tulisivat +melkein +kun +kaikkia +uutta +lähinnä +ole +hyvä +aiotte +asia +hyvää +uusi +tulisimme +aikajen +viimeisen +mukaansa +eteen +hänestä +omiksi +heistä +edestä +kukin +hyviksi +moniaalle +missä +takaa +omista +joilta +saakka +onkin +yhdeksän +sitä +ollut +hänet +antamatta +ensimmäisinä +avun +yhteensä +kolmen +hyviä +olemme +muulloin +kumpi +yli +sinun +erittäin +aiemmin +ympäri +voivat +keiksi +tulitte +tästä +siis +perusteella +sinut +täten +menen +asti +muu +omalle +haluamatta +keitä +apu +menee +yhä +voit +varsin +neljä +yhden +uusinta +tuolta +yksin +keskimäärin +entistä +sinne +samat +jotenkin +eilen +luo +takana +edellä +useasti +muutama +he +sitten +sinulta +pieneksi +enää +itsensä +tulla +joille +jotenkuten +kahdesta +kuutta +omissa +parempi +pakosti +olla +joutua +aloitattivat +silti +kahdelta +joudun +viiden +kehen +paikoittain +kukaan +näiden +aloitti +olevan +myöskin +jouduimme +mistä +tähän +takaisin +eräät +tai +tulee +siihen +olimme +molemmat +kesken +keneksi +jotta +toisella +tätä +jompikumpi +ennen +täytyy +kolmesti +voitte +pieneltä +en +niin +keistä +jotain +ensimmäisiksi +esimerkiksi +pienellä +takia +on +kautta +muka +oli +kai +hänellä +onko +itseään +joutumaan +aina +aivan +aloitimme +taemmas +todella +ensi +tulisitte +taas +mutta +vuotta +nyt +kenettä +tule +enemmän +vain +usein +haluatte +heissä +kannalta +avuksi +tuonne +toiseen +kaikkialta +hyviin +olevat +olisimme +uudeksi +tulen +elleivät +huomenna +olleet +moniaalta +kahdeksan +nopeasti +joudumme +heihin +antaa +johon +ei +hei +omassa +saman +lähelle +et +keiden +kanssani +yhteen +huolimatta +häneltä +alussa +olivat +aloitatte +kuuden +annettavaksi +esiin +näissältä +joukossa +halusimme +tuntuu +kenenä +vuosi +hyvien +tällä +tulevat +joista +läheltä +tulin +jokainen +joutuivat +vaikka +asiaa +halusivat +päälle +joukkoon +lähemmäs +joutuu +joutuvat +yhteydessä +vaikeista +nopeammin +vuoksi +toiselta +koko +hieman +kauemmas +kertaa +muutaman +kyse +kahteen +sen +edeltä +kuusi +varsinkin +toisensa +jossa +kaikilta +mahdollisimman +tavalla +näin +liki +joka +olin +entisen +aist +oma +yleensä +nopeiten +tulisi +viisi +vähintään +hyvistä +aikaisin +kahdelle +miksi +olit +joita +halua +asioiden +miten +aiomme +heidän +kaikkin +ylemmäs +näitä +aloittaa +kahdeksannen +ketä +vielä +aikaisemmin +puolestaan +aloitit +jolla +myötä +omaksi +lähes +kiitos +hyvin +samalla +sijaan +milloin +sisäkkäin +se +olen +noin +alemmas +kaikkialla +muuten +ellemme +pienestä +omiin +olet +aikaan +tuskin +yhtäältä +siellä +keittä +kuinka +vastakkain +halusitte +ilman +sata +siksi +tuo +tulet +suuria +muut +satojen +eräs +kaikkien +tulivat +varmasti +ohi +muualta +kyllä +haluan +kenessä +ensimmäinen +mikin +tulisin +oman +joiksi +edelle +entisiä +entisten +kauan +alas +ainoat +ellette +suoraan +eikä +melko +alusta +ettei +alkuun +aloitettava +monet +ketkä +uusien +tulit +kuitenkin +aloittamatta +vaan +toisemme +keihin +viimeksi +toisaalle +muuanne +ainakaan +tänä +oikein +mennessä +samoin +kerran +kanssaan +sadam +täällä +kanssamme +vasta +vaikeissa +älä diff --git a/src/mongo/db/fts/stop_words_french.txt b/src/mongo/db/fts/stop_words_french.txt new file mode 100644 index 00000000000..42502590cbc --- /dev/null +++ b/src/mongo/db/fts/stop_words_french.txt @@ -0,0 +1,126 @@ +vu +avoir +sa +étions +faites +ni +quand +ou +tu +tout +et +hors +haut +avec +votre +ton +tes +dedans +comment +il +aussi +vont +fait +soyez +peu +ils +sans +avant +valeur +été +pas +font +mine +encore +sien +le +mais +ci +quels +devrait +mot +droite +ma +pièce +tels +force +dans +qui +trop +ça +juste +au +dehors +eu +cela +voient +leur +notre +plupart +elles +dos +tous +elle +bon +étaient +nommés +je +ces +nouveaux +donc +est +pourquoi +car +son +alors +peut +quel +quelle +pour +essai +ici +sujet +mon +état +depuis +même +que +où +parole +être +voie +autre +sont +sur +des +début +ta +la +deux +ce +doit +quelles +du +ses +là +tandis +personnes +en +par +chaque +parce +fois +si +les +nous +mes +vous +aucuns +seulement +moins +maintenant +ceux +tellement +très +comme +sous diff --git a/src/mongo/db/fts/stop_words_german.txt b/src/mongo/db/fts/stop_words_german.txt new file mode 100644 index 00000000000..4fcf107963b --- /dev/null +++ b/src/mongo/db/fts/stop_words_german.txt @@ -0,0 +1,992 @@ +kleiner +muss +zwanzig +berichtete +dort +nun +könnten +fortsetzten +seinen +important +gefiel +sofort +liegt +wolle +eben +in +dadurch +sind +geben +etc +wenngleich +erhalten +ihr +solchen +bezüglich +bald +befragte +werde +geehrten +singt +fragte +jedem +eines +behalten +wegen +freies +einseitig +dasselbe +befragen +unmögliche +weg +nachhinein +senkte +ihrem +drunter +später +wäre +daraus +ausdrückte +sagt +einmal +inzwischen +startet +ganzem +lesen +vermag +forderten +fast +wieviel +alsbald +damals +gewollt +mithin +behielt +erhielten +irgendwen +geteilt +bearbeitete +eröffnen +bist +daran +will +diese +danke +einst +oft +legte +gleichwohl +demnach +begonnen +reagieren +derartig +links +dieser +soweit +worin +möglichen +folgender +desto +zahlreich +wogegen +sich +jenen +schnell +meta +wen +rechts +abgerufener +muesste +vorbei +nacher +nebenan +schwierig +senkt +für +soviel +deswegen +blieb +veröffentlichtes +dagegen +veröffentlichen +einem +stets +hast +demselben +wollen +nötigenfalls +sondern +zugleich +umso +ins +wolltet +deine +danach +konkreter +diesen +abgerufene +mancherorts +befragten +teilten +tat +übermorgen +unse +manche +weshalb +senken +solltest +woher +siebte +sofern +so +guten +vollständig +meiste +doppelt +veröffentlicht +derselbe +zuviel +dem +hinterher +veröffentlicher +ziehen +mich +anderem +überallhin +können +lagen +weiteres +jedenfalls +muessen +eigenen +beitragen +gmbh +keines +pfui +befragter +falls +ergänzten +ergänze +vielleicht +hundert +koennen +entweder +muesst +durchaus +sollt +jemand +solch +wirklich +fortsetzt +sage +derjenige +angesetzt +gleichzeitig +derselben +diesseits +anderm +gern +keinen +gehen +könnt +daher +allerdings +wohingegen +steigen +unserm +unserem +nämlich +heraus +wodurch +obgleich +bloss +seit +aufhören +verrate +je +wohlweislich +einführten +tatsächlichen +ganzes +kommen +waren +alles +komme +einige +dinge +erst +tragen +machst +direkten +einiger +anerkannter +wären +ganz +woraufhin +darauf +jedoch +daneben +einführen +längstens +koennten +beide +such +dorthin +möglich +ausser +unterhalb +vor +darfst +sicherlich +geworden +stattdessen +beinahe +gängig +gibt +sieht +schätzte +euren +obwohl +vom +könnte +vergangenes +wolltest +welchen +wiewohl +schreiber +deshalb +mochte +unses +aber +tät +manches +eröffnet +anders +gängiges +seines +seitdem +titel +musste +jenes +unbedingt +trotzdem +sowohl +kaum +davon +möchten +gemacht +veröffentlichten +verraten +sangen +glücklicherweise +somit +gefallen +eurer +machte +womit +übrig +ihren +sollten +weder +heute +regelmäßig +meiner +was +legten +vorher +niemals +dein +derart +schließlich +außen +noch +zwar +vieles +wenig +fünf +anderes +mussten +schätzen +sagten +angesetze +eigentlich +wann +deines +letztes +fordert +einig +meinem +hiesige +möchte +bin +möglicher +und +z.B. +unten +angesetzten +sehr +dessen +ihrer +beim +weiterhin +weiteren +tun +solcher +hattet +würden +viel +gängige +an +vermutlich +müssten +kommt +lediglich +zehn +mehr +wohin +nie +gedurft +einerseits +besonders +teilen +hätte +darüberhinaus +warst +es +allerlei +ander +unserer +versorgtes +benutzt +hätt +ausdrücken +hab +usw +keine +veröffentlichte +stieg +neues +ab +leicht +allen +könn +fortsetzen +keineswegs +herein +siehe +waere +derem +möchtest +überdies +allgemein +einseitiger +überall +möglicherweise +solche +obschon +beitrugen +gratulieren +fordern +denen +versorgt +mehrere +haette +ueber +meisten +wenige +gesagt +ausgenommen +drauf +einfach +verrieten +hätten +leer +euer +unmöglich +eingesetzt +gefälligst +zur +dannen +gänzlich +anerkannt +zumal +derjenigen +alle +anfing +nichts +mir +wird +jedes +hallo +dafür +spielen +ausserdem +einstmals +ca. +einseitigen +außerhalb +manchen +ggf +andern +unsen +wär +selbst +eröffne +ob +findest +höchstens +dich +suchen +mögen +direkter +sobald +folglich +einigen +dunklen +nicht +ganzen +bearbeiten +bereits +müssen +jährigen +steigt +meist +sicher +seid +geblieben +info +ein +währenddessen +derzeit +finden +ersten +bei +dabei +nützt +bestehen +oberhalb +wer +seht +nachdem +wo +wollte +gratuliert +wenigstens +schreibens +author +bearbeite +koennte +dritte +lichten +sehen +dreißig +vorne +eurem +geehrte +uns +hinein +ihnen +daß +konkrete +irgendwie +sollen +natürlich +also +hin +doch +welches +geb +gar +deinem +ferner +eher +eine +hiermit +neben +gefällt +tatsächlicher +beides +eröffnetes +dennoch +etliche +wurde +brachten +zwölf +setzen +zudem +berichten +müsste +reagiert +sollst +hattest +jederlei +tut +denselben +euch +auch +konkretes +gemocht +stiegen +kannst +mache +lag +kein +unsere +ungefähr +txt +klaren +außer +weiß +braucht +tust +dürfte +nutzen +dazu +ergänzte +warum +konnte +sog +einführte +morgen +ihm +genug +durfte +hatte +eröffnete +igitt +neuen +musst +der +wenn +fortsetzte +wessen +insofern +rund +vorgestern +irgendwas +erhielt +gewesen +anderen +geht +angesetzter +frei +teile +kleines +keiner +nächste +gbr +die +etwa +im +wir +zusammen +bräuchte +weiterem +bleiben +starteten +davor +liest +durch +niemand +zum +jeden +einer +seiten +freier +dank +oben +hat +eigenes +soll +beträchtlich +um +sodaß +gib +erste +völlig +oder +geehrter +setzten +fall +aller +aufzusuchen +seither +versorgten +böden +sonst +nein +machen +zog +danken +einiges +anerkanntes +müßt +setzt +eins +bedürfen +gegeben +konkret +bietet +voran +unter +gekonnt +geteilte +jähriges +diejenige +sagen +bsp. +folgende +du +allmählich +dieses +sogar +ende +folgendes +abgerufen +neue +vorüber +reagiere +nirgendwo +welche +abgerufenes +vergangener +sein +sowie +welchem +übel +sie +plötzlich +müßte +mancher +dahin +auf +gängiger +seiner +bearbeiteten +jährige +machten +solc hen +autor +beiderlei +aufgehört +mal +haben +jener +sei +per +sagtest +legen +unterbrach +bekannt +mindestens +acht +jenseits +bedarf +sooft +habe +da +unsem +zuerst +versorge +hinten +eure +letztlich +beiden +sechs +mögliche +werdet +eures +wollt +bringen +immer +unterbrechen +besser +dies +erneut +dir +war +einigem +drei +findet +letztens +gängigen +derer +klares +mußt +schätzten +margin +besteht +seine +langsam +einbaün +sang +drin +nach +weiterer +zwischen +ganze +vielmals +gesehen +getragen +klein +indem +frau +teilte +hinter +genommen +bekennen +darum +seinem +ehe +indessen +begann +hoch +solange +kleinen +das +berichteten +sect +ohne +leider +solchem +nur +durften +vergangen +bloß +direkte +unseren +bekannte +berichtet +andernfalls +verriet +steige +ergänzen +zurück +dieselben +jede +morgige +jene +kam +trug +nimm +zieht +letzten +wieder +zogen +konnten +willst +allzu +angefangen +nehmen +ganzer +geehrt +mann +trage +bis +schlechter +sehe +mag +unmöglichen +werden +unnötig +dieselbe +bedurfte +gemäss +bevor +sämtliche +anerkannte +gab +konkreten +unwichtig +wieso +senke +keinerlei +senkten +allem +anfangen +erster +denn +ich +total +gleich +schreibe +versorgen +gestrige +zu +bekannter +aufhörte +irgendwer +gekommen +ist +darf +tausend +manchmal +zufolge +andere +vier +hatten +neuem +fuer +gewissermaßen +unmöglicher +jenem +wem +damit +sonstwo +nirgends +sagte +kann +unser +klar +macht +folgenden +wichtig +woraus +nahm +darüber +jeder +arbeiten +einen +wurden +vielen +tatsächlich +trägt +man +soeben +wohl +ausdrückt +ebenso +muß +wachen +lassen +letze +zuletzt +manchem +immerhin +diesem +startete +weitere +nutzt +statt +etwas +aus +reagierte +beginnen +nimmt +infolge +brauchen +hier +dann +nacht +forderte +bislang +desselben +über +meines +weiter +ja +rief +aufgrund +wollten +den +als +vieler +neun +gehabt +anderer +gebracht +irgendwo +sollte +andererseits +dürfen +heutige +deiner +selber +meinen +entsprechend +dass +wirst +gratulierte +bisher +vermögen +brachte +setzte +ihres +weniger +laut +magst +versorgte +klare +während +darin +schätzt +gegen +einigermaßen +solches +keinem +außerdem +am +freie +gestern +anstatt +fand +wie +vergangene +welcher +deinen +schreiben +jetzt +ansetzen +halb +allein +einseitige +tatsächliches +längst +ebenfalls +übrigens +er +mit +unseres +drüber +sieben +darunter +jährig +gute +seite +innen +nimmer +neuer +singen +erhält +irgend +künftig +deren +zwei +einzig +bzw +getan +version +steht +zeitweise +irgendeine +direkt +schon +innerhalb +mein +ihn +viele +dorther +meine +des +würde +nutzung +ihre +pro +letztendlich +von +neu +ähnlich +tages +sehrwohl +weil +koennt diff --git a/src/mongo/db/fts/stop_words_hungarian.txt b/src/mongo/db/fts/stop_words_hungarian.txt new file mode 100644 index 00000000000..abdfe8b7498 --- /dev/null +++ b/src/mongo/db/fts/stop_words_hungarian.txt @@ -0,0 +1,35 @@ +nem +be +mi +igen +a +fel +van +õk +csak +hát +én +meg +lesz +szét +az +és +ide +ön +le +ki +össze +õ +hogy +mint +te +oda +vagy +ti +vissza +egy +át +de +el +rá +volt diff --git a/src/mongo/db/fts/stop_words_italian.txt b/src/mongo/db/fts/stop_words_italian.txt new file mode 100644 index 00000000000..50ebb1c32c3 --- /dev/null +++ b/src/mongo/db/fts/stop_words_italian.txt @@ -0,0 +1,279 @@ +tuoi +farò +all +stai +avevamo +avevo +nelle +fanno +stesse +dagli +avesti +stiamo +abbiate +ai +facevi +avevi +dall +sull +sarebbero +dalle +abbiamo +avessi +stettero +nella +avrete +del +stessi +stavate +farai +dalla +tutti +starò +facemmo +avesse +tu +stiano +faceste +dallo +tutto +avevate +farete +faceva +nello +e +sulle +loro +sono +se +degli +fu +sarai +il +avrei +avuto +stessimo +a +sarete +avrebbero +stavano +sulla +in +si +farà +c +facessi +avevano +sul +li +fecero +sia +starebbe +eravamo +lo +fui +contro +furono +suoi +steste +avuti +o +starei +faremmo +staranno +sullo +avrò +fece +i +della +agli +stesti +avute +la +non +saranno +gli +facessimo +starà +dal +avendo +coi +delle +faccio +nei +fareste +stando +negl +su +anche +avuta +da +facesse +le +facciamo +è +siate +ebbero +faranno +facevate +foste +dello +avrebbe +facessero +facevano +feci +al +stemmo +fai +cui +fummo +quale +avrà +perché +farebbe +mi +negli +faccia +di +siete +facciano +sta +stavo +sarebbe +faremo +degl +stavamo +dov +dei +uno +saremo +starete +faresti +stavi +staresti +essendo +facevamo +nostro +ad +tra +avresti +alle +starai +nell +fossero +abbiano +fosti +nostri +per +ma +stia +saresti +una +sue +dove +siamo +eravate +starebbero +facendo +lei +avreste +agl +alla +sto +stava +nostre +quelli +stette +hanno +sua +dagl +quello +voi +staremmo +vi +nostra +stareste +sarò +avessimo +allo +siano +io +come +suo +facciate +saremmo +ci +quella +era +l +avemmo +miei +sareste +ed +sui +quanto +avete +un +con +ero +vostre +questo +nel +quanti +più +ha +stiate +quelle +stetti +col +avremmo +fosse +questi +noi +tua +ho +mia +farebbero +erano +vostra +ebbi +farei +quante +aveste +abbia +stessero +staremo +avessero +eri +avranno +fossi +queste +hai +sarei +avrai +tue +chi +sei +stanno +mie +dell +ebbe +ne +quanta +dai +avremo +aveva +vostro +questa +che +fossimo +sarà +facevo +vostri +lui +ti +sugl +tuo +facesti +sugli +mio diff --git a/src/mongo/db/fts/stop_words_norwegian.txt b/src/mongo/db/fts/stop_words_norwegian.txt new file mode 100644 index 00000000000..daf5f27d9c8 --- /dev/null +++ b/src/mongo/db/fts/stop_words_norwegian.txt @@ -0,0 +1,119 @@ +som +alle +et +vÖre +gjÛre +slik +ha +nÅ +fordi +og +skulle +sist +hvis +vil +hvem +andre +slutt +mens +siden +sÅ +over +lage +da +din +vite +deres +disse +for +hva +Å +hennes +kunne +ny +ved +fra +lang +mer +kan +er +denne +verdi +riktig +bruke +meget +opp +mÅ +mye +sant +samme +mÅte +hvordan +der +ville +uten +eneste +hans +oss +hver +like +tilstand +arbeid +hvilken +fÅ +hvor +folk +det +ut +start +gÅ +hvorfor +god +tid +ikke +meg +han +stille +bra +fÛrst +i +ene +fÛr +ogsÅ +enn +rett +navn +lik +makt +med +av +til +inn +vÅr +pÅ +her +nÅr +mange +du +forsÛke +begge +vi +part +eller +hadde +tilbake +en +var +enhver +si +vÖrt +mest +om +gjorde +men +min +punkt +bort +under +nei +innen diff --git a/src/mongo/db/fts/stop_words_portuguese.txt b/src/mongo/db/fts/stop_words_portuguese.txt new file mode 100644 index 00000000000..5b14cf26243 --- /dev/null +++ b/src/mongo/db/fts/stop_words_portuguese.txt @@ -0,0 +1,147 @@ +para +o +está +como +quem +ou +eles +nosso +este +não +tu +muitos +somente +corrente +parte +quando +todos +por +das +estão +têm +poderá +com +qualquer +fora +nome +estar +pode +também +novo +bem +meu +dois +desde +ver +fez +estado +ser +ir +aquelas +trabalhar +promeiro +iste +usar +teu +é +muito +mais +tentar +quieto +aquele +teve +quê +comprido +ela +irá +tentaram +deverá +você +estará +são +fará +desligado +tenho +e +podia +então +direita +isto +esteve +tive +tal +eu +debaixo +dentro +foi +último +caminho +tentei +diz +valor +fazer +pessoas +fazia +tipo +deve +acerca +pelo +trabalho +dos +em +veja +aqueles +uns +devem +alguns +verdade +nós +uma +tente +bom +mas +conhecido +algmas +saber +umas +usa +qual +um +porque +seu +estivemos +inicio +horas +ambos +cima +outro +maioria +faz +estive +aquela +os +pegar +estiveram +ista +sem +mesmo +povo +apontar +fim +estes +ligado +atrás +dizer +maiorias +iniciar +aqui +ele +tempo +enquanto +ali +tem +antes +verdadeiro +onde +agora +cada diff --git a/src/mongo/db/fts/stop_words_romanian.txt b/src/mongo/db/fts/stop_words_romanian.txt new file mode 100644 index 00000000000..1a7cb994c86 --- /dev/null +++ b/src/mongo/db/fts/stop_words_romanian.txt @@ -0,0 +1,258 @@ +vreo +acelea +cita +degraba +lor +alta +tot +ai +dat +x +despre +peste +bine +dar +foarte +z +avea +multi +cit +alt +mai +sa +fie +tu +multe +e +orice +dintr +se +g +intr +niste +multa +insa +il +fost +a +abia +nimic +sub +acel +in +altceva +si +avem +altfel +c +ea +acest +li +parca +fi +dintre +unele +m +acestei +mare +cel +este +pe +atitia +uneori +acela +iti +astazi +acestui +o +imi +ele +ceilalti +pai +fata +noua +sa-ti +altul +au +i +prin +conform +aceste +anume +azi +k +unul +ala +unei +fara +ei +la +aceeasi +u +inapoi +acestea +acesta +catre +sale +asupra +as +aceea +ba +ale +da +le +apoi +aia +suntem +cum +isi +inainte +s +de +cind +cumva +chiar +acestia +daca +sunt +care +al +numai +cui +sus +tocmai +prea +cu +mi +eu +doar +niciodata +exact +putini +aiurea +tuturor +celor +astfel +atunci +citeva +cat +sau +fel +intre +acolo +nostri +ma +mult +una +ceea +iar +sintem +ati +din +geaba +sai +caruia +adica +inca +are +aici +ca +ia +nici +d +oricum +asta +carora +face +citiva +voi +unor +f +atat +toata +alaturi +cea +nu +totusi +ce +altii +acum +sint +capat +mod +deasupra +cam +vom +b +toate +careia +aceasta +atit +nimeni +ii +ci +unde +ul +plus +era +sa-mi +l +spre +dupa +nou +cele +acea +un +incit +n +cei +or +va +deci +acelasi +atatea +h +vor +decit +noi +cineva +desi +ceva +j +ului +atitea +avut +ar +pina +t +atata +unui +el +citi +asa +totul +pentru +atita +v +alti +asemenea +atatia +te +ne +deja +unii +p +atare +cite +cine +cand +toti +vreun +ori +r +alte +lui +ti +ni +aceia +am diff --git a/src/mongo/db/fts/stop_words_russian.txt b/src/mongo/db/fts/stop_words_russian.txt new file mode 100644 index 00000000000..b44c0fc7011 --- /dev/null +++ b/src/mongo/db/fts/stop_words_russian.txt @@ -0,0 +1,421 @@ +четвертый +многочисленное +там +ты +обычно +даром +через +из +туда +каждый +начала +алло +он +за +мор +вам +долго +только +пока +быть +этим +ими +важные +раз +да +теми +никогда +е +менее +под +раньше +них +прекрасно +сама +время +семнадцать +несколько +люди +чаще +им +действительно +том +десятый +везде +тою +четырнадцать +вся +тринадцать +какой +такая +внизу +разве +нее +моя +наш +зато +каждая +ними +моё +почти +другие +отовсюду +к +которых +должно +затем +которые +более +важное +давно +рано +всему +может +второй +пятый +тысяч +кто +тому +тоже +ваши +нею +меля +нужно +ни +нх +был +твоё +часто +ею +позже +твоя +вас +двадцатый +нибудь +именно +друго +самими +своих +себе +семь +процентов +одиннадцать +этом +нему +сказать +вдали +всеми +другой +тобою +хорошо +сказала +теперь +были +вверх +двух +неё +говорит +сегодня +тех +потому +этого +ну +пятнадцать +будете +хоть +сих +занято +года +бы +конечно +восемнадцатый +которой +девять +пожалуйста +после +этими +мало +впрочем +без +двенадцать +было +совсем +этот +так +она +непрерывно +свою +нет +хотеть +себя +самого +оба +многочисленная +назад +бывь +кого +где +будут +буду +ней +можно +всем +ещё +сам +вами +мне +кругом +мог +шестнадцатый +о +эта +такие +г +такой +еще +который +мной +самой +ком +то +те +восемнадцать +весь +занят +оно +сказал +сначала +с +которая +нередко +никуда +много +со +ниже +хотя +которого +других +против +однажды +восемь +будь +наша +ли +нас +особенно +тебе +четырнадцатый +две +иметь +первый +вы +иногда +кем +человек +самому +и +во +будет +это +три +заняты +этих +пятнадцатый +довольно +чтоб +все +над +их +мы +ту +будто +от +всего +что +но +при +день +всё +собою +ваш +очень +посреди +говорил +наше +год +опять +третий +четыре +чего +шесть +кажется +немного +семнадцатый +мой +недавно +ему +дел +в +далеко +также +такое +того +будем +однако +времени +какая +ваше +бывает +сами +я +потом +т +не +уж +надо +занята +низко +когда +вдруг +пять +году +вон +девятнадцать +седьмой +девятый +тебя +здесь +этой +шестнадцать +миллионов +стал +самих +снова +чуть +самим +одной +или +ей +саму +мира +каждое +больше +ваша +была +чтобы +просто +двадцать +само +суть +на +слишком +эту +твой +жизнь +тринадцатый +мимо +тут +восьмой +многочисленный +тем +её +него +вокруг +об +кроме +недалеко +лишь +хочешь +один +наиболее +пора +мои +как +м +чем +меня +нами +есть +близко +почему +уже +тобой +мною +его +они +девятнадцатый +у +перед +всех +пор +всею +вот +чему +значит +каждые +уметь +зачем +нем +между +всюду +своей +двенадцатый +важный +около +мож +для +можхо +другая +до +ж +эти +самом +даже +кому +всегда +сейчас +ее +одиннадцатый +наверху +куда +одного +своего +меньше +важная +свои +по +могут +про +тот +тогда +свое +лучше +сколько +будешь +наконец +вниз +всю +спасибо +многочисленные +ведь +наши +два +шестой +имя +же +сеаой +отсюда +рядом +а +лет +собой +другое +дальше +ничего +мочь +если +та +десять +этому +нельзя +нам diff --git a/src/mongo/db/fts/stop_words_spanish.txt b/src/mongo/db/fts/stop_words_spanish.txt new file mode 100644 index 00000000000..04132dc6b66 --- /dev/null +++ b/src/mongo/db/fts/stop_words_spanish.txt @@ -0,0 +1,177 @@ +para +tuyo +usan +primero desde +sabes +como +aquellos +largo +ante +podriais +sin +incluso +un +intento +eras +cierto +una +otro +consigues +ha +bien +tras +alguna +hacemos +podrian +tiempo +por +pero +verdadera +podrias +somos +fue +muchos +podeis +modo +intentas +el +trabajar +bajo +fin +atras +ultimo +puedo +hace +ellas +aquel +intenta +estado +ir +ser +haces +las +tener +entre +vais +cierta +van +usar +intentan +su +nos +trabajamos +verdad +estan +trabajan +estoy +ellos +empleas +algunas +también +siendo +muy +solamente +pueden +ciertas +yo +tengo +estamos +unos +hacen +los +empleo +dentro +sus +valor +sois +vosotros +eramos +sabe +tiene +fui +voy +consiguen +podemos +esta +trabajais +entonces +dos +verdadero +saben +trabajas +era +vaya +estaba +usamos +poder +podriamos +vosotras +mio +encima +consigo +usas +teneis +algún +solo +podria +lo +tienen +conseguimos +trabajo +saber +usa +soy +eran +ampleamos +porque +emplear +es +donde +fueron +ambos +tenemos +sabeis +fuimos +eres +va +empleais +cuando +haceis +con +mientras +intentar +aquellas +conseguir +puede +emplean +la +algunos +sobre +vamos +estais +quien +hacer +ciertos +aqui +usais +todo +en +intentais +bastante +uno +sabemos +unas +si +consigue +trabaja +alguno +uso +antes +intentamos +hago +cual +arriba +por qué +gueno +nosotros +cada diff --git a/src/mongo/db/fts/stop_words_swedish.txt b/src/mongo/db/fts/stop_words_swedish.txt new file mode 100644 index 00000000000..fb5ecc1d275 --- /dev/null +++ b/src/mongo/db/fts/stop_words_swedish.txt @@ -0,0 +1,386 @@ +bort +ert +gör +likställda +skall +nittonde +finns +artonn +flera +kanske +tills +gick +gjorde +tjugoett +fjorton +siste +hundra +senare +varit +sjuttonde +hundraen +sina +honom +den +nederst +viktigare +allas +del +vilket +vad +större +möjligen +åttio +tolfte +kommer +heller +inför +viktigast +sista +gjort +och +tidigt +mitt +tio +gäller +fler +komma +fem +tretton +smått +mittemot +jämfört +eller +åtta +vart +mycket +aderton +enligt +minst +sjunde +kommit +möjligt +ingenting +liten +goda +längst +när +nummer +femtionde +mest +adertonde +över +alltid +från +fyra +det +göra +enkla +nr +förlåt +varsågod +under +hon +övermorgon +sextionde +något +flesta +fyrtio +gått +nittio +gällt +ute +annat +bland +långsammare +nästa +in +hennes +kunna +ursäkt +fram +dagar +du +innan +varken +elva +två +vill +lätt +mera +inuti +dig +tillsammans +kunnat +femte +som +även +någon +kr +vänstra +åtminstone +allt +utanför +nedersta +annan +noll +nittionde +får +i +gå +tidig +säga +stora +sig +behövt +åttionde +genom +helt +varför +behövde +slutligen +alltså +fjortonde +sitt +tionde +där +delen +imorgon +kvar +tjugotvå +jag +verkligen +andra +ettusen +hellre +mina +inom +vår +tidigare +sextonde +kom +han +värre +om +högre +sämst +vilka +bättre +mindre +första +vårt +ur +tredje +också +finnas +gälla +möjlig +legat +behövas +fjärde +ut +litet +de +har +bakom +deras +små +övre +detta +till +hit +ofta +tjungo +för +sagt +beslutat +att +min +så +blivit +nio +mellan +vems +olika +nionde +upp +mot +ditt +nödvändigt +dina +länge +hur +haft +femtio +nitton +efter +framför +idag +inne +beslutit +dagarna +tjugoen +möjligtvis +rätt +därför +på +igår +gärna +samma +stort +dem +sjuttionde +igen +tjugonde +senast +säger +båda +då +ännu +inga +vidare +tidigast +längre +hög +godare +störst +bli +sex +inget +trettionde +din +sexton +hans +vem +våra +tre +går +få +åttonde +nedre +dagen +sedan +alla +sist +man +långsammast +bra +ligga +femton +många +henne +femtonde +helst +tolv +förra +lika +aldrig +nu +vid +sjuttio +nödvändigtvis +långsam +viktigt +vi +ibland +ingen +enkelt +överst +enkel +viktig +några +med +nödvändiga +kunde +höger +tack +dag +ska +fin +fanns +eftersom +oss +adjö +era +andras +ja +snart +rakt +bäst +lättast +ner +vilken +sextio +följande +bådas +lilla +hundraett +måste +utan +trettio +lättare +långsamt +men +än +ha +god +kan +av +skulle +vänster +sjutton +sämre +olikt +varifrån +var +stor +sju +långt +gott +före +tjugo +någonting +mig +fyrtionde +elfte +inte +genast +likställd +nej +hade +ligger +bara +borta +trettonde +sade +beslut +dock +lite +vara +redan +ned +en +fick +tjugotre +dess +artonde +knappast +fått +här +högst +tvåhundra +ett +blir +sent +sin +sjätte +nödvändig +mer +er +ni +blev +nog +godast +dit +oftast +behöva diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp new file mode 100644 index 00000000000..8d70600ce8e --- /dev/null +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -0,0 +1,32 @@ +// stop_words_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/fts/stop_words.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( English, Basic1 ) { + const StopWords* english = StopWords::getStopWords( "english" ); + ASSERT( english->isStopWord( "the" ) ); + ASSERT( !english->isStopWord( "computer" ) ); + } + + } +} diff --git a/src/mongo/db/fts/stop_words_turkish.txt b/src/mongo/db/fts/stop_words_turkish.txt new file mode 100644 index 00000000000..66dea7e2dec --- /dev/null +++ b/src/mongo/db/fts/stop_words_turkish.txt @@ -0,0 +1,114 @@ +mu +onlar +seksen +ama +trilyon +buna +bizim +þeyden +yirmi +altý +iki +seni +doksan +dört +bunun +ki +nereye +altmýþ +hem +milyon +kez +otuz +beþ +elli +bizi +da +sekiz +ve +çok +bu +veya +ya +kýrk +onlarýn +ona +bana +yetmiþ +milyar +þunu +senden +birþeyi +dokuz +yani +kimi +þeyler +kim +neden +senin +yedi +niye +üç +þey +mý +tüm +onlari +bunda +ise +þundan +hep +þuna +bin +ben +ondan +kimden +bazý +belki +ne +bundan +gibi +de +onlardan +sizi +sizin +daha +niçin +þunda +INSERmi +bunu +beni +ile +þu +þeyi +sizden +defa +biz +için +dahi +siz +nerde +kime +birþey +birkez +her +biri +on +mü +diye +acaba +sen +en +hepsi +bir +bizden +sanki +benim +nerede +onu +benden +yüz +birkaç +çünkü +nasýl +hiç +katrilyon diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp new file mode 100644 index 00000000000..73f485901f6 --- /dev/null +++ b/src/mongo/db/fts/tokenizer.cpp @@ -0,0 +1,129 @@ +// tokenizer.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string> + +#include "mongo/db/fts/tokenizer.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + Tokenizer::Tokenizer( const string& language, const StringData& str ) + : _pos(0), _raw( str ) { + _english = language == "english"; + _skipWhitespace(); + _previousWhiteSpace = true; + } + + bool Tokenizer::more() const { + return _pos < _raw.size(); + } + + Token Tokenizer::next() { + if ( _pos >= _raw.size() ) + return Token( Token::INVALID, "", 0, false ); + + unsigned start = _pos++; + Token::Type type = _type( _raw[start] ); + if ( type == Token::WHITESPACE ) abort(); + + if ( type == Token::TEXT ) + while ( _pos < _raw.size() && _type( _raw[_pos] ) == type ) + _pos++; + + StringData ret = _raw.substr( start, _pos - start ); + bool old = _previousWhiteSpace; + _previousWhiteSpace = _skipWhitespace(); + return Token( type, ret, start, old ); + } + + + bool Tokenizer::_skipWhitespace() { + unsigned start = _pos; + while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE ) + _pos++; + return _pos > start; + } + + + Token::Type Tokenizer::_type( char c ) const { + switch ( c ) { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + return Token::WHITESPACE; + case '\'': + if ( _english ) + return Token::TEXT; + else + return Token::WHITESPACE; + + case '~': + case '`': + + case '!': + case '@': + case '#': + case '$': + case '%': + case '^': + case '&': + case '*': + case '(': + case ')': + + case '-': + + case '=': + case '+': + + case '[': + case ']': + case '{': + case '}': + case '|': + case '\\': + + case ';': + case ':': + + case '"': + + case '<': + case '>': + + case ',': + case '.': + + case '/': + case '?': + + return Token::DELIMITER; + default: + return Token::TEXT; + } + } + + } + +} diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h new file mode 100644 index 00000000000..5b9a56ed8d6 --- /dev/null +++ b/src/mongo/db/fts/tokenizer.h @@ -0,0 +1,68 @@ +// tokenizer.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <string> + +#include "mongo/base/string_data.h" +#include "mongo/platform/unordered_map.h" +#include "mongo/platform/unordered_set.h" + +namespace mongo { + + namespace fts { + + struct Token { + enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; + Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace ) + : type( type ), + data( data ), + offset( offset ), + previousWhiteSpace( previousWhiteSpace ) {} + + bool ok() const { return type != INVALID; } + + Type type; + StringData data; + unsigned offset; + bool previousWhiteSpace; + }; + + class Tokenizer { + public: + + Tokenizer( const std::string& language, const StringData& str ); + + bool more() const; + Token next(); + + private: + Token::Type _type( char c ) const; + bool _skipWhitespace(); + + unsigned _pos; + bool _previousWhiteSpace; + const StringData& _raw; + bool _english; + }; + + } +} + diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp new file mode 100644 index 00000000000..1502b2f4390 --- /dev/null +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -0,0 +1,119 @@ +// tokenizer_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/fts/tokenizer.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( Tokenizer, Empty1 ) { + Tokenizer i( "english", "" ); + ASSERT( !i.more() ); + } + + TEST( Tokenizer, Basic1 ) { + Tokenizer i( "english", "blue red green" ); + + ASSERT( i.more() ); + ASSERT_EQUALS( i.next().data.toString(), "blue" ); + + ASSERT( i.more() ); + ASSERT_EQUALS( i.next().data.toString(), "red" ); + + ASSERT( i.more() ); + ASSERT_EQUALS( i.next().data.toString(), "green" ); + + ASSERT( !i.more() ); + } + + TEST( Tokenizer, Basic2 ) { + Tokenizer i( "english", "blue-red" ); + + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); + + ASSERT_EQUALS( Token::TEXT, a.type ); + ASSERT_EQUALS( Token::DELIMITER, b.type ); + ASSERT_EQUALS( Token::TEXT, c.type ); + ASSERT_EQUALS( Token::INVALID, d.type ); + + ASSERT_EQUALS( "blue", a.data.toString() ); + ASSERT_EQUALS( "-", b.data.toString() ); + ASSERT_EQUALS( "red", c.data.toString() ); + + ASSERT( a.previousWhiteSpace ); + ASSERT( !b.previousWhiteSpace ); + ASSERT( !c.previousWhiteSpace ); + } + + TEST( Tokenizer, Basic3 ) { + Tokenizer i( "english", "blue -red" ); + + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); + + ASSERT_EQUALS( Token::TEXT, a.type ); + ASSERT_EQUALS( Token::DELIMITER, b.type ); + ASSERT_EQUALS( Token::TEXT, c.type ); + ASSERT_EQUALS( Token::INVALID, d.type ); + + ASSERT_EQUALS( "blue", a.data.toString() ); + ASSERT_EQUALS( "-", b.data.toString() ); + ASSERT_EQUALS( "red", c.data.toString() ); + + ASSERT( a.previousWhiteSpace ); + ASSERT( b.previousWhiteSpace ); + ASSERT( !c.previousWhiteSpace ); + + + ASSERT_EQUALS( 0U, a.offset ); + ASSERT_EQUALS( 5U, b.offset ); + ASSERT_EQUALS( 6U, c.offset ); + } + + TEST( Tokenizer, Quote1English ) { + Tokenizer i( "english", "eliot's car" ); + + Token a = i.next(); + Token b = i.next(); + + ASSERT_EQUALS( "eliot's", a.data.toString() ); + ASSERT_EQUALS( "car", b.data.toString() ); + } + + TEST( Tokenizer, Quote1French ) { + Tokenizer i( "french", "eliot's car" ); + + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + + ASSERT_EQUALS( "eliot", a.data.toString() ); + ASSERT_EQUALS( "s", b.data.toString() ); + ASSERT_EQUALS( "car", c.data.toString() ); + } + + } +} + + |