diff options
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/fts/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.cpp | 90 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.h | 79 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.h | 24 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher.cpp | 31 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.cpp | 70 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.h | 3 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 22 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec_legacy.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_tokenizer.h | 72 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words.cpp | 15 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words_test.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer_test.cpp | 12 |
21 files changed, 355 insertions, 100 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript index 6ccc070fd64..5a782014600 100644 --- a/src/mongo/db/fts/SConscript +++ b/src/mongo/db/fts/SConscript @@ -33,6 +33,7 @@ baseEnv.Library('base', [ 'fts_spec.cpp', 'fts_spec_legacy.cpp', 'fts_language.cpp', + 'fts_basic_tokenizer.cpp', 'fts_util.cpp', 'fts_element_iterator.cpp', 'stemmer.cpp', diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp new file mode 100644 index 00000000000..296f473f144 --- /dev/null +++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp @@ -0,0 +1,90 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/fts/fts_basic_tokenizer.h" + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/stdx/memory.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" + +namespace mongo { +namespace fts { + + using std::string; + + BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language) + : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { + } + + void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) { + _generateCaseSensitiveTokens = generateCaseSensitiveTokens; + _tokenizer = stdx::make_unique<Tokenizer>(_language, document); + } + + bool BasicFTSTokenizer::moveNext() { + while (true) { + bool hasMore = _tokenizer->more(); + if (!hasMore) { + _stem = ""; + return false; + } + + Token token = _tokenizer->next(); + + string word = token.data.toString(); + + word = tolowerString(token.data); + + // Stop words are case-sensitive so we need them to be lower cased to check + // against the stop word list + if (_stopWords->isStopWord(word)) { + continue; + } + + if (_generateCaseSensitiveTokens) { + word = token.data.toString(); + } + + _stem = _stemmer.stem(word); + return true; + } + } + + StringData BasicFTSTokenizer::get() const { + return _stem; + } + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h new file mode 100644 index 00000000000..fd59a4583fc --- /dev/null +++ b/src/mongo/db/fts/fts_basic_tokenizer.h @@ -0,0 +1,79 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/disallow_copying.h" +#include "mongo/base/string_data.h" +#include "mongo/db/fts/fts_tokenizer.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/tokenizer.h" + +namespace mongo { +namespace fts { + + class FTSLanguage; + class StopWords; + + /** + * BasicFTSTokenizer + * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words. + * Uses + * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space). + * - tolower from the C standard libary to lower letters, ie, it only supports lower casing + * - ASCII letters (U+0000 - U+007F) + * - Stemmer (ie, Snowball Stemmer) to stem words. + * - Embeded stop word lists for each language in StopWord class + * + * For each word returns a stem version of a word optimized for full text indexing. + * Optionally supports returning case sensitive search terms. + */ + class BasicFTSTokenizer : public FTSTokenizer { + MONGO_DISALLOW_COPYING(BasicFTSTokenizer); + public: + BasicFTSTokenizer(const FTSLanguage* language); + + void reset(const char* document, bool generateCaseSensitiveTokens) override; + + bool moveNext() override; + + StringData get() const override; + + private: + const FTSLanguage* const _language; + const Stemmer _stemmer; + const StopWords* const _stopWords; + + std::unique_ptr<Tokenizer> _tokenizer; + bool _generateCaseSensitiveTokens; + + std::string _stem; + }; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index edb2b7cf363..9164d157508 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -33,6 +33,8 @@ #include <string> #include "mongo/base/init.h" +#include "mongo/db/fts/fts_basic_tokenizer.h" +#include "mongo/stdx/memory.h" #include "mongo/util/assert_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/string_map.h" @@ -79,6 +81,10 @@ namespace mongo { LanguageMapV1 languageMapV1; } + std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { + return stdx::make_unique<BasicFTSTokenizer>(this); + } + MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS ); diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index 3a9acbbdd94..ce45e0b812a 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -39,8 +39,10 @@ namespace mongo { namespace fts { + class FTSTokenizer; + #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \ - FTSLanguage language; \ + BasicFTSLanguage language; \ MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \ ( "FTSAllLanguagesRegistered" ) ) \ ( ::mongo::InitializerContext* context ) { \ @@ -70,6 +72,8 @@ namespace mongo { /** Create an uninitialized language. */ FTSLanguage(); + virtual ~FTSLanguage() {} + /** * Returns the language as a std::string in canonical form (lowercased English name). It is * an error to call str() on an uninitialized language. @@ -77,6 +81,12 @@ namespace mongo { const std::string& str() const; /** + * Returns a new FTSTokenizer instance for this language. + * Lifetime is scoped to FTSLanguage (which are currently all process lifetime) + */ + virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0; + + /** * Register std::string 'languageName' as a new language with text index version * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language @@ -120,9 +130,15 @@ namespace mongo { typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; - extern FTSLanguage languagePorterV1; - extern FTSLanguage languageEnglishV2; - extern FTSLanguage languageFrenchV2; + + class BasicFTSLanguage : public FTSLanguage { + public: + std::unique_ptr<FTSTokenizer> createTokenizer() const override; + }; + + extern BasicFTSLanguage languagePorterV1; + extern BasicFTSLanguage languageEnglishV2; + extern BasicFTSLanguage languageFrenchV2; } } diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index 492dbdf7b7b..634bcf345cd 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -31,6 +31,7 @@ #include "mongo/platform/basic.h" #include "mongo/db/fts/fts_matcher.h" +#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_element_iterator.h" #include "mongo/platform/strcasestr.h" @@ -96,15 +97,13 @@ namespace mongo { bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language, const string& raw ) const { - Tokenizer i( *language, raw ); - Stemmer stemmer( *language ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) { - continue; - } - string word = stemmer.stem( _query.normalizeString( t.data ) ); - if ( _query.getPositiveTerms().count( word ) > 0 ) { + std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); + + tokenizer->reset(raw.c_str(), _query.getCaseSensitive()); + + while (tokenizer->moveNext()) { + string word = tokenizer->get().toString(); + if (_query.getPositiveTerms().count(word) > 0) { return true; } } @@ -130,14 +129,12 @@ namespace mongo { bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language, const string& raw ) const { - Tokenizer i( *language, raw ); - Stemmer stemmer( *language ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) { - continue; - } - string word = stemmer.stem( _query.normalizeString( t.data ) ); + std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); + + tokenizer->reset(raw.c_str(), _query.getCaseSensitive()); + + while (tokenizer->moveNext()) { + string word = tokenizer->get().toString(); if ( _query.getNegatedTerms().count( word ) > 0 ) { return true; } diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index 9088719d11e..e05aa5693cc 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -33,6 +33,7 @@ #include "mongo/db/fts/fts_query.h" #include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/tokenizer.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -59,15 +60,14 @@ namespace mongo { _language = swl.getValue(); _caseSensitive = caseSensitive; - const StopWords& stopWords = *StopWords::getStopWords( *_language ); - Stemmer stemmer( *_language ); + std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer()); bool inNegation = false; bool inPhrase = false; unsigned quoteOffset = 0; - Tokenizer i( *_language, query ); + Tokenizer i( _language, query ); while ( i.more() ) { Token t = i.next(); @@ -78,7 +78,7 @@ namespace mongo { // don't add term } else { - _addTerm( stopWords, stemmer, s, inNegation ); + _addTerm( tokenizer.get(), s, inNegation ); } if ( inNegation && !inPhrase ) @@ -122,44 +122,52 @@ namespace mongo { return Status::OK(); } - void FTSQuery::_addTerm( const StopWords& sw, - const Stemmer& stemmer, + void FTSQuery::_addTerm( FTSTokenizer* tokenizer, const string& token, bool negated ) { - // Compute the string corresponding to 'token' that will be used for index bounds - // generation. - string boundsTerm = tolowerString( token ); - if ( sw.isStopWord( boundsTerm ) ) { - return; - } - boundsTerm = stemmer.stem( boundsTerm ); + tokenizer->reset(token.c_str(), false); + + auto& activeTerms = negated ? _negatedTerms : _positiveTerms; + + // First, get all the terms for indexing, ie, lower cased words + // If we are case-insensitive, we can also used this for positive, and negative terms + // Some terms may be expanded into multiple words in some non-English languages + while (tokenizer->moveNext()) { + + string word = tokenizer->get().toString(); + + if (!negated) { + _termsForBounds.insert(word); + } - // If the lowercased version of 'token' is a not a stop word, 'token' itself should also - // not be. - dassert( !sw.isStopWord( token ) ); - if ( !negated ) { - _termsForBounds.insert( boundsTerm ); + // Compute the string corresponding to 'token' that will be used for the matcher. + // For case-insensitive queries, this is the same string as 'boundsTerm' computed + // above. + if (!_caseSensitive) { + activeTerms.insert(word); + } } - // Compute the string corresponding to 'token' that will be used for the matcher. For - // case-insensitive queries, this is the same string as 'boundsTerm' computed above. - // However, for case-sensitive queries we need to re-stem the original token, since - // 'boundsTerm' is already lowercased but we need the original casing for an exact - // match. - const string& matcherTerm = _caseSensitive ? stemmer.stem( token ) : boundsTerm; - if ( negated ) { - _negatedTerms.insert( matcherTerm ); + if (!_caseSensitive) { + return; } - else { - _positiveTerms.insert( matcherTerm ); + + tokenizer->reset(token.c_str(), true); + + // If we want case-sensitivity, get the case-sensitive token + while (tokenizer->moveNext()) { + + string word = tokenizer->get().toString(); + + activeTerms.insert(word); } } - string FTSQuery::normalizeString( StringData str ) const { - if ( _caseSensitive ) { + string FTSQuery::normalizeString(StringData str) const { + if (_caseSensitive) { return str.toString(); } - return tolowerString( str ); + return tolowerString(str); } namespace { diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index 96317c926e5..f9ea7f2d1eb 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -79,8 +79,7 @@ namespace mongo { static const bool caseSensitiveDefault; private: - void _addTerm( const StopWords& sw, - const Stemmer& stemmer, + void _addTerm( FTSTokenizer* tokenizer, const std::string& token, bool negated ); diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index fdd9ecf7824..9e68835e83b 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -33,6 +33,7 @@ #include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_element_iterator.h" +#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -167,13 +168,12 @@ namespace mongo { while ( it.more() ) { FTSIteratorValue val = it.next(); - Stemmer stemmer( *val._language ); - Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) ); - _scoreStringV2( tools, val._text, term_freqs, val._weight ); + std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer()); + _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight ); } } - void FTSSpec::_scoreStringV2( const Tools& tools, + void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer, StringData raw, TermFrequencyMap* docScores, double weight ) const { @@ -182,18 +182,10 @@ namespace mongo { unsigned numTokens = 0; - Tokenizer i( tools.language, raw ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) { - continue; - } + tokenizer->reset(raw.rawData(), false ); - string term = tolowerString( t.data ); - if ( tools.stopwords->isStopWord( term ) ) { - continue; - } - term = tools.stemmer->stem( term ); + while (tokenizer->moveNext()) { + string term = tokenizer->get().toString(); ScoreHelperStruct& data = terms[term]; diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index c35bc2c9010..0f17d825dcc 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -119,7 +119,7 @@ namespace mongo { * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses * 'raw' using 'tools', and weights term scores based on 'weight'. */ - void _scoreStringV2( const Tools& tools, + void _scoreStringV2( FTSTokenizer* tokenizer, StringData raw, TermFrequencyMap* term_freqs, double weight ) const; diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index 69721fe2ae0..a2dc1dc2489 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -73,7 +73,7 @@ namespace mongo { unsigned numTokens = 0; - Tokenizer i( tools.language, raw ); + Tokenizer i( &tools.language, raw ); while ( i.more() ) { Token t = i.next(); if ( t.type != Token::TEXT ) @@ -162,8 +162,8 @@ namespace mongo { const FTSLanguage& language = _getLanguageToUseV1( obj ); - Stemmer stemmer(language); - Tools tools(language, &stemmer, StopWords::getStopWords( language )); + Stemmer stemmer(&language); + Tools tools(language, &stemmer, StopWords::getStopWords( &language )); if ( wildcard() ) { // if * is specified for weight, we can recurse over all fields. diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h new file mode 100644 index 00000000000..65833aff0cb --- /dev/null +++ b/src/mongo/db/fts/fts_tokenizer.h @@ -0,0 +1,72 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + + +#pragma once + +#include "mongo/base/disallow_copying.h" +#include "mongo/base/string_data.h" + +namespace mongo { +namespace fts { + + class FTSLanguage; + class StopWords; + + /** + * FTSTokenizer + * A iterator of "documents" where a document contains space delimited words. + * For each word returns a stem or lemma version of a word optimized for full text indexing. + * Optionally supports returning case sensitive search terms. + */ + class FTSTokenizer { + public: + virtual ~FTSTokenizer() = default; + + /** + * Process a new document, and discards any previous results. + * May be called multiple times on an instance of an iterator. + */ + virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0; + + /** + * Moves to the next token in the iterator. + * Returns false when the iterator reaches end of the document. + */ + virtual bool moveNext() = 0; + + /** + * Returns stemmed form, normalized, and lowercased depending on the parameter + * to the reset method. + * Returned StringData is valid until next call to moveNext(). + */ + virtual StringData get() const = 0; + }; + +} // namespace fts +} // namespace mongo diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index 4a734dbe316..9353fccf297 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -40,10 +40,10 @@ namespace mongo { using std::string; - Stemmer::Stemmer( const FTSLanguage& language ) { + Stemmer::Stemmer( const FTSLanguage* language ) { _stemmer = NULL; - if ( language.str() != "none" ) - _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); + if ( language->str() != "none" ) + _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8"); } Stemmer::~Stemmer() { diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index 6abba8abddc..d6d76e64218 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -49,7 +49,7 @@ namespace mongo { class Stemmer { MONGO_DISALLOW_COPYING( Stemmer ); public: - Stemmer( const FTSLanguage& language ); + Stemmer( const FTSLanguage* language ); ~Stemmer(); std::string stem( StringData word ) const; diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index 9037715d4da..bef556bf2ad 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -38,13 +38,13 @@ namespace mongo { namespace fts { TEST( English, Stemmer1 ) { - Stemmer s( languageEnglishV2 ); + Stemmer s( &languageEnglishV2 ); ASSERT_EQUALS( "run", s.stem( "running" ) ); ASSERT_EQUALS( "Run", s.stem( "Running" ) ); } TEST( English, Caps ) { - Stemmer s( languagePorterV1 ); + Stemmer s( &languagePorterV1 ); ASSERT_EQUALS( "unit", s.stem( "united" ) ); ASSERT_EQUALS( "Unite", s.stem( "United" ) ); } diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index 66240a1ce2d..421bfae63db 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -28,7 +28,6 @@ * it in the license file. */ -#include <boost/shared_ptr.hpp> #include <set> #include <string> @@ -37,18 +36,14 @@ #include "mongo/base/init.h" #include "mongo/util/string_map.h" - - namespace mongo { - using boost::shared_ptr; - namespace fts { void loadStopWordMap( StringMap< std::set< std::string > >* m ); namespace { - StringMap< boost::shared_ptr<StopWords> > STOP_WORDS; + StringMap< std::shared_ptr<StopWords> > StopWordsMap; StopWords empty; } @@ -61,9 +56,9 @@ namespace mongo { _words.insert( *i ); } - const StopWords* StopWords::getStopWords( const FTSLanguage& language ) { - StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() ); - if ( i == STOP_WORDS.end() ) + const StopWords* StopWords::getStopWords( const FTSLanguage* language ) { + auto i = StopWordsMap.find( language->str() ); + if ( i == StopWordsMap.end() ) return ∅ return i->second.get(); } @@ -75,7 +70,7 @@ namespace mongo { for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin(); i != raw.end(); ++i ) { - STOP_WORDS[i->first].reset(new StopWords( i->second )); + StopWordsMap[i->first].reset(new StopWords( i->second )); } return Status::OK(); } diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index 4789535ef4d..d989b4dcd32 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -53,7 +53,7 @@ namespace mongo { size_t numStopWords() const { return _words.size(); } - static const StopWords* getStopWords( const FTSLanguage& language ); + static const StopWords* getStopWords( const FTSLanguage* language ); private: unordered_set<std::string> _words; }; diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index 0edf4e2540c..248c4d93407 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -36,7 +36,7 @@ namespace mongo { namespace fts { TEST( English, Basic1 ) { - const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 ); + const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 ); ASSERT( englishStopWords->isStopWord( "the" ) ); ASSERT( !englishStopWords->isStopWord( "computer" ) ); } diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index ee60f99d588..6896924ae31 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -38,9 +38,9 @@ namespace mongo { namespace fts { - Tokenizer::Tokenizer( const FTSLanguage& language, StringData str ) + Tokenizer::Tokenizer( const FTSLanguage* language, StringData str ) : _pos(0), _raw( str ) { - _english = ( language.str() == "english" ); + _english = ( language->str() == "english" ); _skipWhitespace(); _previousWhiteSpace = true; } diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h index cd0d76a4f70..6e449124b3a 100644 --- a/src/mongo/db/fts/tokenizer.h +++ b/src/mongo/db/fts/tokenizer.h @@ -62,7 +62,7 @@ namespace mongo { MONGO_DISALLOW_COPYING( Tokenizer ); public: - Tokenizer( const FTSLanguage& language, StringData str ); + Tokenizer( const FTSLanguage* language, StringData str ); bool more() const; Token next(); diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index 29153a329a6..a6692f3456d 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -36,12 +36,12 @@ namespace mongo { namespace fts { TEST( Tokenizer, Empty1 ) { - Tokenizer i( languageEnglishV2, "" ); + Tokenizer i( &languageEnglishV2, "" ); ASSERT( !i.more() ); } TEST( Tokenizer, Basic1 ) { - Tokenizer i( languageEnglishV2, "blue red green" ); + Tokenizer i( &languageEnglishV2, "blue red green" ); ASSERT( i.more() ); ASSERT_EQUALS( i.next().data.toString(), "blue" ); @@ -56,7 +56,7 @@ namespace mongo { } TEST( Tokenizer, Basic2 ) { - Tokenizer i( languageEnglishV2, "blue-red" ); + Tokenizer i( &languageEnglishV2, "blue-red" ); Token a = i.next(); Token b = i.next(); @@ -78,7 +78,7 @@ namespace mongo { } TEST( Tokenizer, Basic3 ) { - Tokenizer i( languageEnglishV2, "blue -red" ); + Tokenizer i( &languageEnglishV2, "blue -red" ); Token a = i.next(); Token b = i.next(); @@ -105,7 +105,7 @@ namespace mongo { } TEST( Tokenizer, Quote1English ) { - Tokenizer i( languageEnglishV2, "eliot's car" ); + Tokenizer i( &languageEnglishV2, "eliot's car" ); Token a = i.next(); Token b = i.next(); @@ -115,7 +115,7 @@ namespace mongo { } TEST( Tokenizer, Quote1French ) { - Tokenizer i( languageFrenchV2, "eliot's car" ); + Tokenizer i( &languageFrenchV2, "eliot's car" ); Token a = i.next(); Token b = i.next(); |