diff options
-rw-r--r-- | src/mongo/db/fts/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.cpp | 90 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.h | 79 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_language.h | 24 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_matcher.cpp | 31 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.cpp | 70 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_query.h | 3 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 22 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_spec_legacy.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/fts_tokenizer.h | 72 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/stemmer_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words.cpp | 15 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/stop_words_test.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer.h | 2 | ||||
-rw-r--r-- | src/mongo/db/fts/tokenizer_test.cpp | 12 |
21 files changed, 100 insertions, 355 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript index 5a782014600..6ccc070fd64 100644 --- a/src/mongo/db/fts/SConscript +++ b/src/mongo/db/fts/SConscript @@ -33,7 +33,6 @@ baseEnv.Library('base', [ 'fts_spec.cpp', 'fts_spec_legacy.cpp', 'fts_language.cpp', - 'fts_basic_tokenizer.cpp', 'fts_util.cpp', 'fts_element_iterator.cpp', 'stemmer.cpp', diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp deleted file mode 100644 index 296f473f144..00000000000 --- a/src/mongo/db/fts/fts_basic_tokenizer.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright (C) 2015 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/db/fts/fts_basic_tokenizer.h" - -#include "mongo/db/fts/fts_query.h" -#include "mongo/db/fts/fts_spec.h" -#include "mongo/db/fts/stemmer.h" -#include "mongo/db/fts/stop_words.h" -#include "mongo/db/fts/tokenizer.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/stringutils.h" - -namespace mongo { -namespace fts { - - using std::string; - - BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language) - : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { - } - - void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) { - _generateCaseSensitiveTokens = generateCaseSensitiveTokens; - _tokenizer = stdx::make_unique<Tokenizer>(_language, document); - } - - bool BasicFTSTokenizer::moveNext() { - while (true) { - bool hasMore = _tokenizer->more(); - if (!hasMore) { - _stem = ""; - return false; - } - - Token token = _tokenizer->next(); - - string word = token.data.toString(); - - word = tolowerString(token.data); - - // Stop words are case-sensitive so we need them to be lower cased to check - // against the stop word list - if (_stopWords->isStopWord(word)) { - continue; - } - - if (_generateCaseSensitiveTokens) { - word = token.data.toString(); - } - - _stem = _stemmer.stem(word); - return true; - } - } - - StringData BasicFTSTokenizer::get() const { - return _stem; - } - -} // namespace fts -} // namespace mongo diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h deleted file mode 100644 index fd59a4583fc..00000000000 --- a/src/mongo/db/fts/fts_basic_tokenizer.h +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Copyright (C) 2015 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/disallow_copying.h" -#include "mongo/base/string_data.h" -#include "mongo/db/fts/fts_tokenizer.h" -#include "mongo/db/fts/stemmer.h" -#include "mongo/db/fts/tokenizer.h" - -namespace mongo { -namespace fts { - - class FTSLanguage; - class StopWords; - - /** - * BasicFTSTokenizer - * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words. - * Uses - * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space). - * - tolower from the C standard libary to lower letters, ie, it only supports lower casing - * - ASCII letters (U+0000 - U+007F) - * - Stemmer (ie, Snowball Stemmer) to stem words. - * - Embeded stop word lists for each language in StopWord class - * - * For each word returns a stem version of a word optimized for full text indexing. - * Optionally supports returning case sensitive search terms. - */ - class BasicFTSTokenizer : public FTSTokenizer { - MONGO_DISALLOW_COPYING(BasicFTSTokenizer); - public: - BasicFTSTokenizer(const FTSLanguage* language); - - void reset(const char* document, bool generateCaseSensitiveTokens) override; - - bool moveNext() override; - - StringData get() const override; - - private: - const FTSLanguage* const _language; - const Stemmer _stemmer; - const StopWords* const _stopWords; - - std::unique_ptr<Tokenizer> _tokenizer; - bool _generateCaseSensitiveTokens; - - std::string _stem; - }; - -} // namespace fts -} // namespace mongo diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp index 9164d157508..edb2b7cf363 100644 --- a/src/mongo/db/fts/fts_language.cpp +++ b/src/mongo/db/fts/fts_language.cpp @@ -33,8 +33,6 @@ #include <string> #include "mongo/base/init.h" -#include "mongo/db/fts/fts_basic_tokenizer.h" -#include "mongo/stdx/memory.h" #include "mongo/util/assert_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/string_map.h" @@ -81,10 +79,6 @@ namespace mongo { LanguageMapV1 languageMapV1; } - std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const { - return stdx::make_unique<BasicFTSTokenizer>(this); - } - MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS ); diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h index ce45e0b812a..3a9acbbdd94 100644 --- a/src/mongo/db/fts/fts_language.h +++ b/src/mongo/db/fts/fts_language.h @@ -39,10 +39,8 @@ namespace mongo { namespace fts { - class FTSTokenizer; - #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \ - BasicFTSLanguage language; \ + FTSLanguage language; \ MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \ ( "FTSAllLanguagesRegistered" ) ) \ ( ::mongo::InitializerContext* context ) { \ @@ -72,8 +70,6 @@ namespace mongo { /** Create an uninitialized language. */ FTSLanguage(); - virtual ~FTSLanguage() {} - /** * Returns the language as a std::string in canonical form (lowercased English name). It is * an error to call str() on an uninitialized language. @@ -81,12 +77,6 @@ namespace mongo { const std::string& str() const; /** - * Returns a new FTSTokenizer instance for this language. - * Lifetime is scoped to FTSLanguage (which are currently all process lifetime) - */ - virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0; - - /** * Register std::string 'languageName' as a new language with text index version * 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'. * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language @@ -130,15 +120,9 @@ namespace mongo { typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage; - - class BasicFTSLanguage : public FTSLanguage { - public: - std::unique_ptr<FTSTokenizer> createTokenizer() const override; - }; - - extern BasicFTSLanguage languagePorterV1; - extern BasicFTSLanguage languageEnglishV2; - extern BasicFTSLanguage languageFrenchV2; + extern FTSLanguage languagePorterV1; + extern FTSLanguage languageEnglishV2; + extern FTSLanguage languageFrenchV2; } } diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp index 634bcf345cd..492dbdf7b7b 100644 --- a/src/mongo/db/fts/fts_matcher.cpp +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -31,7 +31,6 @@ #include "mongo/platform/basic.h" #include "mongo/db/fts/fts_matcher.h" -#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_element_iterator.h" #include "mongo/platform/strcasestr.h" @@ -97,13 +96,15 @@ namespace mongo { bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language, const string& raw ) const { - std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - - tokenizer->reset(raw.c_str(), _query.getCaseSensitive()); - - while (tokenizer->moveNext()) { - string word = tokenizer->get().toString(); - if (_query.getPositiveTerms().count(word) > 0) { + Tokenizer i( *language, raw ); + Stemmer stemmer( *language ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) { + continue; + } + string word = stemmer.stem( _query.normalizeString( t.data ) ); + if ( _query.getPositiveTerms().count( word ) > 0 ) { return true; } } @@ -129,12 +130,14 @@ namespace mongo { bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language, const string& raw ) const { - std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer()); - - tokenizer->reset(raw.c_str(), _query.getCaseSensitive()); - - while (tokenizer->moveNext()) { - string word = tokenizer->get().toString(); + Tokenizer i( *language, raw ); + Stemmer stemmer( *language ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) { + continue; + } + string word = stemmer.stem( _query.normalizeString( t.data ) ); if ( _query.getNegatedTerms().count( word ) > 0 ) { return true; } diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp index e05aa5693cc..9088719d11e 100644 --- a/src/mongo/db/fts/fts_query.cpp +++ b/src/mongo/db/fts/fts_query.cpp @@ -33,7 +33,6 @@ #include "mongo/db/fts/fts_query.h" #include "mongo/db/fts/fts_spec.h" -#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/tokenizer.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -60,14 +59,15 @@ namespace mongo { _language = swl.getValue(); _caseSensitive = caseSensitive; - std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer()); + const StopWords& stopWords = *StopWords::getStopWords( *_language ); + Stemmer stemmer( *_language ); bool inNegation = false; bool inPhrase = false; unsigned quoteOffset = 0; - Tokenizer i( _language, query ); + Tokenizer i( *_language, query ); while ( i.more() ) { Token t = i.next(); @@ -78,7 +78,7 @@ namespace mongo { // don't add term } else { - _addTerm( tokenizer.get(), s, inNegation ); + _addTerm( stopWords, stemmer, s, inNegation ); } if ( inNegation && !inPhrase ) @@ -122,52 +122,44 @@ namespace mongo { return Status::OK(); } - void FTSQuery::_addTerm( FTSTokenizer* tokenizer, + void FTSQuery::_addTerm( const StopWords& sw, + const Stemmer& stemmer, const string& token, bool negated ) { - tokenizer->reset(token.c_str(), false); - - auto& activeTerms = negated ? _negatedTerms : _positiveTerms; - - // First, get all the terms for indexing, ie, lower cased words - // If we are case-insensitive, we can also used this for positive, and negative terms - // Some terms may be expanded into multiple words in some non-English languages - while (tokenizer->moveNext()) { - - string word = tokenizer->get().toString(); - - if (!negated) { - _termsForBounds.insert(word); - } - - // Compute the string corresponding to 'token' that will be used for the matcher. - // For case-insensitive queries, this is the same string as 'boundsTerm' computed - // above. - if (!_caseSensitive) { - activeTerms.insert(word); - } - } - - if (!_caseSensitive) { + // Compute the string corresponding to 'token' that will be used for index bounds + // generation. + string boundsTerm = tolowerString( token ); + if ( sw.isStopWord( boundsTerm ) ) { return; } + boundsTerm = stemmer.stem( boundsTerm ); - tokenizer->reset(token.c_str(), true); - - // If we want case-sensitivity, get the case-sensitive token - while (tokenizer->moveNext()) { - - string word = tokenizer->get().toString(); + // If the lowercased version of 'token' is a not a stop word, 'token' itself should also + // not be. + dassert( !sw.isStopWord( token ) ); + if ( !negated ) { + _termsForBounds.insert( boundsTerm ); + } - activeTerms.insert(word); + // Compute the string corresponding to 'token' that will be used for the matcher. For + // case-insensitive queries, this is the same string as 'boundsTerm' computed above. + // However, for case-sensitive queries we need to re-stem the original token, since + // 'boundsTerm' is already lowercased but we need the original casing for an exact + // match. + const string& matcherTerm = _caseSensitive ? stemmer.stem( token ) : boundsTerm; + if ( negated ) { + _negatedTerms.insert( matcherTerm ); + } + else { + _positiveTerms.insert( matcherTerm ); } } - string FTSQuery::normalizeString(StringData str) const { - if (_caseSensitive) { + string FTSQuery::normalizeString( StringData str ) const { + if ( _caseSensitive ) { return str.toString(); } - return tolowerString(str); + return tolowerString( str ); } namespace { diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h index f9ea7f2d1eb..96317c926e5 100644 --- a/src/mongo/db/fts/fts_query.h +++ b/src/mongo/db/fts/fts_query.h @@ -79,7 +79,8 @@ namespace mongo { static const bool caseSensitiveDefault; private: - void _addTerm( FTSTokenizer* tokenizer, + void _addTerm( const StopWords& sw, + const Stemmer& stemmer, const std::string& token, bool negated ); diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index 9e68835e83b..fdd9ecf7824 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -33,7 +33,6 @@ #include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_element_iterator.h" -#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -168,12 +167,13 @@ namespace mongo { while ( it.more() ) { FTSIteratorValue val = it.next(); - std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer()); - _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight ); + Stemmer stemmer( *val._language ); + Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) ); + _scoreStringV2( tools, val._text, term_freqs, val._weight ); } } - void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer, + void FTSSpec::_scoreStringV2( const Tools& tools, StringData raw, TermFrequencyMap* docScores, double weight ) const { @@ -182,10 +182,18 @@ namespace mongo { unsigned numTokens = 0; - tokenizer->reset(raw.rawData(), false ); + Tokenizer i( tools.language, raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) { + continue; + } - while (tokenizer->moveNext()) { - string term = tokenizer->get().toString(); + string term = tolowerString( t.data ); + if ( tools.stopwords->isStopWord( term ) ) { + continue; + } + term = tools.stemmer->stem( term ); ScoreHelperStruct& data = terms[term]; diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h index 0f17d825dcc..c35bc2c9010 100644 --- a/src/mongo/db/fts/fts_spec.h +++ b/src/mongo/db/fts/fts_spec.h @@ -119,7 +119,7 @@ namespace mongo { * Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses * 'raw' using 'tools', and weights term scores based on 'weight'. */ - void _scoreStringV2( FTSTokenizer* tokenizer, + void _scoreStringV2( const Tools& tools, StringData raw, TermFrequencyMap* term_freqs, double weight ) const; diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp index a2dc1dc2489..69721fe2ae0 100644 --- a/src/mongo/db/fts/fts_spec_legacy.cpp +++ b/src/mongo/db/fts/fts_spec_legacy.cpp @@ -73,7 +73,7 @@ namespace mongo { unsigned numTokens = 0; - Tokenizer i( &tools.language, raw ); + Tokenizer i( tools.language, raw ); while ( i.more() ) { Token t = i.next(); if ( t.type != Token::TEXT ) @@ -162,8 +162,8 @@ namespace mongo { const FTSLanguage& language = _getLanguageToUseV1( obj ); - Stemmer stemmer(&language); - Tools tools(language, &stemmer, StopWords::getStopWords( &language )); + Stemmer stemmer(language); + Tools tools(language, &stemmer, StopWords::getStopWords( language )); if ( wildcard() ) { // if * is specified for weight, we can recurse over all fields. diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h deleted file mode 100644 index 65833aff0cb..00000000000 --- a/src/mongo/db/fts/fts_tokenizer.h +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Copyright (C) 2015 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - - -#pragma once - -#include "mongo/base/disallow_copying.h" -#include "mongo/base/string_data.h" - -namespace mongo { -namespace fts { - - class FTSLanguage; - class StopWords; - - /** - * FTSTokenizer - * A iterator of "documents" where a document contains space delimited words. - * For each word returns a stem or lemma version of a word optimized for full text indexing. - * Optionally supports returning case sensitive search terms. - */ - class FTSTokenizer { - public: - virtual ~FTSTokenizer() = default; - - /** - * Process a new document, and discards any previous results. - * May be called multiple times on an instance of an iterator. - */ - virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0; - - /** - * Moves to the next token in the iterator. - * Returns false when the iterator reaches end of the document. - */ - virtual bool moveNext() = 0; - - /** - * Returns stemmed form, normalized, and lowercased depending on the parameter - * to the reset method. - * Returned StringData is valid until next call to moveNext(). - */ - virtual StringData get() const = 0; - }; - -} // namespace fts -} // namespace mongo diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp index 9353fccf297..4a734dbe316 100644 --- a/src/mongo/db/fts/stemmer.cpp +++ b/src/mongo/db/fts/stemmer.cpp @@ -40,10 +40,10 @@ namespace mongo { using std::string; - Stemmer::Stemmer( const FTSLanguage* language ) { + Stemmer::Stemmer( const FTSLanguage& language ) { _stemmer = NULL; - if ( language->str() != "none" ) - _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8"); + if ( language.str() != "none" ) + _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); } Stemmer::~Stemmer() { diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h index d6d76e64218..6abba8abddc 100644 --- a/src/mongo/db/fts/stemmer.h +++ b/src/mongo/db/fts/stemmer.h @@ -49,7 +49,7 @@ namespace mongo { class Stemmer { MONGO_DISALLOW_COPYING( Stemmer ); public: - Stemmer( const FTSLanguage* language ); + Stemmer( const FTSLanguage& language ); ~Stemmer(); std::string stem( StringData word ) const; diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp index bef556bf2ad..9037715d4da 100644 --- a/src/mongo/db/fts/stemmer_test.cpp +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -38,13 +38,13 @@ namespace mongo { namespace fts { TEST( English, Stemmer1 ) { - Stemmer s( &languageEnglishV2 ); + Stemmer s( languageEnglishV2 ); ASSERT_EQUALS( "run", s.stem( "running" ) ); ASSERT_EQUALS( "Run", s.stem( "Running" ) ); } TEST( English, Caps ) { - Stemmer s( &languagePorterV1 ); + Stemmer s( languagePorterV1 ); ASSERT_EQUALS( "unit", s.stem( "united" ) ); ASSERT_EQUALS( "Unite", s.stem( "United" ) ); } diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp index 421bfae63db..66240a1ce2d 100644 --- a/src/mongo/db/fts/stop_words.cpp +++ b/src/mongo/db/fts/stop_words.cpp @@ -28,6 +28,7 @@ * it in the license file. */ +#include <boost/shared_ptr.hpp> #include <set> #include <string> @@ -36,14 +37,18 @@ #include "mongo/base/init.h" #include "mongo/util/string_map.h" + + namespace mongo { + using boost::shared_ptr; + namespace fts { void loadStopWordMap( StringMap< std::set< std::string > >* m ); namespace { - StringMap< std::shared_ptr<StopWords> > StopWordsMap; + StringMap< boost::shared_ptr<StopWords> > STOP_WORDS; StopWords empty; } @@ -56,9 +61,9 @@ namespace mongo { _words.insert( *i ); } - const StopWords* StopWords::getStopWords( const FTSLanguage* language ) { - auto i = StopWordsMap.find( language->str() ); - if ( i == StopWordsMap.end() ) + const StopWords* StopWords::getStopWords( const FTSLanguage& language ) { + StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() ); + if ( i == STOP_WORDS.end() ) return ∅ return i->second.get(); } @@ -70,7 +75,7 @@ namespace mongo { for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin(); i != raw.end(); ++i ) { - StopWordsMap[i->first].reset(new StopWords( i->second )); + STOP_WORDS[i->first].reset(new StopWords( i->second )); } return Status::OK(); } diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h index d989b4dcd32..4789535ef4d 100644 --- a/src/mongo/db/fts/stop_words.h +++ b/src/mongo/db/fts/stop_words.h @@ -53,7 +53,7 @@ namespace mongo { size_t numStopWords() const { return _words.size(); } - static const StopWords* getStopWords( const FTSLanguage* language ); + static const StopWords* getStopWords( const FTSLanguage& language ); private: unordered_set<std::string> _words; }; diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp index 248c4d93407..0edf4e2540c 100644 --- a/src/mongo/db/fts/stop_words_test.cpp +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -36,7 +36,7 @@ namespace mongo { namespace fts { TEST( English, Basic1 ) { - const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 ); + const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 ); ASSERT( englishStopWords->isStopWord( "the" ) ); ASSERT( !englishStopWords->isStopWord( "computer" ) ); } diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp index 6896924ae31..ee60f99d588 100644 --- a/src/mongo/db/fts/tokenizer.cpp +++ b/src/mongo/db/fts/tokenizer.cpp @@ -38,9 +38,9 @@ namespace mongo { namespace fts { - Tokenizer::Tokenizer( const FTSLanguage* language, StringData str ) + Tokenizer::Tokenizer( const FTSLanguage& language, StringData str ) : _pos(0), _raw( str ) { - _english = ( language->str() == "english" ); + _english = ( language.str() == "english" ); _skipWhitespace(); _previousWhiteSpace = true; } diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h index 6e449124b3a..cd0d76a4f70 100644 --- a/src/mongo/db/fts/tokenizer.h +++ b/src/mongo/db/fts/tokenizer.h @@ -62,7 +62,7 @@ namespace mongo { MONGO_DISALLOW_COPYING( Tokenizer ); public: - Tokenizer( const FTSLanguage* language, StringData str ); + Tokenizer( const FTSLanguage& language, StringData str ); bool more() const; Token next(); diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp index a6692f3456d..29153a329a6 100644 --- a/src/mongo/db/fts/tokenizer_test.cpp +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -36,12 +36,12 @@ namespace mongo { namespace fts { TEST( Tokenizer, Empty1 ) { - Tokenizer i( &languageEnglishV2, "" ); + Tokenizer i( languageEnglishV2, "" ); ASSERT( !i.more() ); } TEST( Tokenizer, Basic1 ) { - Tokenizer i( &languageEnglishV2, "blue red green" ); + Tokenizer i( languageEnglishV2, "blue red green" ); ASSERT( i.more() ); ASSERT_EQUALS( i.next().data.toString(), "blue" ); @@ -56,7 +56,7 @@ namespace mongo { } TEST( Tokenizer, Basic2 ) { - Tokenizer i( &languageEnglishV2, "blue-red" ); + Tokenizer i( languageEnglishV2, "blue-red" ); Token a = i.next(); Token b = i.next(); @@ -78,7 +78,7 @@ namespace mongo { } TEST( Tokenizer, Basic3 ) { - Tokenizer i( &languageEnglishV2, "blue -red" ); + Tokenizer i( languageEnglishV2, "blue -red" ); Token a = i.next(); Token b = i.next(); @@ -105,7 +105,7 @@ namespace mongo { } TEST( Tokenizer, Quote1English ) { - Tokenizer i( &languageEnglishV2, "eliot's car" ); + Tokenizer i( languageEnglishV2, "eliot's car" ); Token a = i.next(); Token b = i.next(); @@ -115,7 +115,7 @@ namespace mongo { } TEST( Tokenizer, Quote1French ) { - Tokenizer i( &languageFrenchV2, "eliot's car" ); + Tokenizer i( languageFrenchV2, "eliot's car" ); Token a = i.next(); Token b = i.next(); |