summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeert Bosch <geert@mongodb.com>2015-03-30 15:43:54 -0400
committerGeert Bosch <geert@mongodb.com>2015-03-30 15:43:54 -0400
commitedc67399aef9bded106c0196d4af843f23a8acc9 (patch)
treedbb5cebdf54af86e9ee28ced798377579f393722
parent465bb26c0fb0f4731f4dbb5e09e0a791177bbc64 (diff)
downloadmongo-edc67399aef9bded106c0196d4af843f23a8acc9.tar.gz
Revert "FTS Tokenizer"
This reverts commit 0bed4262dac849788e6571dc404d5d261b9e1c8c.
-rw-r--r--src/mongo/db/fts/SConscript1
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.cpp90
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.h79
-rw-r--r--src/mongo/db/fts/fts_language.cpp6
-rw-r--r--src/mongo/db/fts/fts_language.h24
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp31
-rw-r--r--src/mongo/db/fts/fts_query.cpp70
-rw-r--r--src/mongo/db/fts/fts_query.h3
-rw-r--r--src/mongo/db/fts/fts_spec.cpp22
-rw-r--r--src/mongo/db/fts/fts_spec.h2
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp6
-rw-r--r--src/mongo/db/fts/fts_tokenizer.h72
-rw-r--r--src/mongo/db/fts/stemmer.cpp6
-rw-r--r--src/mongo/db/fts/stemmer.h2
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp4
-rw-r--r--src/mongo/db/fts/stop_words.cpp15
-rw-r--r--src/mongo/db/fts/stop_words.h2
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp2
-rw-r--r--src/mongo/db/fts/tokenizer.cpp4
-rw-r--r--src/mongo/db/fts/tokenizer.h2
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp12
21 files changed, 100 insertions, 355 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 5a782014600..6ccc070fd64 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -33,7 +33,6 @@ baseEnv.Library('base', [
'fts_spec.cpp',
'fts_spec_legacy.cpp',
'fts_language.cpp',
- 'fts_basic_tokenizer.cpp',
'fts_util.cpp',
'fts_element_iterator.cpp',
'stemmer.cpp',
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
deleted file mode 100644
index 296f473f144..00000000000
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Copyright (C) 2015 MongoDB Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the GNU Affero General Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#include "mongo/platform/basic.h"
-
-#include "mongo/db/fts/fts_basic_tokenizer.h"
-
-#include "mongo/db/fts/fts_query.h"
-#include "mongo/db/fts/fts_spec.h"
-#include "mongo/db/fts/stemmer.h"
-#include "mongo/db/fts/stop_words.h"
-#include "mongo/db/fts/tokenizer.h"
-#include "mongo/stdx/memory.h"
-#include "mongo/util/mongoutils/str.h"
-#include "mongo/util/stringutils.h"
-
-namespace mongo {
-namespace fts {
-
- using std::string;
-
- BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language)
- : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
- }
-
- void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) {
- _generateCaseSensitiveTokens = generateCaseSensitiveTokens;
- _tokenizer = stdx::make_unique<Tokenizer>(_language, document);
- }
-
- bool BasicFTSTokenizer::moveNext() {
- while (true) {
- bool hasMore = _tokenizer->more();
- if (!hasMore) {
- _stem = "";
- return false;
- }
-
- Token token = _tokenizer->next();
-
- string word = token.data.toString();
-
- word = tolowerString(token.data);
-
- // Stop words are case-sensitive so we need them to be lower cased to check
- // against the stop word list
- if (_stopWords->isStopWord(word)) {
- continue;
- }
-
- if (_generateCaseSensitiveTokens) {
- word = token.data.toString();
- }
-
- _stem = _stemmer.stem(word);
- return true;
- }
- }
-
- StringData BasicFTSTokenizer::get() const {
- return _stem;
- }
-
-} // namespace fts
-} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
deleted file mode 100644
index fd59a4583fc..00000000000
--- a/src/mongo/db/fts/fts_basic_tokenizer.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Copyright (C) 2015 MongoDB Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the GNU Affero General Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-#pragma once
-
-#include "mongo/base/disallow_copying.h"
-#include "mongo/base/string_data.h"
-#include "mongo/db/fts/fts_tokenizer.h"
-#include "mongo/db/fts/stemmer.h"
-#include "mongo/db/fts/tokenizer.h"
-
-namespace mongo {
-namespace fts {
-
- class FTSLanguage;
- class StopWords;
-
- /**
- * BasicFTSTokenizer
- * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words.
- * Uses
- * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space).
- * - tolower from the C standard libary to lower letters, ie, it only supports lower casing
- * - ASCII letters (U+0000 - U+007F)
- * - Stemmer (ie, Snowball Stemmer) to stem words.
- * - Embeded stop word lists for each language in StopWord class
- *
- * For each word returns a stem version of a word optimized for full text indexing.
- * Optionally supports returning case sensitive search terms.
- */
- class BasicFTSTokenizer : public FTSTokenizer {
- MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
- public:
- BasicFTSTokenizer(const FTSLanguage* language);
-
- void reset(const char* document, bool generateCaseSensitiveTokens) override;
-
- bool moveNext() override;
-
- StringData get() const override;
-
- private:
- const FTSLanguage* const _language;
- const Stemmer _stemmer;
- const StopWords* const _stopWords;
-
- std::unique_ptr<Tokenizer> _tokenizer;
- bool _generateCaseSensitiveTokens;
-
- std::string _stem;
- };
-
-} // namespace fts
-} // namespace mongo
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index 9164d157508..edb2b7cf363 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -33,8 +33,6 @@
#include <string>
#include "mongo/base/init.h"
-#include "mongo/db/fts/fts_basic_tokenizer.h"
-#include "mongo/stdx/memory.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/string_map.h"
@@ -81,10 +79,6 @@ namespace mongo {
LanguageMapV1 languageMapV1;
}
- std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
- return stdx::make_unique<BasicFTSTokenizer>(this);
- }
-
MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
MONGO_NO_DEPENDENTS );
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index ce45e0b812a..3a9acbbdd94 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -39,10 +39,8 @@ namespace mongo {
namespace fts {
- class FTSTokenizer;
-
#define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \
- BasicFTSLanguage language; \
+ FTSLanguage language; \
MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \
( "FTSAllLanguagesRegistered" ) ) \
( ::mongo::InitializerContext* context ) { \
@@ -72,8 +70,6 @@ namespace mongo {
/** Create an uninitialized language. */
FTSLanguage();
- virtual ~FTSLanguage() {}
-
/**
* Returns the language as a std::string in canonical form (lowercased English name). It is
* an error to call str() on an uninitialized language.
@@ -81,12 +77,6 @@ namespace mongo {
const std::string& str() const;
/**
- * Returns a new FTSTokenizer instance for this language.
- * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
- */
- virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
-
- /**
* Register std::string 'languageName' as a new language with text index version
* 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
* Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
@@ -130,15 +120,9 @@ namespace mongo {
typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
-
- class BasicFTSLanguage : public FTSLanguage {
- public:
- std::unique_ptr<FTSTokenizer> createTokenizer() const override;
- };
-
- extern BasicFTSLanguage languagePorterV1;
- extern BasicFTSLanguage languageEnglishV2;
- extern BasicFTSLanguage languageFrenchV2;
+ extern FTSLanguage languagePorterV1;
+ extern FTSLanguage languageEnglishV2;
+ extern FTSLanguage languageFrenchV2;
}
}
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 634bcf345cd..492dbdf7b7b 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -31,7 +31,6 @@
#include "mongo/platform/basic.h"
#include "mongo/db/fts/fts_matcher.h"
-#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_element_iterator.h"
#include "mongo/platform/strcasestr.h"
@@ -97,13 +96,15 @@ namespace mongo {
bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language,
const string& raw ) const {
- std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
-
- tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
-
- while (tokenizer->moveNext()) {
- string word = tokenizer->get().toString();
- if (_query.getPositiveTerms().count(word) > 0) {
+ Tokenizer i( *language, raw );
+ Stemmer stemmer( *language );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT ) {
+ continue;
+ }
+ string word = stemmer.stem( _query.normalizeString( t.data ) );
+ if ( _query.getPositiveTerms().count( word ) > 0 ) {
return true;
}
}
@@ -129,12 +130,14 @@ namespace mongo {
bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language,
const string& raw ) const {
- std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
-
- tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
-
- while (tokenizer->moveNext()) {
- string word = tokenizer->get().toString();
+ Tokenizer i( *language, raw );
+ Stemmer stemmer( *language );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT ) {
+ continue;
+ }
+ string word = stemmer.stem( _query.normalizeString( t.data ) );
if ( _query.getNegatedTerms().count( word ) > 0 ) {
return true;
}
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index e05aa5693cc..9088719d11e 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -33,7 +33,6 @@
#include "mongo/db/fts/fts_query.h"
#include "mongo/db/fts/fts_spec.h"
-#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/tokenizer.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
@@ -60,14 +59,15 @@ namespace mongo {
_language = swl.getValue();
_caseSensitive = caseSensitive;
- std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer());
+ const StopWords& stopWords = *StopWords::getStopWords( *_language );
+ Stemmer stemmer( *_language );
bool inNegation = false;
bool inPhrase = false;
unsigned quoteOffset = 0;
- Tokenizer i( _language, query );
+ Tokenizer i( *_language, query );
while ( i.more() ) {
Token t = i.next();
@@ -78,7 +78,7 @@ namespace mongo {
// don't add term
}
else {
- _addTerm( tokenizer.get(), s, inNegation );
+ _addTerm( stopWords, stemmer, s, inNegation );
}
if ( inNegation && !inPhrase )
@@ -122,52 +122,44 @@ namespace mongo {
return Status::OK();
}
- void FTSQuery::_addTerm( FTSTokenizer* tokenizer,
+ void FTSQuery::_addTerm( const StopWords& sw,
+ const Stemmer& stemmer,
const string& token,
bool negated ) {
- tokenizer->reset(token.c_str(), false);
-
- auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
-
- // First, get all the terms for indexing, ie, lower cased words
- // If we are case-insensitive, we can also used this for positive, and negative terms
- // Some terms may be expanded into multiple words in some non-English languages
- while (tokenizer->moveNext()) {
-
- string word = tokenizer->get().toString();
-
- if (!negated) {
- _termsForBounds.insert(word);
- }
-
- // Compute the string corresponding to 'token' that will be used for the matcher.
- // For case-insensitive queries, this is the same string as 'boundsTerm' computed
- // above.
- if (!_caseSensitive) {
- activeTerms.insert(word);
- }
- }
-
- if (!_caseSensitive) {
+ // Compute the string corresponding to 'token' that will be used for index bounds
+ // generation.
+ string boundsTerm = tolowerString( token );
+ if ( sw.isStopWord( boundsTerm ) ) {
return;
}
+ boundsTerm = stemmer.stem( boundsTerm );
- tokenizer->reset(token.c_str(), true);
-
- // If we want case-sensitivity, get the case-sensitive token
- while (tokenizer->moveNext()) {
-
- string word = tokenizer->get().toString();
+ // If the lowercased version of 'token' is a not a stop word, 'token' itself should also
+ // not be.
+ dassert( !sw.isStopWord( token ) );
+ if ( !negated ) {
+ _termsForBounds.insert( boundsTerm );
+ }
- activeTerms.insert(word);
+ // Compute the string corresponding to 'token' that will be used for the matcher. For
+ // case-insensitive queries, this is the same string as 'boundsTerm' computed above.
+ // However, for case-sensitive queries we need to re-stem the original token, since
+ // 'boundsTerm' is already lowercased but we need the original casing for an exact
+ // match.
+ const string& matcherTerm = _caseSensitive ? stemmer.stem( token ) : boundsTerm;
+ if ( negated ) {
+ _negatedTerms.insert( matcherTerm );
+ }
+ else {
+ _positiveTerms.insert( matcherTerm );
}
}
- string FTSQuery::normalizeString(StringData str) const {
- if (_caseSensitive) {
+ string FTSQuery::normalizeString( StringData str ) const {
+ if ( _caseSensitive ) {
return str.toString();
}
- return tolowerString(str);
+ return tolowerString( str );
}
namespace {
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index f9ea7f2d1eb..96317c926e5 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -79,7 +79,8 @@ namespace mongo {
static const bool caseSensitiveDefault;
private:
- void _addTerm( FTSTokenizer* tokenizer,
+ void _addTerm( const StopWords& sw,
+ const Stemmer& stemmer,
const std::string& token,
bool negated );
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index 9e68835e83b..fdd9ecf7824 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -33,7 +33,6 @@
#include "mongo/db/field_ref.h"
#include "mongo/db/fts/fts_element_iterator.h"
-#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
@@ -168,12 +167,13 @@ namespace mongo {
while ( it.more() ) {
FTSIteratorValue val = it.next();
- std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer());
- _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight );
+ Stemmer stemmer( *val._language );
+ Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) );
+ _scoreStringV2( tools, val._text, term_freqs, val._weight );
}
}
- void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer,
+ void FTSSpec::_scoreStringV2( const Tools& tools,
StringData raw,
TermFrequencyMap* docScores,
double weight ) const {
@@ -182,10 +182,18 @@ namespace mongo {
unsigned numTokens = 0;
- tokenizer->reset(raw.rawData(), false );
+ Tokenizer i( tools.language, raw );
+ while ( i.more() ) {
+ Token t = i.next();
+ if ( t.type != Token::TEXT ) {
+ continue;
+ }
- while (tokenizer->moveNext()) {
- string term = tokenizer->get().toString();
+ string term = tolowerString( t.data );
+ if ( tools.stopwords->isStopWord( term ) ) {
+ continue;
+ }
+ term = tools.stemmer->stem( term );
ScoreHelperStruct& data = terms[term];
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index 0f17d825dcc..c35bc2c9010 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -119,7 +119,7 @@ namespace mongo {
* Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses
* 'raw' using 'tools', and weights term scores based on 'weight'.
*/
- void _scoreStringV2( FTSTokenizer* tokenizer,
+ void _scoreStringV2( const Tools& tools,
StringData raw,
TermFrequencyMap* term_freqs,
double weight ) const;
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index a2dc1dc2489..69721fe2ae0 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -73,7 +73,7 @@ namespace mongo {
unsigned numTokens = 0;
- Tokenizer i( &tools.language, raw );
+ Tokenizer i( tools.language, raw );
while ( i.more() ) {
Token t = i.next();
if ( t.type != Token::TEXT )
@@ -162,8 +162,8 @@ namespace mongo {
const FTSLanguage& language = _getLanguageToUseV1( obj );
- Stemmer stemmer(&language);
- Tools tools(language, &stemmer, StopWords::getStopWords( &language ));
+ Stemmer stemmer(language);
+ Tools tools(language, &stemmer, StopWords::getStopWords( language ));
if ( wildcard() ) {
// if * is specified for weight, we can recurse over all fields.
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
deleted file mode 100644
index 65833aff0cb..00000000000
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Copyright (C) 2015 MongoDB Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * As a special exception, the copyright holders give permission to link the
- * code of portions of this program with the OpenSSL library under certain
- * conditions as described in each individual source file and distribute
- * linked combinations including the program with the OpenSSL library. You
- * must comply with the GNU Affero General Public License in all respects for
- * all of the code used other than as permitted herein. If you modify file(s)
- * with this exception, you may extend this exception to your version of the
- * file(s), but you are not obligated to do so. If you do not wish to do so,
- * delete this exception statement from your version. If you delete this
- * exception statement from all source files in the program, then also delete
- * it in the license file.
- */
-
-
-#pragma once
-
-#include "mongo/base/disallow_copying.h"
-#include "mongo/base/string_data.h"
-
-namespace mongo {
-namespace fts {
-
- class FTSLanguage;
- class StopWords;
-
- /**
- * FTSTokenizer
- * A iterator of "documents" where a document contains space delimited words.
- * For each word returns a stem or lemma version of a word optimized for full text indexing.
- * Optionally supports returning case sensitive search terms.
- */
- class FTSTokenizer {
- public:
- virtual ~FTSTokenizer() = default;
-
- /**
- * Process a new document, and discards any previous results.
- * May be called multiple times on an instance of an iterator.
- */
- virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0;
-
- /**
- * Moves to the next token in the iterator.
- * Returns false when the iterator reaches end of the document.
- */
- virtual bool moveNext() = 0;
-
- /**
- * Returns stemmed form, normalized, and lowercased depending on the parameter
- * to the reset method.
- * Returned StringData is valid until next call to moveNext().
- */
- virtual StringData get() const = 0;
- };
-
-} // namespace fts
-} // namespace mongo
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 9353fccf297..4a734dbe316 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -40,10 +40,10 @@ namespace mongo {
using std::string;
- Stemmer::Stemmer( const FTSLanguage* language ) {
+ Stemmer::Stemmer( const FTSLanguage& language ) {
_stemmer = NULL;
- if ( language->str() != "none" )
- _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8");
+ if ( language.str() != "none" )
+ _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8");
}
Stemmer::~Stemmer() {
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index d6d76e64218..6abba8abddc 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -49,7 +49,7 @@ namespace mongo {
class Stemmer {
MONGO_DISALLOW_COPYING( Stemmer );
public:
- Stemmer( const FTSLanguage* language );
+ Stemmer( const FTSLanguage& language );
~Stemmer();
std::string stem( StringData word ) const;
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index bef556bf2ad..9037715d4da 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -38,13 +38,13 @@ namespace mongo {
namespace fts {
TEST( English, Stemmer1 ) {
- Stemmer s( &languageEnglishV2 );
+ Stemmer s( languageEnglishV2 );
ASSERT_EQUALS( "run", s.stem( "running" ) );
ASSERT_EQUALS( "Run", s.stem( "Running" ) );
}
TEST( English, Caps ) {
- Stemmer s( &languagePorterV1 );
+ Stemmer s( languagePorterV1 );
ASSERT_EQUALS( "unit", s.stem( "united" ) );
ASSERT_EQUALS( "Unite", s.stem( "United" ) );
}
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 421bfae63db..66240a1ce2d 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -28,6 +28,7 @@
* it in the license file.
*/
+#include <boost/shared_ptr.hpp>
#include <set>
#include <string>
@@ -36,14 +37,18 @@
#include "mongo/base/init.h"
#include "mongo/util/string_map.h"
+
+
namespace mongo {
+ using boost::shared_ptr;
+
namespace fts {
void loadStopWordMap( StringMap< std::set< std::string > >* m );
namespace {
- StringMap< std::shared_ptr<StopWords> > StopWordsMap;
+ StringMap< boost::shared_ptr<StopWords> > STOP_WORDS;
StopWords empty;
}
@@ -56,9 +61,9 @@ namespace mongo {
_words.insert( *i );
}
- const StopWords* StopWords::getStopWords( const FTSLanguage* language ) {
- auto i = StopWordsMap.find( language->str() );
- if ( i == StopWordsMap.end() )
+ const StopWords* StopWords::getStopWords( const FTSLanguage& language ) {
+ StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() );
+ if ( i == STOP_WORDS.end() )
return &empty;
return i->second.get();
}
@@ -70,7 +75,7 @@ namespace mongo {
for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin();
i != raw.end();
++i ) {
- StopWordsMap[i->first].reset(new StopWords( i->second ));
+ STOP_WORDS[i->first].reset(new StopWords( i->second ));
}
return Status::OK();
}
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index d989b4dcd32..4789535ef4d 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -53,7 +53,7 @@ namespace mongo {
size_t numStopWords() const { return _words.size(); }
- static const StopWords* getStopWords( const FTSLanguage* language );
+ static const StopWords* getStopWords( const FTSLanguage& language );
private:
unordered_set<std::string> _words;
};
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index 248c4d93407..0edf4e2540c 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -36,7 +36,7 @@ namespace mongo {
namespace fts {
TEST( English, Basic1 ) {
- const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 );
+ const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 );
ASSERT( englishStopWords->isStopWord( "the" ) );
ASSERT( !englishStopWords->isStopWord( "computer" ) );
}
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index 6896924ae31..ee60f99d588 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -38,9 +38,9 @@ namespace mongo {
namespace fts {
- Tokenizer::Tokenizer( const FTSLanguage* language, StringData str )
+ Tokenizer::Tokenizer( const FTSLanguage& language, StringData str )
: _pos(0), _raw( str ) {
- _english = ( language->str() == "english" );
+ _english = ( language.str() == "english" );
_skipWhitespace();
_previousWhiteSpace = true;
}
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index 6e449124b3a..cd0d76a4f70 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -62,7 +62,7 @@ namespace mongo {
MONGO_DISALLOW_COPYING( Tokenizer );
public:
- Tokenizer( const FTSLanguage* language, StringData str );
+ Tokenizer( const FTSLanguage& language, StringData str );
bool more() const;
Token next();
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index a6692f3456d..29153a329a6 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -36,12 +36,12 @@ namespace mongo {
namespace fts {
TEST( Tokenizer, Empty1 ) {
- Tokenizer i( &languageEnglishV2, "" );
+ Tokenizer i( languageEnglishV2, "" );
ASSERT( !i.more() );
}
TEST( Tokenizer, Basic1 ) {
- Tokenizer i( &languageEnglishV2, "blue red green" );
+ Tokenizer i( languageEnglishV2, "blue red green" );
ASSERT( i.more() );
ASSERT_EQUALS( i.next().data.toString(), "blue" );
@@ -56,7 +56,7 @@ namespace mongo {
}
TEST( Tokenizer, Basic2 ) {
- Tokenizer i( &languageEnglishV2, "blue-red" );
+ Tokenizer i( languageEnglishV2, "blue-red" );
Token a = i.next();
Token b = i.next();
@@ -78,7 +78,7 @@ namespace mongo {
}
TEST( Tokenizer, Basic3 ) {
- Tokenizer i( &languageEnglishV2, "blue -red" );
+ Tokenizer i( languageEnglishV2, "blue -red" );
Token a = i.next();
Token b = i.next();
@@ -105,7 +105,7 @@ namespace mongo {
}
TEST( Tokenizer, Quote1English ) {
- Tokenizer i( &languageEnglishV2, "eliot's car" );
+ Tokenizer i( languageEnglishV2, "eliot's car" );
Token a = i.next();
Token b = i.next();
@@ -115,7 +115,7 @@ namespace mongo {
}
TEST( Tokenizer, Quote1French ) {
- Tokenizer i( &languageFrenchV2, "eliot's car" );
+ Tokenizer i( languageFrenchV2, "eliot's car" );
Token a = i.next();
Token b = i.next();