diff options
Diffstat (limited to 'src/mongo/db/fts/fts_spec.cpp')
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 22 |
1 files changed, 15 insertions, 7 deletions
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index 9e68835e83b..fdd9ecf7824 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -33,7 +33,6 @@ #include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_element_iterator.h" -#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -168,12 +167,13 @@ namespace mongo { while ( it.more() ) { FTSIteratorValue val = it.next(); - std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer()); - _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight ); + Stemmer stemmer( *val._language ); + Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) ); + _scoreStringV2( tools, val._text, term_freqs, val._weight ); } } - void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer, + void FTSSpec::_scoreStringV2( const Tools& tools, StringData raw, TermFrequencyMap* docScores, double weight ) const { @@ -182,10 +182,18 @@ namespace mongo { unsigned numTokens = 0; - tokenizer->reset(raw.rawData(), false ); + Tokenizer i( tools.language, raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) { + continue; + } - while (tokenizer->moveNext()) { - string term = tokenizer->get().toString(); + string term = tolowerString( t.data ); + if ( tools.stopwords->isStopWord( term ) ) { + continue; + } + term = tools.stemmer->stem( term ); ScoreHelperStruct& data = terms[term]; |