diff options
author | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-04-01 14:34:39 -0400 |
---|---|---|
committer | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-04-01 14:58:13 -0400 |
commit | 72598f750d732c08c98f5f578bf1335acd78e10e (patch) | |
tree | d80364b07b25210f5724ba6e6506650be657c74e /src/mongo/db/fts/fts_spec.cpp | |
parent | 3cf0c18aa2c56949fda47ab35570489d68965370 (diff) | |
download | mongo-72598f750d732c08c98f5f578bf1335acd78e10e.tar.gz |
SERVER-17520: Add support for pluggable FTS tokenizers
Diffstat (limited to 'src/mongo/db/fts/fts_spec.cpp')
-rw-r--r-- | src/mongo/db/fts/fts_spec.cpp | 22 |
1 files changed, 7 insertions, 15 deletions
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp index fdd9ecf7824..9e68835e83b 100644 --- a/src/mongo/db/fts/fts_spec.cpp +++ b/src/mongo/db/fts/fts_spec.cpp @@ -33,6 +33,7 @@ #include "mongo/db/field_ref.h" #include "mongo/db/fts/fts_element_iterator.h" +#include "mongo/db/fts/fts_tokenizer.h" #include "mongo/db/fts/fts_util.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/stringutils.h" @@ -167,13 +168,12 @@ namespace mongo { while ( it.more() ) { FTSIteratorValue val = it.next(); - Stemmer stemmer( *val._language ); - Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) ); - _scoreStringV2( tools, val._text, term_freqs, val._weight ); + std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer()); + _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight ); } } - void FTSSpec::_scoreStringV2( const Tools& tools, + void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer, StringData raw, TermFrequencyMap* docScores, double weight ) const { @@ -182,18 +182,10 @@ namespace mongo { unsigned numTokens = 0; - Tokenizer i( tools.language, raw ); - while ( i.more() ) { - Token t = i.next(); - if ( t.type != Token::TEXT ) { - continue; - } + tokenizer->reset(raw.rawData(), false ); - string term = tolowerString( t.data ); - if ( tools.stopwords->isStopWord( term ) ) { - continue; - } - term = tools.stemmer->stem( term ); + while (tokenizer->moveNext()) { + string term = tokenizer->get().toString(); ScoreHelperStruct& data = terms[term]; |