summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorMark Benvenuto <mark.benvenuto@mongodb.com>2015-04-01 14:34:39 -0400
committerMark Benvenuto <mark.benvenuto@mongodb.com>2015-04-01 14:58:13 -0400
commit72598f750d732c08c98f5f578bf1335acd78e10e (patch)
treed80364b07b25210f5724ba6e6506650be657c74e /src/mongo
parent3cf0c18aa2c56949fda47ab35570489d68965370 (diff)
downloadmongo-72598f750d732c08c98f5f578bf1335acd78e10e.tar.gz
SERVER-17520: Add support for pluggable FTS tokenizers
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/fts/SConscript1
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.cpp90
-rw-r--r--src/mongo/db/fts/fts_basic_tokenizer.h79
-rw-r--r--src/mongo/db/fts/fts_language.cpp6
-rw-r--r--src/mongo/db/fts/fts_language.h24
-rw-r--r--src/mongo/db/fts/fts_matcher.cpp31
-rw-r--r--src/mongo/db/fts/fts_query.cpp70
-rw-r--r--src/mongo/db/fts/fts_query.h3
-rw-r--r--src/mongo/db/fts/fts_spec.cpp22
-rw-r--r--src/mongo/db/fts/fts_spec.h2
-rw-r--r--src/mongo/db/fts/fts_spec_legacy.cpp6
-rw-r--r--src/mongo/db/fts/fts_tokenizer.h72
-rw-r--r--src/mongo/db/fts/stemmer.cpp6
-rw-r--r--src/mongo/db/fts/stemmer.h2
-rw-r--r--src/mongo/db/fts/stemmer_test.cpp4
-rw-r--r--src/mongo/db/fts/stop_words.cpp15
-rw-r--r--src/mongo/db/fts/stop_words.h2
-rw-r--r--src/mongo/db/fts/stop_words_test.cpp2
-rw-r--r--src/mongo/db/fts/tokenizer.cpp4
-rw-r--r--src/mongo/db/fts/tokenizer.h2
-rw-r--r--src/mongo/db/fts/tokenizer_test.cpp12
21 files changed, 355 insertions, 100 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 6ccc070fd64..5a782014600 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -33,6 +33,7 @@ baseEnv.Library('base', [
'fts_spec.cpp',
'fts_spec_legacy.cpp',
'fts_language.cpp',
+ 'fts_basic_tokenizer.cpp',
'fts_util.cpp',
'fts_element_iterator.cpp',
'stemmer.cpp',
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
new file mode 100644
index 00000000000..296f473f144
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -0,0 +1,90 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/fts/fts_basic_tokenizer.h"
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/stdx/memory.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+namespace fts {
+
+ using std::string;
+
+ BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language)
+ : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
+ }
+
+ void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) {
+ _generateCaseSensitiveTokens = generateCaseSensitiveTokens;
+ _tokenizer = stdx::make_unique<Tokenizer>(_language, document);
+ }
+
+ bool BasicFTSTokenizer::moveNext() {
+ while (true) {
+ bool hasMore = _tokenizer->more();
+ if (!hasMore) {
+ _stem = "";
+ return false;
+ }
+
+ Token token = _tokenizer->next();
+
+ string word = token.data.toString();
+
+ word = tolowerString(token.data);
+
+ // Stop words are case-sensitive so we need them to be lower cased to check
+ // against the stop word list
+ if (_stopWords->isStopWord(word)) {
+ continue;
+ }
+
+ if (_generateCaseSensitiveTokens) {
+ word = token.data.toString();
+ }
+
+ _stem = _stemmer.stem(word);
+ return true;
+ }
+ }
+
+ StringData BasicFTSTokenizer::get() const {
+ return _stem;
+ }
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
new file mode 100644
index 00000000000..fd59a4583fc
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_tokenizer.h
@@ -0,0 +1,79 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/base/string_data.h"
+#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/tokenizer.h"
+
+namespace mongo {
+namespace fts {
+
+ class FTSLanguage;
+ class StopWords;
+
+ /**
+ * BasicFTSTokenizer
+ * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words.
+ * Uses
+ * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space).
+ * - tolower from the C standard libary to lower letters, ie, it only supports lower casing
+ * - ASCII letters (U+0000 - U+007F)
+ * - Stemmer (ie, Snowball Stemmer) to stem words.
+ * - Embeded stop word lists for each language in StopWord class
+ *
+ * For each word returns a stem version of a word optimized for full text indexing.
+ * Optionally supports returning case sensitive search terms.
+ */
+ class BasicFTSTokenizer : public FTSTokenizer {
+ MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
+ public:
+ BasicFTSTokenizer(const FTSLanguage* language);
+
+ void reset(const char* document, bool generateCaseSensitiveTokens) override;
+
+ bool moveNext() override;
+
+ StringData get() const override;
+
+ private:
+ const FTSLanguage* const _language;
+ const Stemmer _stemmer;
+ const StopWords* const _stopWords;
+
+ std::unique_ptr<Tokenizer> _tokenizer;
+ bool _generateCaseSensitiveTokens;
+
+ std::string _stem;
+ };
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index edb2b7cf363..9164d157508 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -33,6 +33,8 @@
#include <string>
#include "mongo/base/init.h"
+#include "mongo/db/fts/fts_basic_tokenizer.h"
+#include "mongo/stdx/memory.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/string_map.h"
@@ -79,6 +81,10 @@ namespace mongo {
LanguageMapV1 languageMapV1;
}
+ std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
+ return stdx::make_unique<BasicFTSTokenizer>(this);
+ }
+
MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
MONGO_NO_DEPENDENTS );
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index 3a9acbbdd94..ce45e0b812a 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -39,8 +39,10 @@ namespace mongo {
namespace fts {
+ class FTSTokenizer;
+
#define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \
- FTSLanguage language; \
+ BasicFTSLanguage language; \
MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \
( "FTSAllLanguagesRegistered" ) ) \
( ::mongo::InitializerContext* context ) { \
@@ -70,6 +72,8 @@ namespace mongo {
/** Create an uninitialized language. */
FTSLanguage();
+ virtual ~FTSLanguage() {}
+
/**
* Returns the language as a std::string in canonical form (lowercased English name). It is
* an error to call str() on an uninitialized language.
@@ -77,6 +81,12 @@ namespace mongo {
const std::string& str() const;
/**
+ * Returns a new FTSTokenizer instance for this language.
+ * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
+ */
+ virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
+
+ /**
* Register std::string 'languageName' as a new language with text index version
* 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
* Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
@@ -120,9 +130,15 @@ namespace mongo {
typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
- extern FTSLanguage languagePorterV1;
- extern FTSLanguage languageEnglishV2;
- extern FTSLanguage languageFrenchV2;
+
+ class BasicFTSLanguage : public FTSLanguage {
+ public:
+ std::unique_ptr<FTSTokenizer> createTokenizer() const override;
+ };
+
+ extern BasicFTSLanguage languagePorterV1;
+ extern BasicFTSLanguage languageEnglishV2;
+ extern BasicFTSLanguage languageFrenchV2;
}
}
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 492dbdf7b7b..634bcf345cd 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -31,6 +31,7 @@
#include "mongo/platform/basic.h"
#include "mongo/db/fts/fts_matcher.h"
+#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_element_iterator.h"
#include "mongo/platform/strcasestr.h"
@@ -96,15 +97,13 @@ namespace mongo {
bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language,
const string& raw ) const {
- Tokenizer i( *language, raw );
- Stemmer stemmer( *language );
- while ( i.more() ) {
- Token t = i.next();
- if ( t.type != Token::TEXT ) {
- continue;
- }
- string word = stemmer.stem( _query.normalizeString( t.data ) );
- if ( _query.getPositiveTerms().count( word ) > 0 ) {
+ std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
+
+ tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
+
+ while (tokenizer->moveNext()) {
+ string word = tokenizer->get().toString();
+ if (_query.getPositiveTerms().count(word) > 0) {
return true;
}
}
@@ -130,14 +129,12 @@ namespace mongo {
bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language,
const string& raw ) const {
- Tokenizer i( *language, raw );
- Stemmer stemmer( *language );
- while ( i.more() ) {
- Token t = i.next();
- if ( t.type != Token::TEXT ) {
- continue;
- }
- string word = stemmer.stem( _query.normalizeString( t.data ) );
+ std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
+
+ tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
+
+ while (tokenizer->moveNext()) {
+ string word = tokenizer->get().toString();
if ( _query.getNegatedTerms().count( word ) > 0 ) {
return true;
}
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 9088719d11e..e05aa5693cc 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -33,6 +33,7 @@
#include "mongo/db/fts/fts_query.h"
#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/tokenizer.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
@@ -59,15 +60,14 @@ namespace mongo {
_language = swl.getValue();
_caseSensitive = caseSensitive;
- const StopWords& stopWords = *StopWords::getStopWords( *_language );
- Stemmer stemmer( *_language );
+ std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer());
bool inNegation = false;
bool inPhrase = false;
unsigned quoteOffset = 0;
- Tokenizer i( *_language, query );
+ Tokenizer i( _language, query );
while ( i.more() ) {
Token t = i.next();
@@ -78,7 +78,7 @@ namespace mongo {
// don't add term
}
else {
- _addTerm( stopWords, stemmer, s, inNegation );
+ _addTerm( tokenizer.get(), s, inNegation );
}
if ( inNegation && !inPhrase )
@@ -122,44 +122,52 @@ namespace mongo {
return Status::OK();
}
- void FTSQuery::_addTerm( const StopWords& sw,
- const Stemmer& stemmer,
+ void FTSQuery::_addTerm( FTSTokenizer* tokenizer,
const string& token,
bool negated ) {
- // Compute the string corresponding to 'token' that will be used for index bounds
- // generation.
- string boundsTerm = tolowerString( token );
- if ( sw.isStopWord( boundsTerm ) ) {
- return;
- }
- boundsTerm = stemmer.stem( boundsTerm );
+ tokenizer->reset(token.c_str(), false);
+
+ auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
+
+ // First, get all the terms for indexing, ie, lower cased words
+ // If we are case-insensitive, we can also used this for positive, and negative terms
+ // Some terms may be expanded into multiple words in some non-English languages
+ while (tokenizer->moveNext()) {
+
+ string word = tokenizer->get().toString();
+
+ if (!negated) {
+ _termsForBounds.insert(word);
+ }
- // If the lowercased version of 'token' is a not a stop word, 'token' itself should also
- // not be.
- dassert( !sw.isStopWord( token ) );
- if ( !negated ) {
- _termsForBounds.insert( boundsTerm );
+ // Compute the string corresponding to 'token' that will be used for the matcher.
+ // For case-insensitive queries, this is the same string as 'boundsTerm' computed
+ // above.
+ if (!_caseSensitive) {
+ activeTerms.insert(word);
+ }
}
- // Compute the string corresponding to 'token' that will be used for the matcher. For
- // case-insensitive queries, this is the same string as 'boundsTerm' computed above.
- // However, for case-sensitive queries we need to re-stem the original token, since
- // 'boundsTerm' is already lowercased but we need the original casing for an exact
- // match.
- const string& matcherTerm = _caseSensitive ? stemmer.stem( token ) : boundsTerm;
- if ( negated ) {
- _negatedTerms.insert( matcherTerm );
+ if (!_caseSensitive) {
+ return;
}
- else {
- _positiveTerms.insert( matcherTerm );
+
+ tokenizer->reset(token.c_str(), true);
+
+ // If we want case-sensitivity, get the case-sensitive token
+ while (tokenizer->moveNext()) {
+
+ string word = tokenizer->get().toString();
+
+ activeTerms.insert(word);
}
}
- string FTSQuery::normalizeString( StringData str ) const {
- if ( _caseSensitive ) {
+ string FTSQuery::normalizeString(StringData str) const {
+ if (_caseSensitive) {
return str.toString();
}
- return tolowerString( str );
+ return tolowerString(str);
}
namespace {
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index 96317c926e5..f9ea7f2d1eb 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -79,8 +79,7 @@ namespace mongo {
static const bool caseSensitiveDefault;
private:
- void _addTerm( const StopWords& sw,
- const Stemmer& stemmer,
+ void _addTerm( FTSTokenizer* tokenizer,
const std::string& token,
bool negated );
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index fdd9ecf7824..9e68835e83b 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -33,6 +33,7 @@
#include "mongo/db/field_ref.h"
#include "mongo/db/fts/fts_element_iterator.h"
+#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/util/mongoutils/str.h"
#include "mongo/util/stringutils.h"
@@ -167,13 +168,12 @@ namespace mongo {
while ( it.more() ) {
FTSIteratorValue val = it.next();
- Stemmer stemmer( *val._language );
- Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) );
- _scoreStringV2( tools, val._text, term_freqs, val._weight );
+ std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer());
+ _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight );
}
}
- void FTSSpec::_scoreStringV2( const Tools& tools,
+ void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer,
StringData raw,
TermFrequencyMap* docScores,
double weight ) const {
@@ -182,18 +182,10 @@ namespace mongo {
unsigned numTokens = 0;
- Tokenizer i( tools.language, raw );
- while ( i.more() ) {
- Token t = i.next();
- if ( t.type != Token::TEXT ) {
- continue;
- }
+ tokenizer->reset(raw.rawData(), false );
- string term = tolowerString( t.data );
- if ( tools.stopwords->isStopWord( term ) ) {
- continue;
- }
- term = tools.stemmer->stem( term );
+ while (tokenizer->moveNext()) {
+ string term = tokenizer->get().toString();
ScoreHelperStruct& data = terms[term];
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index c35bc2c9010..0f17d825dcc 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -119,7 +119,7 @@ namespace mongo {
* Calculate the term scores for 'raw' and update 'term_freqs' with the result. Parses
* 'raw' using 'tools', and weights term scores based on 'weight'.
*/
- void _scoreStringV2( const Tools& tools,
+ void _scoreStringV2( FTSTokenizer* tokenizer,
StringData raw,
TermFrequencyMap* term_freqs,
double weight ) const;
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index 69721fe2ae0..a2dc1dc2489 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -73,7 +73,7 @@ namespace mongo {
unsigned numTokens = 0;
- Tokenizer i( tools.language, raw );
+ Tokenizer i( &tools.language, raw );
while ( i.more() ) {
Token t = i.next();
if ( t.type != Token::TEXT )
@@ -162,8 +162,8 @@ namespace mongo {
const FTSLanguage& language = _getLanguageToUseV1( obj );
- Stemmer stemmer(language);
- Tools tools(language, &stemmer, StopWords::getStopWords( language ));
+ Stemmer stemmer(&language);
+ Tools tools(language, &stemmer, StopWords::getStopWords( &language ));
if ( wildcard() ) {
// if * is specified for weight, we can recurse over all fields.
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
new file mode 100644
index 00000000000..65833aff0cb
--- /dev/null
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -0,0 +1,72 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/base/string_data.h"
+
+namespace mongo {
+namespace fts {
+
+ class FTSLanguage;
+ class StopWords;
+
+ /**
+ * FTSTokenizer
+ * A iterator of "documents" where a document contains space delimited words.
+ * For each word returns a stem or lemma version of a word optimized for full text indexing.
+ * Optionally supports returning case sensitive search terms.
+ */
+ class FTSTokenizer {
+ public:
+ virtual ~FTSTokenizer() = default;
+
+ /**
+ * Process a new document, and discards any previous results.
+ * May be called multiple times on an instance of an iterator.
+ */
+ virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0;
+
+ /**
+ * Moves to the next token in the iterator.
+ * Returns false when the iterator reaches end of the document.
+ */
+ virtual bool moveNext() = 0;
+
+ /**
+ * Returns stemmed form, normalized, and lowercased depending on the parameter
+ * to the reset method.
+ * Returned StringData is valid until next call to moveNext().
+ */
+ virtual StringData get() const = 0;
+ };
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 4a734dbe316..9353fccf297 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -40,10 +40,10 @@ namespace mongo {
using std::string;
- Stemmer::Stemmer( const FTSLanguage& language ) {
+ Stemmer::Stemmer( const FTSLanguage* language ) {
_stemmer = NULL;
- if ( language.str() != "none" )
- _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8");
+ if ( language->str() != "none" )
+ _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8");
}
Stemmer::~Stemmer() {
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index 6abba8abddc..d6d76e64218 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -49,7 +49,7 @@ namespace mongo {
class Stemmer {
MONGO_DISALLOW_COPYING( Stemmer );
public:
- Stemmer( const FTSLanguage& language );
+ Stemmer( const FTSLanguage* language );
~Stemmer();
std::string stem( StringData word ) const;
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index 9037715d4da..bef556bf2ad 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -38,13 +38,13 @@ namespace mongo {
namespace fts {
TEST( English, Stemmer1 ) {
- Stemmer s( languageEnglishV2 );
+ Stemmer s( &languageEnglishV2 );
ASSERT_EQUALS( "run", s.stem( "running" ) );
ASSERT_EQUALS( "Run", s.stem( "Running" ) );
}
TEST( English, Caps ) {
- Stemmer s( languagePorterV1 );
+ Stemmer s( &languagePorterV1 );
ASSERT_EQUALS( "unit", s.stem( "united" ) );
ASSERT_EQUALS( "Unite", s.stem( "United" ) );
}
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 66240a1ce2d..421bfae63db 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -28,7 +28,6 @@
* it in the license file.
*/
-#include <boost/shared_ptr.hpp>
#include <set>
#include <string>
@@ -37,18 +36,14 @@
#include "mongo/base/init.h"
#include "mongo/util/string_map.h"
-
-
namespace mongo {
- using boost::shared_ptr;
-
namespace fts {
void loadStopWordMap( StringMap< std::set< std::string > >* m );
namespace {
- StringMap< boost::shared_ptr<StopWords> > STOP_WORDS;
+ StringMap< std::shared_ptr<StopWords> > StopWordsMap;
StopWords empty;
}
@@ -61,9 +56,9 @@ namespace mongo {
_words.insert( *i );
}
- const StopWords* StopWords::getStopWords( const FTSLanguage& language ) {
- StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() );
- if ( i == STOP_WORDS.end() )
+ const StopWords* StopWords::getStopWords( const FTSLanguage* language ) {
+ auto i = StopWordsMap.find( language->str() );
+ if ( i == StopWordsMap.end() )
return &empty;
return i->second.get();
}
@@ -75,7 +70,7 @@ namespace mongo {
for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin();
i != raw.end();
++i ) {
- STOP_WORDS[i->first].reset(new StopWords( i->second ));
+ StopWordsMap[i->first].reset(new StopWords( i->second ));
}
return Status::OK();
}
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index 4789535ef4d..d989b4dcd32 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -53,7 +53,7 @@ namespace mongo {
size_t numStopWords() const { return _words.size(); }
- static const StopWords* getStopWords( const FTSLanguage& language );
+ static const StopWords* getStopWords( const FTSLanguage* language );
private:
unordered_set<std::string> _words;
};
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index 0edf4e2540c..248c4d93407 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -36,7 +36,7 @@ namespace mongo {
namespace fts {
TEST( English, Basic1 ) {
- const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 );
+ const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 );
ASSERT( englishStopWords->isStopWord( "the" ) );
ASSERT( !englishStopWords->isStopWord( "computer" ) );
}
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index ee60f99d588..6896924ae31 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -38,9 +38,9 @@ namespace mongo {
namespace fts {
- Tokenizer::Tokenizer( const FTSLanguage& language, StringData str )
+ Tokenizer::Tokenizer( const FTSLanguage* language, StringData str )
: _pos(0), _raw( str ) {
- _english = ( language.str() == "english" );
+ _english = ( language->str() == "english" );
_skipWhitespace();
_previousWhiteSpace = true;
}
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index cd0d76a4f70..6e449124b3a 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -62,7 +62,7 @@ namespace mongo {
MONGO_DISALLOW_COPYING( Tokenizer );
public:
- Tokenizer( const FTSLanguage& language, StringData str );
+ Tokenizer( const FTSLanguage* language, StringData str );
bool more() const;
Token next();
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index 29153a329a6..a6692f3456d 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -36,12 +36,12 @@ namespace mongo {
namespace fts {
TEST( Tokenizer, Empty1 ) {
- Tokenizer i( languageEnglishV2, "" );
+ Tokenizer i( &languageEnglishV2, "" );
ASSERT( !i.more() );
}
TEST( Tokenizer, Basic1 ) {
- Tokenizer i( languageEnglishV2, "blue red green" );
+ Tokenizer i( &languageEnglishV2, "blue red green" );
ASSERT( i.more() );
ASSERT_EQUALS( i.next().data.toString(), "blue" );
@@ -56,7 +56,7 @@ namespace mongo {
}
TEST( Tokenizer, Basic2 ) {
- Tokenizer i( languageEnglishV2, "blue-red" );
+ Tokenizer i( &languageEnglishV2, "blue-red" );
Token a = i.next();
Token b = i.next();
@@ -78,7 +78,7 @@ namespace mongo {
}
TEST( Tokenizer, Basic3 ) {
- Tokenizer i( languageEnglishV2, "blue -red" );
+ Tokenizer i( &languageEnglishV2, "blue -red" );
Token a = i.next();
Token b = i.next();
@@ -105,7 +105,7 @@ namespace mongo {
}
TEST( Tokenizer, Quote1English ) {
- Tokenizer i( languageEnglishV2, "eliot's car" );
+ Tokenizer i( &languageEnglishV2, "eliot's car" );
Token a = i.next();
Token b = i.next();
@@ -115,7 +115,7 @@ namespace mongo {
}
TEST( Tokenizer, Quote1French ) {
- Tokenizer i( languageFrenchV2, "eliot's car" );
+ Tokenizer i( &languageFrenchV2, "eliot's car" );
Token a = i.next();
Token b = i.next();