SERVER-17520: Add support for pluggable FTS tokenizers

author: Mark Benvenuto <mark.benvenuto@mongodb.com> 2015-04-01 14:34:39 -0400
committer: Mark Benvenuto <mark.benvenuto@mongodb.com> 2015-04-01 14:58:13 -0400
commit: 72598f750d732c08c98f5f578bf1335acd78e10e (patch)
tree: d80364b07b25210f5724ba6e6506650be657c74e /src/mongo
parent: 3cf0c18aa2c56949fda47ab35570489d68965370 (diff)
download: mongo-72598f750d732c08c98f5f578bf1335acd78e10e.tar.gz
21 files changed, 355 insertions, 100 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 6ccc070fd64..5a782014600 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -33,6 +33,7 @@ baseEnv.Library('base', [
         'fts_spec.cpp',
         'fts_spec_legacy.cpp',
         'fts_language.cpp',
+        'fts_basic_tokenizer.cpp',
         'fts_util.cpp',
         'fts_element_iterator.cpp',
         'stemmer.cpp',
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
new file mode 100644
index 00000000000..296f473f144
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp
@@ -0,0 +1,90 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/fts/fts_basic_tokenizer.h"
+
+#include "mongo/db/fts/fts_query.h"
+#include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/stop_words.h"
+#include "mongo/db/fts/tokenizer.h"
+#include "mongo/stdx/memory.h"
+#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/stringutils.h"
+
+namespace mongo {
+namespace fts {
+
+    using std::string;
+
+    BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language)
+        : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
+    }
+
+    void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) {
+        _generateCaseSensitiveTokens = generateCaseSensitiveTokens;
+        _tokenizer = stdx::make_unique<Tokenizer>(_language, document);
+    }
+
+    bool BasicFTSTokenizer::moveNext() {
+        while (true) {
+            bool hasMore = _tokenizer->more();
+            if (!hasMore) {
+                _stem = "";
+                return false;
+            }
+
+            Token token = _tokenizer->next();
+
+            string word = token.data.toString();
+
+            word = tolowerString(token.data);
+
+            // Stop words are case-sensitive so we need them to be lower cased to check
+            // against the stop word list
+            if (_stopWords->isStopWord(word)) {
+                continue;
+            }
+
+            if (_generateCaseSensitiveTokens) {
+                word = token.data.toString();
+            }
+
+            _stem = _stemmer.stem(word);
+            return true;
+        }
+    }
+
+    StringData BasicFTSTokenizer::get() const {
+        return _stem;
+    }
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
new file mode 100644
index 00000000000..fd59a4583fc
--- /dev/null
+++ b/src/mongo/db/fts/fts_basic_tokenizer.h
@@ -0,0 +1,79 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/base/string_data.h"
+#include "mongo/db/fts/fts_tokenizer.h"
+#include "mongo/db/fts/stemmer.h"
+#include "mongo/db/fts/tokenizer.h"
+
+namespace mongo {
+namespace fts {
+
+    class FTSLanguage;
+    class StopWords;
+
+    /**
+     * BasicFTSTokenizer
+     * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words.
+     * Uses
+     * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space).
+     * - tolower from the C standard libary to lower letters, ie, it only supports lower casing
+     * -     ASCII letters (U+0000 - U+007F)
+     * - Stemmer (ie, Snowball Stemmer) to stem words.
+     * - Embeded stop word lists for each language in StopWord class
+     *
+     * For each word returns a stem version of a word optimized for full text indexing.
+     * Optionally supports returning case sensitive search terms.
+     */
+    class BasicFTSTokenizer : public FTSTokenizer {
+        MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
+    public:
+        BasicFTSTokenizer(const FTSLanguage* language);
+
+        void reset(const char* document, bool generateCaseSensitiveTokens) override;
+
+        bool moveNext() override;
+
+        StringData get() const override;
+
+    private:
+        const FTSLanguage* const _language;
+        const Stemmer _stemmer;
+        const StopWords* const _stopWords;
+
+        std::unique_ptr<Tokenizer> _tokenizer;
+        bool _generateCaseSensitiveTokens;
+
+        std::string _stem;
+    };
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index edb2b7cf363..9164d157508 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -33,6 +33,8 @@
 #include <string>
  
 #include "mongo/base/init.h"
+#include "mongo/db/fts/fts_basic_tokenizer.h"
+#include "mongo/stdx/memory.h"
 #include "mongo/util/assert_util.h"
 #include "mongo/util/mongoutils/str.h"
 #include "mongo/util/string_map.h"
@@ -79,6 +81,10 @@ namespace mongo {
             LanguageMapV1 languageMapV1;
         }
 
+        std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
+            return stdx::make_unique<BasicFTSTokenizer>(this);
+        }
+
         MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
                                  MONGO_NO_DEPENDENTS );
 
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index 3a9acbbdd94..ce45e0b812a 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -39,8 +39,10 @@ namespace mongo {
 
     namespace fts {
 
+        class FTSTokenizer;
+
         #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \
-            FTSLanguage language; \
+            BasicFTSLanguage language; \
             MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \
                                        ( "FTSAllLanguagesRegistered" ) ) \
                                      ( ::mongo::InitializerContext* context ) { \
@@ -70,6 +72,8 @@ namespace mongo {
             /** Create an uninitialized language. */
             FTSLanguage();
 
+            virtual ~FTSLanguage() {}
+
             /**
              * Returns the language as a std::string in canonical form (lowercased English name).  It is
              * an error to call str() on an uninitialized language.
@@ -77,6 +81,12 @@ namespace mongo {
             const std::string& str() const;
 
             /**
+             * Returns a new FTSTokenizer instance for this language.
+             * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
+             */
+            virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
+
+            /**
              * Register std::string 'languageName' as a new language with text index version
              * 'textIndexVersion'.  Saves the resulting language to out-argument 'languageOut'.
              * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
@@ -120,9 +130,15 @@ namespace mongo {
 
         typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
 
-        extern FTSLanguage languagePorterV1;
-        extern FTSLanguage languageEnglishV2;
-        extern FTSLanguage languageFrenchV2;
+
+        class BasicFTSLanguage : public FTSLanguage {
+        public:
+            std::unique_ptr<FTSTokenizer> createTokenizer() const override;
+        };
+
+        extern BasicFTSLanguage languagePorterV1;
+        extern BasicFTSLanguage languageEnglishV2;
+        extern BasicFTSLanguage languageFrenchV2;
 
     }
 }
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 492dbdf7b7b..634bcf345cd 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -31,6 +31,7 @@
 #include "mongo/platform/basic.h"
 
 #include "mongo/db/fts/fts_matcher.h"
+#include "mongo/db/fts/fts_tokenizer.h"
 #include "mongo/db/fts/fts_element_iterator.h"
 #include "mongo/platform/strcasestr.h"
 
@@ -96,15 +97,13 @@ namespace mongo {
 
         bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language,
                                                   const string& raw ) const {
-            Tokenizer i( *language, raw );
-            Stemmer stemmer( *language );
-            while ( i.more() ) {
-                Token t = i.next();
-                if ( t.type != Token::TEXT ) {
-                    continue;
-                }
-                string word = stemmer.stem( _query.normalizeString( t.data ) );
-                if ( _query.getPositiveTerms().count( word ) > 0 ) {
+            std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
+
+            tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
+
+            while (tokenizer->moveNext()) {
+                string word = tokenizer->get().toString();
+                if (_query.getPositiveTerms().count(word) > 0) {
                     return true;
                 }
             }
@@ -130,14 +129,12 @@ namespace mongo {
 
         bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language,
                                                   const string& raw ) const {
-            Tokenizer i( *language, raw );
-            Stemmer stemmer( *language );
-            while ( i.more() ) {
-                Token t = i.next();
-                if ( t.type != Token::TEXT ) {
-                    continue;
-                }
-                string word = stemmer.stem( _query.normalizeString( t.data ) );
+            std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
+
+            tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
+
+            while (tokenizer->moveNext()) {
+                string word = tokenizer->get().toString();
                 if ( _query.getNegatedTerms().count( word ) > 0 ) {
                     return true;
                 }
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index 9088719d11e..e05aa5693cc 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -33,6 +33,7 @@
 #include "mongo/db/fts/fts_query.h"
 
 #include "mongo/db/fts/fts_spec.h"
+#include "mongo/db/fts/fts_tokenizer.h"
 #include "mongo/db/fts/tokenizer.h"
 #include "mongo/util/mongoutils/str.h"
 #include "mongo/util/stringutils.h"
@@ -59,15 +60,14 @@ namespace mongo {
             _language = swl.getValue();
             _caseSensitive = caseSensitive;
 
-            const StopWords& stopWords = *StopWords::getStopWords( *_language );
-            Stemmer stemmer( *_language );
+            std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer());
 
             bool inNegation = false;
             bool inPhrase = false;
 
             unsigned quoteOffset = 0;
 
-            Tokenizer i( *_language, query );
+            Tokenizer i( _language, query );
             while ( i.more() ) {
                 Token t = i.next();
 
@@ -78,7 +78,7 @@ namespace mongo {
                         // don't add term
                     }
                     else {
-                        _addTerm( stopWords, stemmer, s, inNegation );
+                        _addTerm( tokenizer.get(), s, inNegation );
                     }
 
                     if ( inNegation && !inPhrase )
@@ -122,44 +122,52 @@ namespace mongo {
             return Status::OK();
         }
 
-        void FTSQuery::_addTerm( const StopWords& sw,
-                                 const Stemmer& stemmer,
+        void FTSQuery::_addTerm( FTSTokenizer* tokenizer,
                                  const string& token,
                                  bool negated ) {
-            // Compute the string corresponding to 'token' that will be used for index bounds
-            // generation.
-            string boundsTerm = tolowerString( token );
-            if ( sw.isStopWord( boundsTerm ) ) {
-                return;
-            }
-            boundsTerm = stemmer.stem( boundsTerm );
+            tokenizer->reset(token.c_str(), false);
+
+            auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
+
+            // First, get all the terms for indexing, ie, lower cased words
+            // If we are case-insensitive, we can also used this for positive, and negative terms
+            // Some terms may be expanded into multiple words in some non-English languages
+            while (tokenizer->moveNext()) {
+
+                string word = tokenizer->get().toString();
+
+                if (!negated) {
+                    _termsForBounds.insert(word);
+                }
 
-            // If the lowercased version of 'token' is a not a stop word, 'token' itself should also
-            // not be.
-            dassert( !sw.isStopWord( token ) );
-            if ( !negated ) {
-                _termsForBounds.insert( boundsTerm );
+                // Compute the string corresponding to 'token' that will be used for the matcher.
+                // For case-insensitive queries, this is the same string as 'boundsTerm' computed
+                // above.
+                if (!_caseSensitive) {
+                    activeTerms.insert(word);
+                }
             }
 
-            // Compute the string corresponding to 'token' that will be used for the matcher.  For
-            // case-insensitive queries, this is the same string as 'boundsTerm' computed above.
-            // However, for case-sensitive queries we need to re-stem the original token, since
-            // 'boundsTerm' is already lowercased but we need the original casing for an exact
-            // match.
-            const string& matcherTerm = _caseSensitive ? stemmer.stem( token ) : boundsTerm;
-            if ( negated ) {
-                _negatedTerms.insert( matcherTerm );
+            if (!_caseSensitive) {
+                return;
             }
-            else {
-                _positiveTerms.insert( matcherTerm );
+
+            tokenizer->reset(token.c_str(), true);
+
+            // If we want case-sensitivity, get the case-sensitive token
+            while (tokenizer->moveNext()) {
+
+                string word = tokenizer->get().toString();
+
+                activeTerms.insert(word);
             }
         }
 
-        string FTSQuery::normalizeString( StringData str ) const {
-            if ( _caseSensitive ) {
+        string FTSQuery::normalizeString(StringData str) const {
+            if (_caseSensitive) {
                 return str.toString();
             }
-            return tolowerString( str );
+            return tolowerString(str);
         }
 
         namespace {
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index 96317c926e5..f9ea7f2d1eb 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -79,8 +79,7 @@ namespace mongo {
             static const bool caseSensitiveDefault;
 
         private:
-            void _addTerm( const StopWords& sw,
-                           const Stemmer& stemmer,
+            void _addTerm( FTSTokenizer* tokenizer,
                            const std::string& token,
                            bool negated );
 
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index fdd9ecf7824..9e68835e83b 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -33,6 +33,7 @@
 
 #include "mongo/db/field_ref.h"
 #include "mongo/db/fts/fts_element_iterator.h"
+#include "mongo/db/fts/fts_tokenizer.h"
 #include "mongo/db/fts/fts_util.h"
 #include "mongo/util/mongoutils/str.h"
 #include "mongo/util/stringutils.h"
@@ -167,13 +168,12 @@ namespace mongo {
 
             while ( it.more() ) {
                 FTSIteratorValue val = it.next();
-                Stemmer stemmer( *val._language );
-                Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) );
-                _scoreStringV2( tools, val._text, term_freqs, val._weight );
+                std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer());
+                _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight );
             }
         }
 
-        void FTSSpec::_scoreStringV2( const Tools& tools,
+        void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer,
                                       StringData raw,
                                       TermFrequencyMap* docScores,
                                       double weight ) const {
@@ -182,18 +182,10 @@ namespace mongo {
 
             unsigned numTokens = 0;
 
-            Tokenizer i( tools.language, raw );
-            while ( i.more() ) {
-                Token t = i.next();
-                if ( t.type != Token::TEXT ) {
-                    continue;
-                }
+            tokenizer->reset(raw.rawData(), false );
 
-                string term = tolowerString( t.data );
-                if ( tools.stopwords->isStopWord( term ) ) {
-                    continue;
-                }
-                term = tools.stemmer->stem( term );
+            while (tokenizer->moveNext()) {
+                string term = tokenizer->get().toString();
 
                 ScoreHelperStruct& data = terms[term];
 
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index c35bc2c9010..0f17d825dcc 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -119,7 +119,7 @@ namespace mongo {
              * Calculate the term scores for 'raw' and update 'term_freqs' with the result.  Parses
              * 'raw' using 'tools', and weights term scores based on 'weight'.
              */
-            void _scoreStringV2( const Tools& tools,
+            void _scoreStringV2( FTSTokenizer* tokenizer,
                                  StringData raw,
                                  TermFrequencyMap* term_freqs,
                                  double weight ) const;
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index 69721fe2ae0..a2dc1dc2489 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -73,7 +73,7 @@ namespace mongo {
 
             unsigned numTokens = 0;
 
-            Tokenizer i( tools.language, raw );
+            Tokenizer i( &tools.language, raw );
             while ( i.more() ) {
                 Token t = i.next();
                 if ( t.type != Token::TEXT )
@@ -162,8 +162,8 @@ namespace mongo {
 
             const FTSLanguage& language = _getLanguageToUseV1( obj );
 
-            Stemmer stemmer(language);
-            Tools tools(language, &stemmer, StopWords::getStopWords( language ));
+            Stemmer stemmer(&language);
+            Tools tools(language, &stemmer, StopWords::getStopWords( &language ));
 
             if ( wildcard() ) {
                 // if * is specified for weight, we can recurse over all fields.
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
new file mode 100644
index 00000000000..65833aff0cb
--- /dev/null
+++ b/src/mongo/db/fts/fts_tokenizer.h
@@ -0,0 +1,72 @@
+/**
+ *    Copyright (C) 2015 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+
+#pragma once
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/base/string_data.h"
+
+namespace mongo {
+namespace fts {
+
+    class FTSLanguage;
+    class StopWords;
+
+    /**
+     * FTSTokenizer
+     * A iterator of "documents" where a document contains space delimited words.
+     * For each word returns a stem or lemma version of a word optimized for full text indexing.
+     * Optionally supports returning case sensitive search terms.
+     */
+    class FTSTokenizer {
+    public:
+        virtual ~FTSTokenizer() = default;
+
+        /**
+         * Process a new document, and discards any previous results.
+         * May be called multiple times on an instance of an iterator.
+         */
+        virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0;
+
+        /**
+         * Moves to the next token in the iterator.
+         * Returns false when the iterator reaches end of the document.
+         */
+        virtual bool moveNext() = 0;
+
+        /**
+         * Returns stemmed form, normalized, and lowercased depending on the parameter
+         * to the reset method.
+         * Returned StringData is valid until next call to moveNext().
+         */
+        virtual StringData get() const = 0;
+    };
+
+} // namespace fts
+} // namespace mongo
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 4a734dbe316..9353fccf297 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -40,10 +40,10 @@ namespace mongo {
 
         using std::string;
 
-        Stemmer::Stemmer( const FTSLanguage& language ) {
+        Stemmer::Stemmer( const FTSLanguage* language ) {
             _stemmer = NULL;
-            if ( language.str() != "none" )
-                _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8");
+            if ( language->str() != "none" )
+                _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8");
         }
 
         Stemmer::~Stemmer() {
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index 6abba8abddc..d6d76e64218 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -49,7 +49,7 @@ namespace mongo {
         class Stemmer {
             MONGO_DISALLOW_COPYING( Stemmer );
         public:
-            Stemmer( const FTSLanguage& language );
+            Stemmer( const FTSLanguage* language );
             ~Stemmer();
 
             std::string stem( StringData word ) const;
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index 9037715d4da..bef556bf2ad 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -38,13 +38,13 @@ namespace mongo {
     namespace fts {
 
         TEST( English, Stemmer1 ) {
-            Stemmer s( languageEnglishV2 );
+            Stemmer s( &languageEnglishV2 );
             ASSERT_EQUALS( "run", s.stem( "running" ) );
             ASSERT_EQUALS( "Run", s.stem( "Running" ) );
         }
 
         TEST( English, Caps ) {
-            Stemmer s( languagePorterV1 );
+            Stemmer s( &languagePorterV1 );
             ASSERT_EQUALS( "unit", s.stem( "united" ) );
             ASSERT_EQUALS( "Unite", s.stem( "United" ) );
         }
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 66240a1ce2d..421bfae63db 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -28,7 +28,6 @@
 *    it in the license file.
 */
 
-#include <boost/shared_ptr.hpp>
 #include <set>
 #include <string>
 
@@ -37,18 +36,14 @@
 #include "mongo/base/init.h"
 #include "mongo/util/string_map.h"
 
-
-
 namespace mongo {
 
-    using boost::shared_ptr;
-
     namespace fts {
 
         void loadStopWordMap( StringMap< std::set< std::string > >* m );
 
         namespace {
-            StringMap< boost::shared_ptr<StopWords> > STOP_WORDS;
+            StringMap< std::shared_ptr<StopWords> > StopWordsMap;
             StopWords empty;
         }
 
@@ -61,9 +56,9 @@ namespace mongo {
                 _words.insert( *i );
         }
 
-        const StopWords* StopWords::getStopWords( const FTSLanguage& language ) {
-            StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() );
-            if ( i == STOP_WORDS.end() )
+        const StopWords* StopWords::getStopWords( const FTSLanguage* language ) {
+            auto i = StopWordsMap.find( language->str() );
+            if ( i == StopWordsMap.end() )
                 return &empty;
             return i->second.get();
         }
@@ -75,7 +70,7 @@ namespace mongo {
             for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin();
                   i != raw.end();
                   ++i ) {
-                STOP_WORDS[i->first].reset(new StopWords( i->second ));
+                StopWordsMap[i->first].reset(new StopWords( i->second ));
             }
             return Status::OK();
         }
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index 4789535ef4d..d989b4dcd32 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -53,7 +53,7 @@ namespace mongo {
 
             size_t numStopWords() const { return _words.size(); }
 
-            static const StopWords* getStopWords( const FTSLanguage& language );
+            static const StopWords* getStopWords( const FTSLanguage* language );
         private:
             unordered_set<std::string> _words;
         };
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index 0edf4e2540c..248c4d93407 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -36,7 +36,7 @@ namespace mongo {
     namespace fts {
 
         TEST( English, Basic1 ) {
-            const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 );
+            const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 );
             ASSERT( englishStopWords->isStopWord( "the" ) );
             ASSERT( !englishStopWords->isStopWord( "computer" ) );
         }
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index ee60f99d588..6896924ae31 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -38,9 +38,9 @@ namespace mongo {
 
     namespace fts {
 
-        Tokenizer::Tokenizer( const FTSLanguage& language, StringData str )
+        Tokenizer::Tokenizer( const FTSLanguage* language, StringData str )
             : _pos(0), _raw( str ) {
-            _english = ( language.str() == "english" );
+            _english = ( language->str() == "english" );
             _skipWhitespace();
             _previousWhiteSpace = true;
         }
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index cd0d76a4f70..6e449124b3a 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -62,7 +62,7 @@ namespace mongo {
             MONGO_DISALLOW_COPYING( Tokenizer );
         public:
 
-            Tokenizer( const FTSLanguage& language, StringData str );
+            Tokenizer( const FTSLanguage* language, StringData str );
 
             bool more() const;
             Token next();
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index 29153a329a6..a6692f3456d 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -36,12 +36,12 @@ namespace mongo {
     namespace fts {
 
         TEST( Tokenizer, Empty1 ) {
-            Tokenizer i( languageEnglishV2, "" );
+            Tokenizer i( &languageEnglishV2, "" );
             ASSERT( !i.more() );
         }
 
         TEST( Tokenizer, Basic1 ) {
-            Tokenizer i( languageEnglishV2, "blue red green" );
+            Tokenizer i( &languageEnglishV2, "blue red green" );
 
             ASSERT( i.more() );
             ASSERT_EQUALS( i.next().data.toString(), "blue" );
@@ -56,7 +56,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Basic2 ) {
-            Tokenizer i( languageEnglishV2, "blue-red" );
+            Tokenizer i( &languageEnglishV2, "blue-red" );
 
             Token a = i.next();
             Token b = i.next();
@@ -78,7 +78,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Basic3 ) {
-            Tokenizer i( languageEnglishV2, "blue -red" );
+            Tokenizer i( &languageEnglishV2, "blue -red" );
 
             Token a = i.next();
             Token b = i.next();
@@ -105,7 +105,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Quote1English ) {
-            Tokenizer i( languageEnglishV2, "eliot's car" );
+            Tokenizer i( &languageEnglishV2, "eliot's car" );
 
             Token a = i.next();
             Token b = i.next();
@@ -115,7 +115,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Quote1French ) {
-            Tokenizer i( languageFrenchV2, "eliot's car" );
+            Tokenizer i( &languageFrenchV2, "eliot's car" );
 
             Token a = i.next();
             Token b = i.next();
author	Mark Benvenuto <mark.benvenuto@mongodb.com>	2015-04-01 14:34:39 -0400
committer	Mark Benvenuto <mark.benvenuto@mongodb.com>	2015-04-01 14:58:13 -0400
commit	72598f750d732c08c98f5f578bf1335acd78e10e (patch)
tree	d80364b07b25210f5724ba6e6506650be657c74e /src/mongo
parent	3cf0c18aa2c56949fda47ab35570489d68965370 (diff)
download	mongo-72598f750d732c08c98f5f578bf1335acd78e10e.tar.gz