Revert "FTS Tokenizer"

This reverts commit 0bed4262dac849788e6571dc404d5d261b9e1c8c.
author: Geert Bosch <geert@mongodb.com> 2015-03-30 15:43:54 -0400
committer: Geert Bosch <geert@mongodb.com> 2015-03-30 15:43:54 -0400
commit: edc67399aef9bded106c0196d4af843f23a8acc9 (patch)
tree: dbb5cebdf54af86e9ee28ced798377579f393722
parent: 465bb26c0fb0f4731f4dbb5e09e0a791177bbc64 (diff)
download: mongo-edc67399aef9bded106c0196d4af843f23a8acc9.tar.gz
21 files changed, 100 insertions, 355 deletions
diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript
index 5a782014600..6ccc070fd64 100644
--- a/src/mongo/db/fts/SConscript
+++ b/src/mongo/db/fts/SConscript
@@ -33,7 +33,6 @@ baseEnv.Library('base', [
         'fts_spec.cpp',
         'fts_spec_legacy.cpp',
         'fts_language.cpp',
-        'fts_basic_tokenizer.cpp',
         'fts_util.cpp',
         'fts_element_iterator.cpp',
         'stemmer.cpp',
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp
deleted file mode 100644
index 296f473f144..00000000000
--- a/src/mongo/db/fts/fts_basic_tokenizer.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- *    Copyright (C) 2015 MongoDB Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *    As a special exception, the copyright holders give permission to link the
- *    code of portions of this program with the OpenSSL library under certain
- *    conditions as described in each individual source file and distribute
- *    linked combinations including the program with the OpenSSL library. You
- *    must comply with the GNU Affero General Public License in all respects for
- *    all of the code used other than as permitted herein. If you modify file(s)
- *    with this exception, you may extend this exception to your version of the
- *    file(s), but you are not obligated to do so. If you do not wish to do so,
- *    delete this exception statement from your version. If you delete this
- *    exception statement from all source files in the program, then also delete
- *    it in the license file.
- */
-
-#include "mongo/platform/basic.h"
-
-#include "mongo/db/fts/fts_basic_tokenizer.h"
-
-#include "mongo/db/fts/fts_query.h"
-#include "mongo/db/fts/fts_spec.h"
-#include "mongo/db/fts/stemmer.h"
-#include "mongo/db/fts/stop_words.h"
-#include "mongo/db/fts/tokenizer.h"
-#include "mongo/stdx/memory.h"
-#include "mongo/util/mongoutils/str.h"
-#include "mongo/util/stringutils.h"
-
-namespace mongo {
-namespace fts {
-
-    using std::string;
-
-    BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language)
-        : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) {
-    }
-
-    void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) {
-        _generateCaseSensitiveTokens = generateCaseSensitiveTokens;
-        _tokenizer = stdx::make_unique<Tokenizer>(_language, document);
-    }
-
-    bool BasicFTSTokenizer::moveNext() {
-        while (true) {
-            bool hasMore = _tokenizer->more();
-            if (!hasMore) {
-                _stem = "";
-                return false;
-            }
-
-            Token token = _tokenizer->next();
-
-            string word = token.data.toString();
-
-            word = tolowerString(token.data);
-
-            // Stop words are case-sensitive so we need them to be lower cased to check
-            // against the stop word list
-            if (_stopWords->isStopWord(word)) {
-                continue;
-            }
-
-            if (_generateCaseSensitiveTokens) {
-                word = token.data.toString();
-            }
-
-            _stem = _stemmer.stem(word);
-            return true;
-        }
-    }
-
-    StringData BasicFTSTokenizer::get() const {
-        return _stem;
-    }
-
-} // namespace fts
-} // namespace mongo
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.h b/src/mongo/db/fts/fts_basic_tokenizer.h
deleted file mode 100644
index fd59a4583fc..00000000000
--- a/src/mongo/db/fts/fts_basic_tokenizer.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- *    Copyright (C) 2015 MongoDB Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *    As a special exception, the copyright holders give permission to link the
- *    code of portions of this program with the OpenSSL library under certain
- *    conditions as described in each individual source file and distribute
- *    linked combinations including the program with the OpenSSL library. You
- *    must comply with the GNU Affero General Public License in all respects for
- *    all of the code used other than as permitted herein. If you modify file(s)
- *    with this exception, you may extend this exception to your version of the
- *    file(s), but you are not obligated to do so. If you do not wish to do so,
- *    delete this exception statement from your version. If you delete this
- *    exception statement from all source files in the program, then also delete
- *    it in the license file.
- */
-
-#pragma once
-
-#include "mongo/base/disallow_copying.h"
-#include "mongo/base/string_data.h"
-#include "mongo/db/fts/fts_tokenizer.h"
-#include "mongo/db/fts/stemmer.h"
-#include "mongo/db/fts/tokenizer.h"
-
-namespace mongo {
-namespace fts {
-
-    class FTSLanguage;
-    class StopWords;
-
-    /**
-     * BasicFTSTokenizer
-     * A iterator of "documents" where a document contains ASCII space (U+0020) delimited words.
-     * Uses
-     * - Tokenizer for tokenizing words via ASCII space (ie, U+0020 space).
-     * - tolower from the C standard libary to lower letters, ie, it only supports lower casing
-     * -     ASCII letters (U+0000 - U+007F)
-     * - Stemmer (ie, Snowball Stemmer) to stem words.
-     * - Embeded stop word lists for each language in StopWord class
-     *
-     * For each word returns a stem version of a word optimized for full text indexing.
-     * Optionally supports returning case sensitive search terms.
-     */
-    class BasicFTSTokenizer : public FTSTokenizer {
-        MONGO_DISALLOW_COPYING(BasicFTSTokenizer);
-    public:
-        BasicFTSTokenizer(const FTSLanguage* language);
-
-        void reset(const char* document, bool generateCaseSensitiveTokens) override;
-
-        bool moveNext() override;
-
-        StringData get() const override;
-
-    private:
-        const FTSLanguage* const _language;
-        const Stemmer _stemmer;
-        const StopWords* const _stopWords;
-
-        std::unique_ptr<Tokenizer> _tokenizer;
-        bool _generateCaseSensitiveTokens;
-
-        std::string _stem;
-    };
-
-} // namespace fts
-} // namespace mongo
diff --git a/src/mongo/db/fts/fts_language.cpp b/src/mongo/db/fts/fts_language.cpp
index 9164d157508..edb2b7cf363 100644
--- a/src/mongo/db/fts/fts_language.cpp
+++ b/src/mongo/db/fts/fts_language.cpp
@@ -33,8 +33,6 @@
 #include <string>
  
 #include "mongo/base/init.h"
-#include "mongo/db/fts/fts_basic_tokenizer.h"
-#include "mongo/stdx/memory.h"
 #include "mongo/util/assert_util.h"
 #include "mongo/util/mongoutils/str.h"
 #include "mongo/util/string_map.h"
@@ -81,10 +79,6 @@ namespace mongo {
             LanguageMapV1 languageMapV1;
         }
 
-        std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
-            return stdx::make_unique<BasicFTSTokenizer>(this);
-        }
-
         MONGO_INITIALIZER_GROUP( FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES,
                                  MONGO_NO_DEPENDENTS );
 
diff --git a/src/mongo/db/fts/fts_language.h b/src/mongo/db/fts/fts_language.h
index ce45e0b812a..3a9acbbdd94 100644
--- a/src/mongo/db/fts/fts_language.h
+++ b/src/mongo/db/fts/fts_language.h
@@ -39,10 +39,8 @@ namespace mongo {
 
     namespace fts {
 
-        class FTSTokenizer;
-
         #define MONGO_FTS_LANGUAGE_DECLARE( language, name, version ) \
-            BasicFTSLanguage language; \
+            FTSLanguage language; \
             MONGO_INITIALIZER_GENERAL( language, MONGO_NO_PREREQUISITES, \
                                        ( "FTSAllLanguagesRegistered" ) ) \
                                      ( ::mongo::InitializerContext* context ) { \
@@ -72,8 +70,6 @@ namespace mongo {
             /** Create an uninitialized language. */
             FTSLanguage();
 
-            virtual ~FTSLanguage() {}
-
             /**
              * Returns the language as a std::string in canonical form (lowercased English name).  It is
              * an error to call str() on an uninitialized language.
@@ -81,12 +77,6 @@ namespace mongo {
             const std::string& str() const;
 
             /**
-             * Returns a new FTSTokenizer instance for this language.
-             * Lifetime is scoped to FTSLanguage (which are currently all process lifetime)
-             */
-            virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;
-
-            /**
              * Register std::string 'languageName' as a new language with text index version
              * 'textIndexVersion'.  Saves the resulting language to out-argument 'languageOut'.
              * Subsequent calls to FTSLanguage::make() will recognize the newly-registered language
@@ -130,15 +120,9 @@ namespace mongo {
 
         typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;
 
-
-        class BasicFTSLanguage : public FTSLanguage {
-        public:
-            std::unique_ptr<FTSTokenizer> createTokenizer() const override;
-        };
-
-        extern BasicFTSLanguage languagePorterV1;
-        extern BasicFTSLanguage languageEnglishV2;
-        extern BasicFTSLanguage languageFrenchV2;
+        extern FTSLanguage languagePorterV1;
+        extern FTSLanguage languageEnglishV2;
+        extern FTSLanguage languageFrenchV2;
 
     }
 }
diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp
index 634bcf345cd..492dbdf7b7b 100644
--- a/src/mongo/db/fts/fts_matcher.cpp
+++ b/src/mongo/db/fts/fts_matcher.cpp
@@ -31,7 +31,6 @@
 #include "mongo/platform/basic.h"
 
 #include "mongo/db/fts/fts_matcher.h"
-#include "mongo/db/fts/fts_tokenizer.h"
 #include "mongo/db/fts/fts_element_iterator.h"
 #include "mongo/platform/strcasestr.h"
 
@@ -97,13 +96,15 @@ namespace mongo {
 
         bool FTSMatcher::_hasPositiveTerm_string( const FTSLanguage* language,
                                                   const string& raw ) const {
-            std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
-
-            tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
-
-            while (tokenizer->moveNext()) {
-                string word = tokenizer->get().toString();
-                if (_query.getPositiveTerms().count(word) > 0) {
+            Tokenizer i( *language, raw );
+            Stemmer stemmer( *language );
+            while ( i.more() ) {
+                Token t = i.next();
+                if ( t.type != Token::TEXT ) {
+                    continue;
+                }
+                string word = stemmer.stem( _query.normalizeString( t.data ) );
+                if ( _query.getPositiveTerms().count( word ) > 0 ) {
                     return true;
                 }
             }
@@ -129,12 +130,14 @@ namespace mongo {
 
         bool FTSMatcher::_hasNegativeTerm_string( const FTSLanguage* language,
                                                   const string& raw ) const {
-            std::unique_ptr<FTSTokenizer> tokenizer(language->createTokenizer());
-
-            tokenizer->reset(raw.c_str(), _query.getCaseSensitive());
-
-            while (tokenizer->moveNext()) {
-                string word = tokenizer->get().toString();
+            Tokenizer i( *language, raw );
+            Stemmer stemmer( *language );
+            while ( i.more() ) {
+                Token t = i.next();
+                if ( t.type != Token::TEXT ) {
+                    continue;
+                }
+                string word = stemmer.stem( _query.normalizeString( t.data ) );
                 if ( _query.getNegatedTerms().count( word ) > 0 ) {
                     return true;
                 }
diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp
index e05aa5693cc..9088719d11e 100644
--- a/src/mongo/db/fts/fts_query.cpp
+++ b/src/mongo/db/fts/fts_query.cpp
@@ -33,7 +33,6 @@
 #include "mongo/db/fts/fts_query.h"
 
 #include "mongo/db/fts/fts_spec.h"
-#include "mongo/db/fts/fts_tokenizer.h"
 #include "mongo/db/fts/tokenizer.h"
 #include "mongo/util/mongoutils/str.h"
 #include "mongo/util/stringutils.h"
@@ -60,14 +59,15 @@ namespace mongo {
             _language = swl.getValue();
             _caseSensitive = caseSensitive;
 
-            std::unique_ptr<FTSTokenizer> tokenizer(_language->createTokenizer());
+            const StopWords& stopWords = *StopWords::getStopWords( *_language );
+            Stemmer stemmer( *_language );
 
             bool inNegation = false;
             bool inPhrase = false;
 
             unsigned quoteOffset = 0;
 
-            Tokenizer i( _language, query );
+            Tokenizer i( *_language, query );
             while ( i.more() ) {
                 Token t = i.next();
 
@@ -78,7 +78,7 @@ namespace mongo {
                         // don't add term
                     }
                     else {
-                        _addTerm( tokenizer.get(), s, inNegation );
+                        _addTerm( stopWords, stemmer, s, inNegation );
                     }
 
                     if ( inNegation && !inPhrase )
@@ -122,52 +122,44 @@ namespace mongo {
             return Status::OK();
         }
 
-        void FTSQuery::_addTerm( FTSTokenizer* tokenizer,
+        void FTSQuery::_addTerm( const StopWords& sw,
+                                 const Stemmer& stemmer,
                                  const string& token,
                                  bool negated ) {
-            tokenizer->reset(token.c_str(), false);
-
-            auto& activeTerms = negated ? _negatedTerms : _positiveTerms;
-
-            // First, get all the terms for indexing, ie, lower cased words
-            // If we are case-insensitive, we can also used this for positive, and negative terms
-            // Some terms may be expanded into multiple words in some non-English languages
-            while (tokenizer->moveNext()) {
-
-                string word = tokenizer->get().toString();
-
-                if (!negated) {
-                    _termsForBounds.insert(word);
-                }
-
-                // Compute the string corresponding to 'token' that will be used for the matcher.
-                // For case-insensitive queries, this is the same string as 'boundsTerm' computed
-                // above.
-                if (!_caseSensitive) {
-                    activeTerms.insert(word);
-                }
-            }
-
-            if (!_caseSensitive) {
+            // Compute the string corresponding to 'token' that will be used for index bounds
+            // generation.
+            string boundsTerm = tolowerString( token );
+            if ( sw.isStopWord( boundsTerm ) ) {
                 return;
             }
+            boundsTerm = stemmer.stem( boundsTerm );
 
-            tokenizer->reset(token.c_str(), true);
-
-            // If we want case-sensitivity, get the case-sensitive token
-            while (tokenizer->moveNext()) {
-
-                string word = tokenizer->get().toString();
+            // If the lowercased version of 'token' is a not a stop word, 'token' itself should also
+            // not be.
+            dassert( !sw.isStopWord( token ) );
+            if ( !negated ) {
+                _termsForBounds.insert( boundsTerm );
+            }
 
-                activeTerms.insert(word);
+            // Compute the string corresponding to 'token' that will be used for the matcher.  For
+            // case-insensitive queries, this is the same string as 'boundsTerm' computed above.
+            // However, for case-sensitive queries we need to re-stem the original token, since
+            // 'boundsTerm' is already lowercased but we need the original casing for an exact
+            // match.
+            const string& matcherTerm = _caseSensitive ? stemmer.stem( token ) : boundsTerm;
+            if ( negated ) {
+                _negatedTerms.insert( matcherTerm );
+            }
+            else {
+                _positiveTerms.insert( matcherTerm );
             }
         }
 
-        string FTSQuery::normalizeString(StringData str) const {
-            if (_caseSensitive) {
+        string FTSQuery::normalizeString( StringData str ) const {
+            if ( _caseSensitive ) {
                 return str.toString();
             }
-            return tolowerString(str);
+            return tolowerString( str );
         }
 
         namespace {
diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h
index f9ea7f2d1eb..96317c926e5 100644
--- a/src/mongo/db/fts/fts_query.h
+++ b/src/mongo/db/fts/fts_query.h
@@ -79,7 +79,8 @@ namespace mongo {
             static const bool caseSensitiveDefault;
 
         private:
-            void _addTerm( FTSTokenizer* tokenizer,
+            void _addTerm( const StopWords& sw,
+                           const Stemmer& stemmer,
                            const std::string& token,
                            bool negated );
 
diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp
index 9e68835e83b..fdd9ecf7824 100644
--- a/src/mongo/db/fts/fts_spec.cpp
+++ b/src/mongo/db/fts/fts_spec.cpp
@@ -33,7 +33,6 @@
 
 #include "mongo/db/field_ref.h"
 #include "mongo/db/fts/fts_element_iterator.h"
-#include "mongo/db/fts/fts_tokenizer.h"
 #include "mongo/db/fts/fts_util.h"
 #include "mongo/util/mongoutils/str.h"
 #include "mongo/util/stringutils.h"
@@ -168,12 +167,13 @@ namespace mongo {
 
             while ( it.more() ) {
                 FTSIteratorValue val = it.next();
-                std::unique_ptr<FTSTokenizer> tokenizer(val._language->createTokenizer());
-                _scoreStringV2( tokenizer.get(), val._text, term_freqs, val._weight );
+                Stemmer stemmer( *val._language );
+                Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) );
+                _scoreStringV2( tools, val._text, term_freqs, val._weight );
             }
         }
 
-        void FTSSpec::_scoreStringV2( FTSTokenizer* tokenizer,
+        void FTSSpec::_scoreStringV2( const Tools& tools,
                                       StringData raw,
                                       TermFrequencyMap* docScores,
                                       double weight ) const {
@@ -182,10 +182,18 @@ namespace mongo {
 
             unsigned numTokens = 0;
 
-            tokenizer->reset(raw.rawData(), false );
+            Tokenizer i( tools.language, raw );
+            while ( i.more() ) {
+                Token t = i.next();
+                if ( t.type != Token::TEXT ) {
+                    continue;
+                }
 
-            while (tokenizer->moveNext()) {
-                string term = tokenizer->get().toString();
+                string term = tolowerString( t.data );
+                if ( tools.stopwords->isStopWord( term ) ) {
+                    continue;
+                }
+                term = tools.stemmer->stem( term );
 
                 ScoreHelperStruct& data = terms[term];
 
diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h
index 0f17d825dcc..c35bc2c9010 100644
--- a/src/mongo/db/fts/fts_spec.h
+++ b/src/mongo/db/fts/fts_spec.h
@@ -119,7 +119,7 @@ namespace mongo {
              * Calculate the term scores for 'raw' and update 'term_freqs' with the result.  Parses
              * 'raw' using 'tools', and weights term scores based on 'weight'.
              */
-            void _scoreStringV2( FTSTokenizer* tokenizer,
+            void _scoreStringV2( const Tools& tools,
                                  StringData raw,
                                  TermFrequencyMap* term_freqs,
                                  double weight ) const;
diff --git a/src/mongo/db/fts/fts_spec_legacy.cpp b/src/mongo/db/fts/fts_spec_legacy.cpp
index a2dc1dc2489..69721fe2ae0 100644
--- a/src/mongo/db/fts/fts_spec_legacy.cpp
+++ b/src/mongo/db/fts/fts_spec_legacy.cpp
@@ -73,7 +73,7 @@ namespace mongo {
 
             unsigned numTokens = 0;
 
-            Tokenizer i( &tools.language, raw );
+            Tokenizer i( tools.language, raw );
             while ( i.more() ) {
                 Token t = i.next();
                 if ( t.type != Token::TEXT )
@@ -162,8 +162,8 @@ namespace mongo {
 
             const FTSLanguage& language = _getLanguageToUseV1( obj );
 
-            Stemmer stemmer(&language);
-            Tools tools(language, &stemmer, StopWords::getStopWords( &language ));
+            Stemmer stemmer(language);
+            Tools tools(language, &stemmer, StopWords::getStopWords( language ));
 
             if ( wildcard() ) {
                 // if * is specified for weight, we can recurse over all fields.
diff --git a/src/mongo/db/fts/fts_tokenizer.h b/src/mongo/db/fts/fts_tokenizer.h
deleted file mode 100644
index 65833aff0cb..00000000000
--- a/src/mongo/db/fts/fts_tokenizer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- *    Copyright (C) 2015 MongoDB Inc.
- *
- *    This program is free software: you can redistribute it and/or  modify
- *    it under the terms of the GNU Affero General Public License, version 3,
- *    as published by the Free Software Foundation.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU Affero General Public License for more details.
- *
- *    You should have received a copy of the GNU Affero General Public License
- *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *    As a special exception, the copyright holders give permission to link the
- *    code of portions of this program with the OpenSSL library under certain
- *    conditions as described in each individual source file and distribute
- *    linked combinations including the program with the OpenSSL library. You
- *    must comply with the GNU Affero General Public License in all respects for
- *    all of the code used other than as permitted herein. If you modify file(s)
- *    with this exception, you may extend this exception to your version of the
- *    file(s), but you are not obligated to do so. If you do not wish to do so,
- *    delete this exception statement from your version. If you delete this
- *    exception statement from all source files in the program, then also delete
- *    it in the license file.
- */
-
-
-#pragma once
-
-#include "mongo/base/disallow_copying.h"
-#include "mongo/base/string_data.h"
-
-namespace mongo {
-namespace fts {
-
-    class FTSLanguage;
-    class StopWords;
-
-    /**
-     * FTSTokenizer
-     * A iterator of "documents" where a document contains space delimited words.
-     * For each word returns a stem or lemma version of a word optimized for full text indexing.
-     * Optionally supports returning case sensitive search terms.
-     */
-    class FTSTokenizer {
-    public:
-        virtual ~FTSTokenizer() = default;
-
-        /**
-         * Process a new document, and discards any previous results.
-         * May be called multiple times on an instance of an iterator.
-         */
-        virtual void reset(const char* document, bool generateCaseSensitiveTokens) = 0;
-
-        /**
-         * Moves to the next token in the iterator.
-         * Returns false when the iterator reaches end of the document.
-         */
-        virtual bool moveNext() = 0;
-
-        /**
-         * Returns stemmed form, normalized, and lowercased depending on the parameter
-         * to the reset method.
-         * Returned StringData is valid until next call to moveNext().
-         */
-        virtual StringData get() const = 0;
-    };
-
-} // namespace fts
-} // namespace mongo
diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp
index 9353fccf297..4a734dbe316 100644
--- a/src/mongo/db/fts/stemmer.cpp
+++ b/src/mongo/db/fts/stemmer.cpp
@@ -40,10 +40,10 @@ namespace mongo {
 
         using std::string;
 
-        Stemmer::Stemmer( const FTSLanguage* language ) {
+        Stemmer::Stemmer( const FTSLanguage& language ) {
             _stemmer = NULL;
-            if ( language->str() != "none" )
-                _stemmer = sb_stemmer_new(language->str().c_str(), "UTF_8");
+            if ( language.str() != "none" )
+                _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8");
         }
 
         Stemmer::~Stemmer() {
diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h
index d6d76e64218..6abba8abddc 100644
--- a/src/mongo/db/fts/stemmer.h
+++ b/src/mongo/db/fts/stemmer.h
@@ -49,7 +49,7 @@ namespace mongo {
         class Stemmer {
             MONGO_DISALLOW_COPYING( Stemmer );
         public:
-            Stemmer( const FTSLanguage* language );
+            Stemmer( const FTSLanguage& language );
             ~Stemmer();
 
             std::string stem( StringData word ) const;
diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp
index bef556bf2ad..9037715d4da 100644
--- a/src/mongo/db/fts/stemmer_test.cpp
+++ b/src/mongo/db/fts/stemmer_test.cpp
@@ -38,13 +38,13 @@ namespace mongo {
     namespace fts {
 
         TEST( English, Stemmer1 ) {
-            Stemmer s( &languageEnglishV2 );
+            Stemmer s( languageEnglishV2 );
             ASSERT_EQUALS( "run", s.stem( "running" ) );
             ASSERT_EQUALS( "Run", s.stem( "Running" ) );
         }
 
         TEST( English, Caps ) {
-            Stemmer s( &languagePorterV1 );
+            Stemmer s( languagePorterV1 );
             ASSERT_EQUALS( "unit", s.stem( "united" ) );
             ASSERT_EQUALS( "Unite", s.stem( "United" ) );
         }
diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp
index 421bfae63db..66240a1ce2d 100644
--- a/src/mongo/db/fts/stop_words.cpp
+++ b/src/mongo/db/fts/stop_words.cpp
@@ -28,6 +28,7 @@
 *    it in the license file.
 */
 
+#include <boost/shared_ptr.hpp>
 #include <set>
 #include <string>
 
@@ -36,14 +37,18 @@
 #include "mongo/base/init.h"
 #include "mongo/util/string_map.h"
 
+
+
 namespace mongo {
 
+    using boost::shared_ptr;
+
     namespace fts {
 
         void loadStopWordMap( StringMap< std::set< std::string > >* m );
 
         namespace {
-            StringMap< std::shared_ptr<StopWords> > StopWordsMap;
+            StringMap< boost::shared_ptr<StopWords> > STOP_WORDS;
             StopWords empty;
         }
 
@@ -56,9 +61,9 @@ namespace mongo {
                 _words.insert( *i );
         }
 
-        const StopWords* StopWords::getStopWords( const FTSLanguage* language ) {
-            auto i = StopWordsMap.find( language->str() );
-            if ( i == StopWordsMap.end() )
+        const StopWords* StopWords::getStopWords( const FTSLanguage& language ) {
+            StringMap< boost::shared_ptr<StopWords> >::const_iterator i = STOP_WORDS.find( language.str() );
+            if ( i == STOP_WORDS.end() )
                 return &empty;
             return i->second.get();
         }
@@ -70,7 +75,7 @@ namespace mongo {
             for ( StringMap< std::set< std::string > >::const_iterator i = raw.begin();
                   i != raw.end();
                   ++i ) {
-                StopWordsMap[i->first].reset(new StopWords( i->second ));
+                STOP_WORDS[i->first].reset(new StopWords( i->second ));
             }
             return Status::OK();
         }
diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h
index d989b4dcd32..4789535ef4d 100644
--- a/src/mongo/db/fts/stop_words.h
+++ b/src/mongo/db/fts/stop_words.h
@@ -53,7 +53,7 @@ namespace mongo {
 
             size_t numStopWords() const { return _words.size(); }
 
-            static const StopWords* getStopWords( const FTSLanguage* language );
+            static const StopWords* getStopWords( const FTSLanguage& language );
         private:
             unordered_set<std::string> _words;
         };
diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp
index 248c4d93407..0edf4e2540c 100644
--- a/src/mongo/db/fts/stop_words_test.cpp
+++ b/src/mongo/db/fts/stop_words_test.cpp
@@ -36,7 +36,7 @@ namespace mongo {
     namespace fts {
 
         TEST( English, Basic1 ) {
-            const StopWords* englishStopWords = StopWords::getStopWords( &languageEnglishV2 );
+            const StopWords* englishStopWords = StopWords::getStopWords( languageEnglishV2 );
             ASSERT( englishStopWords->isStopWord( "the" ) );
             ASSERT( !englishStopWords->isStopWord( "computer" ) );
         }
diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp
index 6896924ae31..ee60f99d588 100644
--- a/src/mongo/db/fts/tokenizer.cpp
+++ b/src/mongo/db/fts/tokenizer.cpp
@@ -38,9 +38,9 @@ namespace mongo {
 
     namespace fts {
 
-        Tokenizer::Tokenizer( const FTSLanguage* language, StringData str )
+        Tokenizer::Tokenizer( const FTSLanguage& language, StringData str )
             : _pos(0), _raw( str ) {
-            _english = ( language->str() == "english" );
+            _english = ( language.str() == "english" );
             _skipWhitespace();
             _previousWhiteSpace = true;
         }
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
index 6e449124b3a..cd0d76a4f70 100644
--- a/src/mongo/db/fts/tokenizer.h
+++ b/src/mongo/db/fts/tokenizer.h
@@ -62,7 +62,7 @@ namespace mongo {
             MONGO_DISALLOW_COPYING( Tokenizer );
         public:
 
-            Tokenizer( const FTSLanguage* language, StringData str );
+            Tokenizer( const FTSLanguage& language, StringData str );
 
             bool more() const;
             Token next();
diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp
index a6692f3456d..29153a329a6 100644
--- a/src/mongo/db/fts/tokenizer_test.cpp
+++ b/src/mongo/db/fts/tokenizer_test.cpp
@@ -36,12 +36,12 @@ namespace mongo {
     namespace fts {
 
         TEST( Tokenizer, Empty1 ) {
-            Tokenizer i( &languageEnglishV2, "" );
+            Tokenizer i( languageEnglishV2, "" );
             ASSERT( !i.more() );
         }
 
         TEST( Tokenizer, Basic1 ) {
-            Tokenizer i( &languageEnglishV2, "blue red green" );
+            Tokenizer i( languageEnglishV2, "blue red green" );
 
             ASSERT( i.more() );
             ASSERT_EQUALS( i.next().data.toString(), "blue" );
@@ -56,7 +56,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Basic2 ) {
-            Tokenizer i( &languageEnglishV2, "blue-red" );
+            Tokenizer i( languageEnglishV2, "blue-red" );
 
             Token a = i.next();
             Token b = i.next();
@@ -78,7 +78,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Basic3 ) {
-            Tokenizer i( &languageEnglishV2, "blue -red" );
+            Tokenizer i( languageEnglishV2, "blue -red" );
 
             Token a = i.next();
             Token b = i.next();
@@ -105,7 +105,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Quote1English ) {
-            Tokenizer i( &languageEnglishV2, "eliot's car" );
+            Tokenizer i( languageEnglishV2, "eliot's car" );
 
             Token a = i.next();
             Token b = i.next();
@@ -115,7 +115,7 @@ namespace mongo {
         }
 
         TEST( Tokenizer, Quote1French ) {
-            Tokenizer i( &languageFrenchV2, "eliot's car" );
+            Tokenizer i( languageFrenchV2, "eliot's car" );
 
             Token a = i.next();
             Token b = i.next();
author	Geert Bosch <geert@mongodb.com>	2015-03-30 15:43:54 -0400
committer	Geert Bosch <geert@mongodb.com>	2015-03-30 15:43:54 -0400
commit	edc67399aef9bded106c0196d4af843f23a8acc9 (patch)
tree	dbb5cebdf54af86e9ee28ced798377579f393722
parent	465bb26c0fb0f4731f4dbb5e09e0a791177bbc64 (diff)
download	mongo-edc67399aef9bded106c0196d4af843f23a8acc9.tar.gz