diff options
author | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-04-01 14:34:39 -0400 |
---|---|---|
committer | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-04-01 14:58:13 -0400 |
commit | 72598f750d732c08c98f5f578bf1335acd78e10e (patch) | |
tree | d80364b07b25210f5724ba6e6506650be657c74e /src/mongo/db/fts/fts_basic_tokenizer.cpp | |
parent | 3cf0c18aa2c56949fda47ab35570489d68965370 (diff) | |
download | mongo-72598f750d732c08c98f5f578bf1335acd78e10e.tar.gz |
SERVER-17520: Add support for pluggable FTS tokenizers
Diffstat (limited to 'src/mongo/db/fts/fts_basic_tokenizer.cpp')
-rw-r--r-- | src/mongo/db/fts/fts_basic_tokenizer.cpp | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/src/mongo/db/fts/fts_basic_tokenizer.cpp b/src/mongo/db/fts/fts_basic_tokenizer.cpp new file mode 100644 index 00000000000..296f473f144 --- /dev/null +++ b/src/mongo/db/fts/fts_basic_tokenizer.cpp @@ -0,0 +1,90 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/fts/fts_basic_tokenizer.h" + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/stdx/memory.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" + +namespace mongo { +namespace fts { + + using std::string; + + BasicFTSTokenizer::BasicFTSTokenizer(const FTSLanguage* language) + : _language(language), _stemmer(language), _stopWords(StopWords::getStopWords(language)) { + } + + void BasicFTSTokenizer::reset(const char* document, bool generateCaseSensitiveTokens) { + _generateCaseSensitiveTokens = generateCaseSensitiveTokens; + _tokenizer = stdx::make_unique<Tokenizer>(_language, document); + } + + bool BasicFTSTokenizer::moveNext() { + while (true) { + bool hasMore = _tokenizer->more(); + if (!hasMore) { + _stem = ""; + return false; + } + + Token token = _tokenizer->next(); + + string word = token.data.toString(); + + word = tolowerString(token.data); + + // Stop words are case-sensitive so we need them to be lower cased to check + // against the stop word list + if (_stopWords->isStopWord(word)) { + continue; + } + + if (_generateCaseSensitiveTokens) { + word = token.data.toString(); + } + + _stem = _stemmer.stem(word); + return true; + } + } + + StringData BasicFTSTokenizer::get() const { + return _stem; + } + +} // namespace fts +} // namespace mongo |