diff options
author | Eliot Horowitz <eliot@10gen.com> | 2012-12-25 12:08:28 -0500 |
---|---|---|
committer | Eliot Horowitz <eliot@10gen.com> | 2012-12-25 12:25:45 -0500 |
commit | f201972ecc87f099777e1c61f269998f4399caf4 (patch) | |
tree | e23f1743cf486acbef64bd825b00bd82bb573d95 /src/mongo/db/fts/tokenizer.h | |
parent | d2df300721805ace411b5d1a87cb4bf6d8a51ff3 (diff) | |
download | mongo-f201972ecc87f099777e1c61f269998f4399caf4.tar.gz |
SERVER-380: Experimental text search indexing
Diffstat (limited to 'src/mongo/db/fts/tokenizer.h')
-rw-r--r-- | src/mongo/db/fts/tokenizer.h | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h new file mode 100644 index 00000000000..5b9a56ed8d6 --- /dev/null +++ b/src/mongo/db/fts/tokenizer.h @@ -0,0 +1,68 @@ +// tokenizer.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <string> + +#include "mongo/base/string_data.h" +#include "mongo/platform/unordered_map.h" +#include "mongo/platform/unordered_set.h" + +namespace mongo { + + namespace fts { + + struct Token { + enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; + Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace ) + : type( type ), + data( data ), + offset( offset ), + previousWhiteSpace( previousWhiteSpace ) {} + + bool ok() const { return type != INVALID; } + + Type type; + StringData data; + unsigned offset; + bool previousWhiteSpace; + }; + + class Tokenizer { + public: + + Tokenizer( const std::string& language, const StringData& str ); + + bool more() const; + Token next(); + + private: + Token::Type _type( char c ) const; + bool _skipWhitespace(); + + unsigned _pos; + bool _previousWhiteSpace; + const StringData& _raw; + bool _english; + }; + + } +} + |