summaryrefslogtreecommitdiff
path: root/src/mongo/db/fts/tokenizer.h
diff options
context:
space:
mode:
authorEliot Horowitz <eliot@10gen.com>2012-12-25 12:08:28 -0500
committerEliot Horowitz <eliot@10gen.com>2012-12-25 12:25:45 -0500
commitf201972ecc87f099777e1c61f269998f4399caf4 (patch)
treee23f1743cf486acbef64bd825b00bd82bb573d95 /src/mongo/db/fts/tokenizer.h
parentd2df300721805ace411b5d1a87cb4bf6d8a51ff3 (diff)
downloadmongo-f201972ecc87f099777e1c61f269998f4399caf4.tar.gz
SERVER-380: Experimental text search indexing
Diffstat (limited to 'src/mongo/db/fts/tokenizer.h')
-rw-r--r--src/mongo/db/fts/tokenizer.h68
1 files changed, 68 insertions, 0 deletions
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
new file mode 100644
index 00000000000..5b9a56ed8d6
--- /dev/null
+++ b/src/mongo/db/fts/tokenizer.h
@@ -0,0 +1,68 @@
+// tokenizer.h
+
+/**
+* Copyright (C) 2012 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#pragma once
+
+#include <string>
+
+#include "mongo/base/string_data.h"
+#include "mongo/platform/unordered_map.h"
+#include "mongo/platform/unordered_set.h"
+
+namespace mongo {
+
+ namespace fts {
+
+ struct Token {
+ enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
+ Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace )
+ : type( type ),
+ data( data ),
+ offset( offset ),
+ previousWhiteSpace( previousWhiteSpace ) {}
+
+ bool ok() const { return type != INVALID; }
+
+ Type type;
+ StringData data;
+ unsigned offset;
+ bool previousWhiteSpace;
+ };
+
+ class Tokenizer {
+ public:
+
+ Tokenizer( const std::string& language, const StringData& str );
+
+ bool more() const;
+ Token next();
+
+ private:
+ Token::Type _type( char c ) const;
+ bool _skipWhitespace();
+
+ unsigned _pos;
+ bool _previousWhiteSpace;
+ const StringData& _raw;
+ bool _english;
+ };
+
+ }
+}
+