SERVER-380: Experimental text search indexing

author: Eliot Horowitz <eliot@10gen.com> 2012-12-25 12:08:28 -0500
committer: Eliot Horowitz <eliot@10gen.com> 2012-12-25 12:25:45 -0500
commit: f201972ecc87f099777e1c61f269998f4399caf4 (patch)
tree: e23f1743cf486acbef64bd825b00bd82bb573d95 /src/mongo/db/fts/tokenizer.h
parent: d2df300721805ace411b5d1a87cb4bf6d8a51ff3 (diff)
download: mongo-f201972ecc87f099777e1c61f269998f4399caf4.tar.gz
1 files changed, 68 insertions, 0 deletions
diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h
new file mode 100644
index 00000000000..5b9a56ed8d6
--- /dev/null
+++ b/src/mongo/db/fts/tokenizer.h
@@ -0,0 +1,68 @@
+// tokenizer.h
+
+/**
+*    Copyright (C) 2012 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#pragma once
+
+#include <string>
+
+#include "mongo/base/string_data.h"
+#include "mongo/platform/unordered_map.h"
+#include "mongo/platform/unordered_set.h"
+
+namespace mongo {
+
+    namespace fts {
+
+        struct Token {
+            enum Type { WHITESPACE, DELIMITER, TEXT, INVALID };
+            Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace )
+                : type( type ),
+                  data( data ),
+                  offset( offset ),
+                  previousWhiteSpace( previousWhiteSpace ) {}
+
+            bool ok() const { return type != INVALID; }
+
+            Type type;
+            StringData data;
+            unsigned offset;
+            bool previousWhiteSpace;
+        };
+
+        class Tokenizer {
+        public:
+
+            Tokenizer( const std::string& language, const StringData& str );
+
+            bool more() const;
+            Token next();
+
+        private:
+            Token::Type _type( char c ) const;
+            bool _skipWhitespace();
+
+            unsigned _pos;
+            bool _previousWhiteSpace;
+            const StringData& _raw;
+            bool _english;
+        };
+
+    }
+}
+
author	Eliot Horowitz <eliot@10gen.com>	2012-12-25 12:08:28 -0500
committer	Eliot Horowitz <eliot@10gen.com>	2012-12-25 12:25:45 -0500
commit	f201972ecc87f099777e1c61f269998f4399caf4 (patch)
tree	e23f1743cf486acbef64bd825b00bd82bb573d95 /src/mongo/db/fts/tokenizer.h
parent	d2df300721805ace411b5d1a87cb4bf6d8a51ff3 (diff)
download	mongo-f201972ecc87f099777e1c61f269998f4399caf4.tar.gz