1 files changed, 88 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
new file mode 100644
index 000000000..d4195be81
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
@@ -0,0 +1,88 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_standard_StandardTokenizer
+#define _lucene_analysis_standard_StandardTokenizer
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "../AnalysisHeader.h"
+#include "../Analyzers.h"
+#include "StandardTokenizerConstants.h"
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/FastCharStream.h"
+#include "CLucene/util/Reader.h"
+
+
+CL_NS_DEF2(analysis,standard)
+
+/** A grammar-based tokenizer constructed with JavaCC.
+ *
+ * <p> This should be a good tokenizer for most European-language documents:
+ *
+ * <ul>
+ *   <li>Splits words at punctuation characters, removing punctuation. However, a 
+ *     dot that's not followed by whitespace is considered part of a token.
+ *   <li>Splits words at hyphens, unless there's a number in the token, in which case
+ *     the whole token is interpreted as a product number and is not split.
+ *   <li>Recognizes email addresses and internet hostnames as one token.
+ * </ul>
+ *
+ * <p>Many applications have specific tokenizer needs.  If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+  class StandardTokenizer: public Tokenizer {
+  private:
+    int32_t rdPos;
+    int32_t tokenStart;
+
+    // Advance by one character, incrementing rdPos and returning the character.
+    int readChar();
+    // Retreat by one character, decrementing rdPos.
+    void unReadChar();
+
+    // createToken centralizes token creation for auditing purposes.
+	//Token* createToken(CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
+    inline bool setToken(Token* t, CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
+
+    bool ReadDotted(CL_NS(util)::StringBuffer* str, TokenTypes forcedType,Token* t);
+
+  public:
+	CL_NS(util)::FastCharStream* rd;
+
+    // Constructs a tokenizer for this Reader.
+    StandardTokenizer(CL_NS(util)::Reader* reader);
+
+    ~StandardTokenizer();
+
+    /** Returns the next token in the stream, or false at end-of-stream.
+    * The returned token's type is set to an element of
+    * StandardTokenizerConstants::tokenImage. */
+    bool next(Token* token);
+
+    // Reads for number like "1"/"1234.567", or IP address like "192.168.1.2".
+    bool ReadNumber(const TCHAR* previousNumber, const TCHAR prev, Token* t);
+
+    bool ReadAlphaNum(const TCHAR prev, Token* t);
+
+    // Reads for apostrophe-containing word.
+    bool ReadApostrophe(CL_NS(util)::StringBuffer* str, Token* t);
+
+    // Reads for something@... it may be a COMPANY name or a EMAIL address
+    bool ReadAt(CL_NS(util)::StringBuffer* str, Token* t);
+
+    // Reads for COMPANY name like AT&T.
+    bool ReadCompany(CL_NS(util)::StringBuffer* str, Token* t);
+    
+    // Reads CJK characters
+    bool ReadCJK(const TCHAR prev, Token* t);
+  };
+
+CL_NS_END2
+#endif