1 files changed, 234 insertions, 0 deletions
diff --git a/src/assistant/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h b/src/assistant/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h
new file mode 100644
index 000000000..0cfd9c684
--- /dev/null
+++ b/src/assistant/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h
@@ -0,0 +1,234 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_AnalysisHeader_
+#define _lucene_analysis_AnalysisHeader_
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "CLucene/util/Reader.h"
+
+CL_NS_DEF(analysis)
+  
+
+/** A Token is an occurence of a term from the text of a field.  It consists of
+* a term's text, the start and end offset of the term in the text of the field,
+* and a type string.
+* 
+* The start and end offsets permit applications to re-associate a token with
+* its source text, e.g., to display highlighted query terms in a document
+* browser, or to show matching text fragments in a KWIC (KeyWord In Context)
+* display, etc.
+* 
+* The type is an interned string, assigned by a lexical analyzer
+* (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+* belongs to.  For example an end of sentence marker token might be implemented
+* with type "eos".  The default token type is "word".  
+*/
+class Token:LUCENE_BASE{
+private:
+	int32_t _startOffset;				// start in source text
+	int32_t _endOffset;				  // end in source text
+	const TCHAR* _type;				  // lexical type
+	int32_t positionIncrement;
+	size_t bufferTextLen;
+
+public:
+	#ifndef LUCENE_TOKEN_WORD_LENGTH
+	TCHAR* _termText;				  // the text of the term
+	#else
+	TCHAR _termText[LUCENE_TOKEN_WORD_LENGTH+1];				  // the text of the term
+	#endif
+	int32_t _termTextLen;
+	static const TCHAR* defaultType;
+
+	Token();
+	~Token();
+	// Constructs a Token with the given text, start and end offsets, & type. 
+	Token(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ=defaultType);
+	void set(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ=defaultType);
+	
+	size_t bufferLength(){ return bufferTextLen; }
+	void growBuffer(size_t size);
+	
+	/* Set the position increment.  This determines the position of this
+	* token relative to the previous Token in a TokenStream, used in
+	* phrase searching.
+	*
+	* The default value is 1.
+	*
+	* Some common uses for this are:
+	*
+	* - Set it to zero to put multiple terms in the same position.  This is
+	* useful if, e.g., a word has multiple stems.  Searches for phrases
+	* including either stem will match.  In this case, all but the first stem's
+	* increment should be set to zero: the increment of the first instance
+	* should be one.  Repeating a token with an increment of zero can also be
+	* used to boost the scores of matches on that token.
+	*
+	* - Set it to values greater than one to inhibit exact phrase matches.
+	* If, for example, one does not want phrases to match across removed stop
+	* words, then one could build a stop word filter that removes stop words and
+	* also sets the increment to the number of stop words removed before each
+	* non-stop word.  Then exact phrase queries will only match when the terms
+	* occur with no intervening stop words.
+	*/
+	void setPositionIncrement(int32_t posIncr);
+	int32_t getPositionIncrement() const;
+	const TCHAR* termText() const;
+	size_t termTextLength();
+	void resetTermTextLen();
+	void setText(const TCHAR* txt);
+
+	/**
+	* Returns this Token's starting offset, the position of the first character
+	* corresponding to this token in the source text.
+	* 
+	* Note that the difference between endOffset() and startOffset() may not be
+	* equal to termText.length(), as the term text may have been altered by a
+	* stemmer or some other filter.
+	*/
+	int32_t startOffset() const { return _startOffset; }
+	void setStartOffset(int32_t val){ _startOffset =val; }
+
+	/**
+	* Returns this Token's ending offset, one greater than the position of the
+	* last character corresponding to this token in the source text.
+	*/
+	int32_t endOffset() const { return _endOffset; }
+	void setEndOffset(int32_t val){ _endOffset =val; }
+
+	// Returns this Token's lexical type.  Defaults to "word". 
+	const TCHAR* type() const { return _type; } ///<returns reference
+	void setType(const TCHAR* val) { _type = val; } ///<returns reference
+
+	TCHAR* toString() const;
+
+	///Compares the Token for their order
+	class OrderCompare:LUCENE_BASE, public CL_NS(util)::Compare::_base //<Token*>
+	{
+	public:
+		bool operator()( Token* t1, Token* t2 ) const;
+	};
+};
+
+/** 
+* A TokenStream enumerates the sequence of tokens, either from
+* fields of a document or from query text.
+* <p>
+* This is an abstract class.  Concrete subclasses are:
+* <ul>
+* <li>{@link Tokenizer}, a TokenStream
+* whose input is a Reader; and
+* <li>{@link TokenFilter}, a TokenStream
+* whose input is another TokenStream.
+* </ul>
+*/
+class TokenStream:LUCENE_BASE {
+public:
+	/** Sets token to the next token in the stream, returns false at the EOS. */
+	virtual bool next(Token* token) = 0;
+
+    /** Releases resources associated with this stream. */
+	virtual void close() = 0;
+
+	virtual ~TokenStream(){
+	}
+
+	/* This is for backwards compatibility only. You should pass the token you want to fill
+	 * to next(), this will save a lot of object construction and destructions.
+	 *  @deprecated. use next(token). Kept only to avoid breaking existing code.
+	 */
+	_CL_DEPRECATED(next(Token)) Token* next();
+};
+
+
+/** An Analyzer builds TokenStreams, which analyze text.  It thus represents a
+ *  policy for extracting index terms from text.
+ *  <p>
+ *  Typical implementations first build a Tokenizer, which breaks the stream of
+ *  characters from the Reader into raw Tokens.  One or more TokenFilters may
+ *  then be applied to the output of the Tokenizer.
+ *  <p>
+ *  WARNING: You must override one of the methods defined by this class in your
+ *  subclass or the Analyzer will enter an infinite loop.
+ */
+class Analyzer:LUCENE_BASE{
+public:
+	/** Creates a TokenStream which tokenizes all the text in the provided
+	Reader.  Default implementation forwards to tokenStream(Reader) for 
+	compatibility with older version.  Override to allow Analyzer to choose 
+	strategy based on document and/or field.  Must be able to handle null
+	field name for backward compatibility. */
+	virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)=0;
+	  
+	virtual ~Analyzer(){
+    }
+
+	/**
+	* Invoked before indexing a Field instance if
+	* terms have already been added to that field.  This allows custom
+	* analyzers to place an automatic position increment gap between
+	* Field instances using the same field name.  The default value
+	* position increment gap is 0.  With a 0 position increment gap and
+	* the typical default token position increment of 1, all terms in a field,
+	* including across Field instances, are in successive positions, allowing
+	* exact PhraseQuery matches, for instance, across Field instance boundaries.
+	*
+	* @param fieldName Field name being indexed.
+	* @return position increment gap, added to the next token emitted from {@link #tokenStream(TCHAR*, Reader*)}
+	*/
+	virtual int32_t getPositionIncrementGap(const TCHAR* fieldName);
+};
+
+
+/** A Tokenizer is a TokenStream whose input is a Reader.
+<p>
+This is an abstract class.
+*/
+class Tokenizer:public TokenStream {
+protected:
+    /** The text source for this Tokenizer. */
+	CL_NS(util)::Reader* input;
+
+public:
+    /** Construct a tokenizer with null input. */
+	Tokenizer();
+    /** Construct a token stream processing the given input. */
+    Tokenizer(CL_NS(util)::Reader* _input);
+
+    // ** By default, closes the input Reader. */
+	virtual void close();
+	virtual ~Tokenizer();
+};
+
+/** A TokenFilter is a TokenStream whose input is another token stream.
+<p>
+This is an abstract class.
+*/
+class TokenFilter:public TokenStream {
+protected:
+    /** The source of tokens for this filter. */
+	TokenStream* input;
+    /** If true then input will be deleted in the destructor */
+	bool deleteTokenStream;
+
+    /** Construct a token stream filtering the given input. 
+     *
+     * @param in The TokenStream to filter from
+     * @param deleteTS If true, input will be deleted in the destructor
+    */
+	TokenFilter(TokenStream* in, bool deleteTS=false);
+	virtual ~TokenFilter();
+public:
+    /** Close the input TokenStream. */
+	void close();
+};
+
+CL_NS_END
+#endif