1 files changed, 188 insertions, 0 deletions
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
new file mode 100644
index 00000000000..15726aea1de
--- /dev/null
+++ b/storage/innobase/include/fts0tokenize.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0tokenize.cc
+Full Text Search plugin tokenizer refer to MyISAM
+
+Created 2014/11/17 Shaohua Wang
+***********************************************************************/
+
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+#include "m_ctype.h"
+
+/* Macros and structs below are from ftdefs.h in MyISAM */
+/** Check a char is true word */
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+/** Check if a char is misc word */
+#define misc_word_char(X)       0
+
+/** Boolean search syntax */
+static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
+
+#define FTB_YES   (fts_boolean_syntax[0])
+#define FTB_EGAL  (fts_boolean_syntax[1])
+#define FTB_NO    (fts_boolean_syntax[2])
+#define FTB_INC   (fts_boolean_syntax[3])
+#define FTB_DEC   (fts_boolean_syntax[4])
+#define FTB_LBR   (fts_boolean_syntax[5])
+#define FTB_RBR   (fts_boolean_syntax[6])
+#define FTB_NEG   (fts_boolean_syntax[7])
+#define FTB_TRUNC (fts_boolean_syntax[8])
+#define FTB_LQUOT (fts_boolean_syntax[10])
+#define FTB_RQUOT (fts_boolean_syntax[11])
+
+/** FTS query token */
+typedef struct st_ft_word {
+        uchar* pos;     /*!< word start pointer */
+        uint   len;     /*!< word len */
+        double weight;  /*!< word weight, unused in innodb */
+} FT_WORD;
+
+/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
+Differences: a. code format changed; b. stopword processing removed.
+@param[in]	cs	charset
+@param[in,out]	start	doc start pointer
+@param[in,out]	end	doc end pointer
+@param[in,out]	word	token
+@param[in,out]	info	token info
+@retval	0	eof
+@retval	1	word found
+@retval	2	left bracket
+@retval	3	right bracket
+@retval	4	stopword found */
+inline
+uchar
+fts_get_word(
+	const CHARSET_INFO*	cs,
+	uchar**			start,
+	uchar*			end,
+	FT_WORD*		word,
+	MYSQL_FTPARSER_BOOLEAN_INFO*
+				info)
+{
+	uchar*	doc = *start;
+	int	ctype;
+	uint	mwc;
+	uint	length;
+	int	mbl;
+
+	info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
+	info->weight_adjust = info->wasign = 0;
+	info->type = FT_TOKEN_EOF;
+
+	while (doc < end) {
+		for (; doc < end;
+		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+			mbl = cs->cset->ctype(cs, &ctype, doc, end);
+
+			if (true_word_char(ctype, *doc)) {
+				break;
+			}
+
+			if (*doc == FTB_RQUOT && info->quot) {
+				*start = doc + 1;
+				info->type = FT_TOKEN_RIGHT_PAREN;
+
+				return(info->type);
+			}
+
+			if (!info->quot) {
+				if (*doc == FTB_LBR
+				    || *doc == FTB_RBR
+				    || *doc == FTB_LQUOT) {
+					/* param->prev=' '; */
+					*start = doc + 1;
+					if (*doc == FTB_LQUOT) {
+						info->quot = (char*)1;
+					}
+
+					info->type = (*doc == FTB_RBR ?
+						       FT_TOKEN_RIGHT_PAREN :
+						       FT_TOKEN_LEFT_PAREN);
+
+					return(info->type);
+				}
+
+				if (info->prev == ' ') {
+					if (*doc == FTB_YES) {
+						info->yesno = +1;
+						continue;
+					} else if (*doc == FTB_EGAL) {
+						info->yesno = 0;
+						continue;
+					} else if (*doc == FTB_NO) {
+						info->yesno = -1;
+						continue;
+					} else if (*doc == FTB_INC) {
+						info->weight_adjust++;
+						continue;
+					} else if (*doc == FTB_DEC) {
+						info->weight_adjust--;
+						continue;
+					} else if (*doc == FTB_NEG) {
+						info->wasign = !info->wasign;
+						continue;
+					}
+				}
+			}
+
+			info->prev = *doc;
+			info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
+			info->weight_adjust = info->wasign = 0;
+		}
+
+		mwc = length = 0;
+		for (word->pos = doc;
+		     doc < end;
+		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+			mbl = cs->cset->ctype(cs, &ctype, doc, end);
+
+			if (true_word_char(ctype, *doc)) {
+				mwc = 0;
+			} else if (!misc_word_char(*doc) || mwc) {
+				break;
+			} else {
+				mwc++;
+			}
+		}
+
+		/* Be sure *prev is true_word_char. */
+		info->prev = 'A';
+		word->len = (uint)(doc-word->pos) - mwc;
+
+		if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
+			doc++;
+		}
+
+		/* We don't check stopword here. */
+		*start = doc;
+		info->type = FT_TOKEN_WORD;
+
+		return(info->type);
+	}
+
+	if (info->quot) {
+		*start = doc;
+		info->type = FT_TOKEN_RIGHT_PAREN;
+	}
+
+	return(info->type);
+}