summaryrefslogtreecommitdiff
path: root/storage/innobase/include/fts0tokenize.h
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/include/fts0tokenize.h')
-rw-r--r--storage/innobase/include/fts0tokenize.h188
1 files changed, 188 insertions, 0 deletions
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
new file mode 100644
index 00000000000..15726aea1de
--- /dev/null
+++ b/storage/innobase/include/fts0tokenize.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0tokenize.cc
+Full Text Search plugin tokenizer refer to MyISAM
+
+Created 2014/11/17 Shaohua Wang
+***********************************************************************/
+
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+#include "m_ctype.h"
+
+/* Macros and structs below are from ftdefs.h in MyISAM */
+/** Check a char is true word */
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+/** Check if a char is misc word */
+#define misc_word_char(X) 0
+
+/** Boolean search syntax */
+static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
+
+#define FTB_YES (fts_boolean_syntax[0])
+#define FTB_EGAL (fts_boolean_syntax[1])
+#define FTB_NO (fts_boolean_syntax[2])
+#define FTB_INC (fts_boolean_syntax[3])
+#define FTB_DEC (fts_boolean_syntax[4])
+#define FTB_LBR (fts_boolean_syntax[5])
+#define FTB_RBR (fts_boolean_syntax[6])
+#define FTB_NEG (fts_boolean_syntax[7])
+#define FTB_TRUNC (fts_boolean_syntax[8])
+#define FTB_LQUOT (fts_boolean_syntax[10])
+#define FTB_RQUOT (fts_boolean_syntax[11])
+
+/** FTS query token */
+typedef struct st_ft_word {
+ uchar* pos; /*!< word start pointer */
+ uint len; /*!< word len */
+ double weight; /*!< word weight, unused in innodb */
+} FT_WORD;
+
+/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
+Differences: a. code format changed; b. stopword processing removed.
+@param[in] cs charset
+@param[in,out] start doc start pointer
+@param[in,out] end doc end pointer
+@param[in,out] word token
+@param[in,out] info token info
+@retval 0 eof
+@retval 1 word found
+@retval 2 left bracket
+@retval 3 right bracket
+@retval 4 stopword found */
+inline
+uchar
+fts_get_word(
+ const CHARSET_INFO* cs,
+ uchar** start,
+ uchar* end,
+ FT_WORD* word,
+ MYSQL_FTPARSER_BOOLEAN_INFO*
+ info)
+{
+ uchar* doc = *start;
+ int ctype;
+ uint mwc;
+ uint length;
+ int mbl;
+
+ info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
+ info->weight_adjust = info->wasign = 0;
+ info->type = FT_TOKEN_EOF;
+
+ while (doc < end) {
+ for (; doc < end;
+ doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+ mbl = cs->cset->ctype(cs, &ctype, doc, end);
+
+ if (true_word_char(ctype, *doc)) {
+ break;
+ }
+
+ if (*doc == FTB_RQUOT && info->quot) {
+ *start = doc + 1;
+ info->type = FT_TOKEN_RIGHT_PAREN;
+
+ return(info->type);
+ }
+
+ if (!info->quot) {
+ if (*doc == FTB_LBR
+ || *doc == FTB_RBR
+ || *doc == FTB_LQUOT) {
+ /* param->prev=' '; */
+ *start = doc + 1;
+ if (*doc == FTB_LQUOT) {
+ info->quot = (char*)1;
+ }
+
+ info->type = (*doc == FTB_RBR ?
+ FT_TOKEN_RIGHT_PAREN :
+ FT_TOKEN_LEFT_PAREN);
+
+ return(info->type);
+ }
+
+ if (info->prev == ' ') {
+ if (*doc == FTB_YES) {
+ info->yesno = +1;
+ continue;
+ } else if (*doc == FTB_EGAL) {
+ info->yesno = 0;
+ continue;
+ } else if (*doc == FTB_NO) {
+ info->yesno = -1;
+ continue;
+ } else if (*doc == FTB_INC) {
+ info->weight_adjust++;
+ continue;
+ } else if (*doc == FTB_DEC) {
+ info->weight_adjust--;
+ continue;
+ } else if (*doc == FTB_NEG) {
+ info->wasign = !info->wasign;
+ continue;
+ }
+ }
+ }
+
+ info->prev = *doc;
+ info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
+ info->weight_adjust = info->wasign = 0;
+ }
+
+ mwc = length = 0;
+ for (word->pos = doc;
+ doc < end;
+ length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+ mbl = cs->cset->ctype(cs, &ctype, doc, end);
+
+ if (true_word_char(ctype, *doc)) {
+ mwc = 0;
+ } else if (!misc_word_char(*doc) || mwc) {
+ break;
+ } else {
+ mwc++;
+ }
+ }
+
+ /* Be sure *prev is true_word_char. */
+ info->prev = 'A';
+ word->len = (uint)(doc-word->pos) - mwc;
+
+ if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
+ doc++;
+ }
+
+ /* We don't check stopword here. */
+ *start = doc;
+ info->type = FT_TOKEN_WORD;
+
+ return(info->type);
+ }
+
+ if (info->quot) {
+ *start = doc;
+ info->type = FT_TOKEN_RIGHT_PAREN;
+ }
+
+ return(info->type);
+}