summaryrefslogtreecommitdiff
path: root/storage/xtradb/fts/fts0ast.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/xtradb/fts/fts0ast.cc')
-rw-r--r--storage/xtradb/fts/fts0ast.cc632
1 files changed, 632 insertions, 0 deletions
diff --git a/storage/xtradb/fts/fts0ast.cc b/storage/xtradb/fts/fts0ast.cc
new file mode 100644
index 00000000000..3a03fc63303
--- /dev/null
+++ b/storage/xtradb/fts/fts0ast.cc
@@ -0,0 +1,632 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0ast.cc
+Full Text Search parser helper file.
+
+Created 2007/3/16 Sunny Bains.
+***********************************************************************/
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0fts.h"
+
+/* The FTS ast visit pass. */
+enum fts_ast_visit_pass_t {
+ FTS_PASS_FIRST, /*!< First visit pass,
+ process operators excluding
+ FTS_EXIST and FTS_IGNORE */
+ FTS_PASS_EXIST, /*!< Exist visit pass,
+ process operator FTS_EXIST */
+ FTS_PASS_IGNORE /*!< Ignore visit pass,
+ process operator FTS_IGNORE */
+};
+
+/******************************************************************//**
+Create an empty fts_ast_node_t.
+@return Create a new node */
+static
+fts_ast_node_t*
+fts_ast_node_create(void)
+/*=====================*/
+{
+ fts_ast_node_t* node;
+
+ node = (fts_ast_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0x0, sizeof(*node));
+
+ return(node);
+}
+
+/******************************************************************//**
+Create a operator fts_ast_node_t.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ fts_ast_oper_t oper) /*!< in: ast operator */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_OPER;
+ node->oper = oper;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it
+@return new node or a node list with tokenized words */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ const char* ptr) /*!< in: ast term string */
+{
+ fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg);
+ ulint len = strlen(ptr);
+ ulint cur_pos = 0;
+ fts_ast_node_t* node = NULL;
+ fts_ast_node_t* node_list = NULL;
+ fts_ast_node_t* first_node = NULL;
+
+ /* Scan the incoming string and filter out any "non-word" characters */
+ while (cur_pos < len) {
+ fts_string_t str;
+ ulint offset;
+ ulint cur_len;
+
+ cur_len = innobase_mysql_fts_get_token(
+ state->charset,
+ reinterpret_cast<const byte*>(ptr) + cur_pos,
+ reinterpret_cast<const byte*>(ptr) + len, &str, &offset);
+
+ if (cur_len == 0) {
+ break;
+ }
+
+ cur_pos += cur_len;
+
+ if (str.f_n_char > 0) {
+ /* If the subsequent term (after the first one)'s size
+ is less than fts_min_token_size, we shall ignore
+ that. This is to make consistent with MyISAM behavior */
+ if (first_node && (str.f_n_char < fts_min_token_size)) {
+ continue;
+ }
+
+ node = fts_ast_node_create();
+
+ node->type = FTS_AST_TERM;
+
+ node->term.ptr = static_cast<byte*>(ut_malloc(
+ str.f_len + 1));
+ memcpy(node->term.ptr, str.f_str, str.f_len);
+ node->term.ptr[str.f_len] = '\0';
+
+ fts_ast_state_add_node(
+ static_cast<fts_ast_state_t*>(arg), node);
+
+ if (first_node) {
+ /* There is more than one word, create
+ a list to organize them */
+ if (!node_list) {
+ node_list = fts_ast_create_node_list(
+ static_cast<fts_ast_state_t*>(
+ arg),
+ first_node);
+ }
+
+ fts_ast_add_node(node_list, node);
+ } else {
+ first_node = node;
+ }
+ }
+ }
+
+ return((node_list != NULL) ? node_list : first_node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ const char* ptr) /*!< in: ast text string */
+{
+ ulint len = strlen(ptr);
+ fts_ast_node_t* node = NULL;
+
+
+ ut_ad(len >= 1);
+
+ if (len <= 2) {
+ /* There is a way to directly supply null terminator
+ in the query string (by using 0x220022) and get here,
+ and certainly it would not make a valid query text */
+ ut_ad(ptr[0] == '\"');
+
+ if (len == 2) {
+ ut_ad(ptr[1] == '\"');
+ }
+
+ return(NULL);
+ }
+
+ node = fts_ast_node_create();
+
+ /*!< We ignore the actual quotes "" */
+ len -= 2;
+
+ node->type = FTS_AST_TEXT;
+ node->text.ptr = static_cast<byte*>(ut_malloc(len + 1));
+
+ /*!< Skip copying the first quote */
+ memcpy(node->text.ptr, ptr + 1, len);
+ node->text.ptr[len] = 0;
+ node->text.distance = ULINT_UNDEFINED;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the expr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+ void* arg, /*!< in: ast state instance */
+ fts_ast_node_t* expr) /*!< in: ast expr instance */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_LIST;
+ node->list.head = node->list.tail = expr;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+ void* arg, /*!< in: ast state instance */
+ fts_ast_node_t* expr) /*!< in: ast expr instance */
+{
+ fts_ast_node_t* node = fts_ast_node_create();
+
+ node->type = FTS_AST_SUBEXP_LIST;
+ node->list.head = node->list.tail = expr;
+
+ fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+ return(node);
+}
+
+/******************************************************************//**
+Free an expr list node elements. */
+static
+void
+fts_ast_free_list(
+/*==============*/
+ fts_ast_node_t* node) /*!< in: ast node to free */
+{
+ ut_a(node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST);
+
+ for (node = node->list.head;
+ node != NULL;
+ node = fts_ast_free_node(node)) {
+
+ /*!< No op */
+ }
+}
+
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+ fts_ast_node_t* node) /*!< in: the node to free */
+{
+ fts_ast_node_t* next_node;
+
+ switch (node->type) {
+ case FTS_AST_TEXT:
+ if (node->text.ptr) {
+ ut_free(node->text.ptr);
+ node->text.ptr = NULL;
+ }
+ break;
+
+ case FTS_AST_TERM:
+ if (node->term.ptr) {
+ ut_free(node->term.ptr);
+ node->term.ptr = NULL;
+ }
+ break;
+
+ case FTS_AST_LIST:
+ case FTS_AST_SUBEXP_LIST:
+ fts_ast_free_list(node);
+ node->list.head = node->list.tail = NULL;
+ break;
+
+ case FTS_AST_OPER:
+ break;
+
+ default:
+ ut_error;
+ }
+
+ /*!< Get next node before freeing the node itself */
+ next_node = node->next;
+
+ ut_free(node);
+
+ return(next_node);
+}
+
+/******************************************************************//**
+This AST takes ownership of the expr and is responsible
+for free'ing it.
+@return in param "list" */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+ fts_ast_node_t* node, /*!< in: list instance */
+ fts_ast_node_t* elem) /*!< in: node to add to list */
+{
+ if (!elem) {
+ return(NULL);
+ }
+
+ ut_a(!elem->next);
+ ut_a(node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST);
+
+ if (!node->list.head) {
+ ut_a(!node->list.tail);
+
+ node->list.head = node->list.tail = elem;
+ } else {
+ ut_a(node->list.tail);
+
+ node->list.tail->next = elem;
+ node->list.tail = elem;
+ }
+
+ return(node);
+}
+
+/******************************************************************//**
+For tracking node allocations, in case there is an error during
+parsing. */
+UNIV_INTERN
+void
+fts_ast_state_add_node(
+/*===================*/
+ fts_ast_state_t*state, /*!< in: ast instance */
+ fts_ast_node_t* node) /*!< in: node to add to ast */
+{
+ if (!state->list.head) {
+ ut_a(!state->list.tail);
+
+ state->list.head = state->list.tail = node;
+ } else {
+ state->list.tail->next_alloc = node;
+ state->list.tail = node;
+ }
+}
+
+/******************************************************************//**
+Set the wildcard attribute of a term. */
+UNIV_INTERN
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+ fts_ast_node_t* node) /*!< in/out: set attribute of
+ a term node */
+{
+ if (!node) {
+ return;
+ }
+
+ /* If it's a node list, the wildcard should be set to the tail node*/
+ if (node->type == FTS_AST_LIST) {
+ ut_ad(node->list.tail != NULL);
+ node = node->list.tail;
+ }
+
+ ut_a(node->type == FTS_AST_TERM);
+ ut_a(!node->term.wildcard);
+
+ node->term.wildcard = TRUE;
+}
+
+/******************************************************************//**
+Set the proximity attribute of a text node. */
+UNIV_INTERN
+void
+fts_ast_term_set_distance(
+/*======================*/
+ fts_ast_node_t* node, /*!< in/out: text node */
+ ulint distance) /*!< in: the text proximity
+ distance */
+{
+ ut_a(node->type == FTS_AST_TEXT);
+ ut_a(node->text.distance == ULINT_UNDEFINED);
+
+ node->text.distance = distance;
+}
+
+/******************************************************************//**
+Free node and expr allocations. */
+UNIV_INTERN
+void
+fts_ast_state_free(
+/*===============*/
+ fts_ast_state_t*state) /*!< in: ast state to free */
+{
+ fts_ast_node_t* node = state->list.head;
+
+ /* Free the nodes that were allocated during parsing. */
+ while (node) {
+ fts_ast_node_t* next = node->next_alloc;
+
+ if (node->type == FTS_AST_TEXT && node->text.ptr) {
+ ut_free(node->text.ptr);
+ node->text.ptr = NULL;
+ } else if (node->type == FTS_AST_TERM && node->term.ptr) {
+ ut_free(node->term.ptr);
+ node->term.ptr = NULL;
+ }
+
+ ut_free(node);
+ node = next;
+ }
+
+ state->root = state->list.head = state->list.tail = NULL;
+}
+
+/******************************************************************//**
+Print an ast node. */
+UNIV_INTERN
+void
+fts_ast_node_print(
+/*===============*/
+ fts_ast_node_t* node) /*!< in: ast node to print */
+{
+ switch (node->type) {
+ case FTS_AST_TEXT:
+ printf("TEXT: %s\n", node->text.ptr);
+ break;
+
+ case FTS_AST_TERM:
+ printf("TERM: %s\n", node->term.ptr);
+ break;
+
+ case FTS_AST_LIST:
+ printf("LIST: ");
+ node = node->list.head;
+
+ while (node) {
+ fts_ast_node_print(node);
+ node = node->next;
+ }
+ break;
+
+ case FTS_AST_SUBEXP_LIST:
+ printf("SUBEXP_LIST: ");
+ node = node->list.head;
+
+ while (node) {
+ fts_ast_node_print(node);
+ node = node->next;
+ }
+ case FTS_AST_OPER:
+ printf("OPER: %d\n", node->oper);
+ break;
+
+ default:
+ ut_error;
+ }
+}
+
+/******************************************************************//**
+Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE
+nodes, which will be ignored in the first pass of each level, and visited in a
+second and third pass after all other nodes in the same level are visited.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit(
+/*==========*/
+ fts_ast_oper_t oper, /*!< in: current operator */
+ fts_ast_node_t* node, /*!< in: current root node */
+ fts_ast_callback visitor, /*!< in: callback function */
+ void* arg, /*!< in: arg for callback */
+ bool* has_ignore) /*!< out: true, if the operator
+ was ignored during processing,
+ currently we ignore FTS_EXIST
+ and FTS_IGNORE operators */
+{
+ dberr_t error = DB_SUCCESS;
+ fts_ast_node_t* oper_node = NULL;
+ fts_ast_node_t* start_node;
+ bool revisit = false;
+ bool will_be_ignored = false;
+ fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST;
+
+ start_node = node->list.head;
+
+ ut_a(node->type == FTS_AST_LIST
+ || node->type == FTS_AST_SUBEXP_LIST);
+
+ if (oper == FTS_EXIST_SKIP) {
+ visit_pass = FTS_PASS_EXIST;
+ } else if (oper == FTS_IGNORE_SKIP) {
+ visit_pass = FTS_PASS_IGNORE;
+ }
+
+ /* In the first pass of the tree, at the leaf level of the
+ tree, FTS_EXIST and FTS_IGNORE operation will be ignored.
+ It will be repeated at the level above the leaf level.
+
+ The basic idea here is that when we encounter FTS_EXIST or
+ FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP
+ or FTS_IGNORE_SKIP, and term node & text node with the operators
+ is ignored in the first pass. We have two passes during the revisit:
+ We process nodes with FTS_EXIST_SKIP in the exist pass, and then
+ process nodes with FTS_IGNORE_SKIP in the ignore pass.
+
+ The order should be restrictly followed, or we will get wrong results.
+ For example, we have a query 'a +b -c d +e -f'.
+ first pass: process 'a' and 'd' by union;
+ exist pass: process '+b' and '+e' by intersection;
+ ignore pass: process '-c' and '-f' by difference. */
+
+ for (node = node->list.head;
+ node && (error == DB_SUCCESS);
+ node = node->next) {
+
+ switch(node->type) {
+ case FTS_AST_LIST:
+ if (visit_pass != FTS_PASS_FIRST) {
+ break;
+ }
+
+ error = fts_ast_visit(oper, node, visitor,
+ arg, &will_be_ignored);
+
+ /* If will_be_ignored is set to true, then
+ we encountered and ignored a FTS_EXIST or FTS_IGNORE
+ operator. */
+ if (will_be_ignored) {
+ revisit = true;
+ /* Remember oper for list in case '-abc&def',
+ ignored oper is from previous node of list.*/
+ node->oper = oper;
+ }
+
+ break;
+
+ case FTS_AST_SUBEXP_LIST:
+ if (visit_pass != FTS_PASS_FIRST) {
+ break;
+ }
+
+ error = fts_ast_visit_sub_exp(node, visitor, arg);
+ break;
+
+ case FTS_AST_OPER:
+ oper = node->oper;
+ oper_node = node;
+
+ /* Change the operator for revisit */
+ if (oper == FTS_EXIST) {
+ oper_node->oper = FTS_EXIST_SKIP;
+ } else if (oper == FTS_IGNORE) {
+ oper_node->oper = FTS_IGNORE_SKIP;
+ }
+
+ break;
+
+ default:
+ if (node->visited) {
+ continue;
+ }
+
+ ut_a(oper == FTS_NONE || !oper_node
+ || oper_node->oper == oper
+ || oper_node->oper == FTS_EXIST_SKIP
+ || oper_node->oper == FTS_IGNORE_SKIP);
+
+ if (oper== FTS_EXIST || oper == FTS_IGNORE) {
+ *has_ignore = true;
+ continue;
+ }
+
+ /* Process leaf node accroding to its pass.*/
+ if (oper == FTS_EXIST_SKIP
+ && visit_pass == FTS_PASS_EXIST) {
+ error = visitor(FTS_EXIST, node, arg);
+ node->visited = true;
+ } else if (oper == FTS_IGNORE_SKIP
+ && visit_pass == FTS_PASS_IGNORE) {
+ error = visitor(FTS_IGNORE, node, arg);
+ node->visited = true;
+ } else if (visit_pass == FTS_PASS_FIRST) {
+ error = visitor(oper, node, arg);
+ node->visited = true;
+ }
+ }
+ }
+
+ if (revisit) {
+ /* Exist pass processes the skipped FTS_EXIST operation. */
+ for (node = start_node;
+ node && error == DB_SUCCESS;
+ node = node->next) {
+
+ if (node->type == FTS_AST_LIST
+ && node->oper != FTS_IGNORE) {
+ error = fts_ast_visit(FTS_EXIST_SKIP, node,
+ visitor, arg, &will_be_ignored);
+ }
+ }
+
+ /* Ignore pass processes the skipped FTS_IGNORE operation. */
+ for (node = start_node;
+ node && error == DB_SUCCESS;
+ node = node->next) {
+
+ if (node->type == FTS_AST_LIST) {
+ error = fts_ast_visit(FTS_IGNORE_SKIP, node,
+ visitor, arg, &will_be_ignored);
+ }
+ }
+ }
+
+ return(error);
+}