diff options
Diffstat (limited to 'storage/xtradb/fts/fts0ast.cc')
-rw-r--r-- | storage/xtradb/fts/fts0ast.cc | 632 |
1 files changed, 632 insertions, 0 deletions
diff --git a/storage/xtradb/fts/fts0ast.cc b/storage/xtradb/fts/fts0ast.cc new file mode 100644 index 00000000000..3a03fc63303 --- /dev/null +++ b/storage/xtradb/fts/fts0ast.cc @@ -0,0 +1,632 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0ast.cc +Full Text Search parser helper file. + +Created 2007/3/16 Sunny Bains. +***********************************************************************/ + +#include "mem0mem.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0fts.h" + +/* The FTS ast visit pass. */ +enum fts_ast_visit_pass_t { + FTS_PASS_FIRST, /*!< First visit pass, + process operators excluding + FTS_EXIST and FTS_IGNORE */ + FTS_PASS_EXIST, /*!< Exist visit pass, + process operator FTS_EXIST */ + FTS_PASS_IGNORE /*!< Ignore visit pass, + process operator FTS_IGNORE */ +}; + +/******************************************************************//** +Create an empty fts_ast_node_t. +@return Create a new node */ +static +fts_ast_node_t* +fts_ast_node_create(void) +/*=====================*/ +{ + fts_ast_node_t* node; + + node = (fts_ast_node_t*) ut_malloc(sizeof(*node)); + memset(node, 0x0, sizeof(*node)); + + return(node); +} + +/******************************************************************//** +Create a operator fts_ast_node_t. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_oper_t oper) /*!< in: ast operator */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_OPER; + node->oper = oper; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it +@return new node or a node list with tokenized words */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const char* ptr) /*!< in: ast term string */ +{ + fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg); + ulint len = strlen(ptr); + ulint cur_pos = 0; + fts_ast_node_t* node = NULL; + fts_ast_node_t* node_list = NULL; + fts_ast_node_t* first_node = NULL; + + /* Scan the incoming string and filter out any "non-word" characters */ + while (cur_pos < len) { + fts_string_t str; + ulint offset; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + state->charset, + reinterpret_cast<const byte*>(ptr) + cur_pos, + reinterpret_cast<const byte*>(ptr) + len, &str, &offset); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + + if (str.f_n_char > 0) { + /* If the subsequent term (after the first one)'s size + is less than fts_min_token_size, we shall ignore + that. This is to make consistent with MyISAM behavior */ + if (first_node && (str.f_n_char < fts_min_token_size)) { + continue; + } + + node = fts_ast_node_create(); + + node->type = FTS_AST_TERM; + + node->term.ptr = static_cast<byte*>(ut_malloc( + str.f_len + 1)); + memcpy(node->term.ptr, str.f_str, str.f_len); + node->term.ptr[str.f_len] = '\0'; + + fts_ast_state_add_node( + static_cast<fts_ast_state_t*>(arg), node); + + if (first_node) { + /* There is more than one word, create + a list to organize them */ + if (!node_list) { + node_list = fts_ast_create_node_list( + static_cast<fts_ast_state_t*>( + arg), + first_node); + } + + fts_ast_add_node(node_list, node); + } else { + first_node = node; + } + } + } + + return((node_list != NULL) ? node_list : first_node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const char* ptr) /*!< in: ast text string */ +{ + ulint len = strlen(ptr); + fts_ast_node_t* node = NULL; + + + ut_ad(len >= 1); + + if (len <= 2) { + /* There is a way to directly supply null terminator + in the query string (by using 0x220022) and get here, + and certainly it would not make a valid query text */ + ut_ad(ptr[0] == '\"'); + + if (len == 2) { + ut_ad(ptr[1] == '\"'); + } + + return(NULL); + } + + node = fts_ast_node_create(); + + /*!< We ignore the actual quotes "" */ + len -= 2; + + node->type = FTS_AST_TEXT; + node->text.ptr = static_cast<byte*>(ut_malloc(len + 1)); + + /*!< Skip copying the first quote */ + memcpy(node->text.ptr, ptr + 1, len); + node->text.ptr[len] = 0; + node->text.distance = ULINT_UNDEFINED; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the expr and is responsible +for free'ing it. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. +@return new node */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_SUBEXP_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Free an expr list node elements. */ +static +void +fts_ast_free_list( +/*==============*/ + fts_ast_node_t* node) /*!< in: ast node to free */ +{ + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + for (node = node->list.head; + node != NULL; + node = fts_ast_free_node(node)) { + + /*!< No op */ + } +} + +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node) /*!< in: the node to free */ +{ + fts_ast_node_t* next_node; + + switch (node->type) { + case FTS_AST_TEXT: + if (node->text.ptr) { + ut_free(node->text.ptr); + node->text.ptr = NULL; + } + break; + + case FTS_AST_TERM: + if (node->term.ptr) { + ut_free(node->term.ptr); + node->term.ptr = NULL; + } + break; + + case FTS_AST_LIST: + case FTS_AST_SUBEXP_LIST: + fts_ast_free_list(node); + node->list.head = node->list.tail = NULL; + break; + + case FTS_AST_OPER: + break; + + default: + ut_error; + } + + /*!< Get next node before freeing the node itself */ + next_node = node->next; + + ut_free(node); + + return(next_node); +} + +/******************************************************************//** +This AST takes ownership of the expr and is responsible +for free'ing it. +@return in param "list" */ +UNIV_INTERN +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* node, /*!< in: list instance */ + fts_ast_node_t* elem) /*!< in: node to add to list */ +{ + if (!elem) { + return(NULL); + } + + ut_a(!elem->next); + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + if (!node->list.head) { + ut_a(!node->list.tail); + + node->list.head = node->list.tail = elem; + } else { + ut_a(node->list.tail); + + node->list.tail->next = elem; + node->list.tail = elem; + } + + return(node); +} + +/******************************************************************//** +For tracking node allocations, in case there is an error during +parsing. */ +UNIV_INTERN +void +fts_ast_state_add_node( +/*===================*/ + fts_ast_state_t*state, /*!< in: ast instance */ + fts_ast_node_t* node) /*!< in: node to add to ast */ +{ + if (!state->list.head) { + ut_a(!state->list.tail); + + state->list.head = state->list.tail = node; + } else { + state->list.tail->next_alloc = node; + state->list.tail = node; + } +} + +/******************************************************************//** +Set the wildcard attribute of a term. */ +UNIV_INTERN +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node) /*!< in/out: set attribute of + a term node */ +{ + if (!node) { + return; + } + + /* If it's a node list, the wildcard should be set to the tail node*/ + if (node->type == FTS_AST_LIST) { + ut_ad(node->list.tail != NULL); + node = node->list.tail; + } + + ut_a(node->type == FTS_AST_TERM); + ut_a(!node->term.wildcard); + + node->term.wildcard = TRUE; +} + +/******************************************************************//** +Set the proximity attribute of a text node. */ +UNIV_INTERN +void +fts_ast_term_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance) /*!< in: the text proximity + distance */ +{ + ut_a(node->type == FTS_AST_TEXT); + ut_a(node->text.distance == ULINT_UNDEFINED); + + node->text.distance = distance; +} + +/******************************************************************//** +Free node and expr allocations. */ +UNIV_INTERN +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state) /*!< in: ast state to free */ +{ + fts_ast_node_t* node = state->list.head; + + /* Free the nodes that were allocated during parsing. */ + while (node) { + fts_ast_node_t* next = node->next_alloc; + + if (node->type == FTS_AST_TEXT && node->text.ptr) { + ut_free(node->text.ptr); + node->text.ptr = NULL; + } else if (node->type == FTS_AST_TERM && node->term.ptr) { + ut_free(node->term.ptr); + node->term.ptr = NULL; + } + + ut_free(node); + node = next; + } + + state->root = state->list.head = state->list.tail = NULL; +} + +/******************************************************************//** +Print an ast node. */ +UNIV_INTERN +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node) /*!< in: ast node to print */ +{ + switch (node->type) { + case FTS_AST_TEXT: + printf("TEXT: %s\n", node->text.ptr); + break; + + case FTS_AST_TERM: + printf("TERM: %s\n", node->term.ptr); + break; + + case FTS_AST_LIST: + printf("LIST: "); + node = node->list.head; + + while (node) { + fts_ast_node_print(node); + node = node->next; + } + break; + + case FTS_AST_SUBEXP_LIST: + printf("SUBEXP_LIST: "); + node = node->list.head; + + while (node) { + fts_ast_node_print(node); + node = node->next; + } + case FTS_AST_OPER: + printf("OPER: %d\n", node->oper); + break; + + default: + ut_error; + } +} + +/******************************************************************//** +Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE +nodes, which will be ignored in the first pass of each level, and visited in a +second and third pass after all other nodes in the same level are visited. +@return DB_SUCCESS if all went well */ +UNIV_INTERN +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: current root node */ + fts_ast_callback visitor, /*!< in: callback function */ + void* arg, /*!< in: arg for callback */ + bool* has_ignore) /*!< out: true, if the operator + was ignored during processing, + currently we ignore FTS_EXIST + and FTS_IGNORE operators */ +{ + dberr_t error = DB_SUCCESS; + fts_ast_node_t* oper_node = NULL; + fts_ast_node_t* start_node; + bool revisit = false; + bool will_be_ignored = false; + fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST; + + start_node = node->list.head; + + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + if (oper == FTS_EXIST_SKIP) { + visit_pass = FTS_PASS_EXIST; + } else if (oper == FTS_IGNORE_SKIP) { + visit_pass = FTS_PASS_IGNORE; + } + + /* In the first pass of the tree, at the leaf level of the + tree, FTS_EXIST and FTS_IGNORE operation will be ignored. + It will be repeated at the level above the leaf level. + + The basic idea here is that when we encounter FTS_EXIST or + FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP + or FTS_IGNORE_SKIP, and term node & text node with the operators + is ignored in the first pass. We have two passes during the revisit: + We process nodes with FTS_EXIST_SKIP in the exist pass, and then + process nodes with FTS_IGNORE_SKIP in the ignore pass. + + The order should be restrictly followed, or we will get wrong results. + For example, we have a query 'a +b -c d +e -f'. + first pass: process 'a' and 'd' by union; + exist pass: process '+b' and '+e' by intersection; + ignore pass: process '-c' and '-f' by difference. */ + + for (node = node->list.head; + node && (error == DB_SUCCESS); + node = node->next) { + + switch(node->type) { + case FTS_AST_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; + } + + error = fts_ast_visit(oper, node, visitor, + arg, &will_be_ignored); + + /* If will_be_ignored is set to true, then + we encountered and ignored a FTS_EXIST or FTS_IGNORE + operator. */ + if (will_be_ignored) { + revisit = true; + /* Remember oper for list in case '-abc&def', + ignored oper is from previous node of list.*/ + node->oper = oper; + } + + break; + + case FTS_AST_SUBEXP_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; + } + + error = fts_ast_visit_sub_exp(node, visitor, arg); + break; + + case FTS_AST_OPER: + oper = node->oper; + oper_node = node; + + /* Change the operator for revisit */ + if (oper == FTS_EXIST) { + oper_node->oper = FTS_EXIST_SKIP; + } else if (oper == FTS_IGNORE) { + oper_node->oper = FTS_IGNORE_SKIP; + } + + break; + + default: + if (node->visited) { + continue; + } + + ut_a(oper == FTS_NONE || !oper_node + || oper_node->oper == oper + || oper_node->oper == FTS_EXIST_SKIP + || oper_node->oper == FTS_IGNORE_SKIP); + + if (oper== FTS_EXIST || oper == FTS_IGNORE) { + *has_ignore = true; + continue; + } + + /* Process leaf node accroding to its pass.*/ + if (oper == FTS_EXIST_SKIP + && visit_pass == FTS_PASS_EXIST) { + error = visitor(FTS_EXIST, node, arg); + node->visited = true; + } else if (oper == FTS_IGNORE_SKIP + && visit_pass == FTS_PASS_IGNORE) { + error = visitor(FTS_IGNORE, node, arg); + node->visited = true; + } else if (visit_pass == FTS_PASS_FIRST) { + error = visitor(oper, node, arg); + node->visited = true; + } + } + } + + if (revisit) { + /* Exist pass processes the skipped FTS_EXIST operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST + && node->oper != FTS_IGNORE) { + error = fts_ast_visit(FTS_EXIST_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + + /* Ignore pass processes the skipped FTS_IGNORE operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST) { + error = fts_ast_visit(FTS_IGNORE_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + } + + return(error); +} |