diff options
Diffstat (limited to 'storage/innobase/fts/fts0ast.cc')
-rw-r--r-- | storage/innobase/fts/fts0ast.cc | 232 |
1 files changed, 185 insertions, 47 deletions
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc index 972f5acf461..3a03fc63303 100644 --- a/storage/innobase/fts/fts0ast.cc +++ b/storage/innobase/fts/fts0ast.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,6 +26,18 @@ Created 2007/3/16 Sunny Bains. #include "mem0mem.h" #include "fts0ast.h" #include "fts0pars.h" +#include "fts0fts.h" + +/* The FTS ast visit pass. */ +enum fts_ast_visit_pass_t { + FTS_PASS_FIRST, /*!< First visit pass, + process operators excluding + FTS_EXIST and FTS_IGNORE */ + FTS_PASS_EXIST, /*!< Exist visit pass, + process operator FTS_EXIST */ + FTS_PASS_IGNORE /*!< Ignore visit pass, + process operator FTS_IGNORE */ +}; /******************************************************************//** Create an empty fts_ast_node_t. @@ -66,7 +78,7 @@ fts_ast_create_node_oper( /******************************************************************//** This function takes ownership of the ptr and is responsible for free'ing it -@return new node */ +@return new node or a node list with tokenized words */ UNIV_INTERN fts_ast_node_t* fts_ast_create_node_term( @@ -74,17 +86,68 @@ fts_ast_create_node_term( void* arg, /*!< in: ast state instance */ const char* ptr) /*!< in: ast term string */ { - ulint len = strlen(ptr); - fts_ast_node_t* node = fts_ast_node_create(); + fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg); + ulint len = strlen(ptr); + ulint cur_pos = 0; + fts_ast_node_t* node = NULL; + fts_ast_node_t* node_list = NULL; + fts_ast_node_t* first_node = NULL; + + /* Scan the incoming string and filter out any "non-word" characters */ + while (cur_pos < len) { + fts_string_t str; + ulint offset; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + state->charset, + reinterpret_cast<const byte*>(ptr) + cur_pos, + reinterpret_cast<const byte*>(ptr) + len, &str, &offset); + + if (cur_len == 0) { + break; + } - node->type = FTS_AST_TERM; + cur_pos += cur_len; - node->term.ptr = static_cast<byte*>(ut_malloc(len + 1)); - memcpy(node->term.ptr, ptr, len + 1); + if (str.f_n_char > 0) { + /* If the subsequent term (after the first one)'s size + is less than fts_min_token_size, we shall ignore + that. This is to make consistent with MyISAM behavior */ + if (first_node && (str.f_n_char < fts_min_token_size)) { + continue; + } - fts_ast_state_add_node((fts_ast_state_t*) arg, node); + node = fts_ast_node_create(); - return(node); + node->type = FTS_AST_TERM; + + node->term.ptr = static_cast<byte*>(ut_malloc( + str.f_len + 1)); + memcpy(node->term.ptr, str.f_str, str.f_len); + node->term.ptr[str.f_len] = '\0'; + + fts_ast_state_add_node( + static_cast<fts_ast_state_t*>(arg), node); + + if (first_node) { + /* There is more than one word, create + a list to organize them */ + if (!node_list) { + node_list = fts_ast_create_node_list( + static_cast<fts_ast_state_t*>( + arg), + first_node); + } + + fts_ast_add_node(node_list, node); + } else { + first_node = node; + } + } + } + + return((node_list != NULL) ? node_list : first_node); } /******************************************************************//** @@ -101,11 +164,19 @@ fts_ast_create_node_text( ulint len = strlen(ptr); fts_ast_node_t* node = NULL; - ut_ad(len >= 2); - if (len == 2) { + ut_ad(len >= 1); + + if (len <= 2) { + /* There is a way to directly supply null terminator + in the query string (by using 0x220022) and get here, + and certainly it would not make a valid query text */ ut_ad(ptr[0] == '\"'); - ut_ad(ptr[1] == '\"'); + + if (len == 2) { + ut_ad(ptr[1] == '\"'); + } + return(NULL); } @@ -297,6 +368,16 @@ fts_ast_term_set_wildcard( fts_ast_node_t* node) /*!< in/out: set attribute of a term node */ { + if (!node) { + return; + } + + /* If it's a node list, the wildcard should be set to the tail node*/ + if (node->type == FTS_AST_LIST) { + ut_ad(node->list.tail != NULL); + node = node->list.tail; + } + ut_a(node->type == FTS_AST_TERM); ut_a(!node->term.wildcard); @@ -393,9 +474,9 @@ fts_ast_node_print( } /******************************************************************//** -Traverse the AST - in-order traversal, except for the FTS_IGNORE -nodes, which will be ignored in the first pass of each level, and -visited in a second pass after all other nodes in the same level are visited. +Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE +nodes, which will be ignored in the first pass of each level, and visited in a +second and third pass after all other nodes in the same level are visited. @return DB_SUCCESS if all went well */ UNIV_INTERN dberr_t @@ -407,85 +488,142 @@ fts_ast_visit( void* arg, /*!< in: arg for callback */ bool* has_ignore) /*!< out: true, if the operator was ignored during processing, - currently we only ignore - FTS_IGNORE operator */ + currently we ignore FTS_EXIST + and FTS_IGNORE operators */ { dberr_t error = DB_SUCCESS; fts_ast_node_t* oper_node = NULL; fts_ast_node_t* start_node; bool revisit = false; bool will_be_ignored = false; + fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST; start_node = node->list.head; ut_a(node->type == FTS_AST_LIST || node->type == FTS_AST_SUBEXP_LIST); + if (oper == FTS_EXIST_SKIP) { + visit_pass = FTS_PASS_EXIST; + } else if (oper == FTS_IGNORE_SKIP) { + visit_pass = FTS_PASS_IGNORE; + } + /* In the first pass of the tree, at the leaf level of the - tree, FTS_IGNORE operation will be ignored. It will be - repeated at the level above the leaf level */ + tree, FTS_EXIST and FTS_IGNORE operation will be ignored. + It will be repeated at the level above the leaf level. + + The basic idea here is that when we encounter FTS_EXIST or + FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP + or FTS_IGNORE_SKIP, and term node & text node with the operators + is ignored in the first pass. We have two passes during the revisit: + We process nodes with FTS_EXIST_SKIP in the exist pass, and then + process nodes with FTS_IGNORE_SKIP in the ignore pass. + + The order should be restrictly followed, or we will get wrong results. + For example, we have a query 'a +b -c d +e -f'. + first pass: process 'a' and 'd' by union; + exist pass: process '+b' and '+e' by intersection; + ignore pass: process '-c' and '-f' by difference. */ + for (node = node->list.head; node && (error == DB_SUCCESS); node = node->next) { - if (node->type == FTS_AST_LIST) { + switch(node->type) { + case FTS_AST_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; + } + error = fts_ast_visit(oper, node, visitor, arg, &will_be_ignored); /* If will_be_ignored is set to true, then - we encountered and ignored a FTS_IGNORE operator, - and a second pass is needed to process FTS_IGNORE - operator */ + we encountered and ignored a FTS_EXIST or FTS_IGNORE + operator. */ if (will_be_ignored) { revisit = true; + /* Remember oper for list in case '-abc&def', + ignored oper is from previous node of list.*/ + node->oper = oper; + } + + break; + + case FTS_AST_SUBEXP_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; } - } else if (node->type == FTS_AST_SUBEXP_LIST) { + error = fts_ast_visit_sub_exp(node, visitor, arg); - } else if (node->type == FTS_AST_OPER) { + break; + + case FTS_AST_OPER: oper = node->oper; oper_node = node; - } else { + + /* Change the operator for revisit */ + if (oper == FTS_EXIST) { + oper_node->oper = FTS_EXIST_SKIP; + } else if (oper == FTS_IGNORE) { + oper_node->oper = FTS_IGNORE_SKIP; + } + + break; + + default: if (node->visited) { continue; } ut_a(oper == FTS_NONE || !oper_node - || oper_node->oper == oper); + || oper_node->oper == oper + || oper_node->oper == FTS_EXIST_SKIP + || oper_node->oper == FTS_IGNORE_SKIP); - if (oper == FTS_IGNORE) { + if (oper== FTS_EXIST || oper == FTS_IGNORE) { *has_ignore = true; - /* Change the operator to FTS_IGNORE_SKIP, - so that it is processed in the second pass */ - oper_node->oper = FTS_IGNORE_SKIP; continue; } - if (oper == FTS_IGNORE_SKIP) { - /* This must be the second pass, now we process - the FTS_IGNORE operator */ - visitor(FTS_IGNORE, node, arg); - } else { - visitor(oper, node, arg); + /* Process leaf node accroding to its pass.*/ + if (oper == FTS_EXIST_SKIP + && visit_pass == FTS_PASS_EXIST) { + error = visitor(FTS_EXIST, node, arg); + node->visited = true; + } else if (oper == FTS_IGNORE_SKIP + && visit_pass == FTS_PASS_IGNORE) { + error = visitor(FTS_IGNORE, node, arg); + node->visited = true; + } else if (visit_pass == FTS_PASS_FIRST) { + error = visitor(oper, node, arg); + node->visited = true; } - - node->visited = true; } } - /* Second pass to process the skipped FTS_IGNORE operation. - It is only performed at the level above leaf level */ if (revisit) { + /* Exist pass processes the skipped FTS_EXIST operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST + && node->oper != FTS_IGNORE) { + error = fts_ast_visit(FTS_EXIST_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + + /* Ignore pass processes the skipped FTS_IGNORE operation. */ for (node = start_node; node && error == DB_SUCCESS; node = node->next) { if (node->type == FTS_AST_LIST) { - /* In this pass, it will process all those - operators ignored in the first pass, and those - whose operators are set to FTS_IGNORE_SKIP */ - error = fts_ast_visit( - oper, node, visitor, arg, - &will_be_ignored); + error = fts_ast_visit(FTS_IGNORE_SKIP, node, + visitor, arg, &will_be_ignored); } } } |