summaryrefslogtreecommitdiff
path: root/storage/innobase/fts/fts0que.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/fts/fts0que.cc')
-rw-r--r--storage/innobase/fts/fts0que.cc988
1 files changed, 560 insertions, 428 deletions
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 26bd0378aed..dee7c59a58b 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -24,7 +24,9 @@ Created 2007/03/27 Sunny Bains
Completed 2011/7/10 Sunny and Jimmy Yang
*******************************************************/
-#include "dict0dict.h" /* dict_table_get_n_rows() */
+#include "ha_prototypes.h"
+
+#include "dict0dict.h"
#include "ut0rbt.h"
#include "row0sel.h"
#include "fts0fts.h"
@@ -32,14 +34,15 @@ Completed 2011/7/10 Sunny and Jimmy Yang
#include "fts0ast.h"
#include "fts0pars.h"
#include "fts0types.h"
-#include "ha_prototypes.h"
-#include <ctype.h>
+#include "fts0plugin.h"
+#include "ut0new.h"
-#ifndef UNIV_NONINL
+#ifdef UNIV_NONINL
#include "fts0types.ic"
#include "fts0vlc.ic"
#endif
+#include <iomanip>
#include <vector>
#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
@@ -59,7 +62,7 @@ Completed 2011/7/10 Sunny and Jimmy Yang
// FIXME: Need to have a generic iterator that traverses the ilist.
-typedef std::vector<fts_string_t> word_vector_t;
+typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
struct fts_word_freq_t;
@@ -71,6 +74,7 @@ struct fts_query_t {
dict_index_t* index; /*!< The FTS index to search */
/*!< FTS auxiliary common table def */
+
fts_table_t fts_common_table;
fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
@@ -144,7 +148,18 @@ struct fts_query_t {
document, its elements are of type
fts_word_freq_t */
+ ib_rbt_t* wildcard_words; /*!< words with wildcard */
+
bool multi_exist; /*!< multiple FTS_EXIST oper */
+
+ st_mysql_ftparser* parser; /*!< fts plugin parser */
+
+ /** limit value for the fts query */
+ ulonglong limit;
+
+ /** number of docs fetched by query. This is to restrict the
+ result with limit value */
+ ulonglong n_docs;
};
/** For phrase matching, first we collect the documents and the positions
@@ -178,7 +193,7 @@ struct fts_select_t {
the FTS index */
};
-typedef std::vector<ulint> pos_vector_t;
+typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
/** structure defines a set of ranges for original documents, each of which
has a minimum position and maximum position. Text in such range should
@@ -197,22 +212,54 @@ struct fts_proximity_t {
/** The match positions and tokesn to match */
struct fts_phrase_t {
- ibool found; /*!< Match result */
-
- const fts_match_t*
- match; /*!< Positions within text */
-
- const ib_vector_t*
- tokens; /*!< Tokens to match */
-
- ulint distance; /*!< For matching on proximity
- distance. Can be 0 for exact match */
- CHARSET_INFO* charset; /*!< Phrase match charset */
- mem_heap_t* heap; /*!< Heap for word processing */
- ulint zip_size; /*!< row zip size */
- fts_proximity_t*proximity_pos; /*!< position info for proximity
- search verification. Records the min
- and max position of words matched */
+ fts_phrase_t(const dict_table_t* table)
+ :
+ found(false),
+ match(NULL),
+ tokens(NULL),
+ distance(0),
+ charset(NULL),
+ heap(NULL),
+ page_size(dict_table_page_size(table)),
+ proximity_pos(NULL),
+ parser(NULL)
+ {
+ }
+
+ /** Match result */
+ ibool found;
+
+ /** Positions within text */
+ const fts_match_t* match;
+
+ /** Tokens to match */
+ const ib_vector_t* tokens;
+
+ /** For matching on proximity distance. Can be 0 for exact match */
+ ulint distance;
+
+ /** Phrase match charset */
+ CHARSET_INFO* charset;
+
+ /** Heap for word processing */
+ mem_heap_t* heap;
+
+ /** Row page size */
+ const page_size_t page_size;
+
+ /** Position info for proximity search verification. Records the
+ min and max position of words matched */
+ fts_proximity_t* proximity_pos;
+
+ /** FTS plugin parser */
+ st_mysql_ftparser* parser;
+};
+
+/** Paramter passed to fts phrase match by parser */
+struct fts_phrase_param_t {
+ fts_phrase_t* phrase; /*!< Match phrase instance */
+ ulint token_index; /*!< Index of token to match next */
+ mem_heap_t* heap; /*!< Heap for word processing */
};
/** For storing the frequncy of a word/term in a document */
@@ -395,7 +442,7 @@ fts_query_lcs(
ulint r = len_p1;
ulint c = len_p2;
ulint size = (r + 1) * (c + 1) * sizeof(ulint);
- ulint* table = (ulint*) ut_malloc(size);
+ ulint* table = (ulint*) ut_malloc_nokey(size);
/* Traverse the table backwards, from the last row to the first and
also from the last column to the first. We compute the smaller
@@ -442,7 +489,7 @@ fts_query_lcs(
/*******************************************************************//**
Compare two fts_ranking_t instance on their rank value and doc ids in
descending order on the rank and ascending order on doc id.
-@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
+@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
static
int
fts_query_compare_rank(
@@ -469,67 +516,6 @@ fts_query_compare_rank(
return(1);
}
-#ifdef FTS_UTF8_DEBUG
-/*******************************************************************//**
-Convert string to lowercase.
-@return lower case string, callers responsibility to delete using
-ut_free() */
-static
-byte*
-fts_tolower(
-/*========*/
- const byte* src, /*!< in: src string */
- ulint len) /*!< in: src string length */
-{
- fts_string_t str;
- byte* lc_str = ut_malloc(len + 1);
-
- str.f_len = len;
- str.f_str = lc_str;
-
- memcpy(str.f_str, src, len);
-
- /* Make sure the last byte is NUL terminated */
- str.f_str[len] = '\0';
-
- fts_utf8_tolower(&str);
-
- return(lc_str);
-}
-
-/*******************************************************************//**
-Do a case insensitive search. Doesn't check for NUL byte end marker
-only relies on len. Convert str2 to lower case before comparing.
-@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
-static
-int
-fts_utf8_strcmp(
-/*============*/
- const fts_string_t*
- str1, /*!< in: should be lower case*/
-
- fts_string_t* str2) /*!< in: any case. We will use the length
- of this string during compare as it
- should be the min of the two strings */
-{
- byte b = str2->f_str[str2->f_len];
-
- ut_a(str2->f_len <= str1->f_len);
-
- /* We need to write a NUL byte at the end of the string because the
- string is converted to lowercase by a MySQL function which doesn't
- care about the length. */
- str2->f_str[str2->f_len] = 0;
-
- fts_utf8_tolower(str2);
-
- /* Restore the value we replaced above. */
- str2->f_str[str2->f_len] = b;
-
- return(memcmp(str1->f_str, str2->f_str, str2->f_len));
-}
-#endif
-
/*******************************************************************//**
Create words in ranking */
static
@@ -593,11 +579,7 @@ fts_ranking_words_add(
pos = rbt_size(query->word_map);
- new_word.f_str = static_cast<byte*>(mem_heap_alloc(query->heap,
- word->f_len + 1));
- memcpy(new_word.f_str, word->f_str, word->f_len);
- new_word.f_str[word->f_len] = 0;
- new_word.f_len = word->f_len;
+ fts_string_dup(&new_word, word, query->heap);
new_word.f_n_char = pos;
rbt_add_node(query->word_map, &parent, &new_word);
@@ -684,11 +666,7 @@ fts_query_add_word_freq(
memset(&word_freq, 0, sizeof(word_freq));
- word_freq.word.f_str = static_cast<byte*>(
- mem_heap_alloc(query->heap, word->f_len + 1));
- memcpy(word_freq.word.f_str, word->f_str, word->f_len);
- word_freq.word.f_str[word->f_len] = 0;
- word_freq.word.f_len = word->f_len;
+ fts_string_dup(&word_freq.word, word, query->heap);
word_freq.doc_count = 0;
@@ -1142,8 +1120,12 @@ fts_query_difference(
ut_a(query->oper == FTS_IGNORE);
#ifdef FTS_INTERNAL_DIAG_PRINT
- fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
- (int) token->f_len, token->f_str);
+ {
+ ib::info out;
+ out << "DIFFERENCE: Searching: '";
+ out.write(token->f_str, token->f_len);
+ out << "'";
+ }
#endif
if (query->doc_ids) {
@@ -1233,8 +1215,12 @@ fts_query_intersect(
ut_a(query->oper == FTS_EXIST);
#ifdef FTS_INTERNAL_DIAG_PRINT
- fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
- (int) token->f_len, token->f_str);
+ {
+ ib::info out;
+ out << "INTERSECT: Searching: '";
+ out.write(token->f_str, token->f_len);
+ out << "'";
+ }
#endif
/* If the words set is not empty and multi exist is true,
@@ -1415,8 +1401,12 @@ fts_query_union(
query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
#ifdef FTS_INTERNAL_DIAG_PRINT
- fprintf(stderr, "UNION: Searching: '%.*s'\n",
- (int) token->f_len, token->f_str);
+ {
+ ib::info out;
+ out << "UNION: Searching: '";
+ out.write(token->f_str, token->f_len);
+ out << "'";
+ }
#endif
if (query->doc_ids) {
@@ -1427,10 +1417,6 @@ fts_query_union(
return(query->error);
}
- /* Single '%' would confuse parser in pars_like_rebind(). In addition,
- our wildcard search only supports prefix search */
- ut_ad(*token->f_str != '%');
-
fts_query_cache(query, token);
/* Setup the callback args for filtering and
@@ -1626,18 +1612,17 @@ fts_query_match_phrase_terms(
const fts_string_t* token;
int result;
ulint ret;
- ulint offset;
ret = innobase_mysql_fts_get_token(
- phrase->charset, ptr, (byte*) end,
- &match, &offset);
+ phrase->charset, ptr,
+ const_cast<byte*>(end), &match);
if (match.f_len > 0) {
/* Get next token to match. */
token = static_cast<const fts_string_t*>(
ib_vector_get_const(tokens, i));
- fts_utf8_string_dup(&cmp_str, &match, heap);
+ fts_string_dup(&cmp_str, &match, heap);
result = innobase_fts_text_case_cmp(
phrase->charset, token, &cmp_str);
@@ -1718,12 +1703,11 @@ fts_proximity_is_word_in_range(
while (cur_pos <= proximity_pos->max_pos[i]) {
ulint len;
fts_string_t str;
- ulint offset = 0;
len = innobase_mysql_fts_get_token(
phrase->charset,
start + cur_pos,
- start + total_len, &str, &offset);
+ start + total_len, &str);
if (len == 0) {
break;
@@ -1753,6 +1737,103 @@ fts_proximity_is_word_in_range(
}
/*****************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for phrase match
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return 0 if match, or return non-zero */
+static
+int
+fts_query_match_phrase_add_word_for_parser(
+/*=======================================*/
+ MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
+ const char* word, /*!< in: token */
+ int word_len, /*!< in: token length */
+ MYSQL_FTPARSER_BOOLEAN_INFO* info) /*!< in: token info */
+{
+ fts_phrase_param_t* phrase_param;
+ fts_phrase_t* phrase;
+ const ib_vector_t* tokens;
+ fts_string_t match;
+ fts_string_t cmp_str;
+ const fts_string_t* token;
+ int result;
+ mem_heap_t* heap;
+
+ phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
+ heap = phrase_param->heap;
+ phrase = phrase_param->phrase;
+ tokens = phrase->tokens;
+
+ /* In case plugin parser doesn't check return value */
+ if (phrase_param->token_index == ib_vector_size(tokens)) {
+ return(1);
+ }
+
+ match.f_str = (uchar *)(word);
+ match.f_len = word_len;
+ match.f_n_char = fts_get_token_size(phrase->charset, word, word_len);
+
+ if (match.f_len > 0) {
+ /* Get next token to match. */
+ ut_a(phrase_param->token_index < ib_vector_size(tokens));
+ token = static_cast<const fts_string_t*>(
+ ib_vector_get_const(tokens, phrase_param->token_index));
+
+ fts_string_dup(&cmp_str, &match, heap);
+
+ result = innobase_fts_text_case_cmp(
+ phrase->charset, token, &cmp_str);
+
+ if (result == 0) {
+ phrase_param->token_index++;
+ } else {
+ return(1);
+ }
+ }
+
+ /* Can't be greater than the number of elements. */
+ ut_a(phrase_param->token_index <= ib_vector_size(tokens));
+
+ /* This is the case for multiple words. */
+ if (phrase_param->token_index == ib_vector_size(tokens)) {
+ phrase->found = TRUE;
+ }
+
+ return(static_cast<int>(phrase->found));
+}
+
+/*****************************************************************//**
+Check whether the terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms_by_parser(
+/*===================================*/
+ fts_phrase_param_t* phrase_param, /* in/out: phrase param */
+ st_mysql_ftparser* parser, /* in: plugin fts parser */
+ byte* text, /* in: text to check */
+ ulint len) /* in: text length */
+{
+ MYSQL_FTPARSER_PARAM param;
+
+ ut_a(parser);
+
+ /* Set paramters for param */
+ param.mysql_parse = fts_tokenize_document_internal;
+ param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
+ param.mysql_ftparam = phrase_param;
+ param.cs = phrase_param->phrase->charset;
+ param.doc = reinterpret_cast<char*>(text);
+ param.length = static_cast<int>(len);
+ param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
+
+ PARSER_INIT(parser, &param);
+ parser->parse(&param);
+ PARSER_DEINIT(parser, &param);
+
+ return(phrase_param->phrase->found);
+}
+
+/*****************************************************************//**
Callback function to fetch and search the document.
@return TRUE if matched else FALSE */
static
@@ -1786,11 +1867,7 @@ fts_query_match_phrase(
for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
ulint pos;
- fts_string_t match;
- fts_string_t cmp_str;
byte* ptr = start;
- ulint ret;
- ulint offset;
pos = *(ulint*) ib_vector_get_const(positions, i);
@@ -1807,39 +1884,60 @@ fts_query_match_phrase(
searched field to adjust the doc position when search
phrases. */
pos -= prev_len;
- ptr = match.f_str = start + pos;
+ ptr = start + pos;
/* Within limits ? */
if (ptr >= end) {
break;
}
- ret = innobase_mysql_fts_get_token(
- phrase->charset, start + pos, (byte*) end,
- &match, &offset);
+ if (phrase->parser) {
+ fts_phrase_param_t phrase_param;
- if (match.f_len == 0) {
- break;
- }
+ phrase_param.phrase = phrase;
+ phrase_param.token_index = 0;
+ phrase_param.heap = heap;
- fts_utf8_string_dup(&cmp_str, &match, heap);
+ if (fts_query_match_phrase_terms_by_parser(
+ &phrase_param,
+ phrase->parser,
+ ptr,
+ (end - ptr))) {
+ break;
+ }
+ } else {
+ fts_string_t match;
+ fts_string_t cmp_str;
+ ulint ret;
- if (innobase_fts_text_case_cmp(
- phrase->charset, first, &cmp_str) == 0) {
+ match.f_str = ptr;
+ ret = innobase_mysql_fts_get_token(
+ phrase->charset, start + pos,
+ const_cast<byte*>(end), &match);
- /* This is the case for the single word
- in the phrase. */
- if (ib_vector_size(phrase->tokens) == 1) {
- phrase->found = TRUE;
+ if (match.f_len == 0) {
break;
}
- ptr += ret;
+ fts_string_dup(&cmp_str, &match, heap);
- /* Match the remaining terms in the phrase. */
- if (fts_query_match_phrase_terms(phrase, &ptr,
- end, heap)) {
- break;
+ if (innobase_fts_text_case_cmp(
+ phrase->charset, first, &cmp_str) == 0) {
+
+ /* This is the case for the single word
+ in the phrase. */
+ if (ib_vector_size(phrase->tokens) == 1) {
+ phrase->found = TRUE;
+ break;
+ }
+
+ ptr += ret;
+
+ /* Match the remaining terms in the phrase. */
+ if (fts_query_match_phrase_terms(phrase, &ptr,
+ end, heap)) {
+ break;
+ }
}
}
}
@@ -1915,9 +2013,9 @@ fts_query_fetch_document(
if (dfield_is_ext(dfield)) {
data = btr_copy_externally_stored_field(
- &cur_len, data, phrase->zip_size,
- dfield_get_len(dfield), phrase->heap,
- NULL);
+ &cur_len, data, phrase->page_size,
+ dfield_get_len(dfield), phrase->heap
+ );
} else {
cur_len = dfield_get_len(dfield);
}
@@ -2032,13 +2130,22 @@ fts_query_find_term(
fts_select_t select;
doc_id_t match_doc_id;
trx_t* trx = query->trx;
+ char table_name[MAX_FULL_NAME_LEN];
trx->op_info = "fetching FTS index matching nodes";
if (*graph) {
info = (*graph)->info;
} else {
+ ulint selected;
+
info = pars_info_create();
+
+ selected = fts_select_index(*word->f_str);
+ query->fts_index_table.suffix = fts_get_suffix(selected);
+
+ fts_get_table_name(&query->fts_index_table, table_name);
+ pars_info_bind_id(info, true, "index_table_name", table_name);
}
select.found = FALSE;
@@ -2057,11 +2164,6 @@ fts_query_find_term(
fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
if (!*graph) {
- ulint selected;
-
- selected = fts_select_index(*word->f_str);
-
- query->fts_index_table.suffix = fts_get_suffix(selected);
*graph = fts_parse_sql(
&query->fts_index_table,
@@ -2069,10 +2171,10 @@ fts_query_find_term(
"DECLARE FUNCTION my_func;\n"
"DECLARE CURSOR c IS"
" SELECT doc_count, ilist\n"
- " FROM \"%s\"\n"
- " WHERE word LIKE :word AND "
- " first_doc_id <= :min_doc_id AND "
- " last_doc_id >= :max_doc_id\n"
+ " FROM $index_table_name\n"
+ " WHERE word LIKE :word AND"
+ " first_doc_id <= :min_doc_id AND"
+ " last_doc_id >= :max_doc_id\n"
" ORDER BY first_doc_id;\n"
"BEGIN\n"
"\n"
@@ -2086,24 +2188,22 @@ fts_query_find_term(
"CLOSE c;");
}
- for(;;) {
+ for (;;) {
error = fts_eval_sql(trx, *graph);
if (error == DB_SUCCESS) {
break; /* Exit the loop. */
} else {
- ut_print_timestamp(stderr);
if (error == DB_LOCK_WAIT_TIMEOUT) {
- fprintf(stderr, " InnoDB: Warning: lock wait "
- "timeout reading FTS index. "
- "Retrying!\n");
+ ib::warn() << "lock wait timeout reading FTS"
+ " index. Retrying!";
trx->error_state = DB_SUCCESS;
} else {
- fprintf(stderr, " InnoDB: Error: %lu "
- "while reading FTS index.\n", error);
+ ib::error() << error
+ << " while reading FTS index.";
break; /* Exit the loop. */
}
@@ -2168,6 +2268,7 @@ fts_query_total_docs_containing_term(
que_t* graph;
ulint selected;
trx_t* trx = query->trx;
+ char table_name[MAX_FULL_NAME_LEN]
trx->op_info = "fetching FTS index document count";
@@ -2182,14 +2283,18 @@ fts_query_total_docs_containing_term(
query->fts_index_table.suffix = fts_get_suffix(selected);
+ fts_get_table_name(&query->fts_index_table, table_name);
+
+ pars_info_bind_id(info, true, "index_table_name", table_name);
+
graph = fts_parse_sql(
&query->fts_index_table,
info,
"DECLARE FUNCTION my_func;\n"
"DECLARE CURSOR c IS"
" SELECT doc_count\n"
- " FROM %s\n"
- " WHERE word = :word "
+ " FROM $index_table_name\n"
+ " WHERE word = :word"
" ORDER BY first_doc_id;\n"
"BEGIN\n"
"\n"
@@ -2202,24 +2307,22 @@ fts_query_total_docs_containing_term(
"END LOOP;\n"
"CLOSE c;");
- for(;;) {
+ for (;;) {
error = fts_eval_sql(trx, graph);
if (error == DB_SUCCESS) {
break; /* Exit the loop. */
} else {
- ut_print_timestamp(stderr);
if (error == DB_LOCK_WAIT_TIMEOUT) {
- fprintf(stderr, " InnoDB: Warning: lock wait "
- "timeout reading FTS index. "
- "Retrying!\n");
+ ib::warn() << "lock wait timeout reading FTS"
+ " index. Retrying!";
trx->error_state = DB_SUCCESS;
} else {
- fprintf(stderr, " InnoDB: Error: %lu "
- "while reading FTS index.\n", error);
+ ib::error() << error
+ << " while reading FTS index.";
break; /* Exit the loop. */
}
@@ -2247,6 +2350,7 @@ fts_query_terms_in_document(
que_t* graph;
doc_id_t read_doc_id;
trx_t* trx = query->trx;
+ char table_name[MAX_FULL_NAME_LEN];
trx->op_info = "fetching FTS document term count";
@@ -2262,15 +2366,19 @@ fts_query_terms_in_document(
query->fts_index_table.suffix = "DOC_ID";
+ fts_get_table_name(&query->fts_index_table, table_name);
+
+ pars_info_bind_id(info, true, "index_table_name", table_name);
+
graph = fts_parse_sql(
&query->fts_index_table,
info,
"DECLARE FUNCTION my_func;\n"
"DECLARE CURSOR c IS"
" SELECT count\n"
- " FROM \"%s\"\n"
- " WHERE doc_id = :doc_id "
- "BEGIN\n"
+ " FROM $index_table_name\n"
+ " WHERE doc_id = :doc_id"
+ " BEGIN\n"
"\n"
"OPEN c;\n"
"WHILE 1 = 1 LOOP\n"
@@ -2281,25 +2389,22 @@ fts_query_terms_in_document(
"END LOOP;\n"
"CLOSE c;");
- for(;;) {
+ for (;;) {
error = fts_eval_sql(trx, graph);
if (error == DB_SUCCESS) {
break; /* Exit the loop. */
} else {
- ut_print_timestamp(stderr);
if (error == DB_LOCK_WAIT_TIMEOUT) {
- fprintf(stderr, " InnoDB: Warning: lock wait "
- "timeout reading FTS doc id table. "
- "Retrying!\n");
+ ib::warn() << "lock wait timeout reading FTS"
+ " doc id table. Retrying!";
trx->error_state = DB_SUCCESS;
} else {
- fprintf(stderr, " InnoDB: Error: %lu "
- "while reading FTS doc id table.\n",
- error);
+ ib::error() << error << " while reading FTS"
+ " doc id table.";
break; /* Exit the loop. */
}
@@ -2323,20 +2428,18 @@ fts_query_match_document(
fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
fts_match_t* match, /*!< in: doc id and positions */
ulint distance, /*!< in: proximity distance */
+ st_mysql_ftparser* parser, /*!< in: fts plugin parser */
ibool* found) /*!< out: TRUE if phrase found */
{
dberr_t error;
- fts_phrase_t phrase;
-
- memset(&phrase, 0x0, sizeof(phrase));
+ fts_phrase_t phrase(get_doc->index_cache->index->table);
phrase.match = match; /* Positions to match */
phrase.tokens = tokens; /* Tokens to match */
phrase.distance = distance;
phrase.charset = get_doc->index_cache->charset;
- phrase.zip_size = dict_table_zip_size(
- get_doc->index_cache->index->table);
phrase.heap = mem_heap_create(512);
+ phrase.parser = parser;
*found = phrase.found = FALSE;
@@ -2345,9 +2448,8 @@ fts_query_match_document(
fts_query_fetch_document, &phrase);
if (error != DB_SUCCESS) {
- ut_print_timestamp(stderr);
- fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
- ut_strerr(error));
+ ib::error() << "(" << ut_strerr(error)
+ << ") matching document.";
} else {
*found = phrase.found;
}
@@ -2370,23 +2472,21 @@ fts_query_is_in_proximity_range(
fts_proximity_t* qualified_pos) /*!< in: position info for
qualified ranges */
{
- fts_get_doc_t get_doc;
- fts_cache_t* cache = query->index->table->fts->cache;
- dberr_t err;
- fts_phrase_t phrase;
+ fts_get_doc_t get_doc;
+ fts_cache_t* cache = query->index->table->fts->cache;
+ dberr_t err;
memset(&get_doc, 0x0, sizeof(get_doc));
- memset(&phrase, 0x0, sizeof(phrase));
rw_lock_x_lock(&cache->lock);
get_doc.index_cache = fts_find_index_cache(cache, query->index);
rw_lock_x_unlock(&cache->lock);
ut_a(get_doc.index_cache != NULL);
+ fts_phrase_t phrase(get_doc.index_cache->index->table);
+
phrase.distance = query->distance;
phrase.charset = get_doc.index_cache->charset;
- phrase.zip_size = dict_table_zip_size(
- get_doc.index_cache->index->table);
phrase.heap = mem_heap_create(512);
phrase.proximity_pos = qualified_pos;
phrase.found = FALSE;
@@ -2396,9 +2496,8 @@ fts_query_is_in_proximity_range(
fts_query_fetch_document, &phrase);
if (err != DB_SUCCESS) {
- ib_logf(IB_LOG_LEVEL_ERROR,
- "Error: (%s) in verification phase of proximity "
- "search", ut_strerr(err));
+ ib::error() << "(" << ut_strerr(err) << ") in verification"
+ " phase of proximity search";
}
/* Free the prepared statement. */
@@ -2449,8 +2548,7 @@ fts_query_search_phrase(
rw_lock_x_unlock(&cache->lock);
#ifdef FTS_INTERNAL_DIAG_PRINT
- ut_print_timestamp(stderr);
- fprintf(stderr, " Start phrase search\n");
+ ib::info() << "Start phrase search";
#endif
/* Read the document from disk and do the actual
@@ -2468,8 +2566,8 @@ fts_query_search_phrase(
if (match->doc_id != 0) {
query->error = fts_query_match_document(
- orig_tokens, &get_doc,
- match, query->distance, &found);
+ orig_tokens, &get_doc, match,
+ query->distance, query->parser, &found);
if (query->error == DB_SUCCESS && found) {
ulint z;
@@ -2501,57 +2599,77 @@ func_exit:
return(query->error);
}
-/*****************************************************************//**
-Text/Phrase search.
-@return DB_SUCCESS or error code */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-dberr_t
-fts_query_phrase_search(
-/*====================*/
- fts_query_t* query, /*!< in: query instance */
- const fts_string_t* phrase) /*!< in: token to search */
+/** Split the phrase into tokens
+@param[in,out] query query instance
+@param[in] node query node to search
+@param[in,out] tokens token vector
+@param[in,out] orig_tokens original node tokens include stopword
+@param[in,out] heap mem heap */
+static
+void
+fts_query_phrase_split(
+ fts_query_t* query,
+ const fts_ast_node_t* node,
+ ib_vector_t* tokens,
+ ib_vector_t* orig_tokens,
+ mem_heap_t* heap)
{
- ib_vector_t* tokens;
- ib_vector_t* orig_tokens;
- mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
- ulint len = phrase->f_len;
+ fts_string_t phrase;
+ ulint len = 0;
ulint cur_pos = 0;
- ib_alloc_t* heap_alloc;
- ulint num_token;
- CHARSET_INFO* charset;
-
- charset = query->fts_index_table.charset;
-
- heap_alloc = ib_heap_allocator_create(heap);
-
- tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
- orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+ fts_ast_node_t* term_node = NULL;
- if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
- query->flags = FTS_PROXIMITY;
+ if (node->type == FTS_AST_TEXT) {
+ phrase.f_str = node->text.ptr->str;
+ phrase.f_len = node->text.ptr->len;
+ len = phrase.f_len;
} else {
- query->flags = FTS_PHRASE;
+ ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
+ phrase.f_str = NULL;
+ phrase.f_len = 0;
+ term_node = node->list.head;
}
- /* Split the phrase into tokens. */
- while (cur_pos < len) {
+ while (true) {
fts_cache_t* cache = query->index->table->fts->cache;
- ib_rbt_bound_t parent;
- ulint offset;
ulint cur_len;
fts_string_t result_str;
- cur_len = innobase_mysql_fts_get_token(
- charset,
- reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
- reinterpret_cast<const byte*>(phrase->f_str) + len,
- &result_str, &offset);
+ if (node->type == FTS_AST_TEXT) {
+ if (cur_pos >= len) {
+ break;
+ }
- if (cur_len == 0) {
- break;
- }
+ cur_len = innobase_mysql_fts_get_token(
+ query->fts_index_table.charset,
+ reinterpret_cast<const byte*>(phrase.f_str)
+ + cur_pos,
+ reinterpret_cast<const byte*>(phrase.f_str)
+ + len,
+ &result_str);
- cur_pos += cur_len;
+ if (cur_len == 0) {
+ break;
+ }
+
+ cur_pos += cur_len;
+ } else {
+ ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
+ /* Term node in parser phrase list */
+ if (term_node == NULL) {
+ break;
+ }
+
+ ut_a(term_node->type == FTS_AST_TERM);
+ result_str.f_str = term_node->term.ptr->str;
+ result_str.f_len = term_node->term.ptr->len;
+ result_str.f_n_char = fts_get_token_size(
+ query->fts_index_table.charset,
+ reinterpret_cast<char*>(result_str.f_str),
+ result_str.f_len);
+
+ term_node = term_node->next;
+ }
if (result_str.f_n_char == 0) {
continue;
@@ -2559,19 +2677,13 @@ fts_query_phrase_search(
fts_string_t* token = static_cast<fts_string_t*>(
ib_vector_push(tokens, NULL));
+ fts_string_dup(token, &result_str, heap);
- token->f_str = static_cast<byte*>(
- mem_heap_alloc(heap, result_str.f_len + 1));
- ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
-
- token->f_len = result_str.f_len;
- token->f_str[token->f_len] = 0;
-
- if (cache->stopword_info.cached_stopword
- && rbt_search(cache->stopword_info.cached_stopword,
- &parent, token) != 0
- && result_str.f_n_char >= fts_min_token_size
- && result_str.f_n_char <= fts_max_token_size) {
+ if (fts_check_token(
+ &result_str,
+ cache->stopword_info.cached_stopword,
+ query->index->is_ngram,
+ query->fts_index_table.charset)) {
/* Add the word to the RB tree so that we can
calculate it's frequencey within a document. */
fts_query_add_word_freq(query, token);
@@ -2590,6 +2702,37 @@ fts_query_phrase_search(
orig_token->f_len = token->f_len;
}
}
+}
+
+/*****************************************************************//**
+Text/Phrase search.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+fts_query_phrase_search(
+/*====================*/
+ fts_query_t* query, /*!< in: query instance */
+ const fts_ast_node_t* node) /*!< in: node to search */
+{
+ ib_vector_t* tokens;
+ ib_vector_t* orig_tokens;
+ mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
+ ib_alloc_t* heap_alloc;
+ ulint num_token;
+
+ heap_alloc = ib_heap_allocator_create(heap);
+
+ tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+ orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+
+ if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
+ query->flags = FTS_PROXIMITY;
+ } else {
+ query->flags = FTS_PHRASE;
+ }
+
+ /* Split the phrase into tokens. */
+ fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
num_token = ib_vector_size(tokens);
if (num_token > MAX_PROXIMITY_ITEM) {
@@ -2787,7 +2930,7 @@ fts_query_execute(
/*****************************************************************//**
Create a wildcard string. It's the responsibility of the caller to
-free the byte* pointer. It's allocated using ut_malloc().
+free the byte* pointer. It's allocated using ut_malloc_nokey().
@return ptr to allocated memory */
static
byte*
@@ -2808,7 +2951,7 @@ fts_query_get_token(
if (node->term.wildcard) {
- token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
+ token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
token->f_len = str_len + 1;
memcpy(token->f_str, node->term.ptr->str, str_len);
@@ -2846,8 +2989,7 @@ fts_query_visitor(
switch (node->type) {
case FTS_AST_TEXT:
- token.f_str = node->text.ptr->str;
- token.f_len = node->text.ptr->len;
+ case FTS_AST_PARSER_PHRASE_LIST:
if (query->oper == FTS_EXIST) {
ut_ad(query->intersection == NULL);
@@ -2863,7 +3005,7 @@ fts_query_visitor(
/* Force collection of doc ids and the positions. */
query->collect_positions = TRUE;
- query->error = fts_query_phrase_search(query, &token);
+ query->error = fts_query_phrase_search(query, node);
query->collect_positions = FALSE;
@@ -2879,6 +3021,20 @@ fts_query_visitor(
token.f_str = node->term.ptr->str;
token.f_len = node->term.ptr->len;
+ /* Collect wildcard words for QUERY EXPANSION. */
+ if (node->term.wildcard && query->wildcard_words != NULL) {
+ ib_rbt_bound_t parent;
+
+ if (rbt_search(query->wildcard_words, &parent, &token)
+ != 0) {
+ fts_string_t word;
+
+ fts_string_dup(&word, &token, query->heap);
+ rbt_add_node(query->wildcard_words, &parent,
+ &word);
+ }
+ }
+
/* Add the word to our RB tree that will be used to
calculate this terms per document frequency. */
fts_query_add_word_freq(query, &token);
@@ -2889,6 +3045,7 @@ fts_query_visitor(
if (ptr) {
ut_free(ptr);
}
+
break;
case FTS_AST_SUBEXP_LIST:
@@ -2910,8 +3067,7 @@ fts_query_visitor(
Process (nested) sub-expression, create a new result set to store the
sub-expression result by processing nodes under current sub-expression
list. Merge the sub-expression result with that of parent expression list.
-@return DB_SUCCESS if all well */
-UNIV_INTERN
+@return DB_SUCCESS if all go well */
dberr_t
fts_ast_visit_sub_exp(
/*==================*/
@@ -3060,6 +3216,11 @@ fts_query_filter_doc_ids(
ulint decoded = 0;
ib_rbt_t* doc_freqs = word_freq->doc_freqs;
+ if (query->limit != ULONG_UNDEFINED
+ && query->n_docs >= query->limit) {
+ return(DB_SUCCESS);
+ }
+
/* Decode the ilist and add the doc ids to the query doc_id set. */
while (decoded < len) {
ulint freq = 0;
@@ -3147,11 +3308,17 @@ fts_query_filter_doc_ids(
/* Add the word to the document's matched RB tree. */
fts_query_add_word_to_document(query, doc_id, word);
}
+
+ if (query->limit != ULONG_UNDEFINED
+ && query->limit <= ++query->n_docs) {
+ goto func_exit;
+ }
}
/* Some sanity checks. */
ut_a(doc_id == node->last_doc_id);
+func_exit:
if (query->total_size > fts_result_cache_limit) {
return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
} else {
@@ -3180,8 +3347,9 @@ fts_query_read_node(
byte buf[FTS_MAX_WORD_LEN + 1];
dberr_t error = DB_SUCCESS;
- ut_a(query->cur_node->type == FTS_AST_TERM ||
- query->cur_node->type == FTS_AST_TEXT);
+ ut_a(query->cur_node->type == FTS_AST_TERM
+ || query->cur_node->type == FTS_AST_TEXT
+ || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
memset(&node, 0, sizeof(node));
term.f_str = buf;
@@ -3191,6 +3359,7 @@ fts_query_read_node(
to assign the frequency on search string behalf. */
if (query->cur_node->type == FTS_AST_TERM
&& query->cur_node->term.wildcard) {
+
term.f_len = query->cur_node->term.ptr->len;
ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
@@ -3344,11 +3513,11 @@ fts_query_calculate_idf(
}
if (fts_enable_diag_print) {
- fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
- " %6.5lf\n",
- word_freq->word.f_str,
- query->total_docs, word_freq->doc_count,
- word_freq->idf);
+ ib::info() << "'" << word_freq->word.f_str << "' -> "
+ << query->total_docs << "/"
+ << word_freq->doc_count << " "
+ << std::setw(6) << std::setprecision(5)
+ << word_freq->idf;
}
}
}
@@ -3477,9 +3646,8 @@ fts_query_prepare_result(
DBUG_ENTER("fts_query_prepare_result");
if (result == NULL) {
- result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
-
- memset(result, 0x0, sizeof(*result));
+ result = static_cast<fts_result_t*>(
+ ut_zalloc_nokey(sizeof(*result)));
result->rankings_by_id = rbt_create(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
@@ -3605,8 +3773,8 @@ fts_query_get_result(
result = fts_query_prepare_result(query, result);
} else {
/* Create an empty result instance. */
- result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
- memset(result, 0, sizeof(*result));
+ result = static_cast<fts_result_t*>(
+ ut_zalloc_nokey(sizeof(*result)));
}
DBUG_RETURN(result);
@@ -3657,14 +3825,18 @@ fts_query_free(
rbt_free(query->word_freqs);
}
+ if (query->wildcard_words != NULL) {
+ rbt_free(query->wildcard_words);
+ }
+
ut_a(!query->intersection);
if (query->word_map) {
rbt_free(query->word_map);
}
- if (query->word_vector) {
- delete query->word_vector;
+ if (query->word_vector != NULL) {
+ UT_DELETE(query->word_vector);
}
if (query->heap) {
@@ -3675,7 +3847,8 @@ fts_query_free(
}
/*****************************************************************//**
-Parse the query using flex/bison. */
+Parse the query using flex/bison or plugin parser.
+@return parse tree node. */
static
fts_ast_node_t*
fts_query_parse(
@@ -3691,12 +3864,24 @@ fts_query_parse(
memset(&state, 0x0, sizeof(state));
- /* Setup the scanner to use, this depends on the mode flag. */
- state.lexer = fts_lexer_create(mode, query_str, query_len);
state.charset = query->fts_index_table.charset;
- error = fts_parse(&state);
- fts_lexer_free(state.lexer);
- state.lexer = NULL;
+
+ DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
+ query->parser = NULL;);
+
+ if (query->parser) {
+ state.root = state.cur_node =
+ fts_ast_create_node_list(&state, NULL);
+ error = fts_parse_by_parser(mode, query_str, query_len,
+ query->parser, &state);
+ } else {
+ /* Setup the scanner to use, this depends on the mode flag. */
+ state.lexer = fts_lexer_create(mode, query_str, query_len);
+ state.charset = query->fts_index_table.charset;
+ error = fts_parse(&state);
+ fts_lexer_free(state.lexer);
+ state.lexer = NULL;
+ }
/* Error during parsing ? */
if (error) {
@@ -3704,6 +3889,10 @@ fts_query_parse(
fts_ast_state_free(&state);
} else {
query->root = state.root;
+
+ if (fts_enable_diag_print && query->root != NULL) {
+ fts_ast_node_print(query->root);
+ }
}
DBUG_RETURN(state.root);
@@ -3733,108 +3922,29 @@ fts_query_can_optimize(
}
}
-/*******************************************************************//**
-Pre-process the query string
-1) make it lower case
-2) in boolean mode, if there is '-' or '+' that is immediately proceeded
-and followed by valid word, make it a space
-@return the processed string */
-static
-byte*
-fts_query_str_preprocess(
-/*=====================*/
- const byte* query_str, /*!< in: FTS query */
- ulint query_len, /*!< in: FTS query string len */
- ulint *result_len, /*!< out: result string length */
- CHARSET_INFO* charset, /*!< in: string charset */
- bool boolean_mode) /*!< in: is boolean mode */
-{
- ulint cur_pos = 0;
- ulint str_len;
- byte* str_ptr;
- bool in_phrase = false;
-
- /* Convert the query string to lower case before parsing. We own
- the ut_malloc'ed result and so remember to free it before return. */
-
- str_len = query_len * charset->casedn_multiply + 1;
- str_ptr = static_cast<byte*>(ut_malloc(str_len));
-
- *result_len = innobase_fts_casedn_str(
- charset, const_cast<char*>(reinterpret_cast<const char*>(
- query_str)), query_len,
- reinterpret_cast<char*>(str_ptr), str_len);
-
- ut_ad(*result_len < str_len);
-
- str_ptr[*result_len] = 0;
-
- /* If it is boolean mode, no need to check for '-/+' */
- if (!boolean_mode) {
- return(str_ptr);
- }
-
- /* Otherwise, we travese the string to find any '-/+' that are
- immediately proceeded and followed by valid search word.
- NOTE: we should not do so for CJK languages, this should
- be taken care of in our CJK implementation */
- while (cur_pos < *result_len) {
- fts_string_t str;
- ulint offset;
- ulint cur_len;
-
- cur_len = innobase_mysql_fts_get_token(
- charset, str_ptr + cur_pos, str_ptr + *result_len,
- &str, &offset);
-
- if (cur_len == 0 || str.f_str == NULL) {
- /* No valid word found */
- break;
- }
-
- /* Check if we are in a phrase, if so, no need to do
- replacement of '-/+'. */
- for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
- if ((char) (*ptr) == '"' ) {
- in_phrase = !in_phrase;
- }
- }
-
- /* Find those are not leading '-/+' and also not in a phrase */
- if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
- && !in_phrase) {
- char* last_op = reinterpret_cast<char*>(
- str_ptr + cur_pos);
-
- if (*last_op == '-' || *last_op == '+') {
- *last_op = ' ';
- }
- }
-
- cur_pos += cur_len;
- }
-
- return(str_ptr);
-}
-
-/*******************************************************************//**
-FTS Query entry point.
+/** FTS Query entry point.
+@param[in] trx transaction
+@param[in] index fts index to search
+@param[in] flags FTS search mode
+@param[in] query_str FTS query
+@param[in] query_len FTS query string len in bytes
+@param[in,out] result result doc ids
+@param[in] limit limit value
@return DB_SUCCESS if successful otherwise error code */
-UNIV_INTERN
dberr_t
fts_query(
-/*======*/
- trx_t* trx, /*!< in: transaction */
- dict_index_t* index, /*!< in: The FTS index to search */
- uint flags, /*!< in: FTS search mode */
- const byte* query_str, /*!< in: FTS query */
- ulint query_len, /*!< in: FTS query string len
- in bytes */
- fts_result_t** result) /*!< in/out: result doc ids */
+ trx_t* trx,
+ dict_index_t* index,
+ uint flags,
+ const byte* query_str,
+ ulint query_len,
+ fts_result_t** result,
+ ulonglong limit)
{
fts_query_t query;
dberr_t error = DB_SUCCESS;
byte* lc_query_str;
+ ulint lc_query_str_len;
ulint result_len;
bool boolean_mode;
trx_t* query_trx;
@@ -3859,7 +3969,7 @@ fts_query(
query.fts_common_table.type = FTS_COMMON_TABLE;
query.fts_common_table.table_id = index->table->id;
- query.fts_common_table.parent = index->table->name;
+ query.fts_common_table.parent = index->table->name.m_name;
query.fts_common_table.table = index->table;
charset = fts_index_get_charset(index);
@@ -3867,26 +3977,33 @@ fts_query(
query.fts_index_table.type = FTS_INDEX_TABLE;
query.fts_index_table.index_id = index->id;
query.fts_index_table.table_id = index->table->id;
- query.fts_index_table.parent = index->table->name;
+ query.fts_index_table.parent = index->table->name.m_name;
query.fts_index_table.charset = charset;
query.fts_index_table.table = index->table;
query.word_map = rbt_create_arg_cmp(
- sizeof(fts_string_t), innobase_fts_text_cmp,
- (void *) charset);
- query.word_vector = new word_vector_t;
+ sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
+ query.word_vector = UT_NEW_NOKEY(word_vector_t());
query.error = DB_SUCCESS;
/* Setup the RB tree that will be used to collect per term
statistics. */
query.word_freqs = rbt_create_arg_cmp(
- sizeof(fts_word_freq_t), innobase_fts_text_cmp,
+ sizeof(fts_word_freq_t), innobase_fts_text_cmp,
(void*) charset);
+ if (flags & FTS_EXPAND) {
+ query.wildcard_words = rbt_create_arg_cmp(
+ sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
+ }
+
query.total_size += SIZEOF_RBT_CREATE;
query.total_docs = dict_table_get_n_rows(index->table);
+ query.limit = limit;
+
+ query.n_docs = 0;
#ifdef FTS_DOC_STATS_DEBUG
if (ft_enable_diag_print) {
error = fts_get_total_word_count(
@@ -3896,8 +4013,8 @@ fts_query(
goto func_exit;
}
- fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
- query.total_docs, query.total_words);
+ ib::info() << "Total docs: " << query.total_docs
+ << " Total words: " << query.total_words;
}
#endif /* FTS_DOC_STATS_DEBUG */
@@ -3928,12 +4045,11 @@ fts_query(
/* Sort the vector so that we can do a binary search over the ids. */
ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
-#if 0
/* Convert the query string to lower case before parsing. We own
the ut_malloc'ed result and so remember to free it before return. */
lc_query_str_len = query_len * charset->casedn_multiply + 1;
- lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
+ lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
result_len = innobase_fts_casedn_str(
charset, (char*) query_str, query_len,
@@ -3943,16 +4059,12 @@ fts_query(
lc_query_str[result_len] = 0;
-#endif
-
- lc_query_str = fts_query_str_preprocess(
- query_str, query_len, &result_len, charset, boolean_mode);
-
query.heap = mem_heap_create(128);
/* Create the rb tree for the doc id (current) set. */
query.doc_ids = rbt_create(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ query.parser = index->parser;
query.total_size += SIZEOF_RBT_CREATE;
@@ -3967,6 +4079,19 @@ fts_query(
fts_result_cache_limit = 2048;
);
+ /* Optimisation is allowed for limit value
+ when
+ i) No ranking involved
+ ii) Only FTS Union operations involved. */
+ if (query.limit != ULONG_UNDEFINED
+ && !fts_ast_node_check_union(ast)) {
+ query.limit = ULONG_UNDEFINED;
+ }
+
+ DBUG_EXECUTE_IF("fts_union_limit_off",
+ query.limit = ULONG_UNDEFINED;
+ );
+
/* Traverse the Abstract Syntax Tree (AST) and execute
the query. */
query.error = fts_ast_visit(
@@ -3995,29 +4120,28 @@ fts_query(
} else {
/* still return an empty result set */
*result = static_cast<fts_result_t*>(
- ut_malloc(sizeof(**result)));
- memset(*result, 0, sizeof(**result));
+ ut_zalloc_nokey(sizeof(**result)));
}
ut_free(lc_query_str);
if (fts_enable_diag_print && (*result)) {
ulint diff_time = ut_time_ms() - start_time_ms;
- fprintf(stderr, "FTS Search Processing time: %ld secs:"
- " %ld millisec: row(s) %d \n",
- diff_time / 1000, diff_time % 1000,
- (*result)->rankings_by_id
- ? (int) rbt_size((*result)->rankings_by_id)
- : -1);
+
+ ib::info() << "FTS Search Processing time: "
+ << diff_time / 1000 << " secs: " << diff_time % 1000
+ << " millisec: row(s) "
+ << ((*result)->rankings_by_id
+ ? rbt_size((*result)->rankings_by_id)
+ : -1);
/* Log memory consumption & result size */
- ib_logf(IB_LOG_LEVEL_INFO,
- "Full Search Memory: "
- "%lu (bytes), Row: %lu .",
- query.total_size,
- (*result)->rankings_by_id
- ? rbt_size((*result)->rankings_by_id)
- : 0);
+ ib::info() << "Full Search Memory: " << query.total_size
+ << " (bytes), Row: "
+ << ((*result)->rankings_by_id
+ ? rbt_size((*result)->rankings_by_id)
+ : 0)
+ << ".";
}
func_exit:
@@ -4030,7 +4154,6 @@ func_exit:
/*****************************************************************//**
FTS Query free result, returned by fts_query(). */
-
void
fts_query_free_result(
/*==================*/
@@ -4053,7 +4176,6 @@ fts_query_free_result(
/*****************************************************************//**
FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
-
void
fts_query_sort_result_on_rank(
/*==========================*/
@@ -4089,7 +4211,6 @@ fts_query_sort_result_on_rank(
result->rankings_by_rank = ranked;
}
-#ifdef UNIV_DEBUG
/*******************************************************************//**
A debug function to print result doc_id set. */
static
@@ -4107,18 +4228,16 @@ fts_print_doc_id(
fts_ranking_t* ranking;
ranking = rbt_value(fts_ranking_t, node);
- ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, doc_id: %ld \n",
- (ulint) ranking->doc_id);
+ ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
ulint pos = 0;
fts_string_t word;
while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
- ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, value: %s \n", word.f_str);
+ ib::info() << "doc_ids info, value: " << word.f_str;
}
}
}
-#endif
/*************************************************************//**
This function implements a simple "blind" query expansion search:
@@ -4158,19 +4277,20 @@ fts_expand_query(
(void*) index_cache->charset);
result_doc.charset = index_cache->charset;
+ result_doc.parser = index_cache->index->parser;
+ result_doc.is_ngram = index_cache->index->is_ngram;
query->total_size += SIZEOF_RBT_CREATE;
-#ifdef UNIV_DEBUG
- fts_print_doc_id(query);
-#endif
+
+ if (fts_enable_diag_print) {
+ fts_print_doc_id(query);
+ }
for (node = rbt_first(query->doc_ids);
node;
node = rbt_next(query->doc_ids, node)) {
fts_ranking_t* ranking;
- ulint pos;
- fts_string_t word;
ulint prev_token_size;
ulint estimate_size;
@@ -4189,24 +4309,6 @@ fts_expand_query(
fts_query_expansion_fetch_doc,
&result_doc);
- /* Remove words that have already been searched in the
- first pass */
- pos = 0;
- while (fts_ranking_words_get_next(query, ranking, &pos,
- &word)) {
- ibool ret;
-
- ret = rbt_delete(result_doc.tokens, &word);
-
- /* The word must exist in the doc we found */
- if (!ret) {
- ib_logf(IB_LOG_LEVEL_ERROR, "Did not "
- "find word %s in doc %ld for query "
- "expansion search.\n", word.f_str,
- (ulint) ranking->doc_id);
- }
- }
-
/* Estimate memory used, see fts_process_token and fts_token_t.
We ignore token size here. */
estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
@@ -4220,6 +4322,30 @@ fts_expand_query(
}
}
+ /* Remove words that have already been searched in the first pass */
+ for (ulint i = 0; i < query->word_vector->size(); i++) {
+ fts_string_t word = query->word_vector->at(i);
+ ib_rbt_bound_t parent;
+
+ if (query->wildcard_words
+ && rbt_search(query->wildcard_words, &parent, &word) == 0) {
+ /* If it's a wildcard word, remove words having
+ it as prefix. */
+ while (rbt_search_cmp(result_doc.tokens,
+ &parent, &word, NULL,
+ innobase_fts_text_cmp_prefix)
+ == 0) {
+ ut_free(rbt_remove_node(result_doc.tokens,
+ parent.last));
+ }
+ } else {
+ /* We don't check return value, because the word may
+ have been deleted by a previous wildcard word as its
+ prefix, e.g. ('g * good'). */
+ rbt_delete(result_doc.tokens, &word);
+ }
+ }
+
/* Search the table the second time with expanded search list */
for (token_node = rbt_first(result_doc.tokens);
token_node;
@@ -4227,6 +4353,12 @@ fts_expand_query(
fts_token_t* mytoken;
mytoken = rbt_value(fts_token_t, token_node);
+ /* '%' in the end is treated as prefix search,
+ it can cause assert failure, so we skip it. */
+ if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
+ continue;
+ }
+
ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
fts_query_add_word_freq(query, &mytoken->text);
error = fts_query_union(query, &mytoken->text);