diff options
author | Thirunarayanan Balathandayuthapani <thiru@mariadb.com> | 2018-10-16 11:38:11 +0530 |
---|---|---|
committer | Thirunarayanan Balathandayuthapani <thiru@mariadb.com> | 2018-10-16 11:38:11 +0530 |
commit | 72033f367f4d6b51eea773acba1e0031f98e05b8 (patch) | |
tree | 07c28210f68b4bd708aee9ff361b47c147ad1d8d | |
parent | fd680ef66346208811d77fa4c2c938430234f469 (diff) | |
download | mariadb-git-72033f367f4d6b51eea773acba1e0031f98e05b8.tar.gz |
MDEV-12547: InnoDB FULLTEXT index has too strict innodb_ft_result_cache_limit max limitbb-10.0-MDEV-12547
- Removed f_n_char from fts_string_t. Instead of that, InnoDB calculate
the number of chars when it is needed.
-rw-r--r-- | storage/innobase/fts/fts0ast.cc | 9 | ||||
-rw-r--r-- | storage/innobase/fts/fts0fts.cc | 24 | ||||
-rw-r--r-- | storage/innobase/fts/fts0que.cc | 30 | ||||
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 31 | ||||
-rw-r--r-- | storage/innobase/handler/i_s.cc | 14 | ||||
-rw-r--r-- | storage/innobase/include/fts0fts.h | 26 | ||||
-rw-r--r-- | storage/innobase/include/fts0types.ic | 1 | ||||
-rw-r--r-- | storage/innobase/include/ha_prototypes.h | 27 | ||||
-rw-r--r-- | storage/innobase/row/row0ftsort.cc | 10 | ||||
-rw-r--r-- | storage/xtradb/fts/fts0ast.cc | 9 | ||||
-rw-r--r-- | storage/xtradb/fts/fts0fts.cc | 25 | ||||
-rw-r--r-- | storage/xtradb/fts/fts0que.cc | 33 | ||||
-rw-r--r-- | storage/xtradb/handler/ha_innodb.cc | 31 | ||||
-rw-r--r-- | storage/xtradb/handler/i_s.cc | 14 | ||||
-rw-r--r-- | storage/xtradb/include/fts0fts.h | 27 | ||||
-rw-r--r-- | storage/xtradb/include/fts0types.ic | 1 | ||||
-rw-r--r-- | storage/xtradb/include/ha_prototypes.h | 26 | ||||
-rw-r--r-- | storage/xtradb/row/row0ftsort.cc | 10 |
18 files changed, 181 insertions, 167 deletions
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc index 4b36152cf62..df735d30c13 100644 --- a/storage/innobase/fts/fts0ast.cc +++ b/storage/innobase/fts/fts0ast.cc @@ -100,12 +100,13 @@ fts_ast_create_node_term( fts_string_t str; ulint offset; ulint cur_len; + ulint n_chars = 0; cur_len = innobase_mysql_fts_get_token( state->charset, reinterpret_cast<const byte*>(ptr->str) + cur_pos, reinterpret_cast<const byte*>(ptr->str) + len, - &str, &offset); + &str, &offset, &n_chars); if (cur_len == 0) { break; @@ -113,13 +114,13 @@ fts_ast_create_node_term( cur_pos += cur_len; - if (str.f_n_char > 0) { + if (n_chars > 0) { /* If the subsequent term (after the first one)'s size is less than fts_min_token_size or the term is greater than fts_max_token_size, we shall ignore that. This is to make consistent with MyISAM behavior */ - if ((first_node && (str.f_n_char < fts_min_token_size)) - || str.f_n_char > fts_max_token_size) { + if ((first_node && (n_chars < fts_min_token_size)) + || n_chars > fts_max_token_size) { continue; } diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index 4891e572741..10e4d2ee03b 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -363,8 +363,6 @@ fts_load_default_stopword( stop_words = stopword_info->cached_stopword; - str.f_n_char = 0; - for (ulint i = 0; fts_default_stopword[i]; ++i) { char* word; fts_tokenizer_word_t new_word; @@ -418,7 +416,6 @@ fts_read_stopword( /* We only need to read the first column */ dfield = que_node_get_val(exp); - str.f_n_char = 0; str.f_str = static_cast<byte*>(dfield_get_data(dfield)); str.f_len = dfield_get_len(dfield); @@ -436,7 +433,6 @@ fts_read_stopword( memcpy(new_word.text.f_str, str.f_str, str.f_len); - new_word.text.f_n_char = 0; new_word.text.f_len = str.f_len; new_word.text.f_str[str.f_len] = 0; @@ -2527,7 +2523,6 @@ fts_get_max_cache_size( /* We set the length of value to the max bytes it can hold. This information is used by the callback that reads the value. */ - value.f_n_char = 0; value.f_len = FTS_MAX_CONFIG_VALUE_LEN; value.f_str = ut_malloc(value.f_len + 1); @@ -2597,7 +2592,6 @@ fts_get_total_word_count( /* We set the length of value to the max bytes it can hold. This information is used by the callback that reads the value. */ - value.f_n_char = 0; value.f_len = FTS_MAX_CONFIG_VALUE_LEN; value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); @@ -3259,8 +3253,6 @@ fts_query_expansion_fetch_doc( exp = que_node_get_next(exp); continue; } else { - doc.text.f_n_char = 0; - doc.text.f_str = static_cast<byte*>( dfield_get_data(dfield)); @@ -4705,6 +4697,7 @@ fts_process_token( fts_string_t str; ulint offset = 0; fts_doc_t* result_doc; + ulint n_chars = 0; /* Determine where to save the result. */ result_doc = (result) ? result : doc; @@ -4712,13 +4705,13 @@ fts_process_token( /* The length of a string in characters is set here only. */ ret = innobase_mysql_fts_get_token( doc->charset, doc->text.f_str + start_pos, - doc->text.f_str + doc->text.f_len, &str, &offset); + doc->text.f_str + doc->text.f_len, &str, &offset, &n_chars); /* Ignore string whose character number is less than "fts_min_token_size" or more than "fts_max_token_size" */ - if (str.f_n_char >= fts_min_token_size - && str.f_n_char <= fts_max_token_size) { + if (n_chars >= fts_min_token_size + && n_chars <= fts_max_token_size) { mem_heap_t* heap; fts_string_t t_str; @@ -4728,8 +4721,6 @@ fts_process_token( heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg); - t_str.f_n_char = str.f_n_char; - t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1; t_str.f_str = static_cast<byte*>( @@ -4757,13 +4748,12 @@ fts_process_token( new_token.text.f_len = newlen; new_token.text.f_str = t_str.f_str; - new_token.text.f_n_char = t_str.f_n_char; new_token.positions = ib_vector_create( result_doc->self_heap, sizeof(ulint), 32); - ut_a(new_token.text.f_n_char >= fts_min_token_size); - ut_a(new_token.text.f_n_char <= fts_max_token_size); + ut_a(n_chars >= fts_min_token_size); + ut_a(n_chars <= fts_max_token_size); parent.last = rbt_add_node( result_doc->tokens, &parent, &new_token); @@ -7408,7 +7398,6 @@ fts_load_stopword( if (reload) { /* Fetch the stopword table name from FTS config table */ - str.f_n_char = 0; str.f_str = str_buffer; str.f_len = sizeof(str_buffer) - 1; @@ -7433,7 +7422,6 @@ fts_load_stopword( /* Save the stopword table name to the configure table */ if (!reload) { - str.f_n_char = 0; str.f_str = (byte*) stopword_to_use; str.f_len = ut_strlen(stopword_to_use); diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index 7983181c23a..62456871dcf 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -583,10 +583,18 @@ fts_ranking_words_add( /* We use ib_rbt to simulate a map, f_n_char means position. */ if (rbt_search(query->word_map, &parent, word) == 0) { - fts_string_t* result_word; + CHARSET_INFO* charset = query->fts_index_table.charset; + + for (ulint i = 0; i < query->word_vector->size(); i++) { + if (!my_strcasecmp( + charset, + (const char*) query->word_vector->at(i).f_str, + (const char*) word->f_str)) { + pos = i; + break; + } + } - result_word = rbt_value(fts_string_t, parent.last); - pos = result_word->f_n_char; ut_ad(pos < rbt_size(query->word_map)); } else { /* Add the word to map. */ @@ -599,7 +607,6 @@ fts_ranking_words_add( memcpy(new_word.f_str, word->f_str, word->f_len); new_word.f_str[word->f_len] = 0; new_word.f_len = word->f_len; - new_word.f_n_char = pos; rbt_add_node(query->word_map, &parent, &new_word); ut_ad(rbt_validate(query->word_map)); @@ -1720,11 +1727,12 @@ fts_proximity_is_word_in_range( ulint len; fts_string_t str; ulint offset = 0; + ulint n_chars = 0; len = innobase_mysql_fts_get_token( phrase->charset, start + cur_pos, - start + total_len, &str, &offset); + start + total_len, &str, &offset, &n_chars); if (len == 0) { break; @@ -1734,7 +1742,7 @@ fts_proximity_is_word_in_range( cur_pos += len; /* Record the number of words */ - if (str.f_n_char > 0) { + if (n_chars > 0) { n_word++; } @@ -2540,12 +2548,13 @@ fts_query_phrase_search( ulint offset; ulint cur_len; fts_string_t result_str; + ulint n_chars = 0; cur_len = innobase_mysql_fts_get_token( charset, reinterpret_cast<const byte*>(phrase->f_str) + cur_pos, reinterpret_cast<const byte*>(phrase->f_str) + len, - &result_str, &offset); + &result_str, &offset, &n_chars); if (cur_len == 0) { break; @@ -2553,7 +2562,7 @@ fts_query_phrase_search( cur_pos += cur_len; - if (result_str.f_n_char == 0) { + if (n_chars == 0) { continue; } @@ -2570,8 +2579,8 @@ fts_query_phrase_search( if (cache->stopword_info.cached_stopword && rbt_search(cache->stopword_info.cached_stopword, &parent, token) != 0 - && result_str.f_n_char >= fts_min_token_size - && result_str.f_n_char <= fts_max_token_size) { + && n_chars >= fts_min_token_size + && n_chars <= fts_max_token_size) { /* Add the word to the RB tree so that we can calculate it's frequencey within a document. */ fts_query_add_word_freq(query, token); @@ -2840,7 +2849,6 @@ fts_query_visitor( DBUG_ENTER("fts_query_visitor"); DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type))); - token.f_n_char = 0; query->oper = oper; query->cur_node = node; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 2092cd113a5..e79e3516804 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -5986,29 +5986,31 @@ innobase_fts_casedn_str( #define misc_word_char(X) 0 -/*************************************************************//** -Get the next token from the given string and store it in *token. +/** Get the next token from the given string and store it in *token. It is mostly copied from MyISAM's doc parsing function ft_simple_get_word() +@param[in] cs Character set +@param[in] start start of tex +@param[in] end one character past end of text +@param[out] token token's text +@param[out] offset offset to token, measured as characters from 'start' +@param[out] n_chars number of characters existed @return length of string processed */ UNIV_INTERN ulint innobase_mysql_fts_get_token( -/*=========================*/ - CHARSET_INFO* cs, /*!< in: Character set */ - const byte* start, /*!< in: start of text */ - const byte* end, /*!< in: one character past end of - text */ - fts_string_t* token, /*!< out: token's text */ - ulint* offset) /*!< out: offset to token, - measured as characters from - 'start' */ + CHARSET_INFO* cs, + const byte* start, + const byte* end, + fts_string_t* token, + ulint* offset, + ulint* n_chars) { int mbl; const uchar* doc = start; ut_a(cs); - token->f_n_char = token->f_len = 0; + token->f_len = 0; token->f_str = NULL; for (;;) { @@ -6054,7 +6056,10 @@ innobase_mysql_fts_get_token( } token->f_len = (uint) (doc - token->f_str) - mwc; - token->f_n_char = length; + + if (n_chars != NULL) { + *n_chars = length; + } return(doc - start); } diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index de164e42273..461eb0596d0 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -3268,7 +3268,6 @@ i_s_fts_index_cache_fill_one_index( fields = table->field; index_charset = index_cache->charset; - conv_str->f_n_char = 0; int ret = 0; @@ -3282,15 +3281,15 @@ i_s_fts_index_cache_fill_one_index( /* Convert word from index charset to system_charset_info */ if (index_charset->cset != system_charset_info->cset) { - conv_str->f_n_char = my_convert( + ulint n_chars = my_convert( reinterpret_cast<char*>(conv_str->f_str), static_cast<uint32>(conv_str->f_len), system_charset_info, reinterpret_cast<char*>(word->text.f_str), static_cast<uint32>(word->text.f_len), index_charset, &dummy_errors); - ut_ad(conv_str->f_n_char <= conv_str->f_len); - conv_str->f_str[conv_str->f_n_char] = 0; + ut_ad(n_chars <= conv_str->f_len); + conv_str->f_str[n_chars] = 0; word_str = reinterpret_cast<char*>(conv_str->f_str); } else { word_str = reinterpret_cast<char*>(word->text.f_str); @@ -3650,15 +3649,15 @@ i_s_fts_index_table_fill_one_fetch( /* Convert word from index charset to system_charset_info */ if (index_charset->cset != system_charset_info->cset) { - conv_str->f_n_char = my_convert( + ulint n_chars = my_convert( reinterpret_cast<char*>(conv_str->f_str), static_cast<uint32>(conv_str->f_len), system_charset_info, reinterpret_cast<char*>(word->text.f_str), static_cast<uint32>(word->text.f_len), index_charset, &dummy_errors); - ut_ad(conv_str->f_n_char <= conv_str->f_len); - conv_str->f_str[conv_str->f_n_char] = 0; + ut_ad(n_chars <= conv_str->f_len); + conv_str->f_str[n_chars] = 0; word_str = reinterpret_cast<char*>(conv_str->f_str); } else { word_str = reinterpret_cast<char*>(word->text.f_str); @@ -3750,7 +3749,6 @@ i_s_fts_index_table_fill_one_index( word.f_str = NULL; word.f_len = 0; - word.f_n_char = 0; index_charset = fts_index_get_charset(index); diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index ce30a17c4b4..987db972fc3 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -236,7 +236,6 @@ struct fts_string_t { byte* f_str; /*!< string, not necessary terminated in any way */ ulint f_len; /*!< Length of the string in bytes */ - ulint f_n_char; /*!< Number of characters */ }; /** Query ranked doc ids. */ @@ -915,20 +914,23 @@ innobase_fts_text_cmp_prefix( const void* p1, /*!< in: key */ const void* p2); /*!< in: node */ -/*************************************************************//** -Get the next token from the given string and store it in *token. */ +/** Get the next token from the given string and store it in *token. +@param[in] cs Character set +@param[in] start start of tex +@param[in] end one character past end of text +@param[out] token token's text +@param[out] offset offset to token, measured as characters from 'start' +@param[out] n_chars number of characters existed +@return length of string processed */ extern ulint innobase_mysql_fts_get_token( -/*=========================*/ - CHARSET_INFO* charset, /*!< in: Character set */ - const byte* start, /*!< in: start of text */ - const byte* end, /*!< in: one character past - end of text */ - fts_string_t* token, /*!< out: token's text */ - ulint* offset); /*!< out: offset to token, - measured as characters from - 'start' */ + CHARSET_INFO* cs, + const byte* start, + const byte* end, + fts_string_t* token, + ulint* offset, + ulint* n_chars); /*********************************************************************//** Fetch COUNT(*) from specified table. diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic index f0dfd023a70..964897320a3 100644 --- a/storage/innobase/include/fts0types.ic +++ b/storage/innobase/include/fts0types.ic @@ -52,7 +52,6 @@ fts_utf8_string_dup( dst->f_len = src->f_len; dst->f_str[src->f_len] = 0; - dst->f_n_char = src->f_n_char; } /******************************************************************//** diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index 488ed0257a7..dfba4d56485 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -386,20 +386,25 @@ innobase_close_thd( /*===============*/ THD* thd); /*!< in: MySQL thread handle for which to close the connection */ -/*************************************************************//** -Get the next token from the given string and store it in *token. */ + +/** Get the next token from the given string and store it in *token. +It is mostly copied from MyISAM's doc parsing function ft_simple_get_word() +@param[in] cs Character set +@param[in] start start of tex +@param[in] end one character past end of text +@param[out] token token's text +@param[out] offset offset to token, measured as characters from 'start' +@param[out] n_chars number of characters existed +@return length of string processed */ UNIV_INTERN ulint innobase_mysql_fts_get_token( -/*=========================*/ - CHARSET_INFO* charset, /*!< in: Character set */ - const byte* start, /*!< in: start of text */ - const byte* end, /*!< in: one character past end of - text */ - fts_string_t* token, /*!< out: token's text */ - ulint* offset); /*!< out: offset to token, - measured as characters from - 'start' */ + CHARSET_INFO* cs, + const byte* start, + const byte* end, + fts_string_t* token, + ulint* offset, + ulint* n_chars=NULL); /******************************************************************//** compare two character string case insensitively according to their charset. */ diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 757e268c3a7..4609508b023 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -385,7 +385,6 @@ row_merge_fts_doc_tokenize( ulint data_size[FTS_NUM_AUX_INDEX]; ulint n_tuple[FTS_NUM_AUX_INDEX]; - t_str.f_n_char = 0; t_ctx->buf_used = 0; memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); @@ -400,17 +399,19 @@ row_merge_fts_doc_tokenize( ulint offset = 0; ulint cur_len; doc_id_t write_doc_id; + ulint n_chars = 0; inc = innobase_mysql_fts_get_token( doc->charset, doc->text.f_str + i, - doc->text.f_str + doc->text.f_len, &str, &offset); + doc->text.f_str + doc->text.f_len, &str, &offset, + &n_chars); ut_a(inc > 0); /* Ignore string whose character number is less than "fts_min_token_size" or more than "fts_max_token_size" */ - if (str.f_n_char < fts_min_token_size - || str.f_n_char > fts_max_token_size) { + if (n_chars < fts_min_token_size + || n_chars > fts_max_token_size) { t_ctx->processed_len += inc; continue; @@ -1083,7 +1084,6 @@ row_fts_insert_tuple( /* Get the first field for the tokenized word */ dfield = dtuple_get_nth_field(dtuple, 0); - token_word.f_n_char = 0; token_word.f_len = dfield->len; token_word.f_str = static_cast<byte*>(dfield_get_data(dfield)); diff --git a/storage/xtradb/fts/fts0ast.cc b/storage/xtradb/fts/fts0ast.cc index 4b36152cf62..df735d30c13 100644 --- a/storage/xtradb/fts/fts0ast.cc +++ b/storage/xtradb/fts/fts0ast.cc @@ -100,12 +100,13 @@ fts_ast_create_node_term( fts_string_t str; ulint offset; ulint cur_len; + ulint n_chars = 0; cur_len = innobase_mysql_fts_get_token( state->charset, reinterpret_cast<const byte*>(ptr->str) + cur_pos, reinterpret_cast<const byte*>(ptr->str) + len, - &str, &offset); + &str, &offset, &n_chars); if (cur_len == 0) { break; @@ -113,13 +114,13 @@ fts_ast_create_node_term( cur_pos += cur_len; - if (str.f_n_char > 0) { + if (n_chars > 0) { /* If the subsequent term (after the first one)'s size is less than fts_min_token_size or the term is greater than fts_max_token_size, we shall ignore that. This is to make consistent with MyISAM behavior */ - if ((first_node && (str.f_n_char < fts_min_token_size)) - || str.f_n_char > fts_max_token_size) { + if ((first_node && (n_chars < fts_min_token_size)) + || n_chars > fts_max_token_size) { continue; } diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc index e2a479bf0ae..b13c741bd37 100644 --- a/storage/xtradb/fts/fts0fts.cc +++ b/storage/xtradb/fts/fts0fts.cc @@ -363,8 +363,6 @@ fts_load_default_stopword( stop_words = stopword_info->cached_stopword; - str.f_n_char = 0; - for (ulint i = 0; fts_default_stopword[i]; ++i) { char* word; fts_tokenizer_word_t new_word; @@ -418,7 +416,6 @@ fts_read_stopword( /* We only need to read the first column */ dfield = que_node_get_val(exp); - str.f_n_char = 0; str.f_str = static_cast<byte*>(dfield_get_data(dfield)); str.f_len = dfield_get_len(dfield); @@ -436,7 +433,6 @@ fts_read_stopword( memcpy(new_word.text.f_str, str.f_str, str.f_len); - new_word.text.f_n_char = 0; new_word.text.f_len = str.f_len; new_word.text.f_str[str.f_len] = 0; @@ -2527,7 +2523,6 @@ fts_get_max_cache_size( /* We set the length of value to the max bytes it can hold. This information is used by the callback that reads the value. */ - value.f_n_char = 0; value.f_len = FTS_MAX_CONFIG_VALUE_LEN; value.f_str = ut_malloc(value.f_len + 1); @@ -2597,7 +2592,6 @@ fts_get_total_word_count( /* We set the length of value to the max bytes it can hold. This information is used by the callback that reads the value. */ - value.f_n_char = 0; value.f_len = FTS_MAX_CONFIG_VALUE_LEN; value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1)); @@ -3259,8 +3253,6 @@ fts_query_expansion_fetch_doc( exp = que_node_get_next(exp); continue; } else { - doc.text.f_n_char = 0; - doc.text.f_str = static_cast<byte*>( dfield_get_data(dfield)); @@ -4705,6 +4697,7 @@ fts_process_token( fts_string_t str; ulint offset = 0; fts_doc_t* result_doc; + ulint n_chars = 0; /* Determine where to save the result. */ result_doc = (result) ? result : doc; @@ -4712,13 +4705,14 @@ fts_process_token( /* The length of a string in characters is set here only. */ ret = innobase_mysql_fts_get_token( doc->charset, doc->text.f_str + start_pos, - doc->text.f_str + doc->text.f_len, &str, &offset); + doc->text.f_str + doc->text.f_len, &str, &offset, + &n_chars); /* Ignore string whose character number is less than "fts_min_token_size" or more than "fts_max_token_size" */ - if (str.f_n_char >= fts_min_token_size - && str.f_n_char <= fts_max_token_size) { + if (n_chars >= fts_min_token_size + && n_chars <= fts_max_token_size) { mem_heap_t* heap; fts_string_t t_str; @@ -4728,8 +4722,6 @@ fts_process_token( heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg); - t_str.f_n_char = str.f_n_char; - t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1; t_str.f_str = static_cast<byte*>( @@ -4757,13 +4749,12 @@ fts_process_token( new_token.text.f_len = newlen; new_token.text.f_str = t_str.f_str; - new_token.text.f_n_char = t_str.f_n_char; new_token.positions = ib_vector_create( result_doc->self_heap, sizeof(ulint), 32); - ut_a(new_token.text.f_n_char >= fts_min_token_size); - ut_a(new_token.text.f_n_char <= fts_max_token_size); + ut_a(n_chars >= fts_min_token_size); + ut_a(n_chars <= fts_max_token_size); parent.last = rbt_add_node( result_doc->tokens, &parent, &new_token); @@ -7408,7 +7399,6 @@ fts_load_stopword( if (reload) { /* Fetch the stopword table name from FTS config table */ - str.f_n_char = 0; str.f_str = str_buffer; str.f_len = sizeof(str_buffer) - 1; @@ -7433,7 +7423,6 @@ fts_load_stopword( /* Save the stopword table name to the configure table */ if (!reload) { - str.f_n_char = 0; str.f_str = (byte*) stopword_to_use; str.f_len = ut_strlen(stopword_to_use); diff --git a/storage/xtradb/fts/fts0que.cc b/storage/xtradb/fts/fts0que.cc index b9ad43c626a..dd1aa968bcd 100644 --- a/storage/xtradb/fts/fts0que.cc +++ b/storage/xtradb/fts/fts0que.cc @@ -583,10 +583,18 @@ fts_ranking_words_add( /* We use ib_rbt to simulate a map, f_n_char means position. */ if (rbt_search(query->word_map, &parent, word) == 0) { - fts_string_t* result_word; + CHARSET_INFO* charset = query->fts_index_table.charset; + + for (ulint i = 0; i < query->word_vector->size(); i++) { + if (!my_strcasecmp( + charset, + (const char*) query->word_vector->at(i).f_str, + (const char*) word->f_str)) { + pos = i; + break; + } + } - result_word = rbt_value(fts_string_t, parent.last); - pos = result_word->f_n_char; ut_ad(pos < rbt_size(query->word_map)); } else { /* Add the word to map. */ @@ -599,7 +607,6 @@ fts_ranking_words_add( memcpy(new_word.f_str, word->f_str, word->f_len); new_word.f_str[word->f_len] = 0; new_word.f_len = word->f_len; - new_word.f_n_char = pos; rbt_add_node(query->word_map, &parent, &new_word); ut_ad(rbt_validate(query->word_map)); @@ -1740,11 +1747,12 @@ fts_proximity_is_word_in_range( ulint len; fts_string_t str; ulint offset = 0; + ulint n_chars = 0; len = innobase_mysql_fts_get_token( phrase->charset, start + cur_pos, - start + total_len, &str, &offset); + start + total_len, &str, &offset, &n_chars); if (len == 0) { break; @@ -1754,7 +1762,7 @@ fts_proximity_is_word_in_range( cur_pos += len; /* Record the number of words */ - if (str.f_n_char > 0) { + if (n_chars > 0) { n_word++; } @@ -2560,12 +2568,13 @@ fts_query_phrase_search( ulint offset; ulint cur_len; fts_string_t result_str; + ulint n_chars = 0; cur_len = innobase_mysql_fts_get_token( charset, reinterpret_cast<const byte*>(phrase->f_str) + cur_pos, reinterpret_cast<const byte*>(phrase->f_str) + len, - &result_str, &offset); + &result_str, &offset, &n_chars); if (cur_len == 0) { break; @@ -2573,7 +2582,7 @@ fts_query_phrase_search( cur_pos += cur_len; - if (result_str.f_n_char == 0) { + if (n_chars == 0) { continue; } @@ -2590,8 +2599,8 @@ fts_query_phrase_search( if (cache->stopword_info.cached_stopword && rbt_search(cache->stopword_info.cached_stopword, &parent, token) != 0 - && result_str.f_n_char >= fts_min_token_size - && result_str.f_n_char <= fts_max_token_size) { + && n_chars >= fts_min_token_size + && n_chars <= fts_max_token_size) { /* Add the word to the RB tree so that we can calculate it's frequencey within a document. */ fts_query_add_word_freq(query, token); @@ -2860,7 +2869,6 @@ fts_query_visitor( DBUG_ENTER("fts_query_visitor"); DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type))); - token.f_n_char = 0; query->oper = oper; query->cur_node = node; @@ -3814,10 +3822,11 @@ fts_query_str_preprocess( fts_string_t str; ulint offset; ulint cur_len; + ulint n_chars = 0; cur_len = innobase_mysql_fts_get_token( charset, str_ptr + cur_pos, str_ptr + *result_len, - &str, &offset); + &str, &offset, &n_chars); if (cur_len == 0 || str.f_str == NULL) { /* No valid word found */ diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 6bc4c76f88e..a369f59064d 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -6674,29 +6674,31 @@ innobase_fts_casedn_str( #define misc_word_char(X) 0 -/*************************************************************//** -Get the next token from the given string and store it in *token. +/** Get the next token from the given string and store it in *token. It is mostly copied from MyISAM's doc parsing function ft_simple_get_word() +@param[in] cs Character set +@param[in] start start of tex +@param[in] end one character past end of text +@param[out] token token's text +@param[out] offset offset to token, measured as characters from 'start' +@param[out] n_chars number of characters existed @return length of string processed */ UNIV_INTERN ulint innobase_mysql_fts_get_token( -/*=========================*/ - CHARSET_INFO* cs, /*!< in: Character set */ - const byte* start, /*!< in: start of text */ - const byte* end, /*!< in: one character past end of - text */ - fts_string_t* token, /*!< out: token's text */ - ulint* offset) /*!< out: offset to token, - measured as characters from - 'start' */ + CHARSET_INFO* cs, + const byte* start, + const byte* end, + fts_string_t* token, + ulint* offset, + ulint* n_chars) { int mbl; const uchar* doc = start; ut_a(cs); - token->f_n_char = token->f_len = 0; + token->f_len = 0; token->f_str = NULL; for (;;) { @@ -6742,7 +6744,10 @@ innobase_mysql_fts_get_token( } token->f_len = (uint) (doc - token->f_str) - mwc; - token->f_n_char = length; + + if (n_chars != NULL) { + *n_chars = length; + } return(doc - start); } diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc index b864830657e..9623fc7b670 100644 --- a/storage/xtradb/handler/i_s.cc +++ b/storage/xtradb/handler/i_s.cc @@ -3261,7 +3261,6 @@ i_s_fts_index_cache_fill_one_index( fields = table->field; index_charset = index_cache->charset; - conv_str->f_n_char = 0; int ret = 0; @@ -3275,15 +3274,15 @@ i_s_fts_index_cache_fill_one_index( /* Convert word from index charset to system_charset_info */ if (index_charset->cset != system_charset_info->cset) { - conv_str->f_n_char = my_convert( + ulint n_chars = my_convert( reinterpret_cast<char*>(conv_str->f_str), static_cast<uint32>(conv_str->f_len), system_charset_info, reinterpret_cast<char*>(word->text.f_str), static_cast<uint32>(word->text.f_len), index_charset, &dummy_errors); - ut_ad(conv_str->f_n_char <= conv_str->f_len); - conv_str->f_str[conv_str->f_n_char] = 0; + ut_ad(n_chars <= conv_str->f_len); + conv_str->f_str[n_chars] = 0; word_str = reinterpret_cast<char*>(conv_str->f_str); } else { word_str = reinterpret_cast<char*>(word->text.f_str); @@ -3642,15 +3641,15 @@ i_s_fts_index_table_fill_one_fetch( /* Convert word from index charset to system_charset_info */ if (index_charset->cset != system_charset_info->cset) { - conv_str->f_n_char = my_convert( + ulint n_chars = my_convert( reinterpret_cast<char*>(conv_str->f_str), static_cast<uint32>(conv_str->f_len), system_charset_info, reinterpret_cast<char*>(word->text.f_str), static_cast<uint32>(word->text.f_len), index_charset, &dummy_errors); - ut_ad(conv_str->f_n_char <= conv_str->f_len); - conv_str->f_str[conv_str->f_n_char] = 0; + ut_ad(n_chars <= conv_str->f_len); + conv_str->f_str[n_chars] = 0; word_str = reinterpret_cast<char*>(conv_str->f_str); } else { word_str = reinterpret_cast<char*>(word->text.f_str); @@ -3742,7 +3741,6 @@ i_s_fts_index_table_fill_one_index( word.f_str = NULL; word.f_len = 0; - word.f_n_char = 0; index_charset = fts_index_get_charset(index); diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h index ce30a17c4b4..1d43e9b1ce5 100644 --- a/storage/xtradb/include/fts0fts.h +++ b/storage/xtradb/include/fts0fts.h @@ -236,7 +236,6 @@ struct fts_string_t { byte* f_str; /*!< string, not necessary terminated in any way */ ulint f_len; /*!< Length of the string in bytes */ - ulint f_n_char; /*!< Number of characters */ }; /** Query ranked doc ids. */ @@ -915,20 +914,24 @@ innobase_fts_text_cmp_prefix( const void* p1, /*!< in: key */ const void* p2); /*!< in: node */ -/*************************************************************//** -Get the next token from the given string and store it in *token. */ +/** Get the next token from the given string and store it in *token. +It is mostly copied from MyISAM's doc parsing function ft_simple_get_word() +@param[in] cs Character set +@param[in] start start of tex +@param[in] end one character past end of text +@param[out] token token's text +@param[out] offset offset to token, measured as characters from 'start' +@param[out] n_chars number of characters existed +@return length of string processed */ extern ulint innobase_mysql_fts_get_token( -/*=========================*/ - CHARSET_INFO* charset, /*!< in: Character set */ - const byte* start, /*!< in: start of text */ - const byte* end, /*!< in: one character past - end of text */ - fts_string_t* token, /*!< out: token's text */ - ulint* offset); /*!< out: offset to token, - measured as characters from - 'start' */ + CHARSET_INFO* cs, + const byte* start, + const byte* end, + fts_string_t* token, + ulint* offset, + ulint* n_chars); /*********************************************************************//** Fetch COUNT(*) from specified table. diff --git a/storage/xtradb/include/fts0types.ic b/storage/xtradb/include/fts0types.ic index f0dfd023a70..964897320a3 100644 --- a/storage/xtradb/include/fts0types.ic +++ b/storage/xtradb/include/fts0types.ic @@ -52,7 +52,6 @@ fts_utf8_string_dup( dst->f_len = src->f_len; dst->f_str[src->f_len] = 0; - dst->f_n_char = src->f_n_char; } /******************************************************************//** diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h index 1dfbfe7c8fb..467a61622be 100644 --- a/storage/xtradb/include/ha_prototypes.h +++ b/storage/xtradb/include/ha_prototypes.h @@ -422,20 +422,24 @@ innobase_close_thd( /*===============*/ THD* thd); /*!< in: MySQL thread handle for which to close the connection */ -/*************************************************************//** -Get the next token from the given string and store it in *token. */ + +/** Get the next token from the given string and store it in *token. +@param[in] cs Character set +@param[in] start start of tex +@param[in] end one character past end of text +@param[out] token token's text +@param[out] offset offset to token, measured as characters from 'start' +@param[out] n_chars number of characters existed +@return length of string processed */ UNIV_INTERN ulint innobase_mysql_fts_get_token( -/*=========================*/ - CHARSET_INFO* charset, /*!< in: Character set */ - const byte* start, /*!< in: start of text */ - const byte* end, /*!< in: one character past end of - text */ - fts_string_t* token, /*!< out: token's text */ - ulint* offset); /*!< out: offset to token, - measured as characters from - 'start' */ + CHARSET_INFO* cs, + const byte* start, + const byte* end, + fts_string_t* token, + ulint* offset, + ulint* n_chars=NULL); /******************************************************************//** compare two character string case insensitively according to their charset. */ diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc index bb9821d4484..4ed38be8b6c 100644 --- a/storage/xtradb/row/row0ftsort.cc +++ b/storage/xtradb/row/row0ftsort.cc @@ -388,7 +388,6 @@ row_merge_fts_doc_tokenize( ulint data_size[FTS_NUM_AUX_INDEX]; ulint n_tuple[FTS_NUM_AUX_INDEX]; - t_str.f_n_char = 0; t_ctx->buf_used = 0; memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint)); @@ -403,17 +402,19 @@ row_merge_fts_doc_tokenize( ulint offset = 0; ulint cur_len; doc_id_t write_doc_id; + ulint n_chars = 0; inc = innobase_mysql_fts_get_token( doc->charset, doc->text.f_str + i, - doc->text.f_str + doc->text.f_len, &str, &offset); + doc->text.f_str + doc->text.f_len, &str, &offset, + &n_chars); ut_a(inc > 0); /* Ignore string whose character number is less than "fts_min_token_size" or more than "fts_max_token_size" */ - if (str.f_n_char < fts_min_token_size - || str.f_n_char > fts_max_token_size) { + if (n_chars < fts_min_token_size + || n_chars > fts_max_token_size) { t_ctx->processed_len += inc; continue; @@ -1086,7 +1087,6 @@ row_fts_insert_tuple( /* Get the first field for the tokenized word */ dfield = dtuple_get_nth_field(dtuple, 0); - token_word.f_n_char = 0; token_word.f_len = dfield->len; token_word.f_str = static_cast<byte*>(dfield_get_data(dfield)); |