summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThirunarayanan Balathandayuthapani <thiru@mariadb.com>2018-10-16 11:38:11 +0530
committerThirunarayanan Balathandayuthapani <thiru@mariadb.com>2018-10-16 11:38:11 +0530
commit72033f367f4d6b51eea773acba1e0031f98e05b8 (patch)
tree07c28210f68b4bd708aee9ff361b47c147ad1d8d
parentfd680ef66346208811d77fa4c2c938430234f469 (diff)
downloadmariadb-git-72033f367f4d6b51eea773acba1e0031f98e05b8.tar.gz
MDEV-12547: InnoDB FULLTEXT index has too strict innodb_ft_result_cache_limit max limitbb-10.0-MDEV-12547
- Removed f_n_char from fts_string_t. Instead of that, InnoDB calculate the number of chars when it is needed.
-rw-r--r--storage/innobase/fts/fts0ast.cc9
-rw-r--r--storage/innobase/fts/fts0fts.cc24
-rw-r--r--storage/innobase/fts/fts0que.cc30
-rw-r--r--storage/innobase/handler/ha_innodb.cc31
-rw-r--r--storage/innobase/handler/i_s.cc14
-rw-r--r--storage/innobase/include/fts0fts.h26
-rw-r--r--storage/innobase/include/fts0types.ic1
-rw-r--r--storage/innobase/include/ha_prototypes.h27
-rw-r--r--storage/innobase/row/row0ftsort.cc10
-rw-r--r--storage/xtradb/fts/fts0ast.cc9
-rw-r--r--storage/xtradb/fts/fts0fts.cc25
-rw-r--r--storage/xtradb/fts/fts0que.cc33
-rw-r--r--storage/xtradb/handler/ha_innodb.cc31
-rw-r--r--storage/xtradb/handler/i_s.cc14
-rw-r--r--storage/xtradb/include/fts0fts.h27
-rw-r--r--storage/xtradb/include/fts0types.ic1
-rw-r--r--storage/xtradb/include/ha_prototypes.h26
-rw-r--r--storage/xtradb/row/row0ftsort.cc10
18 files changed, 181 insertions, 167 deletions
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
index 4b36152cf62..df735d30c13 100644
--- a/storage/innobase/fts/fts0ast.cc
+++ b/storage/innobase/fts/fts0ast.cc
@@ -100,12 +100,13 @@ fts_ast_create_node_term(
fts_string_t str;
ulint offset;
ulint cur_len;
+ ulint n_chars = 0;
cur_len = innobase_mysql_fts_get_token(
state->charset,
reinterpret_cast<const byte*>(ptr->str) + cur_pos,
reinterpret_cast<const byte*>(ptr->str) + len,
- &str, &offset);
+ &str, &offset, &n_chars);
if (cur_len == 0) {
break;
@@ -113,13 +114,13 @@ fts_ast_create_node_term(
cur_pos += cur_len;
- if (str.f_n_char > 0) {
+ if (n_chars > 0) {
/* If the subsequent term (after the first one)'s size
is less than fts_min_token_size or the term is greater
than fts_max_token_size, we shall ignore that. This is
to make consistent with MyISAM behavior */
- if ((first_node && (str.f_n_char < fts_min_token_size))
- || str.f_n_char > fts_max_token_size) {
+ if ((first_node && (n_chars < fts_min_token_size))
+ || n_chars > fts_max_token_size) {
continue;
}
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
index 4891e572741..10e4d2ee03b 100644
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@@ -363,8 +363,6 @@ fts_load_default_stopword(
stop_words = stopword_info->cached_stopword;
- str.f_n_char = 0;
-
for (ulint i = 0; fts_default_stopword[i]; ++i) {
char* word;
fts_tokenizer_word_t new_word;
@@ -418,7 +416,6 @@ fts_read_stopword(
/* We only need to read the first column */
dfield = que_node_get_val(exp);
- str.f_n_char = 0;
str.f_str = static_cast<byte*>(dfield_get_data(dfield));
str.f_len = dfield_get_len(dfield);
@@ -436,7 +433,6 @@ fts_read_stopword(
memcpy(new_word.text.f_str, str.f_str, str.f_len);
- new_word.text.f_n_char = 0;
new_word.text.f_len = str.f_len;
new_word.text.f_str[str.f_len] = 0;
@@ -2527,7 +2523,6 @@ fts_get_max_cache_size(
/* We set the length of value to the max bytes it can hold. This
information is used by the callback that reads the value. */
- value.f_n_char = 0;
value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
value.f_str = ut_malloc(value.f_len + 1);
@@ -2597,7 +2592,6 @@ fts_get_total_word_count(
/* We set the length of value to the max bytes it can hold. This
information is used by the callback that reads the value. */
- value.f_n_char = 0;
value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
@@ -3259,8 +3253,6 @@ fts_query_expansion_fetch_doc(
exp = que_node_get_next(exp);
continue;
} else {
- doc.text.f_n_char = 0;
-
doc.text.f_str = static_cast<byte*>(
dfield_get_data(dfield));
@@ -4705,6 +4697,7 @@ fts_process_token(
fts_string_t str;
ulint offset = 0;
fts_doc_t* result_doc;
+ ulint n_chars = 0;
/* Determine where to save the result. */
result_doc = (result) ? result : doc;
@@ -4712,13 +4705,13 @@ fts_process_token(
/* The length of a string in characters is set here only. */
ret = innobase_mysql_fts_get_token(
doc->charset, doc->text.f_str + start_pos,
- doc->text.f_str + doc->text.f_len, &str, &offset);
+ doc->text.f_str + doc->text.f_len, &str, &offset, &n_chars);
/* Ignore string whose character number is less than
"fts_min_token_size" or more than "fts_max_token_size" */
- if (str.f_n_char >= fts_min_token_size
- && str.f_n_char <= fts_max_token_size) {
+ if (n_chars >= fts_min_token_size
+ && n_chars <= fts_max_token_size) {
mem_heap_t* heap;
fts_string_t t_str;
@@ -4728,8 +4721,6 @@ fts_process_token(
heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
- t_str.f_n_char = str.f_n_char;
-
t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1;
t_str.f_str = static_cast<byte*>(
@@ -4757,13 +4748,12 @@ fts_process_token(
new_token.text.f_len = newlen;
new_token.text.f_str = t_str.f_str;
- new_token.text.f_n_char = t_str.f_n_char;
new_token.positions = ib_vector_create(
result_doc->self_heap, sizeof(ulint), 32);
- ut_a(new_token.text.f_n_char >= fts_min_token_size);
- ut_a(new_token.text.f_n_char <= fts_max_token_size);
+ ut_a(n_chars >= fts_min_token_size);
+ ut_a(n_chars <= fts_max_token_size);
parent.last = rbt_add_node(
result_doc->tokens, &parent, &new_token);
@@ -7408,7 +7398,6 @@ fts_load_stopword(
if (reload) {
/* Fetch the stopword table name from FTS config
table */
- str.f_n_char = 0;
str.f_str = str_buffer;
str.f_len = sizeof(str_buffer) - 1;
@@ -7433,7 +7422,6 @@ fts_load_stopword(
/* Save the stopword table name to the configure
table */
if (!reload) {
- str.f_n_char = 0;
str.f_str = (byte*) stopword_to_use;
str.f_len = ut_strlen(stopword_to_use);
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 7983181c23a..62456871dcf 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -583,10 +583,18 @@ fts_ranking_words_add(
/* We use ib_rbt to simulate a map, f_n_char means position. */
if (rbt_search(query->word_map, &parent, word) == 0) {
- fts_string_t* result_word;
+ CHARSET_INFO* charset = query->fts_index_table.charset;
+
+ for (ulint i = 0; i < query->word_vector->size(); i++) {
+ if (!my_strcasecmp(
+ charset,
+ (const char*) query->word_vector->at(i).f_str,
+ (const char*) word->f_str)) {
+ pos = i;
+ break;
+ }
+ }
- result_word = rbt_value(fts_string_t, parent.last);
- pos = result_word->f_n_char;
ut_ad(pos < rbt_size(query->word_map));
} else {
/* Add the word to map. */
@@ -599,7 +607,6 @@ fts_ranking_words_add(
memcpy(new_word.f_str, word->f_str, word->f_len);
new_word.f_str[word->f_len] = 0;
new_word.f_len = word->f_len;
- new_word.f_n_char = pos;
rbt_add_node(query->word_map, &parent, &new_word);
ut_ad(rbt_validate(query->word_map));
@@ -1720,11 +1727,12 @@ fts_proximity_is_word_in_range(
ulint len;
fts_string_t str;
ulint offset = 0;
+ ulint n_chars = 0;
len = innobase_mysql_fts_get_token(
phrase->charset,
start + cur_pos,
- start + total_len, &str, &offset);
+ start + total_len, &str, &offset, &n_chars);
if (len == 0) {
break;
@@ -1734,7 +1742,7 @@ fts_proximity_is_word_in_range(
cur_pos += len;
/* Record the number of words */
- if (str.f_n_char > 0) {
+ if (n_chars > 0) {
n_word++;
}
@@ -2540,12 +2548,13 @@ fts_query_phrase_search(
ulint offset;
ulint cur_len;
fts_string_t result_str;
+ ulint n_chars = 0;
cur_len = innobase_mysql_fts_get_token(
charset,
reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
reinterpret_cast<const byte*>(phrase->f_str) + len,
- &result_str, &offset);
+ &result_str, &offset, &n_chars);
if (cur_len == 0) {
break;
@@ -2553,7 +2562,7 @@ fts_query_phrase_search(
cur_pos += cur_len;
- if (result_str.f_n_char == 0) {
+ if (n_chars == 0) {
continue;
}
@@ -2570,8 +2579,8 @@ fts_query_phrase_search(
if (cache->stopword_info.cached_stopword
&& rbt_search(cache->stopword_info.cached_stopword,
&parent, token) != 0
- && result_str.f_n_char >= fts_min_token_size
- && result_str.f_n_char <= fts_max_token_size) {
+ && n_chars >= fts_min_token_size
+ && n_chars <= fts_max_token_size) {
/* Add the word to the RB tree so that we can
calculate it's frequencey within a document. */
fts_query_add_word_freq(query, token);
@@ -2840,7 +2849,6 @@ fts_query_visitor(
DBUG_ENTER("fts_query_visitor");
DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
- token.f_n_char = 0;
query->oper = oper;
query->cur_node = node;
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 2092cd113a5..e79e3516804 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -5986,29 +5986,31 @@ innobase_fts_casedn_str(
#define misc_word_char(X) 0
-/*************************************************************//**
-Get the next token from the given string and store it in *token.
+/** Get the next token from the given string and store it in *token.
It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@param[in] cs Character set
+@param[in] start start of tex
+@param[in] end one character past end of text
+@param[out] token token's text
+@param[out] offset offset to token, measured as characters from 'start'
+@param[out] n_chars number of characters existed
@return length of string processed */
UNIV_INTERN
ulint
innobase_mysql_fts_get_token(
-/*=========================*/
- CHARSET_INFO* cs, /*!< in: Character set */
- const byte* start, /*!< in: start of text */
- const byte* end, /*!< in: one character past end of
- text */
- fts_string_t* token, /*!< out: token's text */
- ulint* offset) /*!< out: offset to token,
- measured as characters from
- 'start' */
+ CHARSET_INFO* cs,
+ const byte* start,
+ const byte* end,
+ fts_string_t* token,
+ ulint* offset,
+ ulint* n_chars)
{
int mbl;
const uchar* doc = start;
ut_a(cs);
- token->f_n_char = token->f_len = 0;
+ token->f_len = 0;
token->f_str = NULL;
for (;;) {
@@ -6054,7 +6056,10 @@ innobase_mysql_fts_get_token(
}
token->f_len = (uint) (doc - token->f_str) - mwc;
- token->f_n_char = length;
+
+ if (n_chars != NULL) {
+ *n_chars = length;
+ }
return(doc - start);
}
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index de164e42273..461eb0596d0 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -3268,7 +3268,6 @@ i_s_fts_index_cache_fill_one_index(
fields = table->field;
index_charset = index_cache->charset;
- conv_str->f_n_char = 0;
int ret = 0;
@@ -3282,15 +3281,15 @@ i_s_fts_index_cache_fill_one_index(
/* Convert word from index charset to system_charset_info */
if (index_charset->cset != system_charset_info->cset) {
- conv_str->f_n_char = my_convert(
+ ulint n_chars = my_convert(
reinterpret_cast<char*>(conv_str->f_str),
static_cast<uint32>(conv_str->f_len),
system_charset_info,
reinterpret_cast<char*>(word->text.f_str),
static_cast<uint32>(word->text.f_len),
index_charset, &dummy_errors);
- ut_ad(conv_str->f_n_char <= conv_str->f_len);
- conv_str->f_str[conv_str->f_n_char] = 0;
+ ut_ad(n_chars <= conv_str->f_len);
+ conv_str->f_str[n_chars] = 0;
word_str = reinterpret_cast<char*>(conv_str->f_str);
} else {
word_str = reinterpret_cast<char*>(word->text.f_str);
@@ -3650,15 +3649,15 @@ i_s_fts_index_table_fill_one_fetch(
/* Convert word from index charset to system_charset_info */
if (index_charset->cset != system_charset_info->cset) {
- conv_str->f_n_char = my_convert(
+ ulint n_chars = my_convert(
reinterpret_cast<char*>(conv_str->f_str),
static_cast<uint32>(conv_str->f_len),
system_charset_info,
reinterpret_cast<char*>(word->text.f_str),
static_cast<uint32>(word->text.f_len),
index_charset, &dummy_errors);
- ut_ad(conv_str->f_n_char <= conv_str->f_len);
- conv_str->f_str[conv_str->f_n_char] = 0;
+ ut_ad(n_chars <= conv_str->f_len);
+ conv_str->f_str[n_chars] = 0;
word_str = reinterpret_cast<char*>(conv_str->f_str);
} else {
word_str = reinterpret_cast<char*>(word->text.f_str);
@@ -3750,7 +3749,6 @@ i_s_fts_index_table_fill_one_index(
word.f_str = NULL;
word.f_len = 0;
- word.f_n_char = 0;
index_charset = fts_index_get_charset(index);
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
index ce30a17c4b4..987db972fc3 100644
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@@ -236,7 +236,6 @@ struct fts_string_t {
byte* f_str; /*!< string, not necessary terminated in
any way */
ulint f_len; /*!< Length of the string in bytes */
- ulint f_n_char; /*!< Number of characters */
};
/** Query ranked doc ids. */
@@ -915,20 +914,23 @@ innobase_fts_text_cmp_prefix(
const void* p1, /*!< in: key */
const void* p2); /*!< in: node */
-/*************************************************************//**
-Get the next token from the given string and store it in *token. */
+/** Get the next token from the given string and store it in *token.
+@param[in] cs Character set
+@param[in] start start of tex
+@param[in] end one character past end of text
+@param[out] token token's text
+@param[out] offset offset to token, measured as characters from 'start'
+@param[out] n_chars number of characters existed
+@return length of string processed */
extern
ulint
innobase_mysql_fts_get_token(
-/*=========================*/
- CHARSET_INFO* charset, /*!< in: Character set */
- const byte* start, /*!< in: start of text */
- const byte* end, /*!< in: one character past
- end of text */
- fts_string_t* token, /*!< out: token's text */
- ulint* offset); /*!< out: offset to token,
- measured as characters from
- 'start' */
+ CHARSET_INFO* cs,
+ const byte* start,
+ const byte* end,
+ fts_string_t* token,
+ ulint* offset,
+ ulint* n_chars);
/*********************************************************************//**
Fetch COUNT(*) from specified table.
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
index f0dfd023a70..964897320a3 100644
--- a/storage/innobase/include/fts0types.ic
+++ b/storage/innobase/include/fts0types.ic
@@ -52,7 +52,6 @@ fts_utf8_string_dup(
dst->f_len = src->f_len;
dst->f_str[src->f_len] = 0;
- dst->f_n_char = src->f_n_char;
}
/******************************************************************//**
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index 488ed0257a7..dfba4d56485 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -386,20 +386,25 @@ innobase_close_thd(
/*===============*/
THD* thd); /*!< in: MySQL thread handle for
which to close the connection */
-/*************************************************************//**
-Get the next token from the given string and store it in *token. */
+
+/** Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@param[in] cs Character set
+@param[in] start start of tex
+@param[in] end one character past end of text
+@param[out] token token's text
+@param[out] offset offset to token, measured as characters from 'start'
+@param[out] n_chars number of characters existed
+@return length of string processed */
UNIV_INTERN
ulint
innobase_mysql_fts_get_token(
-/*=========================*/
- CHARSET_INFO* charset, /*!< in: Character set */
- const byte* start, /*!< in: start of text */
- const byte* end, /*!< in: one character past end of
- text */
- fts_string_t* token, /*!< out: token's text */
- ulint* offset); /*!< out: offset to token,
- measured as characters from
- 'start' */
+ CHARSET_INFO* cs,
+ const byte* start,
+ const byte* end,
+ fts_string_t* token,
+ ulint* offset,
+ ulint* n_chars=NULL);
/******************************************************************//**
compare two character string case insensitively according to their charset. */
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 757e268c3a7..4609508b023 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -385,7 +385,6 @@ row_merge_fts_doc_tokenize(
ulint data_size[FTS_NUM_AUX_INDEX];
ulint n_tuple[FTS_NUM_AUX_INDEX];
- t_str.f_n_char = 0;
t_ctx->buf_used = 0;
memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
@@ -400,17 +399,19 @@ row_merge_fts_doc_tokenize(
ulint offset = 0;
ulint cur_len;
doc_id_t write_doc_id;
+ ulint n_chars = 0;
inc = innobase_mysql_fts_get_token(
doc->charset, doc->text.f_str + i,
- doc->text.f_str + doc->text.f_len, &str, &offset);
+ doc->text.f_str + doc->text.f_len, &str, &offset,
+ &n_chars);
ut_a(inc > 0);
/* Ignore string whose character number is less than
"fts_min_token_size" or more than "fts_max_token_size" */
- if (str.f_n_char < fts_min_token_size
- || str.f_n_char > fts_max_token_size) {
+ if (n_chars < fts_min_token_size
+ || n_chars > fts_max_token_size) {
t_ctx->processed_len += inc;
continue;
@@ -1083,7 +1084,6 @@ row_fts_insert_tuple(
/* Get the first field for the tokenized word */
dfield = dtuple_get_nth_field(dtuple, 0);
- token_word.f_n_char = 0;
token_word.f_len = dfield->len;
token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
diff --git a/storage/xtradb/fts/fts0ast.cc b/storage/xtradb/fts/fts0ast.cc
index 4b36152cf62..df735d30c13 100644
--- a/storage/xtradb/fts/fts0ast.cc
+++ b/storage/xtradb/fts/fts0ast.cc
@@ -100,12 +100,13 @@ fts_ast_create_node_term(
fts_string_t str;
ulint offset;
ulint cur_len;
+ ulint n_chars = 0;
cur_len = innobase_mysql_fts_get_token(
state->charset,
reinterpret_cast<const byte*>(ptr->str) + cur_pos,
reinterpret_cast<const byte*>(ptr->str) + len,
- &str, &offset);
+ &str, &offset, &n_chars);
if (cur_len == 0) {
break;
@@ -113,13 +114,13 @@ fts_ast_create_node_term(
cur_pos += cur_len;
- if (str.f_n_char > 0) {
+ if (n_chars > 0) {
/* If the subsequent term (after the first one)'s size
is less than fts_min_token_size or the term is greater
than fts_max_token_size, we shall ignore that. This is
to make consistent with MyISAM behavior */
- if ((first_node && (str.f_n_char < fts_min_token_size))
- || str.f_n_char > fts_max_token_size) {
+ if ((first_node && (n_chars < fts_min_token_size))
+ || n_chars > fts_max_token_size) {
continue;
}
diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc
index e2a479bf0ae..b13c741bd37 100644
--- a/storage/xtradb/fts/fts0fts.cc
+++ b/storage/xtradb/fts/fts0fts.cc
@@ -363,8 +363,6 @@ fts_load_default_stopword(
stop_words = stopword_info->cached_stopword;
- str.f_n_char = 0;
-
for (ulint i = 0; fts_default_stopword[i]; ++i) {
char* word;
fts_tokenizer_word_t new_word;
@@ -418,7 +416,6 @@ fts_read_stopword(
/* We only need to read the first column */
dfield = que_node_get_val(exp);
- str.f_n_char = 0;
str.f_str = static_cast<byte*>(dfield_get_data(dfield));
str.f_len = dfield_get_len(dfield);
@@ -436,7 +433,6 @@ fts_read_stopword(
memcpy(new_word.text.f_str, str.f_str, str.f_len);
- new_word.text.f_n_char = 0;
new_word.text.f_len = str.f_len;
new_word.text.f_str[str.f_len] = 0;
@@ -2527,7 +2523,6 @@ fts_get_max_cache_size(
/* We set the length of value to the max bytes it can hold. This
information is used by the callback that reads the value. */
- value.f_n_char = 0;
value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
value.f_str = ut_malloc(value.f_len + 1);
@@ -2597,7 +2592,6 @@ fts_get_total_word_count(
/* We set the length of value to the max bytes it can hold. This
information is used by the callback that reads the value. */
- value.f_n_char = 0;
value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
@@ -3259,8 +3253,6 @@ fts_query_expansion_fetch_doc(
exp = que_node_get_next(exp);
continue;
} else {
- doc.text.f_n_char = 0;
-
doc.text.f_str = static_cast<byte*>(
dfield_get_data(dfield));
@@ -4705,6 +4697,7 @@ fts_process_token(
fts_string_t str;
ulint offset = 0;
fts_doc_t* result_doc;
+ ulint n_chars = 0;
/* Determine where to save the result. */
result_doc = (result) ? result : doc;
@@ -4712,13 +4705,14 @@ fts_process_token(
/* The length of a string in characters is set here only. */
ret = innobase_mysql_fts_get_token(
doc->charset, doc->text.f_str + start_pos,
- doc->text.f_str + doc->text.f_len, &str, &offset);
+ doc->text.f_str + doc->text.f_len, &str, &offset,
+ &n_chars);
/* Ignore string whose character number is less than
"fts_min_token_size" or more than "fts_max_token_size" */
- if (str.f_n_char >= fts_min_token_size
- && str.f_n_char <= fts_max_token_size) {
+ if (n_chars >= fts_min_token_size
+ && n_chars <= fts_max_token_size) {
mem_heap_t* heap;
fts_string_t t_str;
@@ -4728,8 +4722,6 @@ fts_process_token(
heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
- t_str.f_n_char = str.f_n_char;
-
t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1;
t_str.f_str = static_cast<byte*>(
@@ -4757,13 +4749,12 @@ fts_process_token(
new_token.text.f_len = newlen;
new_token.text.f_str = t_str.f_str;
- new_token.text.f_n_char = t_str.f_n_char;
new_token.positions = ib_vector_create(
result_doc->self_heap, sizeof(ulint), 32);
- ut_a(new_token.text.f_n_char >= fts_min_token_size);
- ut_a(new_token.text.f_n_char <= fts_max_token_size);
+ ut_a(n_chars >= fts_min_token_size);
+ ut_a(n_chars <= fts_max_token_size);
parent.last = rbt_add_node(
result_doc->tokens, &parent, &new_token);
@@ -7408,7 +7399,6 @@ fts_load_stopword(
if (reload) {
/* Fetch the stopword table name from FTS config
table */
- str.f_n_char = 0;
str.f_str = str_buffer;
str.f_len = sizeof(str_buffer) - 1;
@@ -7433,7 +7423,6 @@ fts_load_stopword(
/* Save the stopword table name to the configure
table */
if (!reload) {
- str.f_n_char = 0;
str.f_str = (byte*) stopword_to_use;
str.f_len = ut_strlen(stopword_to_use);
diff --git a/storage/xtradb/fts/fts0que.cc b/storage/xtradb/fts/fts0que.cc
index b9ad43c626a..dd1aa968bcd 100644
--- a/storage/xtradb/fts/fts0que.cc
+++ b/storage/xtradb/fts/fts0que.cc
@@ -583,10 +583,18 @@ fts_ranking_words_add(
/* We use ib_rbt to simulate a map, f_n_char means position. */
if (rbt_search(query->word_map, &parent, word) == 0) {
- fts_string_t* result_word;
+ CHARSET_INFO* charset = query->fts_index_table.charset;
+
+ for (ulint i = 0; i < query->word_vector->size(); i++) {
+ if (!my_strcasecmp(
+ charset,
+ (const char*) query->word_vector->at(i).f_str,
+ (const char*) word->f_str)) {
+ pos = i;
+ break;
+ }
+ }
- result_word = rbt_value(fts_string_t, parent.last);
- pos = result_word->f_n_char;
ut_ad(pos < rbt_size(query->word_map));
} else {
/* Add the word to map. */
@@ -599,7 +607,6 @@ fts_ranking_words_add(
memcpy(new_word.f_str, word->f_str, word->f_len);
new_word.f_str[word->f_len] = 0;
new_word.f_len = word->f_len;
- new_word.f_n_char = pos;
rbt_add_node(query->word_map, &parent, &new_word);
ut_ad(rbt_validate(query->word_map));
@@ -1740,11 +1747,12 @@ fts_proximity_is_word_in_range(
ulint len;
fts_string_t str;
ulint offset = 0;
+ ulint n_chars = 0;
len = innobase_mysql_fts_get_token(
phrase->charset,
start + cur_pos,
- start + total_len, &str, &offset);
+ start + total_len, &str, &offset, &n_chars);
if (len == 0) {
break;
@@ -1754,7 +1762,7 @@ fts_proximity_is_word_in_range(
cur_pos += len;
/* Record the number of words */
- if (str.f_n_char > 0) {
+ if (n_chars > 0) {
n_word++;
}
@@ -2560,12 +2568,13 @@ fts_query_phrase_search(
ulint offset;
ulint cur_len;
fts_string_t result_str;
+ ulint n_chars = 0;
cur_len = innobase_mysql_fts_get_token(
charset,
reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
reinterpret_cast<const byte*>(phrase->f_str) + len,
- &result_str, &offset);
+ &result_str, &offset, &n_chars);
if (cur_len == 0) {
break;
@@ -2573,7 +2582,7 @@ fts_query_phrase_search(
cur_pos += cur_len;
- if (result_str.f_n_char == 0) {
+ if (n_chars == 0) {
continue;
}
@@ -2590,8 +2599,8 @@ fts_query_phrase_search(
if (cache->stopword_info.cached_stopword
&& rbt_search(cache->stopword_info.cached_stopword,
&parent, token) != 0
- && result_str.f_n_char >= fts_min_token_size
- && result_str.f_n_char <= fts_max_token_size) {
+ && n_chars >= fts_min_token_size
+ && n_chars <= fts_max_token_size) {
/* Add the word to the RB tree so that we can
calculate it's frequencey within a document. */
fts_query_add_word_freq(query, token);
@@ -2860,7 +2869,6 @@ fts_query_visitor(
DBUG_ENTER("fts_query_visitor");
DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
- token.f_n_char = 0;
query->oper = oper;
query->cur_node = node;
@@ -3814,10 +3822,11 @@ fts_query_str_preprocess(
fts_string_t str;
ulint offset;
ulint cur_len;
+ ulint n_chars = 0;
cur_len = innobase_mysql_fts_get_token(
charset, str_ptr + cur_pos, str_ptr + *result_len,
- &str, &offset);
+ &str, &offset, &n_chars);
if (cur_len == 0 || str.f_str == NULL) {
/* No valid word found */
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 6bc4c76f88e..a369f59064d 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -6674,29 +6674,31 @@ innobase_fts_casedn_str(
#define misc_word_char(X) 0
-/*************************************************************//**
-Get the next token from the given string and store it in *token.
+/** Get the next token from the given string and store it in *token.
It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@param[in] cs Character set
+@param[in] start start of tex
+@param[in] end one character past end of text
+@param[out] token token's text
+@param[out] offset offset to token, measured as characters from 'start'
+@param[out] n_chars number of characters existed
@return length of string processed */
UNIV_INTERN
ulint
innobase_mysql_fts_get_token(
-/*=========================*/
- CHARSET_INFO* cs, /*!< in: Character set */
- const byte* start, /*!< in: start of text */
- const byte* end, /*!< in: one character past end of
- text */
- fts_string_t* token, /*!< out: token's text */
- ulint* offset) /*!< out: offset to token,
- measured as characters from
- 'start' */
+ CHARSET_INFO* cs,
+ const byte* start,
+ const byte* end,
+ fts_string_t* token,
+ ulint* offset,
+ ulint* n_chars)
{
int mbl;
const uchar* doc = start;
ut_a(cs);
- token->f_n_char = token->f_len = 0;
+ token->f_len = 0;
token->f_str = NULL;
for (;;) {
@@ -6742,7 +6744,10 @@ innobase_mysql_fts_get_token(
}
token->f_len = (uint) (doc - token->f_str) - mwc;
- token->f_n_char = length;
+
+ if (n_chars != NULL) {
+ *n_chars = length;
+ }
return(doc - start);
}
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
index b864830657e..9623fc7b670 100644
--- a/storage/xtradb/handler/i_s.cc
+++ b/storage/xtradb/handler/i_s.cc
@@ -3261,7 +3261,6 @@ i_s_fts_index_cache_fill_one_index(
fields = table->field;
index_charset = index_cache->charset;
- conv_str->f_n_char = 0;
int ret = 0;
@@ -3275,15 +3274,15 @@ i_s_fts_index_cache_fill_one_index(
/* Convert word from index charset to system_charset_info */
if (index_charset->cset != system_charset_info->cset) {
- conv_str->f_n_char = my_convert(
+ ulint n_chars = my_convert(
reinterpret_cast<char*>(conv_str->f_str),
static_cast<uint32>(conv_str->f_len),
system_charset_info,
reinterpret_cast<char*>(word->text.f_str),
static_cast<uint32>(word->text.f_len),
index_charset, &dummy_errors);
- ut_ad(conv_str->f_n_char <= conv_str->f_len);
- conv_str->f_str[conv_str->f_n_char] = 0;
+ ut_ad(n_chars <= conv_str->f_len);
+ conv_str->f_str[n_chars] = 0;
word_str = reinterpret_cast<char*>(conv_str->f_str);
} else {
word_str = reinterpret_cast<char*>(word->text.f_str);
@@ -3642,15 +3641,15 @@ i_s_fts_index_table_fill_one_fetch(
/* Convert word from index charset to system_charset_info */
if (index_charset->cset != system_charset_info->cset) {
- conv_str->f_n_char = my_convert(
+ ulint n_chars = my_convert(
reinterpret_cast<char*>(conv_str->f_str),
static_cast<uint32>(conv_str->f_len),
system_charset_info,
reinterpret_cast<char*>(word->text.f_str),
static_cast<uint32>(word->text.f_len),
index_charset, &dummy_errors);
- ut_ad(conv_str->f_n_char <= conv_str->f_len);
- conv_str->f_str[conv_str->f_n_char] = 0;
+ ut_ad(n_chars <= conv_str->f_len);
+ conv_str->f_str[n_chars] = 0;
word_str = reinterpret_cast<char*>(conv_str->f_str);
} else {
word_str = reinterpret_cast<char*>(word->text.f_str);
@@ -3742,7 +3741,6 @@ i_s_fts_index_table_fill_one_index(
word.f_str = NULL;
word.f_len = 0;
- word.f_n_char = 0;
index_charset = fts_index_get_charset(index);
diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h
index ce30a17c4b4..1d43e9b1ce5 100644
--- a/storage/xtradb/include/fts0fts.h
+++ b/storage/xtradb/include/fts0fts.h
@@ -236,7 +236,6 @@ struct fts_string_t {
byte* f_str; /*!< string, not necessary terminated in
any way */
ulint f_len; /*!< Length of the string in bytes */
- ulint f_n_char; /*!< Number of characters */
};
/** Query ranked doc ids. */
@@ -915,20 +914,24 @@ innobase_fts_text_cmp_prefix(
const void* p1, /*!< in: key */
const void* p2); /*!< in: node */
-/*************************************************************//**
-Get the next token from the given string and store it in *token. */
+/** Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@param[in] cs Character set
+@param[in] start start of tex
+@param[in] end one character past end of text
+@param[out] token token's text
+@param[out] offset offset to token, measured as characters from 'start'
+@param[out] n_chars number of characters existed
+@return length of string processed */
extern
ulint
innobase_mysql_fts_get_token(
-/*=========================*/
- CHARSET_INFO* charset, /*!< in: Character set */
- const byte* start, /*!< in: start of text */
- const byte* end, /*!< in: one character past
- end of text */
- fts_string_t* token, /*!< out: token's text */
- ulint* offset); /*!< out: offset to token,
- measured as characters from
- 'start' */
+ CHARSET_INFO* cs,
+ const byte* start,
+ const byte* end,
+ fts_string_t* token,
+ ulint* offset,
+ ulint* n_chars);
/*********************************************************************//**
Fetch COUNT(*) from specified table.
diff --git a/storage/xtradb/include/fts0types.ic b/storage/xtradb/include/fts0types.ic
index f0dfd023a70..964897320a3 100644
--- a/storage/xtradb/include/fts0types.ic
+++ b/storage/xtradb/include/fts0types.ic
@@ -52,7 +52,6 @@ fts_utf8_string_dup(
dst->f_len = src->f_len;
dst->f_str[src->f_len] = 0;
- dst->f_n_char = src->f_n_char;
}
/******************************************************************//**
diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h
index 1dfbfe7c8fb..467a61622be 100644
--- a/storage/xtradb/include/ha_prototypes.h
+++ b/storage/xtradb/include/ha_prototypes.h
@@ -422,20 +422,24 @@ innobase_close_thd(
/*===============*/
THD* thd); /*!< in: MySQL thread handle for
which to close the connection */
-/*************************************************************//**
-Get the next token from the given string and store it in *token. */
+
+/** Get the next token from the given string and store it in *token.
+@param[in] cs Character set
+@param[in] start start of tex
+@param[in] end one character past end of text
+@param[out] token token's text
+@param[out] offset offset to token, measured as characters from 'start'
+@param[out] n_chars number of characters existed
+@return length of string processed */
UNIV_INTERN
ulint
innobase_mysql_fts_get_token(
-/*=========================*/
- CHARSET_INFO* charset, /*!< in: Character set */
- const byte* start, /*!< in: start of text */
- const byte* end, /*!< in: one character past end of
- text */
- fts_string_t* token, /*!< out: token's text */
- ulint* offset); /*!< out: offset to token,
- measured as characters from
- 'start' */
+ CHARSET_INFO* cs,
+ const byte* start,
+ const byte* end,
+ fts_string_t* token,
+ ulint* offset,
+ ulint* n_chars=NULL);
/******************************************************************//**
compare two character string case insensitively according to their charset. */
diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc
index bb9821d4484..4ed38be8b6c 100644
--- a/storage/xtradb/row/row0ftsort.cc
+++ b/storage/xtradb/row/row0ftsort.cc
@@ -388,7 +388,6 @@ row_merge_fts_doc_tokenize(
ulint data_size[FTS_NUM_AUX_INDEX];
ulint n_tuple[FTS_NUM_AUX_INDEX];
- t_str.f_n_char = 0;
t_ctx->buf_used = 0;
memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
@@ -403,17 +402,19 @@ row_merge_fts_doc_tokenize(
ulint offset = 0;
ulint cur_len;
doc_id_t write_doc_id;
+ ulint n_chars = 0;
inc = innobase_mysql_fts_get_token(
doc->charset, doc->text.f_str + i,
- doc->text.f_str + doc->text.f_len, &str, &offset);
+ doc->text.f_str + doc->text.f_len, &str, &offset,
+ &n_chars);
ut_a(inc > 0);
/* Ignore string whose character number is less than
"fts_min_token_size" or more than "fts_max_token_size" */
- if (str.f_n_char < fts_min_token_size
- || str.f_n_char > fts_max_token_size) {
+ if (n_chars < fts_min_token_size
+ || n_chars > fts_max_token_size) {
t_ctx->processed_len += inc;
continue;
@@ -1086,7 +1087,6 @@ row_fts_insert_tuple(
/* Get the first field for the tokenized word */
dfield = dtuple_get_nth_field(dtuple, 0);
- token_word.f_n_char = 0;
token_word.f_len = dfield->len;
token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));