summaryrefslogtreecommitdiff
path: root/storage/innobase/fts/fts0que.cc
diff options
context:
space:
mode:
authorSergei Golubchik <sergii@pisem.net>2014-02-01 09:33:26 +0100
committerSergei Golubchik <sergii@pisem.net>2014-02-01 09:33:26 +0100
commit27d45e46968e4bd623565688a997b83b0a5cc1a8 (patch)
tree8df9c87f8fd3855d81e3ae46078d34609668e63a /storage/innobase/fts/fts0que.cc
parent27fbb637d36324992b270f0dc0472807ffa4ebc2 (diff)
downloadmariadb-git-27d45e46968e4bd623565688a997b83b0a5cc1a8.tar.gz
MDEV-5574 Set AUTO_INCREMENT below max value of column.
Update InnoDB to 5.6.14 Apply MySQL-5.6 hack for MySQL Bug#16434374 Move Aria-only HA_RTREE_INDEX from my_base.h to maria_def.h (breaks an assert in InnoDB) Fix InnoDB memory leak
Diffstat (limited to 'storage/innobase/fts/fts0que.cc')
-rw-r--r--storage/innobase/fts/fts0que.cc1039
1 files changed, 748 insertions, 291 deletions
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 5c757b4f176..72901d193eb 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -40,6 +40,10 @@ Completed 2011/7/10 Sunny and Jimmy Yang
#include "fts0vlc.ic"
#endif
+#include <string>
+#include <vector>
+#include <map>
+
#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
#define RANK_DOWNGRADE (-1.0F)
@@ -50,13 +54,20 @@ FIXME, this limitation can be removed easily. Need to see
if we want to enforce such limitation */
#define MAX_PROXIMITY_ITEM 128
+/* Memory used by rbt itself for create and node add */
+#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
+#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
+
+/*Initial byte length for 'words' in fts_ranking_t */
+#define RANKING_WORDS_INIT_LEN 4
+
/* Coeffecient to use for normalize relevance ranking. */
static const double FTS_NORMALIZE_COEFF = 0.0115F;
// FIXME: Need to have a generic iterator that traverses the ilist.
-/* For parsing the search phrase */
-static const char* FTS_PHRASE_DELIMITER = "\t ";
+typedef std::map<std::string, ulint> word_map_t;
+typedef std::vector<std::string> word_vector_t;
struct fts_word_freq_t;
@@ -72,6 +83,8 @@ struct fts_query_t {
fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
+ ulint total_size; /*!< total memory size used by query */
+
fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
filtered from the output */
@@ -79,6 +92,12 @@ struct fts_query_t {
fts_ast_node_t* cur_node; /*!< Current tree node */
+ word_map_t* word_map; /*!< Matched word map for
+ searching by word*/
+
+ word_vector_t* word_vector; /*!< Matched word vector for
+ searching by index */
+
ib_rbt_t* doc_ids; /*!< The current set of matching
doc ids, elements are of
type fts_ranking_t */
@@ -113,7 +132,7 @@ struct fts_query_t {
doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
- ibool boolean_mode; /*!< TRUE if boolean mode query */
+ bool boolean_mode; /*!< TRUE if boolean mode query */
ib_vector_t* matched; /*!< Array of matching documents
(fts_match_t) to search for a phrase */
@@ -133,9 +152,7 @@ struct fts_query_t {
document, its elements are of type
fts_word_freq_t */
- ibool inited; /*!< Flag to test whether the query
- processing has started or not */
- ibool multi_exist; /*!< multiple FTS_EXIST oper */
+ bool multi_exist; /*!< multiple FTS_EXIST oper */
};
/** For phrase matching, first we collect the documents and the positions
@@ -237,7 +254,7 @@ fts_query_index_fetch_nodes(
Read and filter nodes.
@return fts_node_t instance */
static
-void
+dberr_t
fts_query_filter_doc_ids(
/*=====================*/
fts_query_t* query, /*!< in: query instance */
@@ -318,7 +335,7 @@ static
ulint
fts_query_terms_in_document(
/*========================*/
- /*!< out: DB_SUCCESS if all went well
+ /*!< out: DB_SUCCESS if all go well
else error code */
fts_query_t* query, /*!< in: FTS query state */
doc_id_t doc_id, /*!< in: the word to check */
@@ -430,22 +447,6 @@ fts_query_lcs(
#endif
/*******************************************************************//**
-Compare two byte* arrays.
-@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
-static
-int
-fts_query_strcmp(
-/*=============*/
- const void* p1, /*!< in: pointer to elem */
- const void* p2) /*!< in: pointer to elem */
-{
- void* temp = const_cast<void*>(p2);
-
- return(strcmp(static_cast<const char*>(p1),
- *(static_cast <char**>(temp))));
-}
-
-/*******************************************************************//**
Compare two fts_ranking_t instance on their rank value and doc ids in
descending order on the rank and ascending order on doc id.
@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
@@ -537,6 +538,127 @@ fts_utf8_strcmp(
#endif
/*******************************************************************//**
+Create words in ranking */
+static
+void
+fts_ranking_words_create(
+/*=====================*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_ranking_t* ranking) /*!< in: ranking instance */
+{
+ ranking->words = static_cast<byte*>(
+ mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
+ ranking->words_len = RANKING_WORDS_INIT_LEN;
+}
+
+/*
+The optimization here is using a char array(bitmap) to replace words rb tree
+in fts_ranking_t.
+
+It can save lots of memory except in some cases of QUERY EXPANSION.
+
+'word_map' is used as a word dictionary, in which the key is a word, the value
+is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
+if not, we add it into 'word_map', and give it a position(actually a number).
+then we set the corresponding bit to '1' at the position in the char array 'words'.
+
+'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
+more quickly than searching by value in 'word_map'. we use 'word_vector'
+in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
+to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
+by the word.
+*/
+
+/*******************************************************************//**
+Add a word into ranking */
+static
+void
+fts_ranking_words_add(
+/*==================*/
+ fts_query_t* query, /*!< in: query instance */
+ fts_ranking_t* ranking, /*!< in: ranking instance */
+ const char* word) /*!< in: term/word to add */
+{
+ ulint pos;
+ ulint byte_offset;
+ ulint bit_offset;
+ word_map_t::iterator it;
+
+ /* Note: we suppose the word map and vector are append-only */
+ /* Check if need to add it to word map */
+ it = query->word_map->lower_bound(word);
+ if (it != query->word_map->end()
+ && !query->word_map->key_comp()(word, it->first)) {
+ pos = it->second;
+ } else {
+ pos = query->word_map->size();
+ query->word_map->insert(it,
+ std::pair<std::string, ulint>(word, pos));
+
+ query->word_vector->push_back(word);
+ }
+
+ /* Check words len */
+ byte_offset = pos / CHAR_BIT;
+ if (byte_offset >= ranking->words_len) {
+ byte* words = ranking->words;
+ ulint words_len = ranking->words_len;
+
+ while (byte_offset >= words_len) {
+ words_len *= 2;
+ }
+
+ ranking->words = static_cast<byte*>(
+ mem_heap_zalloc(query->heap, words_len));
+ ut_memcpy(ranking->words, words, ranking->words_len);
+ ranking->words_len = words_len;
+ }
+
+ /* Set ranking words */
+ ut_ad(byte_offset < ranking->words_len);
+ bit_offset = pos % CHAR_BIT;
+ ranking->words[byte_offset] |= 1 << bit_offset;
+}
+
+/*******************************************************************//**
+Get a word from a ranking
+@return true if it's successful */
+static
+bool
+fts_ranking_words_get_next(
+/*=======================*/
+ const fts_query_t* query, /*!< in: query instance */
+ fts_ranking_t* ranking,/*!< in: ranking instance */
+ ulint* pos, /*!< in/out: word start pos */
+ byte** word) /*!< in/out: term/word to add */
+{
+ bool ret = false;
+ ulint max_pos = ranking->words_len * CHAR_BIT;
+
+ /* Search for next word */
+ while (*pos < max_pos) {
+ ulint byte_offset = *pos / CHAR_BIT;
+ ulint bit_offset = *pos % CHAR_BIT;
+
+ if (ranking->words[byte_offset] & (1 << bit_offset)) {
+ ret = true;
+ break;
+ }
+
+ *pos += 1;
+ };
+
+ /* Get next word from word vector */
+ if (ret) {
+ ut_ad(*pos < query->word_vector->size());
+ *word = (byte*)query->word_vector->at((size_t)*pos).c_str();
+ *pos += 1;
+ }
+
+ return ret;
+}
+
+/*******************************************************************//**
Add a word if it doesn't exist, to the term freq RB tree. We store
a pointer to the word that is passed in as the argument.
@return pointer to word */
@@ -569,6 +691,11 @@ fts_query_add_word_freq(
parent.last = rbt_add_node(
query->word_freqs, &parent, &word_freq);
+
+ query->total_size += len
+ + SIZEOF_RBT_CREATE
+ + SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_word_freq_t);
}
return(rbt_value(fts_word_freq_t, parent.last));
@@ -581,6 +708,7 @@ static
fts_doc_freq_t*
fts_query_add_doc_freq(
/*===================*/
+ fts_query_t* query, /*!< in: query instance */
ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
doc_id_t doc_id) /*!< in: doc id to add */
{
@@ -596,6 +724,9 @@ fts_query_add_doc_freq(
doc_freq.doc_id = doc_id;
parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_doc_freq_t);
}
return(rbt_value(fts_doc_freq_t, parent.last));
@@ -625,9 +756,12 @@ fts_query_union_doc_id(
ranking.rank = rank;
ranking.doc_id = doc_id;
- ranking.words = rbt_create(sizeof(byte*), fts_query_strcmp);
+ fts_ranking_words_create(query, &ranking);
rbt_add_node(query->doc_ids, &parent, &ranking);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
}
}
@@ -648,13 +782,12 @@ fts_query_remove_doc_id(
/* Check if the doc id is deleted and it's in our set. */
if (fts_bsearch(array, 0, size, doc_id) < 0
&& rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
-
- fts_ranking_t* ranking;
-
- ranking = rbt_value(fts_ranking_t, parent.last);
- rbt_free(ranking->words);
-
ut_free(rbt_remove_node(query->doc_ids, parent.last));
+
+ ut_ad(query->total_size >
+ SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+ query->total_size -= SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
}
}
@@ -712,57 +845,69 @@ fts_query_intersect_doc_id(
ib_rbt_bound_t parent;
ulint size = ib_vector_size(query->deleted->doc_ids);
fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
- fts_ranking_t* ranking;
+ fts_ranking_t* ranking= NULL;
+
+ /* There are three types of intersect:
+ 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
+ 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
+ if it matches 'b'. if the doc is also in doc_ids, then change the
+ doc's rank, and add 'a' in doc's words.
+ 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
+ if it matches 'b' and it's in doc_ids.(multi_exist = true). */
/* Check if the doc id is deleted and it's in our set */
if (fts_bsearch(array, 0, size, doc_id) < 0) {
- /* If this is the first FTS_EXIST we encountered, all of its
- value must be in intersect list */
- if (!query->multi_exist) {
- fts_ranking_t new_ranking;
-
- if (rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
- ranking = rbt_value(fts_ranking_t, parent.last);
- rank += (ranking->rank > 0)
- ? ranking->rank : RANK_UPGRADE;
- if (rank >= 1.0F) {
- rank = 1.0F;
- }
- }
-
- new_ranking.rank = rank;
- new_ranking.doc_id = doc_id;
- new_ranking.words = rbt_create(
- sizeof(byte*), fts_query_strcmp);
- ranking = &new_ranking;
+ fts_ranking_t new_ranking;
- if (rbt_search(query->intersection, &parent,
- ranking) != 0) {
- rbt_add_node(query->intersection,
- &parent, ranking);
+ if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+ if (query->multi_exist) {
+ return;
} else {
- rbt_free(new_ranking.words);
+ new_ranking.words = NULL;
}
} else {
+ ranking = rbt_value(fts_ranking_t, parent.last);
- if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+ /* We've just checked the doc id before */
+ if (ranking->words == NULL) {
+ ut_ad(rbt_search(query->intersection, &parent,
+ ranking) == 0);
return;
}
- ranking = rbt_value(fts_ranking_t, parent.last);
+ /* Merge rank */
+ rank += ranking->rank;
+ if (rank >= 1.0F) {
+ rank = 1.0F;
+ } else if (rank <= -1.0F) {
+ rank = -1.0F;
+ }
+
+ /* Take words */
+ new_ranking.words = ranking->words;
+ new_ranking.words_len = ranking->words_len;
+ }
- ranking->rank = rank;
+ new_ranking.rank = rank;
+ new_ranking.doc_id = doc_id;
- if (ranking->words != NULL
- && rbt_search(query->intersection, &parent,
- ranking) != 0) {
- rbt_add_node(query->intersection, &parent,
- ranking);
+ if (rbt_search(query->intersection, &parent,
+ &new_ranking) != 0) {
+ if (new_ranking.words == NULL) {
+ fts_ranking_words_create(query, &new_ranking);
+ query->total_size += RANKING_WORDS_INIT_LEN;
+ } else {
/* Note that the intersection has taken
ownership of the ranking data. */
ranking->words = NULL;
}
+
+ rbt_add_node(query->intersection,
+ &parent, &new_ranking);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
}
}
}
@@ -773,6 +918,7 @@ static
void
fts_query_free_doc_ids(
/*===================*/
+ fts_query_t* query, /*!< in: query instance */
ib_rbt_t* doc_ids) /*!< in: rb tree to free */
{
const ib_rbt_node_t* node;
@@ -784,14 +930,21 @@ fts_query_free_doc_ids(
ranking = rbt_value(fts_ranking_t, node);
if (ranking->words) {
- rbt_free(ranking->words);
ranking->words = NULL;
}
ut_free(rbt_remove_node(doc_ids, node));
+
+ ut_ad(query->total_size >
+ SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+ query->total_size -= SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
}
rbt_free(doc_ids);
+
+ ut_ad(query->total_size > SIZEOF_RBT_CREATE);
+ query->total_size -= SIZEOF_RBT_CREATE;
}
/*******************************************************************//**
@@ -808,6 +961,10 @@ fts_query_add_word_to_document(
ib_rbt_bound_t parent;
fts_ranking_t* ranking = NULL;
+ if (query->flags == FTS_OPT_RANKING) {
+ return;
+ }
+
/* First we search the intersection RB tree as it could have
taken ownership of the words rb tree instance. */
if (query->intersection
@@ -823,23 +980,7 @@ fts_query_add_word_to_document(
}
if (ranking != NULL) {
- ulint len;
- byte* term;
-
- len = ut_strlen((char*) word) + 1;
-
- term = static_cast<byte*>(mem_heap_alloc(query->heap, len));
-
- /* Need to copy the NUL character too. */
- memcpy(term, (char*) word, len);
-
- /* The current set must have ownership of the RB tree. */
- ut_a(ranking->words != NULL);
-
- /* If the word doesn't exist in the words "list" we add it. */
- if (rbt_search(ranking->words, &parent, term) != 0) {
- rbt_add_node(ranking->words, &parent, &term);
- }
+ fts_ranking_words_add(query, ranking, (char*)word);
}
}
@@ -874,9 +1015,9 @@ fts_query_check_node(
word_freqs = rbt_value(fts_word_freq_t, parent.last);
- fts_query_filter_doc_ids(
- query, token->f_str, word_freqs, node,
- node->ilist, ilist_size, TRUE);
+ query->error = fts_query_filter_doc_ids(
+ query, token->f_str, word_freqs, node,
+ node->ilist, ilist_size, TRUE);
}
}
@@ -940,10 +1081,14 @@ fts_cache_find_wildcard(
fts_word_freq_t,
freq_parent.last);
- fts_query_filter_doc_ids(
+ query->error = fts_query_filter_doc_ids(
query, srch_text.f_str,
word_freqs, node,
node->ilist, node->ilist_size, TRUE);
+
+ if (query->error != DB_SUCCESS) {
+ return(0);
+ }
}
num_word++;
@@ -976,7 +1121,7 @@ cont_search:
/*****************************************************************//**
Set difference.
-@return DB_SUCCESS if all went well */
+@return DB_SUCCESS if all go well */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_difference(
@@ -1007,6 +1152,7 @@ fts_query_difference(
const fts_index_cache_t*index_cache;
que_t* graph = NULL;
fts_cache_t* cache = table->fts->cache;
+ dberr_t error;
rw_lock_x_lock(&cache->lock);
@@ -1023,7 +1169,8 @@ fts_query_difference(
} else {
nodes = fts_cache_find_word(index_cache, token);
- for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+ for (i = 0; nodes && i < ib_vector_size(nodes)
+ && query->error == DB_SUCCESS; ++i) {
const fts_node_t* node;
node = static_cast<const fts_node_t*>(
@@ -1035,14 +1182,26 @@ fts_query_difference(
rw_lock_x_unlock(&cache->lock);
+ /* error is passed by 'query->error' */
+ if (query->error != DB_SUCCESS) {
+ ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ return(query->error);
+ }
+
/* Setup the callback args for filtering and
consolidating the ilist. */
fetch.read_arg = query;
fetch.read_record = fts_query_index_fetch_nodes;
- query->error = fts_index_fetch_nodes(
+ error = fts_index_fetch_nodes(
trx, &graph, &query->fts_index_table, token, &fetch);
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
fts_que_graph_free(graph);
}
@@ -1054,7 +1213,7 @@ fts_query_difference(
/*****************************************************************//**
Intersect the token doc ids with the current set.
-@return DB_SUCCESS if all went well */
+@return DB_SUCCESS if all go well */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_intersect(
@@ -1062,7 +1221,6 @@ fts_query_intersect(
fts_query_t* query, /*!< in: query instance */
const fts_string_t* token) /*!< in: the token to search */
{
- ulint n_doc_ids = 0;
trx_t* trx = query->trx;
dict_table_t* table = query->index->table;
@@ -1073,41 +1231,28 @@ fts_query_intersect(
(int) token->f_len, token->f_str);
#endif
- if (!query->inited) {
-
- ut_a(rbt_empty(query->doc_ids));
-
- /* Since this is the first time we need to convert this
- intersection query into a union query. Otherwise we
- will end up with an empty set. */
- query->oper = FTS_NONE;
- query->inited = TRUE;
- }
-
- if (query->doc_ids) {
- n_doc_ids = rbt_size(query->doc_ids);
- }
-
- /* If the words set is not empty or this is the first time. */
-
- if (!rbt_empty(query->doc_ids) || query->oper == FTS_NONE) {
+ /* If the words set is not empty and multi exist is true,
+ we know the intersection set is empty in advance. */
+ if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
+ ulint n_doc_ids = 0;
ulint i;
fts_fetch_t fetch;
const ib_vector_t* nodes;
const fts_index_cache_t*index_cache;
que_t* graph = NULL;
fts_cache_t* cache = table->fts->cache;
+ dberr_t error;
ut_a(!query->intersection);
- /* Only if this is not the first time. */
- if (query->oper != FTS_NONE) {
+ n_doc_ids = rbt_size(query->doc_ids);
- /* Create the rb tree that will hold the doc ids of
- the intersection. */
- query->intersection = rbt_create(
- sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
- }
+ /* Create the rb tree that will hold the doc ids of
+ the intersection. */
+ query->intersection = rbt_create(
+ sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += SIZEOF_RBT_CREATE;
/* This is to avoid decompressing the ilist if the
node's ilist doc ids are out of range. */
@@ -1144,7 +1289,8 @@ fts_query_intersect(
} else {
nodes = fts_cache_find_word(index_cache, token);
- for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+ for (i = 0; nodes && i < ib_vector_size(nodes)
+ && query->error == DB_SUCCESS; ++i) {
const fts_node_t* node;
node = static_cast<const fts_node_t*>(
@@ -1156,48 +1302,48 @@ fts_query_intersect(
rw_lock_x_unlock(&cache->lock);
+ /* error is passed by 'query->error' */
+ if (query->error != DB_SUCCESS) {
+ ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ return(query->error);
+ }
+
/* Setup the callback args for filtering and
consolidating the ilist. */
fetch.read_arg = query;
fetch.read_record = fts_query_index_fetch_nodes;
- query->error = fts_index_fetch_nodes(
+ error = fts_index_fetch_nodes(
trx, &graph, &query->fts_index_table, token, &fetch);
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
fts_que_graph_free(graph);
if (query->error == DB_SUCCESS) {
- if (query->oper == FTS_EXIST) {
-
- /* The size can't increase. */
- ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
- }
-
/* Make the intesection (rb tree) the current doc id
set and free the old set. */
- if (query->intersection) {
- fts_query_free_doc_ids(query->doc_ids);
- query->doc_ids = query->intersection;
- query->intersection = NULL;
- }
+ fts_query_free_doc_ids(query, query->doc_ids);
+ query->doc_ids = query->intersection;
+ query->intersection = NULL;
- /* Reset the set operation to intersect. */
- query->oper = FTS_EXIST;
+ ut_a(!query->multi_exist || (query->multi_exist
+ && rbt_size(query->doc_ids) <= n_doc_ids));
}
}
- if (!query->multi_exist) {
- query->multi_exist = TRUE;
- }
-
return(query->error);
}
/*****************************************************************//**
Query index cache.
-@return DB_SUCCESS if all went well */
+@return DB_SUCCESS if all go well */
static
-ulint
+dberr_t
fts_query_cache(
/*============*/
fts_query_t* query, /*!< in/out: query instance */
@@ -1227,7 +1373,8 @@ fts_query_cache(
nodes = fts_cache_find_word(index_cache, token);
- for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+ for (i = 0; nodes && i < ib_vector_size(nodes)
+ && query->error == DB_SUCCESS; ++i) {
const fts_node_t* node;
node = static_cast<const fts_node_t*>(
@@ -1239,12 +1386,12 @@ fts_query_cache(
rw_lock_x_unlock(&cache->lock);
- return(DB_SUCCESS);
+ return(query->error);
}
/*****************************************************************//**
Set union.
-@return DB_SUCCESS if all went well */
+@return DB_SUCCESS if all go well */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_union(
@@ -1256,6 +1403,7 @@ fts_query_union(
ulint n_doc_ids = 0;
trx_t* trx = query->trx;
que_t* graph = NULL;
+ dberr_t error;
ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
@@ -1265,8 +1413,6 @@ fts_query_union(
(int) token->f_len, token->f_str);
#endif
- query->error = DB_SUCCESS;
-
if (query->doc_ids) {
n_doc_ids = rbt_size(query->doc_ids);
}
@@ -1287,9 +1433,15 @@ fts_query_union(
fetch.read_record = fts_query_index_fetch_nodes;
/* Read the nodes from disk. */
- query->error = fts_index_fetch_nodes(
+ error = fts_index_fetch_nodes(
trx, &graph, &query->fts_index_table, token, &fetch);
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
fts_que_graph_free(graph);
if (query->error == DB_SUCCESS) {
@@ -1302,22 +1454,17 @@ fts_query_union(
if (query->doc_ids) {
n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
}
-
- /* In case there were no matching docs then we reset the
- state, otherwise intersection will not be able to detect
- that it's being called for the first time. */
- if (!rbt_empty(query->doc_ids)) {
- query->inited = TRUE;
- }
}
return(query->error);
}
/*****************************************************************//**
-Depending upon the current query operator process the doc id. */
+Depending upon the current query operator process the doc id.
+return DB_SUCCESS if all go well
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
static
-void
+dberr_t
fts_query_process_doc_id(
/*=====================*/
fts_query_t* query, /*!< in: query instance */
@@ -1325,6 +1472,10 @@ fts_query_process_doc_id(
fts_rank_t rank) /*!< in: if non-zero, it is the
rank associated with the doc_id */
{
+ if (query->flags == FTS_OPT_RANKING) {
+ return(DB_SUCCESS);
+ }
+
switch (query->oper) {
case FTS_NONE:
fts_query_union_doc_id(query, doc_id, rank);
@@ -1355,12 +1506,18 @@ fts_query_process_doc_id(
default:
ut_error;
}
+
+ if (query->total_size > fts_result_cache_limit) {
+ return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ } else {
+ return(DB_SUCCESS);
+ }
}
/*****************************************************************//**
Merge two result sets. */
static
-void
+dberr_t
fts_merge_doc_ids(
/*==============*/
fts_query_t* query, /*!< in,out: query instance */
@@ -1377,25 +1534,42 @@ fts_merge_doc_ids(
query->intersection = rbt_create(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += SIZEOF_RBT_CREATE;
}
/* Merge the elements to the result set. */
for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
fts_ranking_t* ranking;
+ ulint pos = 0;
+ byte* word = NULL;
ranking = rbt_value(fts_ranking_t, node);
- fts_query_process_doc_id(
- query, ranking->doc_id, ranking->rank);
+ query->error = fts_query_process_doc_id(
+ query, ranking->doc_id, ranking->rank);
+
+ if (query->error != DB_SUCCESS) {
+ return(query->error);
+ }
+
+ /* Merge words. Don't need to take operator into account. */
+ ut_a(ranking->words);
+ while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+ fts_query_add_word_to_document(query, ranking->doc_id,
+ word);
+ }
}
/* If it is an intersection operation, reset query->doc_ids
to query->intersection and free the old result list. */
if (query->oper == FTS_EXIST && query->intersection != NULL) {
- fts_query_free_doc_ids(query->doc_ids);
+ fts_query_free_doc_ids(query, query->doc_ids);
query->doc_ids = query->intersection;
query->intersection = NULL;
}
+
+ return(DB_SUCCESS);
}
/*****************************************************************//**
@@ -1827,7 +2001,7 @@ fts_query_select(
/********************************************************************
Read the rows from the FTS index, that match word and where the
doc id is between first and last doc id.
-@return DB_SUCCESS if all went well else error code */
+@return DB_SUCCESS if all go well else error code */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_find_term(
@@ -1967,7 +2141,7 @@ fts_query_sum(
/********************************************************************
Calculate the total documents that contain a particular word (term).
-@return DB_SUCCESS if all went well else error code */
+@return DB_SUCCESS if all go well else error code */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_total_docs_containing_term(
@@ -2046,7 +2220,7 @@ fts_query_total_docs_containing_term(
/********************************************************************
Get the total number of words in a documents.
-@return DB_SUCCESS if all went well else error code */
+@return DB_SUCCESS if all go well else error code */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_terms_in_document(
@@ -2233,8 +2407,14 @@ static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_search_phrase(
/*====================*/
- fts_query_t* query, /*!< in: query instance */
- ib_vector_t* tokens) /*!< in: tokens to search */
+ fts_query_t* query, /*!< in: query instance */
+ ib_vector_t* orig_tokens, /*!< in: tokens to search,
+ with any stopwords in the
+ original phrase */
+ ib_vector_t* tokens) /*!< in: tokens that does
+ not include stopwords and
+ can be used to calculate
+ ranking */
{
ulint i;
fts_get_doc_t get_doc;
@@ -2275,14 +2455,18 @@ fts_query_search_phrase(
if (match->doc_id != 0) {
query->error = fts_query_match_document(
- tokens, &get_doc,
+ orig_tokens, &get_doc,
match, query->distance, &found);
if (query->error == DB_SUCCESS && found) {
ulint z;
- fts_query_process_doc_id(query,
+ query->error = fts_query_process_doc_id(query,
match->doc_id, 0);
+ if (query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
+
for (z = 0; z < ib_vector_size(tokens); z++) {
fts_string_t* token;
token = static_cast<fts_string_t*>(
@@ -2295,6 +2479,7 @@ fts_query_search_phrase(
}
}
+func_exit:
/* Free the prepared statement. */
if (get_doc.get_document_graph) {
fts_que_graph_free(get_doc.get_document_graph);
@@ -2314,17 +2499,21 @@ fts_query_phrase_search(
fts_query_t* query, /*!< in: query instance */
const fts_string_t* phrase) /*!< in: token to search */
{
- char* src;
- char* state; /* strtok_r internal state */
ib_vector_t* tokens;
+ ib_vector_t* orig_tokens;
mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
- char* utf8 = strdup((char*) phrase->f_str);
+ ulint len = phrase->f_len;
+ ulint cur_pos = 0;
ib_alloc_t* heap_alloc;
ulint num_token;
+ CHARSET_INFO* charset;
+
+ charset = query->fts_index_table.charset;
heap_alloc = ib_heap_allocator_create(heap);
tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+ orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
query->flags = FTS_PROXIMITY;
@@ -2333,26 +2522,65 @@ fts_query_phrase_search(
}
/* Split the phrase into tokens. */
- for (src = utf8; /* No op */; src = NULL) {
+ while (cur_pos < len) {
+ fts_cache_t* cache = query->index->table->fts->cache;
+ ib_rbt_bound_t parent;
+ ulint offset;
+ ulint cur_len;
+ fts_string_t result_str;
+
+ cur_len = innobase_mysql_fts_get_token(
+ charset,
+ reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
+ reinterpret_cast<const byte*>(phrase->f_str) + len,
+ &result_str, &offset);
+
+ if (cur_len == 0) {
+ break;
+ }
+
+ cur_pos += cur_len;
+
+ if (result_str.f_n_char == 0) {
+ continue;
+ }
+
fts_string_t* token = static_cast<fts_string_t*>(
ib_vector_push(tokens, NULL));
- token->f_str = (byte*) strtok_r(
- src, FTS_PHRASE_DELIMITER, &state);
+ token->f_str = static_cast<byte*>(
+ mem_heap_alloc(heap, result_str.f_len + 1));
+ ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
- if (token->f_str) {
+ token->f_len = result_str.f_len;
+ token->f_str[token->f_len] = 0;
+
+ if (cache->stopword_info.cached_stopword
+ && rbt_search(cache->stopword_info.cached_stopword,
+ &parent, token) != 0
+ && result_str.f_n_char >= fts_min_token_size
+ && result_str.f_n_char <= fts_max_token_size) {
/* Add the word to the RB tree so that we can
calculate it's frequencey within a document. */
fts_query_add_word_freq(query, token->f_str);
-
- token->f_len = ut_strlen((char*) token->f_str);
} else {
ib_vector_pop(tokens);
- break;
+ }
+
+ /* we will start to store all words including stopwords
+ in the "orig_tokens" vector, but skip any leading words
+ that are stopwords */
+ if (!ib_vector_is_empty(tokens)) {
+ fts_string_t* orig_token = static_cast<fts_string_t*>(
+ ib_vector_push(orig_tokens, NULL));
+
+ orig_token->f_str = token->f_str;
+ orig_token->f_len = token->f_len;
}
}
num_token = ib_vector_size(tokens);
+ ut_ad(ib_vector_size(orig_tokens) >= num_token);
/* Ignore empty strings. */
if (num_token > 0) {
@@ -2362,19 +2590,7 @@ fts_query_phrase_search(
fts_ast_oper_t oper = query->oper;
que_t* graph = NULL;
ulint i;
-
- /* Create the rb tree for storing the words read form disk. */
- if (!query->inited) {
-
- /* Since this is the first time, we need to convert
- this intersection query into a union query. Otherwise
- we will end up with an empty set. */
- if (query->oper == FTS_EXIST) {
- query->oper = FTS_NONE;
- }
-
- query->inited = TRUE;
- }
+ dberr_t error;
/* Create the vector for storing matching document ids
and the positions of the first token of the phrase. */
@@ -2422,10 +2638,16 @@ fts_query_phrase_search(
query->matched = query->match_array[i];
}
- fts_index_fetch_nodes(
+ error = fts_index_fetch_nodes(
trx, &graph, &query->fts_index_table,
token, &fetch);
+ /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+ ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+ if (error != DB_SUCCESS) {
+ query->error = error;
+ }
+
fts_que_graph_free(graph);
graph = NULL;
@@ -2438,12 +2660,15 @@ fts_query_phrase_search(
/* If any of the token can't be found,
no need to continue match */
- if (ib_vector_is_empty(query->match_array[i])) {
+ if (ib_vector_is_empty(query->match_array[i])
+ || query->error != DB_SUCCESS) {
goto func_exit;
}
}
- if (num_token == 1
+ /* Just a single word, no need to fetch the original
+ documents to do phrase matching */
+ if (ib_vector_size(orig_tokens) == 1
&& !ib_vector_is_empty(query->match_array[0])) {
fts_match_t* match;
ulint n_matched;
@@ -2455,8 +2680,11 @@ fts_query_phrase_search(
ib_vector_get(
query->match_array[0], i));
- fts_query_process_doc_id(
- query, match->doc_id, 0);
+ query->error = fts_query_process_doc_id(
+ query, match->doc_id, 0);
+ if (query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
fts_query_add_word_to_document(
query, match->doc_id, token->f_str);
@@ -2484,18 +2712,21 @@ fts_query_phrase_search(
/* Read the actual text in and search for the phrase. */
if (matched) {
- query->error = DB_SUCCESS;
+ ut_ad(query->error == DB_SUCCESS);
query->error = fts_query_search_phrase(
- query, tokens);
+ query, orig_tokens, tokens);
}
}
/* Restore original operation. */
query->oper = oper;
+
+ if (query->error != DB_SUCCESS) {
+ goto func_exit;
+ }
}
func_exit:
- free(utf8);
mem_heap_free(heap);
/* Don't need it anymore. */
@@ -2506,7 +2737,7 @@ func_exit:
/*****************************************************************//**
Find the word and evaluate.
-@return DB_SUCCESS if all went well */
+@return DB_SUCCESS if all go well */
static __attribute__((nonnull, warn_unused_result))
dberr_t
fts_query_execute(
@@ -2578,7 +2809,7 @@ fts_query_get_token(
/*****************************************************************//**
Visit every node of the AST. */
static
-ulint
+dberr_t
fts_query_visitor(
/*==============*/
fts_ast_oper_t oper, /*!< in: current operator */
@@ -2602,11 +2833,8 @@ fts_query_visitor(
token.f_str = node->text.ptr;
token.f_len = ut_strlen((char*) token.f_str);
- /* "first second third" is treated as first & second
- & third. Create the rb tree that will hold the doc ids
- of the intersection. */
- if (!query->intersection && query->oper == FTS_EXIST) {
-
+ if (query->oper == FTS_EXIST) {
+ ut_ad(query->intersection == NULL);
query->intersection = rbt_create(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
}
@@ -2621,10 +2849,8 @@ fts_query_visitor(
query->collect_positions = FALSE;
- /* Make the intesection (rb tree) the current doc id
- set and free the old set. */
- if (query->intersection) {
- fts_query_free_doc_ids(query->doc_ids);
+ if (query->oper == FTS_EXIST) {
+ fts_query_free_doc_ids(query, query->doc_ids);
query->doc_ids = query->intersection;
query->intersection = NULL;
}
@@ -2649,6 +2875,10 @@ fts_query_visitor(
ut_error;
}
+ if (query->oper == FTS_EXIST) {
+ query->multi_exist = true;
+ }
+
return(query->error);
}
@@ -2656,7 +2886,7 @@ fts_query_visitor(
Process (nested) sub-expression, create a new result set to store the
sub-expression result by processing nodes under current sub-expression
list. Merge the sub-expression result with that of parent expression list.
-@return DB_SUCCESS if all went well */
+@return DB_SUCCESS if all well */
UNIV_INTERN
dberr_t
fts_ast_visit_sub_exp(
@@ -2670,8 +2900,8 @@ fts_ast_visit_sub_exp(
ib_rbt_t* parent_doc_ids;
ib_rbt_t* subexpr_doc_ids;
dberr_t error = DB_SUCCESS;
- ibool inited = query->inited;
bool will_be_ignored = false;
+ bool multi_exist;
ut_a(node->type == FTS_AST_SUBEXP_LIST);
@@ -2691,45 +2921,34 @@ fts_ast_visit_sub_exp(
query->doc_ids = rbt_create(sizeof(fts_ranking_t),
fts_ranking_doc_id_cmp);
- /* Reset the query start flag because the sub-expression result
- set is independent of any previous results. The state flag
- reset is needed for not making an intersect operation on an empty
- set in the first call to fts_query_intersect() for the first term. */
- query->inited = FALSE;
+ query->total_size += SIZEOF_RBT_CREATE;
+ multi_exist = query->multi_exist;
+ query->multi_exist = false;
/* Process nodes in current sub-expression and store its
result set in query->doc_ids we created above. */
error = fts_ast_visit(FTS_NONE, node->next, visitor,
arg, &will_be_ignored);
/* Reinstate parent node state and prepare for merge. */
- query->inited = inited;
+ query->multi_exist = multi_exist;
query->oper = cur_oper;
subexpr_doc_ids = query->doc_ids;
/* Restore current result set. */
query->doc_ids = parent_doc_ids;
- if (query->oper == FTS_EXIST && !query->inited) {
- ut_a(rbt_empty(query->doc_ids));
- /* Since this is the first time we need to convert this
- intersection query into a union query. Otherwise we
- will end up with an empty set. */
- query->oper = FTS_NONE;
- query->inited = TRUE;
- }
-
/* Merge the sub-expression result with the parent result set. */
if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) {
- fts_merge_doc_ids(query, subexpr_doc_ids);
+ error = fts_merge_doc_ids(query, subexpr_doc_ids);
}
if (query->oper == FTS_EXIST) {
- query->multi_exist = TRUE;
+ query->multi_exist = true;
}
/* Free current result set. Result already merged into parent. */
- fts_query_free_doc_ids(subexpr_doc_ids);
+ fts_query_free_doc_ids(query, subexpr_doc_ids);
return(error);
}
@@ -2808,9 +3027,10 @@ fts_query_find_doc_id(
/*****************************************************************//**
Read and filter nodes.
-@return fts_node_t instance */
+@return DB_SUCCESS if all go well,
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
static
-void
+dberr_t
fts_query_filter_doc_ids(
/*=====================*/
fts_query_t* query, /*!< in: query instance */
@@ -2863,6 +3083,10 @@ fts_query_filter_doc_ids(
parent container. */
match->positions = ib_vector_create(
heap_alloc, sizeof(ulint), 64);
+
+ query->total_size += sizeof(fts_match_t)
+ + sizeof(ib_vector_t)
+ + sizeof(ulint) * 64;
}
/* Unpack the positions within the document. */
@@ -2888,7 +3112,7 @@ fts_query_filter_doc_ids(
/* Add the doc id to the doc freq rb tree, if the doc id
doesn't exist it will be created. */
- doc_freq = fts_query_add_doc_freq(doc_freqs, doc_id);
+ doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
/* Avoid duplicating frequency tally. */
if (doc_freq->freq == 0) {
@@ -2904,21 +3128,29 @@ fts_query_filter_doc_ids(
/* We simply collect the matching documents and the
positions here and match later. */
if (!query->collect_positions) {
+ /* We ignore error here and will check it later */
fts_query_process_doc_id(query, doc_id, 0);
- }
- /* Add the word to the document's matched RB tree. */
- fts_query_add_word_to_document(query, doc_id, word);
+ /* Add the word to the document's matched RB tree. */
+ fts_query_add_word_to_document(query, doc_id, word);
+ }
}
/* Some sanity checks. */
ut_a(doc_id == node->last_doc_id);
+
+ if (query->total_size > fts_result_cache_limit) {
+ return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ } else {
+ return(DB_SUCCESS);
+ }
}
/*****************************************************************//**
-Read the FTS INDEX row. */
+Read the FTS INDEX row.
+@return DB_SUCCESS if all go well. */
static
-void
+dberr_t
fts_query_read_node(
/*================*/
fts_query_t* query, /*!< in: query instance */
@@ -2932,6 +3164,7 @@ fts_query_read_node(
fts_word_freq_t* word_freq;
ibool skip = FALSE;
byte term[FTS_MAX_WORD_LEN + 1];
+ dberr_t error = DB_SUCCESS;
ut_a(query->cur_node->type == FTS_AST_TERM ||
query->cur_node->type == FTS_AST_TEXT);
@@ -3005,9 +3238,9 @@ fts_query_read_node(
case 4: /* ILIST */
- fts_query_filter_doc_ids(
- query, word_freq->word, word_freq,
- &node, data, len, FALSE);
+ error = fts_query_filter_doc_ids(
+ query, word_freq->word, word_freq,
+ &node, data, len, FALSE);
break;
@@ -3021,6 +3254,8 @@ fts_query_read_node(
ut_a(i == 5);
}
+
+ return error;
}
/*****************************************************************//**
@@ -3047,9 +3282,15 @@ fts_query_index_fetch_nodes(
ut_a(dfield_len <= FTS_MAX_WORD_LEN);
- fts_query_read_node(query, &key, que_node_get_next(exp));
+ /* Note: we pass error out by 'query->error' */
+ query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
- return(TRUE);
+ if (query->error != DB_SUCCESS) {
+ ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+ return(FALSE);
+ } else {
+ return(TRUE);
+ }
}
/*****************************************************************//**
@@ -3107,27 +3348,22 @@ fts_query_calculate_ranking(
const fts_query_t* query, /*!< in: query state */
fts_ranking_t* ranking) /*!< in: Document to rank */
{
- const ib_rbt_node_t* node;
+ ulint pos = 0;
+ byte* word = NULL;
/* At this stage, ranking->rank should not exceed the 1.0
bound */
ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
+ ut_ad(query->word_map->size() == query->word_vector->size());
- for (node = rbt_first(ranking->words);
- node;
- node = rbt_first(ranking->words)) {
-
+ while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
int ret;
- const byte* word;
- const byte** wordp;
ib_rbt_bound_t parent;
double weight;
fts_doc_freq_t* doc_freq;
fts_word_freq_t* word_freq;
- wordp = rbt_value(const byte*, node);
- word = *wordp;
-
+ ut_ad(word != NULL);
ret = rbt_search(query->word_freqs, &parent, word);
/* It must exist. */
@@ -3146,8 +3382,6 @@ fts_query_calculate_ranking(
weight = (double) doc_freq->freq * word_freq->idf;
ranking->rank += (fts_rank_t) (weight * word_freq->idf);
-
- ut_free(rbt_remove_node(ranking->words, node));
}
}
@@ -3157,6 +3391,7 @@ static
void
fts_query_add_ranking(
/*==================*/
+ fts_query_t* query, /*!< in: query state */
ib_rbt_t* ranking_tree, /*!< in: ranking tree */
const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
{
@@ -3173,6 +3408,9 @@ fts_query_add_ranking(
ut_a(ranking->words == NULL);
} else {
rbt_add_node(ranking_tree, &parent, new_ranking);
+
+ query->total_size += SIZEOF_RBT_NODE_ADD
+ + sizeof(fts_ranking_t);
}
}
@@ -3213,14 +3451,13 @@ static
fts_result_t*
fts_query_prepare_result(
/*=====================*/
- const fts_query_t* query, /*!< in: Query state */
- fts_result_t* result) /*!< in: result this can contain
- data from a previous search on
- another FTS index */
+ fts_query_t* query, /*!< in: Query state */
+ fts_result_t* result) /*!< in: result this can contain
+ data from a previous search on
+ another FTS index */
{
const ib_rbt_node_t* node;
-
- ut_a(rbt_size(query->doc_ids) > 0);
+ bool result_is_null = false;
if (result == NULL) {
result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
@@ -3229,8 +3466,55 @@ fts_query_prepare_result(
result->rankings_by_id = rbt_create(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+ query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
+ result_is_null = true;
+ }
+
+ if (query->flags == FTS_OPT_RANKING) {
+ fts_word_freq_t* word_freq;
+ ulint size = ib_vector_size(query->deleted->doc_ids);
+ fts_update_t* array =
+ (fts_update_t*) query->deleted->doc_ids->data;
+
+ node = rbt_first(query->word_freqs);
+ ut_ad(node);
+ word_freq = rbt_value(fts_word_freq_t, node);
+
+ for (node = rbt_first(word_freq->doc_freqs);
+ node;
+ node = rbt_next(word_freq->doc_freqs, node)) {
+ fts_doc_freq_t* doc_freq;
+ fts_ranking_t ranking;
+
+ doc_freq = rbt_value(fts_doc_freq_t, node);
+
+ /* Don't put deleted docs into result */
+ if (fts_bsearch(array, 0, size, doc_freq->doc_id)
+ >= 0) {
+ continue;
+ }
+
+ ranking.doc_id = doc_freq->doc_id;
+ ranking.rank = doc_freq->freq * word_freq->idf
+ * word_freq->idf;
+ ranking.words = NULL;
+
+ fts_query_add_ranking(query, result->rankings_by_id,
+ &ranking);
+
+ if (query->total_size > fts_result_cache_limit) {
+ query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ fts_query_free_result(result);
+ return(NULL);
+ }
+ }
+
+ return(result);
}
+ ut_a(rbt_size(query->doc_ids) > 0);
+
for (node = rbt_first(query->doc_ids);
node;
node = rbt_next(query->doc_ids, node)) {
@@ -3245,11 +3529,24 @@ fts_query_prepare_result(
// different FTS indexes.
/* We don't need these anymore free the resources. */
- ut_a(rbt_empty(ranking->words));
- rbt_free(ranking->words);
ranking->words = NULL;
- fts_query_add_ranking(result->rankings_by_id, ranking);
+ if (!result_is_null) {
+ fts_query_add_ranking(query, result->rankings_by_id, ranking);
+
+ if (query->total_size > fts_result_cache_limit) {
+ query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ fts_query_free_result(result);
+ return(NULL);
+ }
+ }
+ }
+
+ if (result_is_null) {
+ /* Use doc_ids directly */
+ rbt_free(result->rankings_by_id);
+ result->rankings_by_id = query->doc_ids;
+ query->doc_ids = NULL;
}
return(result);
@@ -3261,10 +3558,10 @@ static
fts_result_t*
fts_query_get_result(
/*=================*/
- const fts_query_t* query, /*!< in: query instance */
+ fts_query_t* query, /*!< in: query instance */
fts_result_t* result) /*!< in: result */
{
- if (rbt_size(query->doc_ids) > 0) {
+ if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
/* Copy the doc ids to the result. */
result = fts_query_prepare_result(query, result);
} else {
@@ -3298,7 +3595,7 @@ fts_query_free(
}
if (query->doc_ids) {
- fts_query_free_doc_ids(query->doc_ids);
+ fts_query_free_doc_ids(query, query->doc_ids);
}
if (query->word_freqs) {
@@ -3327,6 +3624,14 @@ fts_query_free(
mem_heap_free(query->heap);
}
+ if (query->word_map) {
+ delete query->word_map;
+ }
+
+ if (query->word_vector) {
+ delete query->word_vector;
+ }
+
memset(query, 0, sizeof(*query));
}
@@ -3342,12 +3647,13 @@ fts_query_parse(
{
int error;
fts_ast_state_t state;
- ibool mode = query->boolean_mode;
+ bool mode = query->boolean_mode;
memset(&state, 0x0, sizeof(state));
/* Setup the scanner to use, this depends on the mode flag. */
state.lexer = fts_lexer_create(mode, query_str, query_len);
+ state.charset = query->fts_index_table.charset;
error = fts_parse(&state);
fts_lexer_free(state.lexer);
state.lexer = NULL;
@@ -3363,6 +3669,112 @@ fts_query_parse(
return(state.root);
}
+/*******************************************************************//**
+FTS Query optimization
+Set FTS_OPT_RANKING if it is a simple term query */
+static
+void
+fts_query_can_optimize(
+/*===================*/
+ fts_query_t* query, /*!< in/out: query instance */
+ uint flags) /*!< In: FTS search mode */
+{
+ fts_ast_node_t* node = query->root;
+
+ if (flags & FTS_EXPAND) {
+ return;
+ }
+
+ /* Check if it has only a term without oper */
+ ut_ad(node->type == FTS_AST_LIST);
+ node = node->list.head;
+ if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
+ query->flags = FTS_OPT_RANKING;
+ }
+}
+
+/*******************************************************************//**
+Pre-process the query string
+1) make it lower case
+2) in boolean mode, if there is '-' or '+' that is immediately proceeded
+and followed by valid word, make it a space
+@return the processed string */
+static
+byte*
+fts_query_str_preprocess(
+/*=====================*/
+ const byte* query_str, /*!< in: FTS query */
+ ulint query_len, /*!< in: FTS query string len */
+ ulint *result_len, /*!< out: result string length */
+ CHARSET_INFO* charset, /*!< in: string charset */
+ bool boolean_mode) /*!< in: is boolean mode */
+{
+ ulint cur_pos = 0;
+ ulint str_len;
+ byte* str_ptr;
+ bool in_phrase = false;
+
+ /* Convert the query string to lower case before parsing. We own
+ the ut_malloc'ed result and so remember to free it before return. */
+
+ str_len = query_len * charset->casedn_multiply + 1;
+ str_ptr = static_cast<byte*>(ut_malloc(str_len));
+
+ *result_len = innobase_fts_casedn_str(
+ charset, const_cast<char*>(reinterpret_cast<const char*>(
+ query_str)), query_len,
+ reinterpret_cast<char*>(str_ptr), str_len);
+
+ ut_ad(*result_len < str_len);
+
+ str_ptr[*result_len] = 0;
+
+ /* If it is boolean mode, no need to check for '-/+' */
+ if (!boolean_mode) {
+ return(str_ptr);
+ }
+
+ /* Otherwise, we travese the string to find any '-/+' that are
+ immediately proceeded and followed by valid search word.
+ NOTE: we should not do so for CJK languages, this should
+ be taken care of in our CJK implementation */
+ while (cur_pos < *result_len) {
+ fts_string_t str;
+ ulint offset;
+ ulint cur_len;
+
+ cur_len = innobase_mysql_fts_get_token(
+ charset, str_ptr + cur_pos, str_ptr + *result_len,
+ &str, &offset);
+
+ if (cur_len == 0) {
+ break;
+ }
+
+ /* Check if we are in a phrase, if so, no need to do
+ replacement of '-/+'. */
+ for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
+ if ((char) (*ptr) == '"' ) {
+ in_phrase = !in_phrase;
+ }
+ }
+
+ /* Find those are not leading '-/+' and also not in a phrase */
+ if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
+ && !in_phrase) {
+ char* last_op = reinterpret_cast<char*>(
+ str_ptr + cur_pos);
+
+ if (*last_op == '-' || *last_op == '+') {
+ *last_op = ' ';
+ }
+ }
+
+ cur_pos += cur_len;
+ }
+
+ return(str_ptr);
+}
/*******************************************************************//**
FTS Query entry point.
@@ -3382,9 +3794,8 @@ fts_query(
fts_query_t query;
dberr_t error = DB_SUCCESS;
byte* lc_query_str;
- ulint lc_query_str_len;
ulint result_len;
- ibool boolean_mode;
+ bool boolean_mode;
trx_t* query_trx;
CHARSET_INFO* charset;
ulint start_time_ms;
@@ -3401,7 +3812,6 @@ fts_query(
query.trx = query_trx;
query.index = index;
- query.inited = FALSE;
query.boolean_mode = boolean_mode;
query.deleted = fts_doc_ids_create();
query.cur_node = NULL;
@@ -3418,6 +3828,9 @@ fts_query(
query.fts_index_table.parent = index->table->name;
query.fts_index_table.charset = charset;
+ query.word_map = new word_map_t;
+ query.word_vector = new word_vector_t;
+ query.error = DB_SUCCESS;
/* Setup the RB tree that will be used to collect per term
statistics. */
@@ -3425,6 +3838,8 @@ fts_query(
sizeof(fts_word_freq_t), innobase_fts_string_cmp,
(void*) charset);
+ query.total_size += SIZEOF_RBT_CREATE;
+
query.total_docs = dict_table_get_n_rows(index->table);
#ifdef FTS_DOC_STATS_DEBUG
@@ -3467,6 +3882,7 @@ fts_query(
/* Sort the vector so that we can do a binary search over the ids. */
ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
+#if 0
/* Convert the query string to lower case before parsing. We own
the ut_malloc'ed result and so remember to free it before return. */
@@ -3481,16 +3897,30 @@ fts_query(
lc_query_str[result_len] = 0;
+#endif
+
+ lc_query_str = fts_query_str_preprocess(
+ query_str, query_len, &result_len, charset, boolean_mode);
+
query.heap = mem_heap_create(128);
/* Create the rb tree for the doc id (current) set. */
query.doc_ids = rbt_create(
sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+ query.total_size += SIZEOF_RBT_CREATE;
+
/* Parse the input query string. */
if (fts_query_parse(&query, lc_query_str, result_len)) {
fts_ast_node_t* ast = query.root;
+ /* Optimize query to check if it's a single term */
+ fts_query_can_optimize(&query, flags);
+
+ DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+ fts_result_cache_limit = 2048;
+ );
+
/* Traverse the Abstract Syntax Tree (AST) and execute
the query. */
query.error = fts_ast_visit(
@@ -3500,11 +3930,13 @@ fts_query(
/* If query expansion is requested, extend the search
with first search pass result */
if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
- query.error = fts_expand_query(index, &query);
+ query.error = fts_expand_query(index, &query);
}
/* Calculate the inverse document frequency of the terms. */
- fts_query_calculate_idf(&query);
+ if (query.error == DB_SUCCESS) {
+ fts_query_calculate_idf(&query);
+ }
/* Copy the result from the query state, so that we can
return it to the caller. */
@@ -3530,6 +3962,15 @@ fts_query(
(*result)->rankings_by_id
? (int) rbt_size((*result)->rankings_by_id)
: -1);
+
+ /* Log memory consumption & result size */
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Full Search Memory: "
+ "%lu (bytes), Row: %lu .",
+ query.total_size,
+ (*result)->rankings_by_id
+ ? rbt_size((*result)->rankings_by_id)
+ : 0);
}
func_exit:
@@ -3608,30 +4049,24 @@ static
void
fts_print_doc_id(
/*=============*/
- ib_rbt_t* doc_ids) /*!< in : tree that stores doc_ids.*/
+ fts_query_t* query) /*!< in : tree that stores doc_ids.*/
{
const ib_rbt_node_t* node;
- const ib_rbt_node_t* node_word;
/* Iterate each member of the doc_id set */
- for (node = rbt_first(doc_ids);
+ for (node = rbt_first(query->doc_ids);
node;
- node = rbt_next(doc_ids, node)) {
+ node = rbt_next(query->doc_ids, node)) {
fts_ranking_t* ranking;
ranking = rbt_value(fts_ranking_t, node);
fprintf(stderr, "doc_ids info, doc_id: %ld \n",
(ulint) ranking->doc_id);
- for (node_word = rbt_first(ranking->words);
- node_word;
- node_word = rbt_next(ranking->words, node_word)) {
-
- const byte** value;
-
- value = rbt_value(const byte*, node_word);
-
- fprintf(stderr, "doc_ids info, value: %s \n", *value);
+ ulint pos = 0;
+ byte* value = NULL;
+ while (fts_ranking_words_get_next(query, ranking, &pos, &value)) {
+ fprintf(stderr, "doc_ids info, value: %s \n", value);
}
}
}
@@ -3676,8 +4111,9 @@ fts_expand_query(
result_doc.charset = index_cache->charset;
+ query->total_size += SIZEOF_RBT_CREATE;
#ifdef UNIV_DEBUG
- fts_print_doc_id(query->doc_ids);
+ fts_print_doc_id(query);
#endif
for (node = rbt_first(query->doc_ids);
@@ -3685,7 +4121,12 @@ fts_expand_query(
node = rbt_next(query->doc_ids, node)) {
fts_ranking_t* ranking;
- const ib_rbt_node_t* node_word;
+ ulint pos;
+ byte* word;
+ ulint prev_token_size;
+ ulint estimate_size;
+
+ prev_token_size = rbt_size(result_doc.tokens);
ranking = rbt_value(fts_ranking_t, node);
@@ -3702,16 +4143,15 @@ fts_expand_query(
/* Remove words that have already been searched in the
first pass */
- for (node_word = rbt_first(ranking->words);
- node_word;
- node_word = rbt_next(ranking->words, node_word)) {
+ pos = 0;
+ word = NULL;
+ while (fts_ranking_words_get_next(query, ranking, &pos,
+ &word)) {
fts_string_t str;
ibool ret;
- const byte** strp;
- strp = rbt_value(const byte*, node_word);
/* FIXME: We are discarding a const qualifier here. */
- str.f_str = (byte*) *strp;
+ str.f_str = word;
str.f_len = ut_strlen((const char*) str.f_str);
ret = rbt_delete(result_doc.tokens, &str);
@@ -3723,6 +4163,18 @@ fts_expand_query(
(ulint) ranking->doc_id);
}
}
+
+ /* Estimate memory used, see fts_process_token and fts_token_t.
+ We ignore token size here. */
+ estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
+ * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
+ + sizeof(ib_vector_t) + sizeof(ulint) * 32);
+ query->total_size += estimate_size;
+
+ if (query->total_size > fts_result_cache_limit) {
+ error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+ goto func_exit;
+ }
}
/* Search the table the second time with expanded search list */
@@ -3740,6 +4192,7 @@ fts_expand_query(
}
}
+func_exit:
fts_doc_free(&result_doc);
return(error);
@@ -3857,8 +4310,12 @@ fts_phrase_or_proximity_search(
if (fts_query_is_in_proximity_range(
query, match, &qualified_pos)) {
/* If so, mark we find a matching doc */
- fts_query_process_doc_id(
+ query->error = fts_query_process_doc_id(
query, match[0]->doc_id, 0);
+ if (query->error != DB_SUCCESS) {
+ matched = FALSE;
+ goto func_exit;
+ }
matched = TRUE;
for (ulint z = 0; z < num_token; z++) {