diff options
author | Sergei Golubchik <sergii@pisem.net> | 2014-02-01 09:33:26 +0100 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2014-02-01 09:33:26 +0100 |
commit | 27d45e46968e4bd623565688a997b83b0a5cc1a8 (patch) | |
tree | 8df9c87f8fd3855d81e3ae46078d34609668e63a /storage/innobase/fts/fts0que.cc | |
parent | 27fbb637d36324992b270f0dc0472807ffa4ebc2 (diff) | |
download | mariadb-git-27d45e46968e4bd623565688a997b83b0a5cc1a8.tar.gz |
MDEV-5574 Set AUTO_INCREMENT below max value of column.
Update InnoDB to 5.6.14
Apply MySQL-5.6 hack for MySQL Bug#16434374
Move Aria-only HA_RTREE_INDEX from my_base.h to maria_def.h (breaks an assert in InnoDB)
Fix InnoDB memory leak
Diffstat (limited to 'storage/innobase/fts/fts0que.cc')
-rw-r--r-- | storage/innobase/fts/fts0que.cc | 1039 |
1 files changed, 748 insertions, 291 deletions
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index 5c757b4f176..72901d193eb 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,6 +40,10 @@ Completed 2011/7/10 Sunny and Jimmy Yang #include "fts0vlc.ic" #endif +#include <string> +#include <vector> +#include <map> + #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)]) #define RANK_DOWNGRADE (-1.0F) @@ -50,13 +54,20 @@ FIXME, this limitation can be removed easily. Need to see if we want to enforce such limitation */ #define MAX_PROXIMITY_ITEM 128 +/* Memory used by rbt itself for create and node add */ +#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 +#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) + +/*Initial byte length for 'words' in fts_ranking_t */ +#define RANKING_WORDS_INIT_LEN 4 + /* Coeffecient to use for normalize relevance ranking. */ static const double FTS_NORMALIZE_COEFF = 0.0115F; // FIXME: Need to have a generic iterator that traverses the ilist. -/* For parsing the search phrase */ -static const char* FTS_PHRASE_DELIMITER = "\t "; +typedef std::map<std::string, ulint> word_map_t; +typedef std::vector<std::string> word_vector_t; struct fts_word_freq_t; @@ -72,6 +83,8 @@ struct fts_query_t { fts_table_t fts_index_table;/*!< FTS auxiliary index table def */ + ulint total_size; /*!< total memory size used by query */ + fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be filtered from the output */ @@ -79,6 +92,12 @@ struct fts_query_t { fts_ast_node_t* cur_node; /*!< Current tree node */ + word_map_t* word_map; /*!< Matched word map for + searching by word*/ + + word_vector_t* word_vector; /*!< Matched word vector for + searching by index */ + ib_rbt_t* doc_ids; /*!< The current set of matching doc ids, elements are of type fts_ranking_t */ @@ -113,7 +132,7 @@ struct fts_query_t { doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */ - ibool boolean_mode; /*!< TRUE if boolean mode query */ + bool boolean_mode; /*!< TRUE if boolean mode query */ ib_vector_t* matched; /*!< Array of matching documents (fts_match_t) to search for a phrase */ @@ -133,9 +152,7 @@ struct fts_query_t { document, its elements are of type fts_word_freq_t */ - ibool inited; /*!< Flag to test whether the query - processing has started or not */ - ibool multi_exist; /*!< multiple FTS_EXIST oper */ + bool multi_exist; /*!< multiple FTS_EXIST oper */ }; /** For phrase matching, first we collect the documents and the positions @@ -237,7 +254,7 @@ fts_query_index_fetch_nodes( Read and filter nodes. @return fts_node_t instance */ static -void +dberr_t fts_query_filter_doc_ids( /*=====================*/ fts_query_t* query, /*!< in: query instance */ @@ -318,7 +335,7 @@ static ulint fts_query_terms_in_document( /*========================*/ - /*!< out: DB_SUCCESS if all went well + /*!< out: DB_SUCCESS if all go well else error code */ fts_query_t* query, /*!< in: FTS query state */ doc_id_t doc_id, /*!< in: the word to check */ @@ -430,22 +447,6 @@ fts_query_lcs( #endif /*******************************************************************//** -Compare two byte* arrays. -@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ -static -int -fts_query_strcmp( -/*=============*/ - const void* p1, /*!< in: pointer to elem */ - const void* p2) /*!< in: pointer to elem */ -{ - void* temp = const_cast<void*>(p2); - - return(strcmp(static_cast<const char*>(p1), - *(static_cast <char**>(temp)))); -} - -/*******************************************************************//** Compare two fts_ranking_t instance on their rank value and doc ids in descending order on the rank and ascending order on doc id. @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ @@ -537,6 +538,127 @@ fts_utf8_strcmp( #endif /*******************************************************************//** +Create words in ranking */ +static +void +fts_ranking_words_create( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking) /*!< in: ranking instance */ +{ + ranking->words = static_cast<byte*>( + mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN)); + ranking->words_len = RANKING_WORDS_INIT_LEN; +} + +/* +The optimization here is using a char array(bitmap) to replace words rb tree +in fts_ranking_t. + +It can save lots of memory except in some cases of QUERY EXPANSION. + +'word_map' is used as a word dictionary, in which the key is a word, the value +is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'. +if not, we add it into 'word_map', and give it a position(actually a number). +then we set the corresponding bit to '1' at the position in the char array 'words'. + +'word_vector' is a useful backup of 'word_map', and we can get a word by its position, +more quickly than searching by value in 'word_map'. we use 'word_vector' +in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need +to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq +by the word. +*/ + +/*******************************************************************//** +Add a word into ranking */ +static +void +fts_ranking_words_add( +/*==================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking, /*!< in: ranking instance */ + const char* word) /*!< in: term/word to add */ +{ + ulint pos; + ulint byte_offset; + ulint bit_offset; + word_map_t::iterator it; + + /* Note: we suppose the word map and vector are append-only */ + /* Check if need to add it to word map */ + it = query->word_map->lower_bound(word); + if (it != query->word_map->end() + && !query->word_map->key_comp()(word, it->first)) { + pos = it->second; + } else { + pos = query->word_map->size(); + query->word_map->insert(it, + std::pair<std::string, ulint>(word, pos)); + + query->word_vector->push_back(word); + } + + /* Check words len */ + byte_offset = pos / CHAR_BIT; + if (byte_offset >= ranking->words_len) { + byte* words = ranking->words; + ulint words_len = ranking->words_len; + + while (byte_offset >= words_len) { + words_len *= 2; + } + + ranking->words = static_cast<byte*>( + mem_heap_zalloc(query->heap, words_len)); + ut_memcpy(ranking->words, words, ranking->words_len); + ranking->words_len = words_len; + } + + /* Set ranking words */ + ut_ad(byte_offset < ranking->words_len); + bit_offset = pos % CHAR_BIT; + ranking->words[byte_offset] |= 1 << bit_offset; +} + +/*******************************************************************//** +Get a word from a ranking +@return true if it's successful */ +static +bool +fts_ranking_words_get_next( +/*=======================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking,/*!< in: ranking instance */ + ulint* pos, /*!< in/out: word start pos */ + byte** word) /*!< in/out: term/word to add */ +{ + bool ret = false; + ulint max_pos = ranking->words_len * CHAR_BIT; + + /* Search for next word */ + while (*pos < max_pos) { + ulint byte_offset = *pos / CHAR_BIT; + ulint bit_offset = *pos % CHAR_BIT; + + if (ranking->words[byte_offset] & (1 << bit_offset)) { + ret = true; + break; + } + + *pos += 1; + }; + + /* Get next word from word vector */ + if (ret) { + ut_ad(*pos < query->word_vector->size()); + *word = (byte*)query->word_vector->at((size_t)*pos).c_str(); + *pos += 1; + } + + return ret; +} + +/*******************************************************************//** Add a word if it doesn't exist, to the term freq RB tree. We store a pointer to the word that is passed in as the argument. @return pointer to word */ @@ -569,6 +691,11 @@ fts_query_add_word_freq( parent.last = rbt_add_node( query->word_freqs, &parent, &word_freq); + + query->total_size += len + + SIZEOF_RBT_CREATE + + SIZEOF_RBT_NODE_ADD + + sizeof(fts_word_freq_t); } return(rbt_value(fts_word_freq_t, parent.last)); @@ -581,6 +708,7 @@ static fts_doc_freq_t* fts_query_add_doc_freq( /*===================*/ + fts_query_t* query, /*!< in: query instance */ ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */ doc_id_t doc_id) /*!< in: doc id to add */ { @@ -596,6 +724,9 @@ fts_query_add_doc_freq( doc_freq.doc_id = doc_id; parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_doc_freq_t); } return(rbt_value(fts_doc_freq_t, parent.last)); @@ -625,9 +756,12 @@ fts_query_union_doc_id( ranking.rank = rank; ranking.doc_id = doc_id; - ranking.words = rbt_create(sizeof(byte*), fts_query_strcmp); + fts_ranking_words_create(query, &ranking); rbt_add_node(query->doc_ids, &parent, &ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN; } } @@ -648,13 +782,12 @@ fts_query_remove_doc_id( /* Check if the doc id is deleted and it's in our set. */ if (fts_bsearch(array, 0, size, doc_id) < 0 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { - - fts_ranking_t* ranking; - - ranking = rbt_value(fts_ranking_t, parent.last); - rbt_free(ranking->words); - ut_free(rbt_remove_node(query->doc_ids, parent.last)); + + ut_ad(query->total_size > + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); } } @@ -712,57 +845,69 @@ fts_query_intersect_doc_id( ib_rbt_bound_t parent; ulint size = ib_vector_size(query->deleted->doc_ids); fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; - fts_ranking_t* ranking; + fts_ranking_t* ranking= NULL; + + /* There are three types of intersect: + 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'. + 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect + if it matches 'b'. if the doc is also in doc_ids, then change the + doc's rank, and add 'a' in doc's words. + 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect + if it matches 'b' and it's in doc_ids.(multi_exist = true). */ /* Check if the doc id is deleted and it's in our set */ if (fts_bsearch(array, 0, size, doc_id) < 0) { - /* If this is the first FTS_EXIST we encountered, all of its - value must be in intersect list */ - if (!query->multi_exist) { - fts_ranking_t new_ranking; - - if (rbt_search(query->doc_ids, &parent, &doc_id) == 0) { - ranking = rbt_value(fts_ranking_t, parent.last); - rank += (ranking->rank > 0) - ? ranking->rank : RANK_UPGRADE; - if (rank >= 1.0F) { - rank = 1.0F; - } - } - - new_ranking.rank = rank; - new_ranking.doc_id = doc_id; - new_ranking.words = rbt_create( - sizeof(byte*), fts_query_strcmp); - ranking = &new_ranking; + fts_ranking_t new_ranking; - if (rbt_search(query->intersection, &parent, - ranking) != 0) { - rbt_add_node(query->intersection, - &parent, ranking); + if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + if (query->multi_exist) { + return; } else { - rbt_free(new_ranking.words); + new_ranking.words = NULL; } } else { + ranking = rbt_value(fts_ranking_t, parent.last); - if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + /* We've just checked the doc id before */ + if (ranking->words == NULL) { + ut_ad(rbt_search(query->intersection, &parent, + ranking) == 0); return; } - ranking = rbt_value(fts_ranking_t, parent.last); + /* Merge rank */ + rank += ranking->rank; + if (rank >= 1.0F) { + rank = 1.0F; + } else if (rank <= -1.0F) { + rank = -1.0F; + } + + /* Take words */ + new_ranking.words = ranking->words; + new_ranking.words_len = ranking->words_len; + } - ranking->rank = rank; + new_ranking.rank = rank; + new_ranking.doc_id = doc_id; - if (ranking->words != NULL - && rbt_search(query->intersection, &parent, - ranking) != 0) { - rbt_add_node(query->intersection, &parent, - ranking); + if (rbt_search(query->intersection, &parent, + &new_ranking) != 0) { + if (new_ranking.words == NULL) { + fts_ranking_words_create(query, &new_ranking); + query->total_size += RANKING_WORDS_INIT_LEN; + } else { /* Note that the intersection has taken ownership of the ranking data. */ ranking->words = NULL; } + + rbt_add_node(query->intersection, + &parent, &new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); } } } @@ -773,6 +918,7 @@ static void fts_query_free_doc_ids( /*===================*/ + fts_query_t* query, /*!< in: query instance */ ib_rbt_t* doc_ids) /*!< in: rb tree to free */ { const ib_rbt_node_t* node; @@ -784,14 +930,21 @@ fts_query_free_doc_ids( ranking = rbt_value(fts_ranking_t, node); if (ranking->words) { - rbt_free(ranking->words); ranking->words = NULL; } ut_free(rbt_remove_node(doc_ids, node)); + + ut_ad(query->total_size > + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); } rbt_free(doc_ids); + + ut_ad(query->total_size > SIZEOF_RBT_CREATE); + query->total_size -= SIZEOF_RBT_CREATE; } /*******************************************************************//** @@ -808,6 +961,10 @@ fts_query_add_word_to_document( ib_rbt_bound_t parent; fts_ranking_t* ranking = NULL; + if (query->flags == FTS_OPT_RANKING) { + return; + } + /* First we search the intersection RB tree as it could have taken ownership of the words rb tree instance. */ if (query->intersection @@ -823,23 +980,7 @@ fts_query_add_word_to_document( } if (ranking != NULL) { - ulint len; - byte* term; - - len = ut_strlen((char*) word) + 1; - - term = static_cast<byte*>(mem_heap_alloc(query->heap, len)); - - /* Need to copy the NUL character too. */ - memcpy(term, (char*) word, len); - - /* The current set must have ownership of the RB tree. */ - ut_a(ranking->words != NULL); - - /* If the word doesn't exist in the words "list" we add it. */ - if (rbt_search(ranking->words, &parent, term) != 0) { - rbt_add_node(ranking->words, &parent, &term); - } + fts_ranking_words_add(query, ranking, (char*)word); } } @@ -874,9 +1015,9 @@ fts_query_check_node( word_freqs = rbt_value(fts_word_freq_t, parent.last); - fts_query_filter_doc_ids( - query, token->f_str, word_freqs, node, - node->ilist, ilist_size, TRUE); + query->error = fts_query_filter_doc_ids( + query, token->f_str, word_freqs, node, + node->ilist, ilist_size, TRUE); } } @@ -940,10 +1081,14 @@ fts_cache_find_wildcard( fts_word_freq_t, freq_parent.last); - fts_query_filter_doc_ids( + query->error = fts_query_filter_doc_ids( query, srch_text.f_str, word_freqs, node, node->ilist, node->ilist_size, TRUE); + + if (query->error != DB_SUCCESS) { + return(0); + } } num_word++; @@ -976,7 +1121,7 @@ cont_search: /*****************************************************************//** Set difference. -@return DB_SUCCESS if all went well */ +@return DB_SUCCESS if all go well */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_difference( @@ -1007,6 +1152,7 @@ fts_query_difference( const fts_index_cache_t*index_cache; que_t* graph = NULL; fts_cache_t* cache = table->fts->cache; + dberr_t error; rw_lock_x_lock(&cache->lock); @@ -1023,7 +1169,8 @@ fts_query_difference( } else { nodes = fts_cache_find_word(index_cache, token); - for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { const fts_node_t* node; node = static_cast<const fts_node_t*>( @@ -1035,14 +1182,26 @@ fts_query_difference( rw_lock_x_unlock(&cache->lock); + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(query->error); + } + /* Setup the callback args for filtering and consolidating the ilist. */ fetch.read_arg = query; fetch.read_record = fts_query_index_fetch_nodes; - query->error = fts_index_fetch_nodes( + error = fts_index_fetch_nodes( trx, &graph, &query->fts_index_table, token, &fetch); + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + fts_que_graph_free(graph); } @@ -1054,7 +1213,7 @@ fts_query_difference( /*****************************************************************//** Intersect the token doc ids with the current set. -@return DB_SUCCESS if all went well */ +@return DB_SUCCESS if all go well */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_intersect( @@ -1062,7 +1221,6 @@ fts_query_intersect( fts_query_t* query, /*!< in: query instance */ const fts_string_t* token) /*!< in: the token to search */ { - ulint n_doc_ids = 0; trx_t* trx = query->trx; dict_table_t* table = query->index->table; @@ -1073,41 +1231,28 @@ fts_query_intersect( (int) token->f_len, token->f_str); #endif - if (!query->inited) { - - ut_a(rbt_empty(query->doc_ids)); - - /* Since this is the first time we need to convert this - intersection query into a union query. Otherwise we - will end up with an empty set. */ - query->oper = FTS_NONE; - query->inited = TRUE; - } - - if (query->doc_ids) { - n_doc_ids = rbt_size(query->doc_ids); - } - - /* If the words set is not empty or this is the first time. */ - - if (!rbt_empty(query->doc_ids) || query->oper == FTS_NONE) { + /* If the words set is not empty and multi exist is true, + we know the intersection set is empty in advance. */ + if (!(rbt_empty(query->doc_ids) && query->multi_exist)) { + ulint n_doc_ids = 0; ulint i; fts_fetch_t fetch; const ib_vector_t* nodes; const fts_index_cache_t*index_cache; que_t* graph = NULL; fts_cache_t* cache = table->fts->cache; + dberr_t error; ut_a(!query->intersection); - /* Only if this is not the first time. */ - if (query->oper != FTS_NONE) { + n_doc_ids = rbt_size(query->doc_ids); - /* Create the rb tree that will hold the doc ids of - the intersection. */ - query->intersection = rbt_create( - sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); - } + /* Create the rb tree that will hold the doc ids of + the intersection. */ + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; /* This is to avoid decompressing the ilist if the node's ilist doc ids are out of range. */ @@ -1144,7 +1289,8 @@ fts_query_intersect( } else { nodes = fts_cache_find_word(index_cache, token); - for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { const fts_node_t* node; node = static_cast<const fts_node_t*>( @@ -1156,48 +1302,48 @@ fts_query_intersect( rw_lock_x_unlock(&cache->lock); + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(query->error); + } + /* Setup the callback args for filtering and consolidating the ilist. */ fetch.read_arg = query; fetch.read_record = fts_query_index_fetch_nodes; - query->error = fts_index_fetch_nodes( + error = fts_index_fetch_nodes( trx, &graph, &query->fts_index_table, token, &fetch); + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + fts_que_graph_free(graph); if (query->error == DB_SUCCESS) { - if (query->oper == FTS_EXIST) { - - /* The size can't increase. */ - ut_a(rbt_size(query->doc_ids) <= n_doc_ids); - } - /* Make the intesection (rb tree) the current doc id set and free the old set. */ - if (query->intersection) { - fts_query_free_doc_ids(query->doc_ids); - query->doc_ids = query->intersection; - query->intersection = NULL; - } + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; - /* Reset the set operation to intersect. */ - query->oper = FTS_EXIST; + ut_a(!query->multi_exist || (query->multi_exist + && rbt_size(query->doc_ids) <= n_doc_ids)); } } - if (!query->multi_exist) { - query->multi_exist = TRUE; - } - return(query->error); } /*****************************************************************//** Query index cache. -@return DB_SUCCESS if all went well */ +@return DB_SUCCESS if all go well */ static -ulint +dberr_t fts_query_cache( /*============*/ fts_query_t* query, /*!< in/out: query instance */ @@ -1227,7 +1373,8 @@ fts_query_cache( nodes = fts_cache_find_word(index_cache, token); - for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { const fts_node_t* node; node = static_cast<const fts_node_t*>( @@ -1239,12 +1386,12 @@ fts_query_cache( rw_lock_x_unlock(&cache->lock); - return(DB_SUCCESS); + return(query->error); } /*****************************************************************//** Set union. -@return DB_SUCCESS if all went well */ +@return DB_SUCCESS if all go well */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_union( @@ -1256,6 +1403,7 @@ fts_query_union( ulint n_doc_ids = 0; trx_t* trx = query->trx; que_t* graph = NULL; + dberr_t error; ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING || query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING); @@ -1265,8 +1413,6 @@ fts_query_union( (int) token->f_len, token->f_str); #endif - query->error = DB_SUCCESS; - if (query->doc_ids) { n_doc_ids = rbt_size(query->doc_ids); } @@ -1287,9 +1433,15 @@ fts_query_union( fetch.read_record = fts_query_index_fetch_nodes; /* Read the nodes from disk. */ - query->error = fts_index_fetch_nodes( + error = fts_index_fetch_nodes( trx, &graph, &query->fts_index_table, token, &fetch); + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + fts_que_graph_free(graph); if (query->error == DB_SUCCESS) { @@ -1302,22 +1454,17 @@ fts_query_union( if (query->doc_ids) { n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids; } - - /* In case there were no matching docs then we reset the - state, otherwise intersection will not be able to detect - that it's being called for the first time. */ - if (!rbt_empty(query->doc_ids)) { - query->inited = TRUE; - } } return(query->error); } /*****************************************************************//** -Depending upon the current query operator process the doc id. */ +Depending upon the current query operator process the doc id. +return DB_SUCCESS if all go well +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ static -void +dberr_t fts_query_process_doc_id( /*=====================*/ fts_query_t* query, /*!< in: query instance */ @@ -1325,6 +1472,10 @@ fts_query_process_doc_id( fts_rank_t rank) /*!< in: if non-zero, it is the rank associated with the doc_id */ { + if (query->flags == FTS_OPT_RANKING) { + return(DB_SUCCESS); + } + switch (query->oper) { case FTS_NONE: fts_query_union_doc_id(query, doc_id, rank); @@ -1355,12 +1506,18 @@ fts_query_process_doc_id( default: ut_error; } + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } } /*****************************************************************//** Merge two result sets. */ static -void +dberr_t fts_merge_doc_ids( /*==============*/ fts_query_t* query, /*!< in,out: query instance */ @@ -1377,25 +1534,42 @@ fts_merge_doc_ids( query->intersection = rbt_create( sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; } /* Merge the elements to the result set. */ for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) { fts_ranking_t* ranking; + ulint pos = 0; + byte* word = NULL; ranking = rbt_value(fts_ranking_t, node); - fts_query_process_doc_id( - query, ranking->doc_id, ranking->rank); + query->error = fts_query_process_doc_id( + query, ranking->doc_id, ranking->rank); + + if (query->error != DB_SUCCESS) { + return(query->error); + } + + /* Merge words. Don't need to take operator into account. */ + ut_a(ranking->words); + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + fts_query_add_word_to_document(query, ranking->doc_id, + word); + } } /* If it is an intersection operation, reset query->doc_ids to query->intersection and free the old result list. */ if (query->oper == FTS_EXIST && query->intersection != NULL) { - fts_query_free_doc_ids(query->doc_ids); + fts_query_free_doc_ids(query, query->doc_ids); query->doc_ids = query->intersection; query->intersection = NULL; } + + return(DB_SUCCESS); } /*****************************************************************//** @@ -1827,7 +2001,7 @@ fts_query_select( /******************************************************************** Read the rows from the FTS index, that match word and where the doc id is between first and last doc id. -@return DB_SUCCESS if all went well else error code */ +@return DB_SUCCESS if all go well else error code */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_find_term( @@ -1967,7 +2141,7 @@ fts_query_sum( /******************************************************************** Calculate the total documents that contain a particular word (term). -@return DB_SUCCESS if all went well else error code */ +@return DB_SUCCESS if all go well else error code */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_total_docs_containing_term( @@ -2046,7 +2220,7 @@ fts_query_total_docs_containing_term( /******************************************************************** Get the total number of words in a documents. -@return DB_SUCCESS if all went well else error code */ +@return DB_SUCCESS if all go well else error code */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_terms_in_document( @@ -2233,8 +2407,14 @@ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_search_phrase( /*====================*/ - fts_query_t* query, /*!< in: query instance */ - ib_vector_t* tokens) /*!< in: tokens to search */ + fts_query_t* query, /*!< in: query instance */ + ib_vector_t* orig_tokens, /*!< in: tokens to search, + with any stopwords in the + original phrase */ + ib_vector_t* tokens) /*!< in: tokens that does + not include stopwords and + can be used to calculate + ranking */ { ulint i; fts_get_doc_t get_doc; @@ -2275,14 +2455,18 @@ fts_query_search_phrase( if (match->doc_id != 0) { query->error = fts_query_match_document( - tokens, &get_doc, + orig_tokens, &get_doc, match, query->distance, &found); if (query->error == DB_SUCCESS && found) { ulint z; - fts_query_process_doc_id(query, + query->error = fts_query_process_doc_id(query, match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + for (z = 0; z < ib_vector_size(tokens); z++) { fts_string_t* token; token = static_cast<fts_string_t*>( @@ -2295,6 +2479,7 @@ fts_query_search_phrase( } } +func_exit: /* Free the prepared statement. */ if (get_doc.get_document_graph) { fts_que_graph_free(get_doc.get_document_graph); @@ -2314,17 +2499,21 @@ fts_query_phrase_search( fts_query_t* query, /*!< in: query instance */ const fts_string_t* phrase) /*!< in: token to search */ { - char* src; - char* state; /* strtok_r internal state */ ib_vector_t* tokens; + ib_vector_t* orig_tokens; mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t)); - char* utf8 = strdup((char*) phrase->f_str); + ulint len = phrase->f_len; + ulint cur_pos = 0; ib_alloc_t* heap_alloc; ulint num_token; + CHARSET_INFO* charset; + + charset = query->fts_index_table.charset; heap_alloc = ib_heap_allocator_create(heap); tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); if (query->distance != ULINT_UNDEFINED && query->distance > 0) { query->flags = FTS_PROXIMITY; @@ -2333,26 +2522,65 @@ fts_query_phrase_search( } /* Split the phrase into tokens. */ - for (src = utf8; /* No op */; src = NULL) { + while (cur_pos < len) { + fts_cache_t* cache = query->index->table->fts->cache; + ib_rbt_bound_t parent; + ulint offset; + ulint cur_len; + fts_string_t result_str; + + cur_len = innobase_mysql_fts_get_token( + charset, + reinterpret_cast<const byte*>(phrase->f_str) + cur_pos, + reinterpret_cast<const byte*>(phrase->f_str) + len, + &result_str, &offset); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + + if (result_str.f_n_char == 0) { + continue; + } + fts_string_t* token = static_cast<fts_string_t*>( ib_vector_push(tokens, NULL)); - token->f_str = (byte*) strtok_r( - src, FTS_PHRASE_DELIMITER, &state); + token->f_str = static_cast<byte*>( + mem_heap_alloc(heap, result_str.f_len + 1)); + ut_memcpy(token->f_str, result_str.f_str, result_str.f_len); - if (token->f_str) { + token->f_len = result_str.f_len; + token->f_str[token->f_len] = 0; + + if (cache->stopword_info.cached_stopword + && rbt_search(cache->stopword_info.cached_stopword, + &parent, token) != 0 + && result_str.f_n_char >= fts_min_token_size + && result_str.f_n_char <= fts_max_token_size) { /* Add the word to the RB tree so that we can calculate it's frequencey within a document. */ fts_query_add_word_freq(query, token->f_str); - - token->f_len = ut_strlen((char*) token->f_str); } else { ib_vector_pop(tokens); - break; + } + + /* we will start to store all words including stopwords + in the "orig_tokens" vector, but skip any leading words + that are stopwords */ + if (!ib_vector_is_empty(tokens)) { + fts_string_t* orig_token = static_cast<fts_string_t*>( + ib_vector_push(orig_tokens, NULL)); + + orig_token->f_str = token->f_str; + orig_token->f_len = token->f_len; } } num_token = ib_vector_size(tokens); + ut_ad(ib_vector_size(orig_tokens) >= num_token); /* Ignore empty strings. */ if (num_token > 0) { @@ -2362,19 +2590,7 @@ fts_query_phrase_search( fts_ast_oper_t oper = query->oper; que_t* graph = NULL; ulint i; - - /* Create the rb tree for storing the words read form disk. */ - if (!query->inited) { - - /* Since this is the first time, we need to convert - this intersection query into a union query. Otherwise - we will end up with an empty set. */ - if (query->oper == FTS_EXIST) { - query->oper = FTS_NONE; - } - - query->inited = TRUE; - } + dberr_t error; /* Create the vector for storing matching document ids and the positions of the first token of the phrase. */ @@ -2422,10 +2638,16 @@ fts_query_phrase_search( query->matched = query->match_array[i]; } - fts_index_fetch_nodes( + error = fts_index_fetch_nodes( trx, &graph, &query->fts_index_table, token, &fetch); + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + fts_que_graph_free(graph); graph = NULL; @@ -2438,12 +2660,15 @@ fts_query_phrase_search( /* If any of the token can't be found, no need to continue match */ - if (ib_vector_is_empty(query->match_array[i])) { + if (ib_vector_is_empty(query->match_array[i]) + || query->error != DB_SUCCESS) { goto func_exit; } } - if (num_token == 1 + /* Just a single word, no need to fetch the original + documents to do phrase matching */ + if (ib_vector_size(orig_tokens) == 1 && !ib_vector_is_empty(query->match_array[0])) { fts_match_t* match; ulint n_matched; @@ -2455,8 +2680,11 @@ fts_query_phrase_search( ib_vector_get( query->match_array[0], i)); - fts_query_process_doc_id( - query, match->doc_id, 0); + query->error = fts_query_process_doc_id( + query, match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } fts_query_add_word_to_document( query, match->doc_id, token->f_str); @@ -2484,18 +2712,21 @@ fts_query_phrase_search( /* Read the actual text in and search for the phrase. */ if (matched) { - query->error = DB_SUCCESS; + ut_ad(query->error == DB_SUCCESS); query->error = fts_query_search_phrase( - query, tokens); + query, orig_tokens, tokens); } } /* Restore original operation. */ query->oper = oper; + + if (query->error != DB_SUCCESS) { + goto func_exit; + } } func_exit: - free(utf8); mem_heap_free(heap); /* Don't need it anymore. */ @@ -2506,7 +2737,7 @@ func_exit: /*****************************************************************//** Find the word and evaluate. -@return DB_SUCCESS if all went well */ +@return DB_SUCCESS if all go well */ static __attribute__((nonnull, warn_unused_result)) dberr_t fts_query_execute( @@ -2578,7 +2809,7 @@ fts_query_get_token( /*****************************************************************//** Visit every node of the AST. */ static -ulint +dberr_t fts_query_visitor( /*==============*/ fts_ast_oper_t oper, /*!< in: current operator */ @@ -2602,11 +2833,8 @@ fts_query_visitor( token.f_str = node->text.ptr; token.f_len = ut_strlen((char*) token.f_str); - /* "first second third" is treated as first & second - & third. Create the rb tree that will hold the doc ids - of the intersection. */ - if (!query->intersection && query->oper == FTS_EXIST) { - + if (query->oper == FTS_EXIST) { + ut_ad(query->intersection == NULL); query->intersection = rbt_create( sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); } @@ -2621,10 +2849,8 @@ fts_query_visitor( query->collect_positions = FALSE; - /* Make the intesection (rb tree) the current doc id - set and free the old set. */ - if (query->intersection) { - fts_query_free_doc_ids(query->doc_ids); + if (query->oper == FTS_EXIST) { + fts_query_free_doc_ids(query, query->doc_ids); query->doc_ids = query->intersection; query->intersection = NULL; } @@ -2649,6 +2875,10 @@ fts_query_visitor( ut_error; } + if (query->oper == FTS_EXIST) { + query->multi_exist = true; + } + return(query->error); } @@ -2656,7 +2886,7 @@ fts_query_visitor( Process (nested) sub-expression, create a new result set to store the sub-expression result by processing nodes under current sub-expression list. Merge the sub-expression result with that of parent expression list. -@return DB_SUCCESS if all went well */ +@return DB_SUCCESS if all well */ UNIV_INTERN dberr_t fts_ast_visit_sub_exp( @@ -2670,8 +2900,8 @@ fts_ast_visit_sub_exp( ib_rbt_t* parent_doc_ids; ib_rbt_t* subexpr_doc_ids; dberr_t error = DB_SUCCESS; - ibool inited = query->inited; bool will_be_ignored = false; + bool multi_exist; ut_a(node->type == FTS_AST_SUBEXP_LIST); @@ -2691,45 +2921,34 @@ fts_ast_visit_sub_exp( query->doc_ids = rbt_create(sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); - /* Reset the query start flag because the sub-expression result - set is independent of any previous results. The state flag - reset is needed for not making an intersect operation on an empty - set in the first call to fts_query_intersect() for the first term. */ - query->inited = FALSE; + query->total_size += SIZEOF_RBT_CREATE; + multi_exist = query->multi_exist; + query->multi_exist = false; /* Process nodes in current sub-expression and store its result set in query->doc_ids we created above. */ error = fts_ast_visit(FTS_NONE, node->next, visitor, arg, &will_be_ignored); /* Reinstate parent node state and prepare for merge. */ - query->inited = inited; + query->multi_exist = multi_exist; query->oper = cur_oper; subexpr_doc_ids = query->doc_ids; /* Restore current result set. */ query->doc_ids = parent_doc_ids; - if (query->oper == FTS_EXIST && !query->inited) { - ut_a(rbt_empty(query->doc_ids)); - /* Since this is the first time we need to convert this - intersection query into a union query. Otherwise we - will end up with an empty set. */ - query->oper = FTS_NONE; - query->inited = TRUE; - } - /* Merge the sub-expression result with the parent result set. */ if (error == DB_SUCCESS && !rbt_empty(subexpr_doc_ids)) { - fts_merge_doc_ids(query, subexpr_doc_ids); + error = fts_merge_doc_ids(query, subexpr_doc_ids); } if (query->oper == FTS_EXIST) { - query->multi_exist = TRUE; + query->multi_exist = true; } /* Free current result set. Result already merged into parent. */ - fts_query_free_doc_ids(subexpr_doc_ids); + fts_query_free_doc_ids(query, subexpr_doc_ids); return(error); } @@ -2808,9 +3027,10 @@ fts_query_find_doc_id( /*****************************************************************//** Read and filter nodes. -@return fts_node_t instance */ +@return DB_SUCCESS if all go well, +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ static -void +dberr_t fts_query_filter_doc_ids( /*=====================*/ fts_query_t* query, /*!< in: query instance */ @@ -2863,6 +3083,10 @@ fts_query_filter_doc_ids( parent container. */ match->positions = ib_vector_create( heap_alloc, sizeof(ulint), 64); + + query->total_size += sizeof(fts_match_t) + + sizeof(ib_vector_t) + + sizeof(ulint) * 64; } /* Unpack the positions within the document. */ @@ -2888,7 +3112,7 @@ fts_query_filter_doc_ids( /* Add the doc id to the doc freq rb tree, if the doc id doesn't exist it will be created. */ - doc_freq = fts_query_add_doc_freq(doc_freqs, doc_id); + doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id); /* Avoid duplicating frequency tally. */ if (doc_freq->freq == 0) { @@ -2904,21 +3128,29 @@ fts_query_filter_doc_ids( /* We simply collect the matching documents and the positions here and match later. */ if (!query->collect_positions) { + /* We ignore error here and will check it later */ fts_query_process_doc_id(query, doc_id, 0); - } - /* Add the word to the document's matched RB tree. */ - fts_query_add_word_to_document(query, doc_id, word); + /* Add the word to the document's matched RB tree. */ + fts_query_add_word_to_document(query, doc_id, word); + } } /* Some sanity checks. */ ut_a(doc_id == node->last_doc_id); + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } } /*****************************************************************//** -Read the FTS INDEX row. */ +Read the FTS INDEX row. +@return DB_SUCCESS if all go well. */ static -void +dberr_t fts_query_read_node( /*================*/ fts_query_t* query, /*!< in: query instance */ @@ -2932,6 +3164,7 @@ fts_query_read_node( fts_word_freq_t* word_freq; ibool skip = FALSE; byte term[FTS_MAX_WORD_LEN + 1]; + dberr_t error = DB_SUCCESS; ut_a(query->cur_node->type == FTS_AST_TERM || query->cur_node->type == FTS_AST_TEXT); @@ -3005,9 +3238,9 @@ fts_query_read_node( case 4: /* ILIST */ - fts_query_filter_doc_ids( - query, word_freq->word, word_freq, - &node, data, len, FALSE); + error = fts_query_filter_doc_ids( + query, word_freq->word, word_freq, + &node, data, len, FALSE); break; @@ -3021,6 +3254,8 @@ fts_query_read_node( ut_a(i == 5); } + + return error; } /*****************************************************************//** @@ -3047,9 +3282,15 @@ fts_query_index_fetch_nodes( ut_a(dfield_len <= FTS_MAX_WORD_LEN); - fts_query_read_node(query, &key, que_node_get_next(exp)); + /* Note: we pass error out by 'query->error' */ + query->error = fts_query_read_node(query, &key, que_node_get_next(exp)); - return(TRUE); + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(FALSE); + } else { + return(TRUE); + } } /*****************************************************************//** @@ -3107,27 +3348,22 @@ fts_query_calculate_ranking( const fts_query_t* query, /*!< in: query state */ fts_ranking_t* ranking) /*!< in: Document to rank */ { - const ib_rbt_node_t* node; + ulint pos = 0; + byte* word = NULL; /* At this stage, ranking->rank should not exceed the 1.0 bound */ ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0); + ut_ad(query->word_map->size() == query->word_vector->size()); - for (node = rbt_first(ranking->words); - node; - node = rbt_first(ranking->words)) { - + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { int ret; - const byte* word; - const byte** wordp; ib_rbt_bound_t parent; double weight; fts_doc_freq_t* doc_freq; fts_word_freq_t* word_freq; - wordp = rbt_value(const byte*, node); - word = *wordp; - + ut_ad(word != NULL); ret = rbt_search(query->word_freqs, &parent, word); /* It must exist. */ @@ -3146,8 +3382,6 @@ fts_query_calculate_ranking( weight = (double) doc_freq->freq * word_freq->idf; ranking->rank += (fts_rank_t) (weight * word_freq->idf); - - ut_free(rbt_remove_node(ranking->words, node)); } } @@ -3157,6 +3391,7 @@ static void fts_query_add_ranking( /*==================*/ + fts_query_t* query, /*!< in: query state */ ib_rbt_t* ranking_tree, /*!< in: ranking tree */ const fts_ranking_t* new_ranking) /*!< in: ranking of a document */ { @@ -3173,6 +3408,9 @@ fts_query_add_ranking( ut_a(ranking->words == NULL); } else { rbt_add_node(ranking_tree, &parent, new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); } } @@ -3213,14 +3451,13 @@ static fts_result_t* fts_query_prepare_result( /*=====================*/ - const fts_query_t* query, /*!< in: Query state */ - fts_result_t* result) /*!< in: result this can contain - data from a previous search on - another FTS index */ + fts_query_t* query, /*!< in: Query state */ + fts_result_t* result) /*!< in: result this can contain + data from a previous search on + another FTS index */ { const ib_rbt_node_t* node; - - ut_a(rbt_size(query->doc_ids) > 0); + bool result_is_null = false; if (result == NULL) { result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result))); @@ -3229,8 +3466,55 @@ fts_query_prepare_result( result->rankings_by_id = rbt_create( sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE; + result_is_null = true; + } + + if (query->flags == FTS_OPT_RANKING) { + fts_word_freq_t* word_freq; + ulint size = ib_vector_size(query->deleted->doc_ids); + fts_update_t* array = + (fts_update_t*) query->deleted->doc_ids->data; + + node = rbt_first(query->word_freqs); + ut_ad(node); + word_freq = rbt_value(fts_word_freq_t, node); + + for (node = rbt_first(word_freq->doc_freqs); + node; + node = rbt_next(word_freq->doc_freqs, node)) { + fts_doc_freq_t* doc_freq; + fts_ranking_t ranking; + + doc_freq = rbt_value(fts_doc_freq_t, node); + + /* Don't put deleted docs into result */ + if (fts_bsearch(array, 0, size, doc_freq->doc_id) + >= 0) { + continue; + } + + ranking.doc_id = doc_freq->doc_id; + ranking.rank = doc_freq->freq * word_freq->idf + * word_freq->idf; + ranking.words = NULL; + + fts_query_add_ranking(query, result->rankings_by_id, + &ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + return(NULL); + } + } + + return(result); } + ut_a(rbt_size(query->doc_ids) > 0); + for (node = rbt_first(query->doc_ids); node; node = rbt_next(query->doc_ids, node)) { @@ -3245,11 +3529,24 @@ fts_query_prepare_result( // different FTS indexes. /* We don't need these anymore free the resources. */ - ut_a(rbt_empty(ranking->words)); - rbt_free(ranking->words); ranking->words = NULL; - fts_query_add_ranking(result->rankings_by_id, ranking); + if (!result_is_null) { + fts_query_add_ranking(query, result->rankings_by_id, ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + return(NULL); + } + } + } + + if (result_is_null) { + /* Use doc_ids directly */ + rbt_free(result->rankings_by_id); + result->rankings_by_id = query->doc_ids; + query->doc_ids = NULL; } return(result); @@ -3261,10 +3558,10 @@ static fts_result_t* fts_query_get_result( /*=================*/ - const fts_query_t* query, /*!< in: query instance */ + fts_query_t* query, /*!< in: query instance */ fts_result_t* result) /*!< in: result */ { - if (rbt_size(query->doc_ids) > 0) { + if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) { /* Copy the doc ids to the result. */ result = fts_query_prepare_result(query, result); } else { @@ -3298,7 +3595,7 @@ fts_query_free( } if (query->doc_ids) { - fts_query_free_doc_ids(query->doc_ids); + fts_query_free_doc_ids(query, query->doc_ids); } if (query->word_freqs) { @@ -3327,6 +3624,14 @@ fts_query_free( mem_heap_free(query->heap); } + if (query->word_map) { + delete query->word_map; + } + + if (query->word_vector) { + delete query->word_vector; + } + memset(query, 0, sizeof(*query)); } @@ -3342,12 +3647,13 @@ fts_query_parse( { int error; fts_ast_state_t state; - ibool mode = query->boolean_mode; + bool mode = query->boolean_mode; memset(&state, 0x0, sizeof(state)); /* Setup the scanner to use, this depends on the mode flag. */ state.lexer = fts_lexer_create(mode, query_str, query_len); + state.charset = query->fts_index_table.charset; error = fts_parse(&state); fts_lexer_free(state.lexer); state.lexer = NULL; @@ -3363,6 +3669,112 @@ fts_query_parse( return(state.root); } +/*******************************************************************//** +FTS Query optimization +Set FTS_OPT_RANKING if it is a simple term query */ +static +void +fts_query_can_optimize( +/*===================*/ + fts_query_t* query, /*!< in/out: query instance */ + uint flags) /*!< In: FTS search mode */ +{ + fts_ast_node_t* node = query->root; + + if (flags & FTS_EXPAND) { + return; + } + + /* Check if it has only a term without oper */ + ut_ad(node->type == FTS_AST_LIST); + node = node->list.head; + if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) { + query->flags = FTS_OPT_RANKING; + } +} + +/*******************************************************************//** +Pre-process the query string +1) make it lower case +2) in boolean mode, if there is '-' or '+' that is immediately proceeded +and followed by valid word, make it a space +@return the processed string */ +static +byte* +fts_query_str_preprocess( +/*=====================*/ + const byte* query_str, /*!< in: FTS query */ + ulint query_len, /*!< in: FTS query string len */ + ulint *result_len, /*!< out: result string length */ + CHARSET_INFO* charset, /*!< in: string charset */ + bool boolean_mode) /*!< in: is boolean mode */ +{ + ulint cur_pos = 0; + ulint str_len; + byte* str_ptr; + bool in_phrase = false; + + /* Convert the query string to lower case before parsing. We own + the ut_malloc'ed result and so remember to free it before return. */ + + str_len = query_len * charset->casedn_multiply + 1; + str_ptr = static_cast<byte*>(ut_malloc(str_len)); + + *result_len = innobase_fts_casedn_str( + charset, const_cast<char*>(reinterpret_cast<const char*>( + query_str)), query_len, + reinterpret_cast<char*>(str_ptr), str_len); + + ut_ad(*result_len < str_len); + + str_ptr[*result_len] = 0; + + /* If it is boolean mode, no need to check for '-/+' */ + if (!boolean_mode) { + return(str_ptr); + } + + /* Otherwise, we travese the string to find any '-/+' that are + immediately proceeded and followed by valid search word. + NOTE: we should not do so for CJK languages, this should + be taken care of in our CJK implementation */ + while (cur_pos < *result_len) { + fts_string_t str; + ulint offset; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + charset, str_ptr + cur_pos, str_ptr + *result_len, + &str, &offset); + + if (cur_len == 0) { + break; + } + + /* Check if we are in a phrase, if so, no need to do + replacement of '-/+'. */ + for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) { + if ((char) (*ptr) == '"' ) { + in_phrase = !in_phrase; + } + } + + /* Find those are not leading '-/+' and also not in a phrase */ + if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1 + && !in_phrase) { + char* last_op = reinterpret_cast<char*>( + str_ptr + cur_pos); + + if (*last_op == '-' || *last_op == '+') { + *last_op = ' '; + } + } + + cur_pos += cur_len; + } + + return(str_ptr); +} /*******************************************************************//** FTS Query entry point. @@ -3382,9 +3794,8 @@ fts_query( fts_query_t query; dberr_t error = DB_SUCCESS; byte* lc_query_str; - ulint lc_query_str_len; ulint result_len; - ibool boolean_mode; + bool boolean_mode; trx_t* query_trx; CHARSET_INFO* charset; ulint start_time_ms; @@ -3401,7 +3812,6 @@ fts_query( query.trx = query_trx; query.index = index; - query.inited = FALSE; query.boolean_mode = boolean_mode; query.deleted = fts_doc_ids_create(); query.cur_node = NULL; @@ -3418,6 +3828,9 @@ fts_query( query.fts_index_table.parent = index->table->name; query.fts_index_table.charset = charset; + query.word_map = new word_map_t; + query.word_vector = new word_vector_t; + query.error = DB_SUCCESS; /* Setup the RB tree that will be used to collect per term statistics. */ @@ -3425,6 +3838,8 @@ fts_query( sizeof(fts_word_freq_t), innobase_fts_string_cmp, (void*) charset); + query.total_size += SIZEOF_RBT_CREATE; + query.total_docs = dict_table_get_n_rows(index->table); #ifdef FTS_DOC_STATS_DEBUG @@ -3467,6 +3882,7 @@ fts_query( /* Sort the vector so that we can do a binary search over the ids. */ ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp); +#if 0 /* Convert the query string to lower case before parsing. We own the ut_malloc'ed result and so remember to free it before return. */ @@ -3481,16 +3897,30 @@ fts_query( lc_query_str[result_len] = 0; +#endif + + lc_query_str = fts_query_str_preprocess( + query_str, query_len, &result_len, charset, boolean_mode); + query.heap = mem_heap_create(128); /* Create the rb tree for the doc id (current) set. */ query.doc_ids = rbt_create( sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + query.total_size += SIZEOF_RBT_CREATE; + /* Parse the input query string. */ if (fts_query_parse(&query, lc_query_str, result_len)) { fts_ast_node_t* ast = query.root; + /* Optimize query to check if it's a single term */ + fts_query_can_optimize(&query, flags); + + DBUG_EXECUTE_IF("fts_instrument_result_cache_limit", + fts_result_cache_limit = 2048; + ); + /* Traverse the Abstract Syntax Tree (AST) and execute the query. */ query.error = fts_ast_visit( @@ -3500,11 +3930,13 @@ fts_query( /* If query expansion is requested, extend the search with first search pass result */ if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) { - query.error = fts_expand_query(index, &query); + query.error = fts_expand_query(index, &query); } /* Calculate the inverse document frequency of the terms. */ - fts_query_calculate_idf(&query); + if (query.error == DB_SUCCESS) { + fts_query_calculate_idf(&query); + } /* Copy the result from the query state, so that we can return it to the caller. */ @@ -3530,6 +3962,15 @@ fts_query( (*result)->rankings_by_id ? (int) rbt_size((*result)->rankings_by_id) : -1); + + /* Log memory consumption & result size */ + ib_logf(IB_LOG_LEVEL_INFO, + "Full Search Memory: " + "%lu (bytes), Row: %lu .", + query.total_size, + (*result)->rankings_by_id + ? rbt_size((*result)->rankings_by_id) + : 0); } func_exit: @@ -3608,30 +4049,24 @@ static void fts_print_doc_id( /*=============*/ - ib_rbt_t* doc_ids) /*!< in : tree that stores doc_ids.*/ + fts_query_t* query) /*!< in : tree that stores doc_ids.*/ { const ib_rbt_node_t* node; - const ib_rbt_node_t* node_word; /* Iterate each member of the doc_id set */ - for (node = rbt_first(doc_ids); + for (node = rbt_first(query->doc_ids); node; - node = rbt_next(doc_ids, node)) { + node = rbt_next(query->doc_ids, node)) { fts_ranking_t* ranking; ranking = rbt_value(fts_ranking_t, node); fprintf(stderr, "doc_ids info, doc_id: %ld \n", (ulint) ranking->doc_id); - for (node_word = rbt_first(ranking->words); - node_word; - node_word = rbt_next(ranking->words, node_word)) { - - const byte** value; - - value = rbt_value(const byte*, node_word); - - fprintf(stderr, "doc_ids info, value: %s \n", *value); + ulint pos = 0; + byte* value = NULL; + while (fts_ranking_words_get_next(query, ranking, &pos, &value)) { + fprintf(stderr, "doc_ids info, value: %s \n", value); } } } @@ -3676,8 +4111,9 @@ fts_expand_query( result_doc.charset = index_cache->charset; + query->total_size += SIZEOF_RBT_CREATE; #ifdef UNIV_DEBUG - fts_print_doc_id(query->doc_ids); + fts_print_doc_id(query); #endif for (node = rbt_first(query->doc_ids); @@ -3685,7 +4121,12 @@ fts_expand_query( node = rbt_next(query->doc_ids, node)) { fts_ranking_t* ranking; - const ib_rbt_node_t* node_word; + ulint pos; + byte* word; + ulint prev_token_size; + ulint estimate_size; + + prev_token_size = rbt_size(result_doc.tokens); ranking = rbt_value(fts_ranking_t, node); @@ -3702,16 +4143,15 @@ fts_expand_query( /* Remove words that have already been searched in the first pass */ - for (node_word = rbt_first(ranking->words); - node_word; - node_word = rbt_next(ranking->words, node_word)) { + pos = 0; + word = NULL; + while (fts_ranking_words_get_next(query, ranking, &pos, + &word)) { fts_string_t str; ibool ret; - const byte** strp; - strp = rbt_value(const byte*, node_word); /* FIXME: We are discarding a const qualifier here. */ - str.f_str = (byte*) *strp; + str.f_str = word; str.f_len = ut_strlen((const char*) str.f_str); ret = rbt_delete(result_doc.tokens, &str); @@ -3723,6 +4163,18 @@ fts_expand_query( (ulint) ranking->doc_id); } } + + /* Estimate memory used, see fts_process_token and fts_token_t. + We ignore token size here. */ + estimate_size = (rbt_size(result_doc.tokens) - prev_token_size) + * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t) + + sizeof(ib_vector_t) + sizeof(ulint) * 32); + query->total_size += estimate_size; + + if (query->total_size > fts_result_cache_limit) { + error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + goto func_exit; + } } /* Search the table the second time with expanded search list */ @@ -3740,6 +4192,7 @@ fts_expand_query( } } +func_exit: fts_doc_free(&result_doc); return(error); @@ -3857,8 +4310,12 @@ fts_phrase_or_proximity_search( if (fts_query_is_in_proximity_range( query, match, &qualified_pos)) { /* If so, mark we find a matching doc */ - fts_query_process_doc_id( + query->error = fts_query_process_doc_id( query, match[0]->doc_id, 0); + if (query->error != DB_SUCCESS) { + matched = FALSE; + goto func_exit; + } matched = TRUE; for (ulint z = 0; z < num_token; z++) { |