/***************************************************************************** Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA *****************************************************************************/ /******************************************************************//** @file include/fts0types.h Full text search types file Created 2007-03-27 Sunny Bains *******************************************************/ #ifndef INNOBASE_FTS0TYPES_H #define INNOBASE_FTS0TYPES_H #include "fts0fts.h" #include "fut0fut.h" #include "pars0pars.h" #include "que0types.h" #include "ut0byte.h" #include "ut0rbt.h" /** Types used within FTS. */ struct fts_que_t; struct fts_node_t; /** Callbacks used within FTS. */ typedef pars_user_func_cb_t fts_sql_callback; typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); /** Statistics relevant to a particular document, used during retrieval. */ struct fts_doc_stats_t { doc_id_t doc_id; /*!< Document id */ ulint word_count; /*!< Total words in the document */ }; /** It's main purpose is to store the SQL prepared statements that are required to retrieve a document from the database. */ struct fts_get_doc_t { fts_index_cache_t* index_cache; /*!< The index cache instance */ /*!< Parsed sql statement */ que_t* get_document_graph; fts_cache_t* cache; /*!< The parent cache */ }; /** Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc. */ struct fts_index_cache_t { dict_index_t* index; /*!< The FTS index instance */ ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, cells are fts_tokenizer_word_t*.*/ ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t contained in the memory buffer. Must be in sorted order (ascending). The ideal choice is an rb tree but the rb tree imposes a space overhead that we can do without */ que_t** ins_graph; /*!< Insert query graphs */ que_t** sel_graph; /*!< Select query graphs */ CHARSET_INFO* charset; /*!< charset */ }; /** Stop word control infotmation. */ struct fts_stopword_t { ulint status; /*!< Status of the stopword tree */ ib_alloc_t* heap; /*!< The memory allocator to use */ ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ CHARSET_INFO* charset; /*!< charset for stopword */ }; /** The SYNC state of the cache. There is one instance of this struct associated with each ADD thread. */ struct fts_sync_t { trx_t* trx; /*!< The transaction used for SYNCing the cache to disk */ dict_table_t* table; /*!< Table with FTS index(es) */ ulint max_cache_size; /*!< Max size in bytes of the cache */ ibool cache_full; /*!< flag, when true it indicates that we need to sync the cache to disk */ ulint lower_index; /*!< the start index of the doc id vector from where to start adding documents to the FTS cache */ ulint upper_index; /*!< max index of the doc id vector to add to the FTS cache */ ibool interrupted; /*!< TRUE if SYNC was interrupted */ doc_id_t min_doc_id; /*!< The smallest doc id added to the cache. It should equal to doc_ids[lower_index] */ doc_id_t max_doc_id; /*!< The doc id at which the cache was noted as being full, we use this to set the upper_limit field */ time_t start_time; /*!< SYNC start time; only used if fts_enable_diag_print */ bool in_progress; /*!< flag whether sync is in progress.*/ bool unlock_cache; /*!< flag whether unlock cache when write fts node */ /** condition variable for in_progress; used with table->fts->cache->lock */ pthread_cond_t cond; }; /** The cache for the FTS system. It is a memory-based inverted index that new entries are added to, until it grows over the configured maximum size, at which time its contents are written to the INDEX table. */ struct fts_cache_t { /** lock protecting all access to the memory buffer */ mysql_mutex_t lock; /** cache initialization */ mysql_mutex_t init_lock; /** protection for deleted_doc_ids */ mysql_mutex_t deleted_lock; /** protection for DOC_ID */ mysql_mutex_t doc_id_lock; ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each element is of type fts_update_t */ ib_vector_t* indexes; /*!< We store the stats and inverted index for the individual FTS indexes in this vector. Each element is an instance of fts_index_cache_t */ ib_vector_t* get_docs; /*!< information required to read the document from the table. Each element is of type fts_doc_t */ size_t total_size; /*!< total size consumed by the ilist field of all nodes. SYNC is run whenever this gets too big */ fts_sync_t* sync; /*!< sync structure to sync data to disk */ ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes and deleted_doc_ids, ie. transient objects, they are recreated after a SYNC is completed */ ib_alloc_t* self_heap; /*!< This heap is the heap out of which an instance of the cache itself was created. Objects created using this heap will last for the lifetime of the cache */ doc_id_t next_doc_id; /*!< Next doc id */ doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ doc_id_t first_doc_id; /*!< first doc id since this table was opened */ ulint deleted; /*!< Number of doc ids deleted since last optimized. This variable is covered by deleted_lock */ ulint added; /*!< Number of doc ids added since last optimized. This variable is covered by the deleted lock */ fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ mem_heap_t* cache_heap; /*!< Cache Heap */ }; /** Columns of the FTS auxiliary INDEX table */ struct fts_node_t { doc_id_t first_doc_id; /*!< First document id in ilist. */ doc_id_t last_doc_id; /*!< Last document id in ilist. */ byte* ilist; /*!< Binary list of documents & word positions the token appears in. TODO: For now, these are simply ut_malloc'd, but if testing shows that they waste memory unacceptably, a special memory allocator will have to be written */ ulint doc_count; /*!< Number of doc ids in ilist */ ulint ilist_size; /*!< Used size of ilist in bytes. */ ulint ilist_size_alloc; /*!< Allocated size of ilist in bytes */ bool synced; /*!< flag whether the node is synced */ }; /** A tokenizer word. Contains information about one word. */ struct fts_tokenizer_word_t { fts_string_t text; /*!< Token text. */ ib_vector_t* nodes; /*!< Word node ilists, each element is of type fts_node_t */ }; /** Word text plus it's array of nodes as on disk in FTS index */ struct fts_word_t { fts_string_t text; /*!< Word value in UTF-8 */ ib_vector_t* nodes; /*!< Nodes read from disk */ ib_alloc_t* heap_alloc; /*!< For handling all allocations */ }; /** Callback for reading and filtering nodes that are read from FTS index */ struct fts_fetch_t { void* read_arg; /*!< Arg for the sql_callback */ fts_sql_callback read_record; /*!< Callback for reading index record */ size_t total_memory; /*!< Total memory used */ }; /** For horizontally splitting an FTS auxiliary index */ struct fts_index_selector_t { ulint value; /*!< Character value at which to split */ const char* suffix; /*!< FTS aux index suffix */ }; /** This type represents a single document. */ struct fts_doc_t { fts_string_t text; /*!< document text */ ibool found; /*!< TRUE if the document was found successfully in the database */ ib_rbt_t* tokens; /*!< This is filled when the document is tokenized. Tokens; indexed by fts_string_t*, cells are of type fts_token_t* */ ib_alloc_t* self_heap; /*!< An instance of this type is allocated from this heap along with any objects that have the same lifespan, most notably the vector of token positions */ CHARSET_INFO* charset; /*!< Document's charset info */ st_mysql_ftparser* parser; /*!< fts plugin parser */ ib_rbt_t* stopwords; /*!< Stopwords */ }; /** A token and its positions within a document. */ struct fts_token_t { fts_string_t text; /*!< token text */ ib_vector_t* positions; /*!< an array of the positions the token is found in; each item is actually an ulint. */ }; /** It's defined in fts/fts0fts.c */ extern const fts_index_selector_t fts_index_selector[]; /******************************************************************//** Compare two fts_trx_row_t instances doc_ids. */ UNIV_INLINE int fts_trx_row_doc_id_cmp( /*===================*/ /*!< out: < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ const void* p1, /*!< in: id1 */ const void* p2); /*!< in: id2 */ /******************************************************************//** Compare two fts_ranking_t instances doc_ids. */ UNIV_INLINE int fts_ranking_doc_id_cmp( /*===================*/ /*!< out: < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ const void* p1, /*!< in: id1 */ const void* p2); /*!< in: id2 */ /******************************************************************//** Compare two doc_ids. */ UNIV_INLINE int fts_doc_id_cmp( /*==================*/ /*!< out: < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ const void* p1, /*!< in: id1 */ const void* p2); /*!< in: id2 */ /******************************************************************//** Decode and return the integer that was encoded using our VLC scheme.*/ UNIV_INLINE ulint fts_decode_vlc( /*===========*/ /*!< out: value decoded */ byte** ptr); /*!< in: ptr to decode from, this ptr is incremented by the number of bytes decoded */ /******************************************************************//** Duplicate a string. */ UNIV_INLINE void fts_string_dup( /*===========*/ /*!< out: < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ fts_string_t* dst, /*!< in: dup to here */ const fts_string_t* src, /*!< in: src string */ mem_heap_t* heap); /*!< in: heap to use */ /******************************************************************//** Return length of val if it were encoded using our VLC scheme. */ UNIV_INLINE ulint fts_get_encoded_len( /*================*/ /*!< out: length of value encoded, in bytes */ ulint val); /*!< in: value to encode */ /******************************************************************//** Encode an integer using our VLC scheme and return the length in bytes. */ UNIV_INLINE ulint fts_encode_int( /*===========*/ /*!< out: length of value encoded, in bytes */ ulint val, /*!< in: value to encode */ byte* buf); /*!< in: buffer, must have enough space */ /******************************************************************//** Get the selected FTS aux INDEX suffix. */ UNIV_INLINE const char* fts_get_suffix( /*===========*/ ulint selected); /*!< in: selected index */ /** Select the FTS auxiliary index for the given character. @param[in] cs charset @param[in] str string @param[in] len string length in bytes @return the index to use for the string */ UNIV_INLINE ulint fts_select_index( const CHARSET_INFO* cs, const byte* str, ulint len); #include "fts0types.ic" #include "fts0vlc.ic" #endif /* INNOBASE_FTS0TYPES_H */