summaryrefslogtreecommitdiff
path: root/storage/innobase/include/fts0types.h
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/include/fts0types.h')
-rw-r--r--storage/innobase/include/fts0types.h481
1 files changed, 481 insertions, 0 deletions
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000000..5b28f2c9473
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,481 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "que0types.h"
+#include "ut0byte.h"
+#include "fut0fut.h"
+#include "ut0rbt.h"
+#include "fts0fts.h"
+
+/** Types (aliases) used within FTS. */
+typedef struct fts_que_struct fts_que_t;
+typedef struct fts_node_struct fts_node_t;
+typedef struct fts_word_struct fts_word_t;
+typedef struct fts_fetch_struct fts_fetch_t;
+typedef struct fts_update_struct fts_update_t;
+typedef struct fts_get_doc_struct fts_get_doc_t;
+typedef struct fts_utf8_str_struct fts_utf8_str_t;
+typedef struct fts_doc_stats_struct fts_doc_stats_t;
+typedef struct fts_tokenizer_word_struct fts_tokenizer_word_t;
+typedef struct fts_index_selector_struct fts_index_selector_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_struct {
+ doc_id_t doc_id; /*!< Document id */
+ ulint word_count; /*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_struct {
+ fts_index_cache_t*
+ index_cache; /*!< The index cache instance */
+
+ /*!< Parsed sql statement */
+ que_t* get_document_graph;
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_struct {
+ dict_index_t* index; /*!< The FTS index instance */
+
+ ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*,
+ cells are fts_tokenizer_word_t*.*/
+
+ ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t
+ contained in the memory buffer.
+ Must be in sorted order (ascending).
+ The ideal choice is an rb tree but
+ the rb tree imposes a space overhead
+ that we can do without */
+
+ que_t** ins_graph; /*!< Insert query graphs */
+
+ que_t** sel_graph; /*!< Select query graphs */
+ CHARSET_INFO* charset; /*!< charset */
+};
+
+/** For supporting the tracking of updates on multiple FTS indexes we need
+to track which FTS indexes need to be updated. For INSERT and DELETE we
+update all fts indexes. */
+struct fts_update_struct {
+ doc_id_t doc_id; /*!< The doc id affected */
+
+ ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be
+ updated. A NULL value means all
+ indexes need to be updated. This
+ vector is not allocated on the heap
+ and so must be freed explicitly,
+ when we are done with it */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_struct {
+ ulint status; /*!< Status of the stopword tree */
+ ib_alloc_t* heap; /*!< The memory allocator to use */
+ ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */
+ CHARSET_INFO* charset; /*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_struct {
+ trx_t* trx; /*!< The transaction used for SYNCing
+ the cache to disk */
+ dict_table_t* table; /*!< Table with FTS index(es) */
+ ulint max_cache_size; /*!< Max size in bytes of the cache */
+ ibool cache_full; /*!< flag, when true it indicates that
+ we need to sync the cache to disk */
+ ulint lower_index; /*!< the start index of the doc id
+ vector from where to start adding
+ documents to the FTS cache */
+ ulint upper_index; /*!< max index of the doc id vector to
+ add to the FTS cache */
+ ibool interrupted; /*!< TRUE if SYNC was interrupted */
+ doc_id_t min_doc_id; /*!< The smallest doc id added to the
+ cache. It should equal to
+ doc_ids[lower_index] */
+ doc_id_t max_doc_id; /*!< The doc id at which the cache was
+ noted as being full, we use this to
+ set the upper_limit field */
+ ib_time_t start_time; /*!< SYNC start time */
+};
+
+typedef struct fts_sync_struct fts_sync_t;
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_struct {
+ rw_lock_t lock; /*!< lock protecting all access to the
+ memory buffer. FIXME: this needs to
+ be our new upgrade-capable rw-lock */
+
+ rw_lock_t init_lock; /*!< lock used for the cache
+ intialization, it has different
+ SYNC level as above cache lock */
+
+ mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
+
+ mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
+
+ mutex_t doc_id_lock; /*!< Lock covering Doc ID */
+
+ ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each
+ element is of type fts_update_t */
+
+ ib_vector_t* indexes; /*!< We store the stats and inverted
+ index for the individual FTS indexes
+ in this vector. Each element is
+ an instance of fts_index_cache_t */
+
+ ib_vector_t* get_docs; /*!< information required to read
+ the document from the table. Each
+ element is of type fts_doc_t */
+
+ ulint total_size; /*!< total size consumed by the ilist
+ field of all nodes. SYNC is run
+ whenever this gets too big */
+ fts_sync_t* sync; /*!< sync structure to sync data to
+ disk */
+ ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes
+ and deleted_doc_ids, ie. transient
+ objects, they are recreated after
+ a SYNC is completed */
+
+
+ ib_alloc_t* self_heap; /*!< This heap is the heap out of
+ which an instance of the cache itself
+ was created. Objects created using
+ this heap will last for the lifetime
+ of the cache */
+
+ doc_id_t next_doc_id; /*!< Next doc id */
+
+ doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
+
+ doc_id_t first_doc_id; /*!< first doc id since this table
+ was opened */
+
+ ulint deleted; /*!< Number of doc ids deleted since
+ last optimized. This variable is
+ covered by deleted_lock */
+
+ ulint added; /*!< Number of doc ids added since last
+ optimized. This variable is covered by
+ the deleted lock */
+
+ fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
+ mem_heap_t* cache_heap; /*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_struct {
+ doc_id_t first_doc_id; /*!< First document id in ilist. */
+
+ doc_id_t last_doc_id; /*!< Last document id in ilist. */
+
+ byte* ilist; /*!< Binary list of documents & word
+ positions the token appears in.
+ TODO: For now, these are simply
+ ut_malloc'd, but if testing shows
+ that they waste memory unacceptably, a
+ special memory allocator will have
+ to be written */
+
+ ulint doc_count; /*!< Number of doc ids in ilist */
+
+ ulint ilist_size; /*!< Used size of ilist in bytes. */
+
+ ulint ilist_size_alloc;
+ /*!< Allocated size of ilist in
+ bytes */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_struct {
+ fts_string_t text; /*!< Token text. */
+
+ ib_vector_t* nodes; /*!< Word node ilists, each element is
+ of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_struct {
+ fts_string_t text; /*!< Word value in UTF-8 */
+ ib_vector_t* nodes; /*!< Nodes read from disk */
+
+ ib_alloc_t* heap_alloc; /*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_struct {
+ void* read_arg; /*!< Arg for the sql_callback */
+
+ fts_sql_callback
+ read_record; /*!< Callback for reading index
+ record */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_struct {
+ ulint value; /*!< Character value at which
+ to split */
+
+ const char* suffix; /*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_struct {
+ fts_string_t text; /*!< document text */
+
+ ibool found; /*!< TRUE if the document was found
+ successfully in the database */
+
+ ib_rbt_t* tokens; /*!< This is filled when the document
+ is tokenized. Tokens; indexed by
+ fts_string_t*, cells are of type
+ fts_token_t* */
+
+ ib_alloc_t* self_heap; /*!< An instance of this type is
+ allocated from this heap along
+ with any objects that have the
+ same lifespan, most notably
+ the vector of token positions */
+ CHARSET_INFO* charset; /*!< Document's charset info */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_struct {
+ fts_string_t text; /*!< token text */
+
+ ib_vector_t* positions; /*!< an array of the positions the
+ token is found in; each item is
+ actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two UTF-8 strings. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: key */
+ const void* p2); /*!< in: node */
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_update_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.*/
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+ /*!< out: value decoded */
+ byte** ptr); /*!< in: ptr to decode from, this ptr is
+ incremented by the number of bytes decoded */
+
+/******************************************************************//**
+Duplicate an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ fts_string_t* dst, /*!< in: dup to here */
+ const fts_string_t* src, /*!< in: src string */
+ mem_heap_t* heap); /*!< in: heap to use */
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme. */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+ /*!< out: length of value
+ encoded, in bytes */
+ ulint val); /*!< in: value to encode */
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes. */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+ /*!< out: length of value
+ encoded, in bytes */
+ ulint val, /*!< in: value to encode */
+ byte* buf); /*!< in: buffer, must have
+ enough space */
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx 0xxxxxxx
+00000yyy yyxxxxxx 110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+ /*!< out: UTF8_ERROR if *ptr
+ did not point to a valid
+ UTF-8 sequence, or the
+ Unicode code point. */
+ const byte** ptr); /*!< in/out: pointer to
+ UTF-8 string. The
+ pointer is advanced to
+ the start of the next
+ character. */
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+ fts_string_t* str); /*!< in: string */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+ ulint selected); /*!< in: selected index */
+
+/********************************************************************
+Get the number of index selectors. */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void);
+/*=====================*/
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given string.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+ const CHARSET_INFO* cs, /*!< Charset */
+ const byte* str, /*!< in: word string */
+ ulint len); /*!< in: string length */
+
+/********************************************************************
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+ const CHARSET_INFO* cs, /*!< Charset */
+ const byte* str, /*!< in: string */
+ ulint len); /*!< in: string length */
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#endif /* INNOBASE_FTS0TYPES_H */