diff options
Diffstat (limited to 'storage/innobase/include/fts0types.ic')
-rw-r--r-- | storage/innobase/include/fts0types.ic | 427 |
1 files changed, 427 insertions, 0 deletions
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic new file mode 100644 index 00000000000..2734a331a86 --- /dev/null +++ b/storage/innobase/include/fts0types.ic @@ -0,0 +1,427 @@ +/***************************************************************************** + +Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.ic +Full text search types. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_IC +#define INNOBASE_FTS0TYPES_IC + +#include <ctype.h> + +#include "rem0cmp.h" +#include "ha_prototypes.h" + +extern const ulint UTF8_ERROR; + +/* Determine if a UTF-8 continuation byte is valid. */ +#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80) + +/******************************************************************//** +Compare two fts_trx_table_t instances. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((table1->id > table2->id) + ? 1 + : (table1->id == table2->id) + ? 0 + : -1); +} + +/******************************************************************//** +Compare a table id with a fts_trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const ullint* table_id = (const ullint*) p1; + const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table; + + return((*table_id > table2->id) + ? 1 + : (*table_id == table2->id) + ? 0 + : -1); +} + +/******************************************************************//** +Duplicate an UTF-8 string. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +void +fts_utf8_string_dup( +/*================*/ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap) /*!< in: heap to use */ +{ + dst->f_str = (byte*) mem_heap_dup(heap, src->f_str, src->f_len + 1); + + dst->f_len = src->f_len; + dst->f_str[src->f_len] = 0; + dst->f_n_char = src->f_n_char; +} + +/******************************************************************//** +Compare two fts_trx_row_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; + const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; + + return((int)(tr1->doc_id - tr2->doc_id)); +} + +/******************************************************************//** +Compare two fts_ranking_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; + const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; + + return((int)(rk1->doc_id - rk2->doc_id)); +} + +/******************************************************************//** +Compare two fts_update_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_update_doc_id_cmp( +/*==================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_update_t* up1 = (const fts_update_t*) p1; + const fts_update_t* up2 = (const fts_update_t*) p2; + + return((int)(up1->doc_id - up2->doc_id)); +} + + +/******************************************************************//** +Lowercase an UTF-8 string. */ +UNIV_INLINE +void +fts_utf8_tolower( +/*=============*/ + fts_string_t* str) /*!< in: string */ +{ + innobase_casedn_str((char*) str->f_str); +} + +/******************************************************************//** +Compare two UTF-8 strings. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_utf8_string_cmp( +/*================*/ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + return(cmp_data_data_slow_varchar( + s1->f_str, s1->f_len, s2->f_str, s2->f_len)); +} + +/******************************************************************//** +Compare two UTF-8 strings, and return match (0) if +passed in "key" value equals or is the prefix of the "node" value. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_utf8_string_cmp_prefix( +/*=======================*/ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + int result; + ulint len; + + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + len = ut_min(s1->f_len, s2->f_len); + + result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len); + + if (result) { + return(result); + } + + if (s1->f_len > s2->f_len) { + return(1); + } + + return(0); +} + +/******************************************************************//** +Decode a UTF-8 character. + +http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: + + Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte +00000000 0xxxxxxx 0xxxxxxx +00000yyy yyxxxxxx 110yyyyy 10xxxxxx +zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx +000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx + +This function decodes UTF-8 sequences up to 6 bytes (31 bits). + +On error *ptr will point to the first byte that was not correctly +decoded. This will hopefully help in resyncing the input. +@return UTF8_ERROR if *ptr did not point to a valid +UTF-8 sequence, or the Unicode code point. */ +UNIV_INLINE +ulint +fts_utf8_decode( +/*============*/ + const byte** ptr) /*!< in/out: pointer to + UTF-8 string. The + pointer is advanced to + the start of the next + character. */ +{ + const byte* p = *ptr; + ulint ch = *p++; +#ifdef UNIV_DEBUG + ulint min_ch; +#endif /* UNIV_DEBUG */ + + if (UNIV_LIKELY(ch < 0x80)) { + /* 0xxxxxxx */ + } else if (UNIV_UNLIKELY(ch < 0xC0)) { + /* A continuation byte cannot start a code. */ + goto err_exit; + } else if (ch < 0xE0) { + /* 110yyyyy 10xxxxxx */ + ch &= 0x1F; + ut_d(min_ch = 0x80); + goto get1; + } else if (ch < 0xF0) { + /* 1110zzzz 10yyyyyy 10xxxxxx */ + ch &= 0x0F; + ut_d(min_ch = 0x800); + goto get2; + } else if (ch < 0xF8) { + /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ch &= 0x07; + ut_d(min_ch = 0x10000); + goto get3; + } else if (ch < 0xFC) { + /* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ch &= 0x03; + ut_d(min_ch = 0x200000); + goto get4; + } else if (ch < 0xFE) { + /* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ + ut_d(min_ch = 0x4000000); + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get4: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get3: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get2: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; +get1: + if (!fts_utf8_is_valid(*p)) { + goto err_exit; + } + ch <<= 6; + ch |= (*p++) & 0x3F; + + /* The following is needed in the 6-byte case + when ulint is wider than 32 bits. */ + ch &= 0xFFFFFFFF; + + /* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs) + and U+FFFE and U+FFFF cannot occur in valid UTF-8. */ + + if ( (ch >= 0xD800 && ch <= 0xDFFF) +#ifdef UNIV_DEBUG + || ch < min_ch +#endif /* UNIV_DEBUG */ + || ch == 0xFFFE || ch == 0xFFFF) { + + ch = UTF8_ERROR; + } + } else { +err_exit: + ch = UTF8_ERROR; + } + + *ptr = p; + + return(ch); +} + +/******************************************************************//** +Get the first character's code position for FTS index partition */ +extern +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const uchar* p2, /*!< in: string */ + const ulint len2); /*!< in: string length */ + +/******************************************************************//** +Select the FTS auxiliary index for the given character. +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( +/*=============*/ + const CHARSET_INFO* cs, /*!< in: Charset */ + const byte* str, /*!< in: string */ + ulint len) /*!< in: string length */ +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected); + + } else if (fts_index_selector[selected].value > value) { + + return(selected > 0 ? selected - 1 : 0); + } + + ++selected; + } + + ut_ad(selected > 1); + + return(selected - 1); +} + +/******************************************************************//** +Select the next FTS auxiliary index for the given character. +@return the next index to use for character */ +UNIV_INLINE +ulint +fts_select_next_index( +/*==================*/ + const CHARSET_INFO* cs, /*!< in: Charset */ + const byte* str, /*!< in: string */ + ulint len) /*!< in: string length */ +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected + 1); + + } else if (fts_index_selector[selected].value > value) { + + return(selected); + } + + ++selected; + } + + ut_ad(selected > 0); + + return((ulint) selected); +} + +/******************************************************************//** +Return the selected FTS aux index suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected) /*!< in: selected index */ +{ + return(fts_index_selector[selected].suffix); +} + +/******************************************************************//** +Get the number of index selectors. +@return The number of selectors */ +UNIV_INLINE +ulint +fts_get_n_selectors(void) +/*=====================*/ +{ + ulint i = 0; + + // FIXME: This is a hack + while (fts_index_selector[i].value != 0) { + ++i; + } + + return(i); +} + +#endif /* INNOBASE_FTS0TYPES_IC */ |