diff options
Diffstat (limited to 'storage/innobase/include/fts0types.ic')
-rw-r--r-- | storage/innobase/include/fts0types.ic | 319 |
1 files changed, 82 insertions, 237 deletions
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic index 5e29cf6d8c5..e388d6257f6 100644 --- a/storage/innobase/include/fts0types.ic +++ b/storage/innobase/include/fts0types.ic @@ -1,6 +1,7 @@ /***************************************************************************** -Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,23 +27,13 @@ Created 2007-03-27 Sunny Bains #ifndef INNOBASE_FTS0TYPES_IC #define INNOBASE_FTS0TYPES_IC -#include <ctype.h> - -#include "rem0cmp.h" -#include "ha_prototypes.h" - -extern const ulint UTF8_ERROR; - -/* Determine if a UTF-8 continuation byte is valid. */ -#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80) - /******************************************************************//** -Duplicate an UTF-8 string. +Duplicate a string. @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ UNIV_INLINE void -fts_utf8_string_dup( -/*================*/ +fts_string_dup( +/*===========*/ fts_string_t* dst, /*!< in: dup to here */ const fts_string_t* src, /*!< in: src string */ mem_heap_t* heap) /*!< in: heap to use */ @@ -103,183 +94,6 @@ fts_update_doc_id_cmp( return((int)(up1->doc_id - up2->doc_id)); } - -/******************************************************************//** -Lowercase an UTF-8 string. */ -UNIV_INLINE -void -fts_utf8_tolower( -/*=============*/ - fts_string_t* str) /*!< in: string */ -{ - innobase_casedn_str((char*) str->f_str); -} - -/******************************************************************//** -Compare two UTF-8 strings. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_utf8_string_cmp( -/*================*/ - const void* p1, /*!< in: key */ - const void* p2) /*!< in: node */ -{ - const fts_string_t* s1 = (const fts_string_t*) p1; - const fts_string_t* s2 = (const fts_string_t*) p2; - - return(cmp_data_data_slow_varchar( - s1->f_str, s1->f_len, s2->f_str, s2->f_len)); -} - -/******************************************************************//** -Compare two UTF-8 strings, and return match (0) if -passed in "key" value equals or is the prefix of the "node" value. -@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ -UNIV_INLINE -int -fts_utf8_string_cmp_prefix( -/*=======================*/ - const void* p1, /*!< in: key */ - const void* p2) /*!< in: node */ -{ - int result; - ulint len; - - const fts_string_t* s1 = (const fts_string_t*) p1; - const fts_string_t* s2 = (const fts_string_t*) p2; - - len = ut_min(s1->f_len, s2->f_len); - - result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len); - - if (result) { - return(result); - } - - if (s1->f_len > s2->f_len) { - return(1); - } - - return(0); -} - -/******************************************************************//** -Decode a UTF-8 character. - -http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf: - - Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte -00000000 0xxxxxxx 0xxxxxxx -00000yyy yyxxxxxx 110yyyyy 10xxxxxx -zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx -000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx - -This function decodes UTF-8 sequences up to 6 bytes (31 bits). - -On error *ptr will point to the first byte that was not correctly -decoded. This will hopefully help in resyncing the input. -@return UTF8_ERROR if *ptr did not point to a valid -UTF-8 sequence, or the Unicode code point. */ -UNIV_INLINE -ulint -fts_utf8_decode( -/*============*/ - const byte** ptr) /*!< in/out: pointer to - UTF-8 string. The - pointer is advanced to - the start of the next - character. */ -{ - const byte* p = *ptr; - ulint ch = *p++; -#ifdef UNIV_DEBUG - ulint min_ch; -#endif /* UNIV_DEBUG */ - - if (UNIV_LIKELY(ch < 0x80)) { - /* 0xxxxxxx */ - } else if (UNIV_UNLIKELY(ch < 0xC0)) { - /* A continuation byte cannot start a code. */ - goto err_exit; - } else if (ch < 0xE0) { - /* 110yyyyy 10xxxxxx */ - ch &= 0x1F; - ut_d(min_ch = 0x80); - goto get1; - } else if (ch < 0xF0) { - /* 1110zzzz 10yyyyyy 10xxxxxx */ - ch &= 0x0F; - ut_d(min_ch = 0x800); - goto get2; - } else if (ch < 0xF8) { - /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ - ch &= 0x07; - ut_d(min_ch = 0x10000); - goto get3; - } else if (ch < 0xFC) { - /* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ - ch &= 0x03; - ut_d(min_ch = 0x200000); - goto get4; - } else if (ch < 0xFE) { - /* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */ - ut_d(min_ch = 0x4000000); - if (!fts_utf8_is_valid(*p)) { - goto err_exit; - } - ch <<= 6; - ch |= (*p++) & 0x3F; -get4: - if (!fts_utf8_is_valid(*p)) { - goto err_exit; - } - ch <<= 6; - ch |= (*p++) & 0x3F; -get3: - if (!fts_utf8_is_valid(*p)) { - goto err_exit; - } - ch <<= 6; - ch |= (*p++) & 0x3F; -get2: - if (!fts_utf8_is_valid(*p)) { - goto err_exit; - } - ch <<= 6; - ch |= (*p++) & 0x3F; -get1: - if (!fts_utf8_is_valid(*p)) { - goto err_exit; - } - ch <<= 6; - ch |= (*p++) & 0x3F; - - /* The following is needed in the 6-byte case - when ulint is wider than 32 bits. */ - ch &= 0xFFFFFFFF; - - /* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs) - and U+FFFE and U+FFFF cannot occur in valid UTF-8. */ - - if ( (ch >= 0xD800 && ch <= 0xDFFF) -#ifdef UNIV_DEBUG - || ch < min_ch -#endif /* UNIV_DEBUG */ - || ch == 0xFFFE || ch == 0xFFFF) { - - ch = UTF8_ERROR; - } - } else { -err_exit: - ch = UTF8_ERROR; - } - - *ptr = p; - - return(ch); -} - /******************************************************************//** Get the first character's code position for FTS index partition */ extern @@ -290,16 +104,38 @@ innobase_strnxfrm( const uchar* p2, /*!< in: string */ const ulint len2); /*!< in: string length */ -/******************************************************************//** -Select the FTS auxiliary index for the given character. -@return the index to use for the string */ +/** Check if fts index charset is cjk +@param[in] cs charset +@retval true if the charset is cjk +@retval false if not. */ +inline bool fts_is_charset_cjk(const CHARSET_INFO* cs) +{ + switch (cs->number) { + case 24: /* my_charset_gb2312_chinese_ci */ + case 28: /* my_charset_gbk_chinese_ci */ + case 1: /* my_charset_big5_chinese_ci */ + case 12: /* my_charset_ujis_japanese_ci */ + case 13: /* my_charset_sjis_japanese_ci */ + case 95: /* my_charset_cp932_japanese_ci */ + case 97: /* my_charset_eucjpms_japanese_ci */ + case 19: /* my_charset_euckr_korean_ci */ + return true; + default: + return false; + } +} + +/** Select the FTS auxiliary index for the given character by range. +@param[in] cs charset +@param[in] str string +@param[in] len string length +@retval the index to use for the string */ UNIV_INLINE ulint -fts_select_index( -/*=============*/ - const CHARSET_INFO* cs, /*!< in: Charset */ - const byte* str, /*!< in: string */ - ulint len) /*!< in: string length */ +fts_select_index_by_range( + const CHARSET_INFO* cs, + const byte* str, + ulint len) { ulint selected = 0; ulint value = innobase_strnxfrm(cs, str, len); @@ -323,37 +159,64 @@ fts_select_index( return(selected - 1); } -/******************************************************************//** -Select the next FTS auxiliary index for the given character. -@return the next index to use for character */ +/** Select the FTS auxiliary index for the given character by hash. +@param[in] cs charset +@param[in] str string +@param[in] len string length +@retval the index to use for the string */ UNIV_INLINE ulint -fts_select_next_index( -/*==================*/ - const CHARSET_INFO* cs, /*!< in: Charset */ - const byte* str, /*!< in: string */ - ulint len) /*!< in: string length */ +fts_select_index_by_hash( + const CHARSET_INFO* cs, + const byte* str, + ulint len) { - ulint selected = 0; - ulint value = innobase_strnxfrm(cs, str, len); + int char_len; + ulong nr1 = 1; + ulong nr2 = 4; - while (fts_index_selector[selected].value != 0) { + ut_ad(!(str == NULL && len > 0)); - if (fts_index_selector[selected].value == value) { + if (str == NULL || len == 0) { + return 0; + } - return(selected + 1); + /* Get the first char */ + /* JAN: TODO: MySQL 5.7 had + char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str), + reinterpret_cast<const char*>(str + len)); + */ + char_len = cs->cset->charlen(cs, str, str+len); - } else if (fts_index_selector[selected].value > value) { + ut_ad(static_cast<ulint>(char_len) <= len); - return(selected); - } + /* Get collation hash code */ + cs->coll->hash_sort(cs, str, char_len, &nr1, &nr2); - ++selected; - } + return(nr1 % FTS_NUM_AUX_INDEX); +} - ut_ad(selected > 0); +/** Select the FTS auxiliary index for the given character. +@param[in] cs charset +@param[in] str string +@param[in] len string length in bytes +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulint selected; - return((ulint) selected); + if (fts_is_charset_cjk(cs)) { + selected = fts_select_index_by_hash(cs, str, len); + } else { + selected = fts_select_index_by_range(cs, str, len); + } + + return(selected); } /******************************************************************//** @@ -367,22 +230,4 @@ fts_get_suffix( return(fts_index_selector[selected].suffix); } -/******************************************************************//** -Get the number of index selectors. -@return The number of selectors */ -UNIV_INLINE -ulint -fts_get_n_selectors(void) -/*=====================*/ -{ - ulint i = 0; - - // FIXME: This is a hack - while (fts_index_selector[i].value != 0) { - ++i; - } - - return(i); -} - #endif /* INNOBASE_FTS0TYPES_IC */ |