summaryrefslogtreecommitdiff
path: root/storage/innobase/include/fts0types.ic
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/include/fts0types.ic')
-rw-r--r--storage/innobase/include/fts0types.ic319
1 files changed, 82 insertions, 237 deletions
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
index 5e29cf6d8c5..e388d6257f6 100644
--- a/storage/innobase/include/fts0types.ic
+++ b/storage/innobase/include/fts0types.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
-Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -26,23 +27,13 @@ Created 2007-03-27 Sunny Bains
#ifndef INNOBASE_FTS0TYPES_IC
#define INNOBASE_FTS0TYPES_IC
-#include <ctype.h>
-
-#include "rem0cmp.h"
-#include "ha_prototypes.h"
-
-extern const ulint UTF8_ERROR;
-
-/* Determine if a UTF-8 continuation byte is valid. */
-#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80)
-
/******************************************************************//**
-Duplicate an UTF-8 string.
+Duplicate a string.
@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
UNIV_INLINE
void
-fts_utf8_string_dup(
-/*================*/
+fts_string_dup(
+/*===========*/
fts_string_t* dst, /*!< in: dup to here */
const fts_string_t* src, /*!< in: src string */
mem_heap_t* heap) /*!< in: heap to use */
@@ -103,183 +94,6 @@ fts_update_doc_id_cmp(
return((int)(up1->doc_id - up2->doc_id));
}
-
-/******************************************************************//**
-Lowercase an UTF-8 string. */
-UNIV_INLINE
-void
-fts_utf8_tolower(
-/*=============*/
- fts_string_t* str) /*!< in: string */
-{
- innobase_casedn_str((char*) str->f_str);
-}
-
-/******************************************************************//**
-Compare two UTF-8 strings.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_utf8_string_cmp(
-/*================*/
- const void* p1, /*!< in: key */
- const void* p2) /*!< in: node */
-{
- const fts_string_t* s1 = (const fts_string_t*) p1;
- const fts_string_t* s2 = (const fts_string_t*) p2;
-
- return(cmp_data_data_slow_varchar(
- s1->f_str, s1->f_len, s2->f_str, s2->f_len));
-}
-
-/******************************************************************//**
-Compare two UTF-8 strings, and return match (0) if
-passed in "key" value equals or is the prefix of the "node" value.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_utf8_string_cmp_prefix(
-/*=======================*/
- const void* p1, /*!< in: key */
- const void* p2) /*!< in: node */
-{
- int result;
- ulint len;
-
- const fts_string_t* s1 = (const fts_string_t*) p1;
- const fts_string_t* s2 = (const fts_string_t*) p2;
-
- len = ut_min(s1->f_len, s2->f_len);
-
- result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len);
-
- if (result) {
- return(result);
- }
-
- if (s1->f_len > s2->f_len) {
- return(1);
- }
-
- return(0);
-}
-
-/******************************************************************//**
-Decode a UTF-8 character.
-
-http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
-
- Scalar Value 1st Byte 2nd Byte 3rd Byte 4th Byte
-00000000 0xxxxxxx 0xxxxxxx
-00000yyy yyxxxxxx 110yyyyy 10xxxxxx
-zzzzyyyy yyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
-000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
-
-This function decodes UTF-8 sequences up to 6 bytes (31 bits).
-
-On error *ptr will point to the first byte that was not correctly
-decoded. This will hopefully help in resyncing the input.
-@return UTF8_ERROR if *ptr did not point to a valid
-UTF-8 sequence, or the Unicode code point. */
-UNIV_INLINE
-ulint
-fts_utf8_decode(
-/*============*/
- const byte** ptr) /*!< in/out: pointer to
- UTF-8 string. The
- pointer is advanced to
- the start of the next
- character. */
-{
- const byte* p = *ptr;
- ulint ch = *p++;
-#ifdef UNIV_DEBUG
- ulint min_ch;
-#endif /* UNIV_DEBUG */
-
- if (UNIV_LIKELY(ch < 0x80)) {
- /* 0xxxxxxx */
- } else if (UNIV_UNLIKELY(ch < 0xC0)) {
- /* A continuation byte cannot start a code. */
- goto err_exit;
- } else if (ch < 0xE0) {
- /* 110yyyyy 10xxxxxx */
- ch &= 0x1F;
- ut_d(min_ch = 0x80);
- goto get1;
- } else if (ch < 0xF0) {
- /* 1110zzzz 10yyyyyy 10xxxxxx */
- ch &= 0x0F;
- ut_d(min_ch = 0x800);
- goto get2;
- } else if (ch < 0xF8) {
- /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
- ch &= 0x07;
- ut_d(min_ch = 0x10000);
- goto get3;
- } else if (ch < 0xFC) {
- /* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
- ch &= 0x03;
- ut_d(min_ch = 0x200000);
- goto get4;
- } else if (ch < 0xFE) {
- /* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
- ut_d(min_ch = 0x4000000);
- if (!fts_utf8_is_valid(*p)) {
- goto err_exit;
- }
- ch <<= 6;
- ch |= (*p++) & 0x3F;
-get4:
- if (!fts_utf8_is_valid(*p)) {
- goto err_exit;
- }
- ch <<= 6;
- ch |= (*p++) & 0x3F;
-get3:
- if (!fts_utf8_is_valid(*p)) {
- goto err_exit;
- }
- ch <<= 6;
- ch |= (*p++) & 0x3F;
-get2:
- if (!fts_utf8_is_valid(*p)) {
- goto err_exit;
- }
- ch <<= 6;
- ch |= (*p++) & 0x3F;
-get1:
- if (!fts_utf8_is_valid(*p)) {
- goto err_exit;
- }
- ch <<= 6;
- ch |= (*p++) & 0x3F;
-
- /* The following is needed in the 6-byte case
- when ulint is wider than 32 bits. */
- ch &= 0xFFFFFFFF;
-
- /* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs)
- and U+FFFE and U+FFFF cannot occur in valid UTF-8. */
-
- if ( (ch >= 0xD800 && ch <= 0xDFFF)
-#ifdef UNIV_DEBUG
- || ch < min_ch
-#endif /* UNIV_DEBUG */
- || ch == 0xFFFE || ch == 0xFFFF) {
-
- ch = UTF8_ERROR;
- }
- } else {
-err_exit:
- ch = UTF8_ERROR;
- }
-
- *ptr = p;
-
- return(ch);
-}
-
/******************************************************************//**
Get the first character's code position for FTS index partition */
extern
@@ -290,16 +104,38 @@ innobase_strnxfrm(
const uchar* p2, /*!< in: string */
const ulint len2); /*!< in: string length */
-/******************************************************************//**
-Select the FTS auxiliary index for the given character.
-@return the index to use for the string */
+/** Check if fts index charset is cjk
+@param[in] cs charset
+@retval true if the charset is cjk
+@retval false if not. */
+inline bool fts_is_charset_cjk(const CHARSET_INFO* cs)
+{
+ switch (cs->number) {
+ case 24: /* my_charset_gb2312_chinese_ci */
+ case 28: /* my_charset_gbk_chinese_ci */
+ case 1: /* my_charset_big5_chinese_ci */
+ case 12: /* my_charset_ujis_japanese_ci */
+ case 13: /* my_charset_sjis_japanese_ci */
+ case 95: /* my_charset_cp932_japanese_ci */
+ case 97: /* my_charset_eucjpms_japanese_ci */
+ case 19: /* my_charset_euckr_korean_ci */
+ return true;
+ default:
+ return false;
+ }
+}
+
+/** Select the FTS auxiliary index for the given character by range.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length
+@retval the index to use for the string */
UNIV_INLINE
ulint
-fts_select_index(
-/*=============*/
- const CHARSET_INFO* cs, /*!< in: Charset */
- const byte* str, /*!< in: string */
- ulint len) /*!< in: string length */
+fts_select_index_by_range(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
{
ulint selected = 0;
ulint value = innobase_strnxfrm(cs, str, len);
@@ -323,37 +159,64 @@ fts_select_index(
return(selected - 1);
}
-/******************************************************************//**
-Select the next FTS auxiliary index for the given character.
-@return the next index to use for character */
+/** Select the FTS auxiliary index for the given character by hash.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length
+@retval the index to use for the string */
UNIV_INLINE
ulint
-fts_select_next_index(
-/*==================*/
- const CHARSET_INFO* cs, /*!< in: Charset */
- const byte* str, /*!< in: string */
- ulint len) /*!< in: string length */
+fts_select_index_by_hash(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
{
- ulint selected = 0;
- ulint value = innobase_strnxfrm(cs, str, len);
+ int char_len;
+ ulong nr1 = 1;
+ ulong nr2 = 4;
- while (fts_index_selector[selected].value != 0) {
+ ut_ad(!(str == NULL && len > 0));
- if (fts_index_selector[selected].value == value) {
+ if (str == NULL || len == 0) {
+ return 0;
+ }
- return(selected + 1);
+ /* Get the first char */
+ /* JAN: TODO: MySQL 5.7 had
+ char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
+ reinterpret_cast<const char*>(str + len));
+ */
+ char_len = cs->cset->charlen(cs, str, str+len);
- } else if (fts_index_selector[selected].value > value) {
+ ut_ad(static_cast<ulint>(char_len) <= len);
- return(selected);
- }
+ /* Get collation hash code */
+ cs->coll->hash_sort(cs, str, char_len, &nr1, &nr2);
- ++selected;
- }
+ return(nr1 % FTS_NUM_AUX_INDEX);
+}
- ut_ad(selected > 0);
+/** Select the FTS auxiliary index for the given character.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length in bytes
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len)
+{
+ ulint selected;
- return((ulint) selected);
+ if (fts_is_charset_cjk(cs)) {
+ selected = fts_select_index_by_hash(cs, str, len);
+ } else {
+ selected = fts_select_index_by_range(cs, str, len);
+ }
+
+ return(selected);
}
/******************************************************************//**
@@ -367,22 +230,4 @@ fts_get_suffix(
return(fts_index_selector[selected].suffix);
}
-/******************************************************************//**
-Get the number of index selectors.
-@return The number of selectors */
-UNIV_INLINE
-ulint
-fts_get_n_selectors(void)
-/*=====================*/
-{
- ulint i = 0;
-
- // FIXME: This is a hack
- while (fts_index_selector[i].value != 0) {
- ++i;
- }
-
- return(i);
-}
-
#endif /* INNOBASE_FTS0TYPES_IC */