1 files changed, 82 insertions, 237 deletions
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
index 5e29cf6d8c5..e388d6257f6 100644
--- a/storage/innobase/include/fts0types.ic
+++ b/storage/innobase/include/fts0types.ic
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,23 +27,13 @@ Created 2007-03-27 Sunny Bains
 #ifndef INNOBASE_FTS0TYPES_IC
 #define INNOBASE_FTS0TYPES_IC
 
-#include <ctype.h>
-
-#include "rem0cmp.h"
-#include "ha_prototypes.h"
-
-extern const ulint UTF8_ERROR;
-
-/* Determine if a UTF-8 continuation byte is valid. */
-#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80)
-
 /******************************************************************//**
-Duplicate an UTF-8 string.
+Duplicate a string.
 @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
 UNIV_INLINE
 void
-fts_utf8_string_dup(
-/*================*/
+fts_string_dup(
+/*===========*/
 	fts_string_t*		dst,		/*!< in: dup to here */
 	const fts_string_t*	src,		/*!< in: src string */
 	mem_heap_t*		heap)		/*!< in: heap to use */
@@ -103,183 +94,6 @@ fts_update_doc_id_cmp(
 	return((int)(up1->doc_id - up2->doc_id));
 }
 
-
-/******************************************************************//**
-Lowercase an UTF-8 string. */
-UNIV_INLINE
-void
-fts_utf8_tolower(
-/*=============*/
-	fts_string_t*	str)			/*!< in: string */
-{
-	innobase_casedn_str((char*) str->f_str);
-}
-
-/******************************************************************//**
-Compare two UTF-8 strings.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_utf8_string_cmp(
-/*================*/
-	const void*	p1,			/*!< in: key */
-	const void*	p2)			/*!< in: node */
-{
-	const fts_string_t* s1 = (const fts_string_t*) p1;
-	const fts_string_t* s2 = (const fts_string_t*) p2;
-
-	return(cmp_data_data_slow_varchar(
-		s1->f_str, s1->f_len, s2->f_str, s2->f_len));
-}
-
-/******************************************************************//**
-Compare two UTF-8 strings, and return match (0) if
-passed in "key" value equals or is the prefix of the "node" value.
-@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
-UNIV_INLINE
-int
-fts_utf8_string_cmp_prefix(
-/*=======================*/
-	const void*	p1,			/*!< in: key */
-	const void*	p2)			/*!< in: node */
-{
-	int	result;
-	ulint	len;
-
-	const fts_string_t* s1 = (const fts_string_t*) p1;
-	const fts_string_t* s2 = (const fts_string_t*) p2;
-
-	len = ut_min(s1->f_len, s2->f_len);
-
-	result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len);
-
-	if (result) {
-		return(result);
-	}
-
-	if (s1->f_len > s2->f_len) {
-		return(1);
-	}
-
-	return(0);
-}
-
-/******************************************************************//**
-Decode a UTF-8 character.
-
-http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
-
- Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
-00000000 0xxxxxxx          0xxxxxxx
-00000yyy yyxxxxxx          110yyyyy 10xxxxxx
-zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
-000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
-
-This function decodes UTF-8 sequences up to 6 bytes (31 bits).
-
-On error *ptr will point to the first byte that was not correctly
-decoded. This will hopefully help in resyncing the input.
-@return UTF8_ERROR if *ptr did not point to a valid
-UTF-8 sequence, or the Unicode code point. */
-UNIV_INLINE
-ulint
-fts_utf8_decode(
-/*============*/
-	const byte**	ptr)			/*!< in/out: pointer to
-						UTF-8 string. The
-						pointer is advanced to
-						the start of the next
-						character. */
-{
-	const byte*	p = *ptr;
-	ulint		ch = *p++;
-#ifdef UNIV_DEBUG
-	ulint		min_ch;
-#endif /* UNIV_DEBUG */
-
-	if (UNIV_LIKELY(ch < 0x80)) {
-		/* 0xxxxxxx */
-	} else if (UNIV_UNLIKELY(ch < 0xC0)) {
-		/* A continuation byte cannot start a code. */
-		goto err_exit;
-	} else if (ch < 0xE0) {
-		/* 110yyyyy 10xxxxxx */
-		ch &= 0x1F;
-		ut_d(min_ch = 0x80);
-		goto get1;
-	} else if (ch < 0xF0) {
-		/* 1110zzzz 10yyyyyy 10xxxxxx */
-		ch &= 0x0F;
-		ut_d(min_ch = 0x800);
-		goto get2;
-	} else if (ch < 0xF8) {
-		/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
-		ch &= 0x07;
-		ut_d(min_ch = 0x10000);
-		goto get3;
-	} else if (ch < 0xFC) {
-		/* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
-		ch &= 0x03;
-		ut_d(min_ch = 0x200000);
-		goto get4;
-	} else if (ch < 0xFE) {
-		/* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
-		ut_d(min_ch = 0x4000000);
-		if (!fts_utf8_is_valid(*p)) {
-			goto err_exit;
-		}
-		ch <<= 6;
-		ch |= (*p++) & 0x3F;
-get4:
-		if (!fts_utf8_is_valid(*p)) {
-			goto err_exit;
-		}
-		ch <<= 6;
-		ch |= (*p++) & 0x3F;
-get3:
-		if (!fts_utf8_is_valid(*p)) {
-			goto err_exit;
-		}
-		ch <<= 6;
-		ch |= (*p++) & 0x3F;
-get2:
-		if (!fts_utf8_is_valid(*p)) {
-			goto err_exit;
-		}
-		ch <<= 6;
-		ch |= (*p++) & 0x3F;
-get1:
-		if (!fts_utf8_is_valid(*p)) {
-			goto err_exit;
-		}
-		ch <<= 6;
-		ch |= (*p++) & 0x3F;
-
-		/* The following is needed in the 6-byte case
-		when ulint is wider than 32 bits. */
-		ch &= 0xFFFFFFFF;
-
-		/* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs)
-		and U+FFFE and U+FFFF cannot occur in valid UTF-8. */
-
-		if ( (ch >= 0xD800 && ch <= 0xDFFF)
-#ifdef UNIV_DEBUG
-		     || ch < min_ch
-#endif /* UNIV_DEBUG */
-		     || ch == 0xFFFE || ch == 0xFFFF) {
-
-			ch = UTF8_ERROR;
-		}
-	} else {
-err_exit:
-		ch = UTF8_ERROR;
-	}
-
-	*ptr = p;
-
-	return(ch);
-}
-
 /******************************************************************//**
 Get the first character's code position for FTS index partition */
 extern
@@ -290,16 +104,38 @@ innobase_strnxfrm(
         const uchar*		p2,	/*!< in: string */
         const ulint		len2);	/*!< in: string length */
 
-/******************************************************************//**
-Select the FTS auxiliary index for the given character.
-@return the index to use for the string */
+/** Check if fts index charset is cjk
+@param[in]	cs	charset
+@retval	true	if the charset is cjk
+@retval	false	if not. */
+inline bool fts_is_charset_cjk(const CHARSET_INFO* cs)
+{
+	switch (cs->number) {
+	case 24: /* my_charset_gb2312_chinese_ci */
+	case 28: /* my_charset_gbk_chinese_ci */
+	case 1: /* my_charset_big5_chinese_ci */
+	case 12: /* my_charset_ujis_japanese_ci */
+	case 13: /* my_charset_sjis_japanese_ci */
+	case 95: /* my_charset_cp932_japanese_ci */
+	case 97: /* my_charset_eucjpms_japanese_ci */
+	case 19: /* my_charset_euckr_korean_ci */
+		return true;
+	default:
+		return false;
+	}
+}
+
+/** Select the FTS auxiliary index for the given character by range.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length
+@retval	the index to use for the string */
 UNIV_INLINE
 ulint
-fts_select_index(
-/*=============*/
-	const CHARSET_INFO*	cs,	/*!< in: Charset */
-	const byte*		str,	/*!< in: string */
-	ulint			len)	/*!< in: string length */
+fts_select_index_by_range(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
 {
 	ulint			selected = 0;
 	ulint			value = innobase_strnxfrm(cs, str, len);
@@ -323,37 +159,64 @@ fts_select_index(
 	return(selected - 1);
 }
 
-/******************************************************************//**
-Select the next FTS auxiliary index for the given character.
-@return the next index to use for character */
+/** Select the FTS auxiliary index for the given character by hash.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length
+@retval the index to use for the string */
 UNIV_INLINE
 ulint
-fts_select_next_index(
-/*==================*/
-	const CHARSET_INFO*	cs,	/*!< in: Charset */
-	const byte*		str,	/*!< in: string */
-	ulint			len)	/*!< in: string length */
+fts_select_index_by_hash(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
 {
-	ulint		selected = 0;
-	ulint		value = innobase_strnxfrm(cs, str, len);
+	int	char_len;
+	ulong	nr1 = 1;
+	ulong	nr2 = 4;
 
-	while (fts_index_selector[selected].value != 0) {
+	ut_ad(!(str == NULL && len > 0));
 
-		if (fts_index_selector[selected].value == value) {
+	if (str == NULL || len == 0) {
+		return 0;
+	}
 
-			return(selected + 1);
+	/* Get the first char */
+	/* JAN: TODO: MySQL 5.7 had
+	char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
+				    reinterpret_cast<const char*>(str + len));
+	*/
+	char_len = cs->cset->charlen(cs, str, str+len);
 
-		} else if (fts_index_selector[selected].value > value) {
+	ut_ad(static_cast<ulint>(char_len) <= len);
 
-			return(selected);
-		}
+	/* Get collation hash code */
+	cs->coll->hash_sort(cs, str, char_len, &nr1, &nr2);
 
-		++selected;
-	}
+	return(nr1 % FTS_NUM_AUX_INDEX);
+}
 
-	ut_ad(selected > 0);
+/** Select the FTS auxiliary index for the given character.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length in bytes
+@retval	the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulint	selected;
 
-	return((ulint) selected);
+	if (fts_is_charset_cjk(cs)) {
+		selected = fts_select_index_by_hash(cs, str, len);
+	} else {
+		selected = fts_select_index_by_range(cs, str, len);
+	}
+
+	return(selected);
 }
 
 /******************************************************************//**
@@ -367,22 +230,4 @@ fts_get_suffix(
 	return(fts_index_selector[selected].suffix);
 }
 
-/******************************************************************//**
-Get the number of index selectors.
-@return The number of selectors */
-UNIV_INLINE
-ulint
-fts_get_n_selectors(void)
-/*=====================*/
-{
-	ulint	i = 0;
-
-	// FIXME: This is a hack
-	while (fts_index_selector[i].value != 0) {
-		++i;
-	}
-
-	return(i);
-}
-
 #endif /* INNOBASE_FTS0TYPES_IC */