summaryrefslogtreecommitdiff
path: root/strings/ctype-utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r--strings/ctype-utf8.c410
1 files changed, 105 insertions, 305 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index c525ee97b65..4ddb086b734 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -26,78 +26,10 @@
#define EILSEQ ENOENT
#endif
-/* Detect special bytes and sequences */
-#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
-/*
- Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
- Use this macro if the caller already checked b0 for:
- - an MB1 character
- - an unused gap between MB1 and MB2HEAD
-*/
-#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
- IS_CONTINUATION_BYTE((uchar) b1))
-
-/*
- Check MB3 character assuming that b0 is already known to be
- in the valid MB3HEAD range [0xE0..0xEF].
-*/
-#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
- IS_CONTINUATION_BYTE(b2) && \
- ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
-
-/*
- Check MB3 character assuming that b0 is already known to be >= 0xE0,
- but is not checked for the high end 0xF0 yet.
- Use this macro if the caller already checked b0 for:
- - an MB1 character
- - an unused gap between MB1 and MB2HEAD
- - an MB2HEAD
-*/
-#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
- IS_UTF8MB3_STEP2(b0,b1,b2))
-
-/*
- UTF-8 quick four-byte mask:
- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- Encoding allows to encode U+00010000..U+001FFFFF
-
- The maximum character defined in the Unicode standard is U+0010FFFF.
- Higher characters U+00110000..U+001FFFFF are not used.
-
- 11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
- 11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
-
- Valid codes:
- [F0][90..BF][80..BF][80..BF]
- [F1][80..BF][80..BF][80..BF]
- [F2][80..BF][80..BF][80..BF]
- [F3][80..BF][80..BF][80..BF]
- [F4][80..8F][80..BF][80..BF]
-*/
+#include "ctype-utf8.h"
+#include "ctype-unidata.h"
-/*
- Check MB4 character assuming that b0 is already
- known to be in the range [0xF0..0xF4]
-*/
-#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
- IS_CONTINUATION_BYTE(b2) && \
- IS_CONTINUATION_BYTE(b3) && \
- (b0 >= 0xf1 || b1 >= 0x90) && \
- (b0 <= 0xf3 || b1 <= 0x8F))
-#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
- IS_UTF8MB4_STEP2(b0,b1,b2,b3))
-
-/* Convert individual bytes to Unicode code points */
-#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
- ((my_wc_t) ((uchar) b1 ^ 0x80)))
-#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
- ((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
- ((my_wc_t) ((uchar) b2 ^ 0x80)))
-#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
- ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
- ((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
- (my_wc_t) ((uchar) b3 ^ 0x80))
/* Definitions for strcoll.ic */
#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
@@ -180,7 +112,7 @@ int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
#include "my_uctype.h"
-static MY_UNICASE_CHARACTER plane00[]={
+MY_UNICASE_CHARACTER my_unicase_default_page00[]={
{0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001},
{0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003},
{0x0004,0x0004,0x0004}, {0x0005,0x0005,0x0005},
@@ -313,7 +245,7 @@ static MY_UNICASE_CHARACTER plane00[]={
/*
- Almost similar to plane00, but maps sorting order
+ Almost similar to my_unicase_default_page00, but maps sorting order
for U+00DF to 0x00DF instead of 0x0053.
*/
static MY_UNICASE_CHARACTER plane00_mysql500[]={
@@ -1759,9 +1691,10 @@ static MY_UNICASE_CHARACTER planeFF[]={
};
-static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]=
+MY_UNICASE_CHARACTER *my_unicase_default_pages[256]=
{
- plane00, plane01, plane02, plane03, plane04, plane05, NULL, NULL,
+ my_unicase_default_page00,
+ plane01, plane02, plane03, plane04, plane05, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F,
@@ -1798,8 +1731,8 @@ static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]=
MY_UNICASE_INFO my_unicase_default=
{
- 0xFFFF,
- my_unicase_pages_default
+ MY_UNICASE_INFO_DEFAULT_MAXCHAR,
+ my_unicase_default_pages
};
@@ -4479,9 +4412,7 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
int result= -1; /* Not found, using wildcards */
my_wc_t s_wc, w_wc;
int scan;
- int (*mb_wc)(CHARSET_INFO *, my_wc_t *,
- const uchar *, const uchar *);
- mb_wc= cs->cset->mb_wc;
+ my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
if (my_string_stack_guard && my_string_stack_guard(recurse_level))
return 1;
@@ -4509,12 +4440,12 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
wildstr+= scan;
escaped= 1;
}
-
+
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <= 0)
return 1;
str+= scan;
-
+
if (!escaped && w_wc == (my_wc_t) w_one)
{
result= 1; /* Found an anchor char */
@@ -4532,86 +4463,84 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
if (wildstr == wildend)
return (str != str_end); /* Match if both are at end */
}
-
-
+
if (w_wc == (my_wc_t) w_many)
{ /* Found w_many */
-
/* Remove any '%' and '_' from the wild search string */
for ( ; wildstr != wildend ; )
{
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;
-
- if (w_wc == (my_wc_t)w_many)
+
+ if (w_wc == (my_wc_t) w_many)
{
wildstr+= scan;
continue;
}
-
- if (w_wc == (my_wc_t)w_one)
+
+ if (w_wc == (my_wc_t) w_one)
{
wildstr+= scan;
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
- (const uchar*)str_end)) <=0)
+ (const uchar*)str_end)) <= 0)
return 1;
str+= scan;
continue;
}
break; /* Not a wild character */
}
-
+
if (wildstr == wildend)
return 0; /* Ok if w_many is last */
-
+
if (str == str_end)
return -1;
-
+
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <=0)
+ (const uchar*)wildend)) <= 0)
return 1;
wildstr+= scan;
-
- if (w_wc == (my_wc_t)escape)
+
+ if (w_wc == (my_wc_t) escape)
{
if (wildstr < wildend)
{
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <=0)
+ (const uchar*)wildend)) <= 0)
return 1;
wildstr+= scan;
}
}
-
+
while (1)
{
/* Skip until the first character from wildstr is found */
while (str != str_end)
{
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
- (const uchar*)str_end)) <=0)
+ (const uchar*)str_end)) <= 0)
return 1;
if (weights)
{
my_tosort_unicode(weights, &s_wc, cs->state);
my_tosort_unicode(weights, &w_wc, cs->state);
}
-
+
if (s_wc == w_wc)
break;
str+= scan;
}
if (str == str_end)
return -1;
-
+
str+= scan;
result= my_wildcmp_unicode_impl(cs, str, str_end, wildstr, wildend,
escape, w_one, w_many,
weights, recurse_level + 1);
if (result <= 0)
return result;
- }
+ }
}
}
return (str != str_end ? 1 : 0);
@@ -4650,7 +4579,7 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
@return Result length
*/
-static size_t
+size_t
my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights)
{
uchar *str0;
@@ -4679,7 +4608,7 @@ my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights)
@return Result length
*/
-static size_t
+size_t
my_strxfrm_pad_unicode(uchar *str, uchar *strend)
{
uchar *str0= str;
@@ -4694,95 +4623,6 @@ my_strxfrm_pad_unicode(uchar *str, uchar *strend)
}
-size_t my_strnxfrm_unicode_internal(CHARSET_INFO *cs,
- uchar *dst, uchar *de, uint *nweights,
- const uchar *src, const uchar *se)
-{
- my_wc_t UNINIT_VAR(wc);
- int res;
- uchar *dst0= dst;
- MY_UNICASE_INFO *uni_plane= (cs->state & MY_CS_BINSORT) ?
- NULL : cs->caseinfo;
-
- DBUG_ASSERT(src || !se);
-
- for (; dst < de && *nweights; (*nweights)--)
- {
- if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
- break;
- src+= res;
-
- if (uni_plane)
- my_tosort_unicode(uni_plane, &wc, cs->state);
-
- *dst++= (uchar) (wc >> 8);
- if (dst < de)
- *dst++= (uchar) (wc & 0xFF);
- }
- return dst - dst0;
-}
-
-
-/*
- Store sorting weights using 2 bytes per character.
-
- This function is shared between
- - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
- which support BMP only (U+0000..U+FFFF).
- - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
- which map all supplementary characters to weight 0xFFFD.
-*/
-size_t
-my_strnxfrm_unicode(CHARSET_INFO *cs,
- uchar *dst, size_t dstlen, uint nweights,
- const uchar *src, size_t srclen, uint flags)
-{
- uchar *dst0= dst;
- uchar *de= dst + dstlen;
- dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights,
- src, src + srclen);
- DBUG_ASSERT(dst <= de); /* Safety */
-
- if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
- dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights);
-
- my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
-
- if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
- dst+= my_strxfrm_pad_unicode(dst, de);
- return dst - dst0;
-}
-
-
-size_t
-my_strnxfrm_unicode_nopad(CHARSET_INFO *cs,
- uchar *dst, size_t dstlen, uint nweights,
- const uchar *src, size_t srclen, uint flags)
-{
- uchar *dst0= dst;
- uchar *de= dst + dstlen;
- dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights,
- src, src + srclen);
- DBUG_ASSERT(dst <= de); /* Safety */
-
- if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
- {
- size_t len= de - dst;
- set_if_smaller(len, nweights * 2);
- memset(dst, 0x00, len);
- dst+= len;
- }
-
- my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
-
- if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
- {
- memset(dst, 0x00, de - dst);
- dst= de;
- }
- return dst - dst0;
-}
-
/*
For BMP-only collations that use 2 bytes per weight.
*/
@@ -4981,42 +4821,7 @@ static const uchar to_upper_utf8[] = {
static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
- uchar c;
-
- if (s >= e)
- return MY_CS_TOOSMALL;
-
- c= s[0];
- if (c < 0x80)
- {
- *pwc = c;
- return 1;
- }
- else if (c < 0xc2)
- return MY_CS_ILSEQ;
- else if (c < 0xe0)
- {
- if (s+2 > e) /* We need 2 characters */
- return MY_CS_TOOSMALL2;
-
- if (!(IS_CONTINUATION_BYTE(s[1])))
- return MY_CS_ILSEQ;
-
- *pwc= UTF8MB2_CODE(c, s[1]);
- return 2;
- }
- else if (c < 0xf0)
- {
- if (s+3 > e) /* We need 3 characters */
- return MY_CS_TOOSMALL3;
-
- if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
- return MY_CS_ILSEQ;
-
- *pwc= UTF8MB3_CODE(c, s[1], s[2]);
- return 3;
- }
- return MY_CS_ILSEQ;
+ return my_mb_wc_utf8mb3_quick(pwc, s, e);
}
@@ -5312,7 +5117,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
It represents a single byte character.
Convert it into weight according to collation.
*/
- s_wc= plane00[(uchar) s[0]].tolower;
+ s_wc= my_unicase_default_page00[(uchar) s[0]].tolower;
s++;
}
else
@@ -5354,7 +5159,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
if ((uchar) t[0] < 128)
{
/* Convert single byte character into weight */
- t_wc= plane00[(uchar) t[0]].tolower;
+ t_wc= my_unicase_default_page00[(uchar) t[0]].tolower;
t++;
}
else
@@ -5417,14 +5222,14 @@ int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
static inline int my_weight_mb1_utf8_general_ci(uchar b)
{
- return (int) plane00[b & 0xFF].sort;
+ return (int) my_unicase_default_page00[b & 0xFF].sort;
}
static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1)
{
my_wc_t wc= UTF8MB2_CODE(b0, b1);
- MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8];
+ MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
@@ -5432,16 +5237,23 @@ static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1)
static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2)
{
my_wc_t wc= UTF8MB3_CODE(b0, b1, b2);
- MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8];
+ MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
-#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci
-#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
-#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x)
-#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y)
-#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z)
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci
+#define DEFINE_STRNXFRM_UNICODE
+#define DEFINE_STRNXFRM_UNICODE_NOPAD
+#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e)
+#define OPTIMIZE_ASCII 1
+#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
+#define UNICASE_PAGE0 my_unicase_default_page00
+#define UNICASE_PAGES my_unicase_default_pages
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x)
+#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y)
+#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z)
#include "strcoll.ic"
@@ -5477,19 +5289,28 @@ my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
}
-#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci
-#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
-#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x)
-#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y)
-#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z)
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci
+#define DEFINE_STRNXFRM_UNICODE
+#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e)
+#define OPTIMIZE_ASCII 1
+#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
+#define UNICASE_PAGE0 plane00_mysql500
+#define UNICASE_PAGES my_unicase_pages_mysql500
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x)
+#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y)
+#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z)
#include "strcoll.ic"
-#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin
-#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
-#define WEIGHT_MB1(x) ((int) (uchar) (x))
-#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y))
-#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z))
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin
+#define DEFINE_STRNXFRM_UNICODE_BIN2
+#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e)
+#define OPTIMIZE_ASCII 1
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) ((int) (uchar) (x))
+#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y))
+#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z))
#include "strcoll.ic"
@@ -5538,7 +5359,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler =
NULL, /* init */
my_strnncoll_utf8_general_ci,
my_strnncollsp_utf8_general_ci,
- my_strnxfrm_unicode,
+ my_strnxfrm_utf8_general_ci,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_utf8,
@@ -5554,7 +5375,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_mysql500_ci_handler =
NULL, /* init */
my_strnncoll_utf8_general_mysql500_ci,
my_strnncollsp_utf8_general_mysql500_ci,
- my_strnxfrm_unicode,
+ my_strnxfrm_utf8_general_mysql500_ci,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_utf8,
@@ -5570,7 +5391,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_bin_handler =
NULL, /* init */
my_strnncoll_utf8_bin,
my_strnncollsp_utf8_bin,
- my_strnxfrm_unicode,
+ my_strnxfrm_utf8_bin,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_mb_bin,
@@ -5586,7 +5407,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_nopad_ci_handler =
NULL, /* init */
my_strnncoll_utf8_general_ci,
my_strnncollsp_utf8_general_nopad_ci,
- my_strnxfrm_unicode_nopad,
+ my_strnxfrm_nopad_utf8_general_ci,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_utf8,
@@ -5602,7 +5423,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_nopad_bin_handler =
NULL, /* init */
my_strnncoll_utf8_bin,
my_strnncollsp_utf8_nopad_bin,
- my_strnxfrm_unicode_nopad,
+ my_strnxfrm_nopad_utf8_bin,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_mb_bin,
@@ -5931,7 +5752,7 @@ static MY_COLLATION_HANDLER my_collation_cs_handler =
NULL, /* init */
my_strnncoll_utf8_cs,
my_strnncollsp_utf8_cs,
- my_strnxfrm_unicode,
+ my_strnxfrm_utf8_general_ci,
my_strnxfrmlen_unicode,
my_like_range_simple,
my_wildcmp_mb,
@@ -7216,13 +7037,30 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
/* my_well_formed_char_length_filename */
+#define MY_FUNCTION_NAME(x) my_ ## x ## _filename
+#define DEFINE_STRNNCOLL 0
+#define DEFINE_STRNXFRM_UNICODE
+#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_filename(cs, pwc, s, e)
+#define OPTIMIZE_ASCII 0
+#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
+#define UNICASE_PAGE0 my_unicase_default_page00
+#define UNICASE_PAGES my_unicase_default_pages
+
+/*
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x)
+#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y)
+#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z)
+*/
+#include "strcoll.ic"
+
static MY_COLLATION_HANDLER my_collation_filename_handler =
{
NULL, /* init */
my_strnncoll_simple,
my_strnncollsp_simple,
- my_strnxfrm_unicode,
+ my_strnxfrm_filename,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_utf8,
@@ -7379,52 +7217,7 @@ static int
my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
- uchar c;
-
- if (s >= e)
- return MY_CS_TOOSMALL;
-
- c= s[0];
- if (c < 0x80)
- {
- *pwc= c;
- return 1;
- }
- else if (c < 0xc2)
- return MY_CS_ILSEQ;
- else if (c < 0xe0)
- {
- if (s + 2 > e) /* We need 2 characters */
- return MY_CS_TOOSMALL2;
-
- if (!(IS_CONTINUATION_BYTE(s[1])))
- return MY_CS_ILSEQ;
-
- *pwc= UTF8MB2_CODE(c, s[1]);
- return 2;
- }
- else if (c < 0xf0)
- {
- if (s + 3 > e) /* We need 3 characters */
- return MY_CS_TOOSMALL3;
-
- if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
- return MY_CS_ILSEQ;
-
- *pwc= UTF8MB3_CODE(c, s[1], s[2]);
- return 3;
- }
- else if (c < 0xf5)
- {
- if (s + 4 > e) /* We need 4 characters */
- return MY_CS_TOOSMALL4;
-
- if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
- return MY_CS_ILSEQ;
- *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
- return 4;
- }
- return MY_CS_ILSEQ;
+ return my_mb_wc_utf8mb4_quick(pwc, s, e);
}
@@ -7756,7 +7549,7 @@ my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t)
It represents a single byte character.
Convert it into weight according to collation.
*/
- s_wc= plane00[(uchar) s[0]].tolower;
+ s_wc= my_unicase_default_page00[(uchar) s[0]].tolower;
s++;
}
else
@@ -7780,7 +7573,7 @@ my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t)
if ((uchar) t[0] < 128)
{
/* Convert single byte character into weight */
- t_wc= plane00[(uchar) t[0]].tolower;
+ t_wc= my_unicase_default_page00[(uchar) t[0]].tolower;
t++;
}
else
@@ -7851,6 +7644,13 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci
+#define DEFINE_STRNXFRM_UNICODE
+#define DEFINE_STRNXFRM_UNICODE_NOPAD
+#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb4_quick(pwc, s, e)
+#define OPTIMIZE_ASCII 1
+#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
+#define UNICASE_PAGE0 my_unicase_default_page00
+#define UNICASE_PAGES my_unicase_default_pages
#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0)
@@ -7901,7 +7701,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
NULL, /* init */
my_strnncoll_utf8mb4_general_ci,
my_strnncollsp_utf8mb4_general_ci,
- my_strnxfrm_unicode,
+ my_strnxfrm_utf8mb4_general_ci,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_utf8mb4,
@@ -7933,7 +7733,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_nopad_ci_handler=
NULL, /* init */
my_strnncoll_utf8mb4_general_ci,
my_strnncollsp_utf8mb4_general_nopad_ci,
- my_strnxfrm_unicode_nopad,
+ my_strnxfrm_nopad_utf8mb4_general_ci,
my_strnxfrmlen_unicode,
my_like_range_mb,
my_wildcmp_utf8mb4,