diff options
author | Alexander Nozdrin <alik@sun.com> | 2010-08-31 17:54:26 +0400 |
---|---|---|
committer | Alexander Nozdrin <alik@sun.com> | 2010-08-31 17:54:26 +0400 |
commit | 9e4928af69a01d6ef91e1c0ed1fb2d836959b773 (patch) | |
tree | 9cd301b009b02b51bfaeaa62de95ffc1fd48712a /strings | |
parent | 8c8080adfb9dfd2244562d785de7928388959f64 (diff) | |
download | mariadb-git-9e4928af69a01d6ef91e1c0ed1fb2d836959b773.tar.gz |
Bug#55980 Character sets: supplementary character _bin ordering is wrong
Problem:
- ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned
results in a wrong order, because old functions
(supporting only BMP range) were used to handle these collations.
- Additionally, utf16_bin did not sort supplementary characters
between U+D700 and U+E000, as WL#1213 specification specified.
include/m_ctype.h:
Adding prototypes.
mysql-test/include/ctype_filesort2.inc:
Adding a new shared test file.
mysql-test/t/ctype_utf8mb4.test:
Adding tests.
strings/ctype-ucs2.c:
- Fixing my_strncoll[sp]_utf16_bin to compare
binary representation instead of code points,
to make columns with indexes sort correct.
- Fixing my_collation_handler_utf32_bin and
my_collation_handler_utf16_bin to use new
functions.
strings/ctype-utf8.c:
- Adding my_strnxfrm[len]_unicode_fill_bin()
to handle utf8mb4_bin, utf16_bin and utf32_bin,
using 3 bytes per weight.
This function also performs special reordering in case of utf16_bin.
- Fixing my_collation_utf8mb4_bin handler to use the
new function.
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-ucs2.c | 12 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 76 |
2 files changed, 79 insertions, 9 deletions
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index d3b0b93a939..ecfac3170d1 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs, } if (s_wc != t_wc) { - return s_wc > t_wc ? 1 : -1; + return my_bincmp(s, s + s_res, t, t + t_res); } s+= s_res; @@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, if (s_wc != t_wc) { - return s_wc > t_wc ? 1 : -1; + return my_bincmp(s, s + s_res, t, t + t_res); } s+= s_res; @@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler = NULL, /* init */ my_strnncoll_utf16_bin, my_strnncollsp_utf16_bin, - my_strnxfrm_unicode, - my_strnxfrmlen_simple, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, my_like_range_utf16, my_wildcmp_utf16_bin, my_strcasecmp_mb2_or_mb4, @@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler = NULL, /* init */ my_strnncoll_utf32_bin, my_strnncollsp_utf32_bin, - my_strnxfrm_unicode, - my_strnxfrmlen_utf32, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, my_like_range_utf32, my_wildcmp_utf32_bin, my_strcasecmp_mb2_or_mb4, diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index ace39130c12..76fff72290b 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs, /* - This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32 + Store sorting weights using 2 bytes per character. + + This function is shared between + - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin + which support BMP only (U+0000..U+FFFF). + - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, + which map all supplementary characters to weight 0xFFFD. */ size_t my_strnxfrm_unicode(CHARSET_INFO *cs, @@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs, } +/* + Store sorting weights using 3 bytes per character. + This function is shared between utf8mb4_bin, utf16_bin, utf32_bin. +*/ +size_t +my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + const uchar *src, size_t srclen) +{ + my_wc_t wc; + uchar *de= dst + dstlen; + uchar *de_beg= de - 2; /* The beginning of the last chunk */ + const uchar *se = src + srclen; + + LINT_INIT(wc); + DBUG_ASSERT(src); + DBUG_ASSERT(cs->state & MY_CS_BINSORT); + + while (dst < de_beg) + { + int res; + if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0) + break; + src+= res; + if (cs->mbminlen == 2) /* utf16_bin */ + { + /* + Reorder code points to weights as follows: + U+0000..U+D7FF -> [00][00][00]..[00][D7][FF] BMP part #1 + U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary + U+E000..U+FFFF -> [20][E0][00]..[20][FF][FF] BMP part #2 + */ + if (wc >= 0xE000 && wc <= 0xFFFF) + wc+= 0x200000; + } + *dst++= (uchar) (wc >> 16); + *dst++= (uchar) ((wc >> 8) & 0xFF); + *dst++= (uchar) (wc & 0xFF); + } + + while (dst < de_beg) /* Fill the tail with keys for space character */ + { + *dst++= 0x00; + *dst++= 0x00; + *dst++= 0x20; + } + + /* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */ + if (dst < de) + { + *dst++= 0x00; + if (dst < de) + *dst= 0x00; + } + + return dstlen; +} + + +size_t +my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len) +{ + return ((len + 3) / cs->mbmaxlen) * 3; +} #endif /* HAVE_UNIDATA */ @@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = NULL, /* init */ my_strnncoll_mb_bin, my_strnncollsp_mb_bin, - my_strnxfrm_unicode, - my_strnxfrmlen_utf8mb4, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, my_like_range_mb, my_wildcmp_mb_bin, my_strcasecmp_mb_bin, |