summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorAlexander Nozdrin <alik@sun.com>2010-08-31 17:54:26 +0400
committerAlexander Nozdrin <alik@sun.com>2010-08-31 17:54:26 +0400
commit9e4928af69a01d6ef91e1c0ed1fb2d836959b773 (patch)
tree9cd301b009b02b51bfaeaa62de95ffc1fd48712a /strings
parent8c8080adfb9dfd2244562d785de7928388959f64 (diff)
downloadmariadb-git-9e4928af69a01d6ef91e1c0ed1fb2d836959b773.tar.gz
Bug#55980 Character sets: supplementary character _bin ordering is wrong
Problem: - ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned results in a wrong order, because old functions (supporting only BMP range) were used to handle these collations. - Additionally, utf16_bin did not sort supplementary characters between U+D700 and U+E000, as WL#1213 specification specified. include/m_ctype.h: Adding prototypes. mysql-test/include/ctype_filesort2.inc: Adding a new shared test file. mysql-test/t/ctype_utf8mb4.test: Adding tests. strings/ctype-ucs2.c: - Fixing my_strncoll[sp]_utf16_bin to compare binary representation instead of code points, to make columns with indexes sort correct. - Fixing my_collation_handler_utf32_bin and my_collation_handler_utf16_bin to use new functions. strings/ctype-utf8.c: - Adding my_strnxfrm[len]_unicode_fill_bin() to handle utf8mb4_bin, utf16_bin and utf32_bin, using 3 bytes per weight. This function also performs special reordering in case of utf16_bin. - Fixing my_collation_utf8mb4_bin handler to use the new function.
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-ucs2.c12
-rw-r--r--strings/ctype-utf8.c76
2 files changed, 79 insertions, 9 deletions
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index d3b0b93a939..ecfac3170d1 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
}
if (s_wc != t_wc)
{
- return s_wc > t_wc ? 1 : -1;
+ return my_bincmp(s, s + s_res, t, t + t_res);
}
s+= s_res;
@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
if (s_wc != t_wc)
{
- return s_wc > t_wc ? 1 : -1;
+ return my_bincmp(s, s + s_res, t, t + t_res);
}
s+= s_res;
@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
NULL, /* init */
my_strnncoll_utf16_bin,
my_strnncollsp_utf16_bin,
- my_strnxfrm_unicode,
- my_strnxfrmlen_simple,
+ my_strnxfrm_unicode_full_bin,
+ my_strnxfrmlen_unicode_full_bin,
my_like_range_utf16,
my_wildcmp_utf16_bin,
my_strcasecmp_mb2_or_mb4,
@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
NULL, /* init */
my_strnncoll_utf32_bin,
my_strnncollsp_utf32_bin,
- my_strnxfrm_unicode,
- my_strnxfrmlen_utf32,
+ my_strnxfrm_unicode_full_bin,
+ my_strnxfrmlen_unicode_full_bin,
my_like_range_utf32,
my_wildcmp_utf32_bin,
my_strcasecmp_mb2_or_mb4,
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index ace39130c12..76fff72290b 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
/*
- This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
+ Store sorting weights using 2 bytes per character.
+
+ This function is shared between
+ - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
+ which support BMP only (U+0000..U+FFFF).
+ - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
+ which map all supplementary characters to weight 0xFFFD.
*/
size_t
my_strnxfrm_unicode(CHARSET_INFO *cs,
@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
}
+/*
+ Store sorting weights using 3 bytes per character.
+ This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
+*/
+size_t
+my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
+ uchar *dst, size_t dstlen,
+ const uchar *src, size_t srclen)
+{
+ my_wc_t wc;
+ uchar *de= dst + dstlen;
+ uchar *de_beg= de - 2; /* The beginning of the last chunk */
+ const uchar *se = src + srclen;
+
+ LINT_INIT(wc);
+ DBUG_ASSERT(src);
+ DBUG_ASSERT(cs->state & MY_CS_BINSORT);
+
+ while (dst < de_beg)
+ {
+ int res;
+ if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
+ break;
+ src+= res;
+ if (cs->mbminlen == 2) /* utf16_bin */
+ {
+ /*
+ Reorder code points to weights as follows:
+ U+0000..U+D7FF -> [00][00][00]..[00][D7][FF] BMP part #1
+ U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary
+ U+E000..U+FFFF -> [20][E0][00]..[20][FF][FF] BMP part #2
+ */
+ if (wc >= 0xE000 && wc <= 0xFFFF)
+ wc+= 0x200000;
+ }
+ *dst++= (uchar) (wc >> 16);
+ *dst++= (uchar) ((wc >> 8) & 0xFF);
+ *dst++= (uchar) (wc & 0xFF);
+ }
+
+ while (dst < de_beg) /* Fill the tail with keys for space character */
+ {
+ *dst++= 0x00;
+ *dst++= 0x00;
+ *dst++= 0x20;
+ }
+
+ /* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
+ if (dst < de)
+ {
+ *dst++= 0x00;
+ if (dst < de)
+ *dst= 0x00;
+ }
+
+ return dstlen;
+}
+
+
+size_t
+my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
+{
+ return ((len + 3) / cs->mbmaxlen) * 3;
+}
#endif /* HAVE_UNIDATA */
@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
NULL, /* init */
my_strnncoll_mb_bin,
my_strnncollsp_mb_bin,
- my_strnxfrm_unicode,
- my_strnxfrmlen_utf8mb4,
+ my_strnxfrm_unicode_full_bin,
+ my_strnxfrmlen_unicode_full_bin,
my_like_range_mb,
my_wildcmp_mb_bin,
my_strcasecmp_mb_bin,