Bug#55980 Character sets: supplementary character _bin ordering is wrong

Problem: - ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned results in a wrong order, because old functions (supporting only BMP range) were used to handle these collations. - Additionally, utf16_bin did not sort supplementary characters between U+D700 and U+E000, as WL#1213 specification specified. include/m_ctype.h: Adding prototypes. mysql-test/include/ctype_filesort2.inc: Adding a new shared test file. mysql-test/t/ctype_utf8mb4.test: Adding tests. strings/ctype-ucs2.c: - Fixing my_strncoll[sp]_utf16_bin to compare binary representation instead of code points, to make columns with indexes sort correct. - Fixing my_collation_handler_utf32_bin and my_collation_handler_utf16_bin to use new functions. strings/ctype-utf8.c: - Adding my_strnxfrm[len]_unicode_fill_bin() to handle utf8mb4_bin, utf16_bin and utf32_bin, using 3 bytes per weight. This function also performs special reordering in case of utf16_bin. - Fixing my_collation_utf8mb4_bin handler to use the new function.
author: Alexander Nozdrin <alik@sun.com> 2010-08-31 17:54:26 +0400
committer: Alexander Nozdrin <alik@sun.com> 2010-08-31 17:54:26 +0400
commit: 9e4928af69a01d6ef91e1c0ed1fb2d836959b773 (patch)
tree: 9cd301b009b02b51bfaeaa62de95ffc1fd48712a /strings
parent: 8c8080adfb9dfd2244562d785de7928388959f64 (diff)
download: mariadb-git-9e4928af69a01d6ef91e1c0ed1fb2d836959b773.tar.gz
2 files changed, 79 insertions, 9 deletions
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index d3b0b93a939..ecfac3170d1 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
     }
     if (s_wc != t_wc)
     {
-      return  s_wc > t_wc ? 1 : -1;
+      return  my_bincmp(s, s + s_res, t, t + t_res);
     }
     
     s+= s_res;
@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
 
     if (s_wc != t_wc)
     {
-      return s_wc > t_wc ? 1 : -1;
+      return my_bincmp(s, s + s_res, t, t + t_res);
     }
 
     s+= s_res;
@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
   NULL,                /* init */
   my_strnncoll_utf16_bin,
   my_strnncollsp_utf16_bin,
-  my_strnxfrm_unicode,
-  my_strnxfrmlen_simple,
+  my_strnxfrm_unicode_full_bin,
+  my_strnxfrmlen_unicode_full_bin,
   my_like_range_utf16,
   my_wildcmp_utf16_bin,
   my_strcasecmp_mb2_or_mb4,
@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
   NULL, /* init */
   my_strnncoll_utf32_bin,
   my_strnncollsp_utf32_bin,
-  my_strnxfrm_unicode,
-  my_strnxfrmlen_utf32,
+  my_strnxfrm_unicode_full_bin,
+  my_strnxfrmlen_unicode_full_bin,
   my_like_range_utf32,
   my_wildcmp_utf32_bin,
   my_strcasecmp_mb2_or_mb4,
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index ace39130c12..76fff72290b 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
 
 
 /*
-  This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
+  Store sorting weights using 2 bytes per character.
+
+  This function is shared between
+  - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
+    which support BMP only (U+0000..U+FFFF).
+  - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
+    which map all supplementary characters to weight 0xFFFD.
 */
 size_t
 my_strnxfrm_unicode(CHARSET_INFO *cs,
@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
 }
 
 
+/*
+  Store sorting weights using 3 bytes per character.
+  This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
+*/
+size_t
+my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
+                             uchar *dst, size_t dstlen,
+                             const uchar *src, size_t srclen)
+{
+  my_wc_t wc;
+  uchar *de= dst + dstlen;
+  uchar *de_beg= de - 2; /* The beginning of the last chunk */
+  const uchar *se = src + srclen;
+
+  LINT_INIT(wc);
+  DBUG_ASSERT(src);
+  DBUG_ASSERT(cs->state & MY_CS_BINSORT);
+
+  while (dst < de_beg)
+  {
+    int res;
+    if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
+      break;
+    src+= res;
+    if (cs->mbminlen == 2) /* utf16_bin */
+    {
+      /*
+        Reorder code points to weights as follows:
+        U+0000..U+D7FF    -> [00][00][00]..[00][D7][FF] BMP part #1
+        U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary
+        U+E000..U+FFFF    -> [20][E0][00]..[20][FF][FF] BMP part #2
+      */
+      if (wc >= 0xE000 && wc <= 0xFFFF)
+        wc+= 0x200000;
+    }
+    *dst++= (uchar) (wc >> 16);
+    *dst++= (uchar) ((wc >> 8) & 0xFF);
+    *dst++= (uchar) (wc & 0xFF);
+  }
+
+  while (dst < de_beg) /* Fill the tail with keys for space character */
+  {
+    *dst++= 0x00;
+    *dst++= 0x00;
+    *dst++= 0x20;
+  }
+
+  /* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
+  if (dst < de)
+  {
+    *dst++= 0x00;
+    if (dst < de)
+      *dst= 0x00;
+  }
+
+  return dstlen;
+}
+
+
+size_t
+my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
+{
+  return ((len + 3) / cs->mbmaxlen) * 3;
+}
 #endif /* HAVE_UNIDATA */
 
 
@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
     NULL,		/* init */
     my_strnncoll_mb_bin,
     my_strnncollsp_mb_bin,
-    my_strnxfrm_unicode,
-    my_strnxfrmlen_utf8mb4,
+    my_strnxfrm_unicode_full_bin,
+    my_strnxfrmlen_unicode_full_bin,
     my_like_range_mb,
     my_wildcmp_mb_bin,
     my_strcasecmp_mb_bin,
author	Alexander Nozdrin <alik@sun.com>	2010-08-31 17:54:26 +0400
committer	Alexander Nozdrin <alik@sun.com>	2010-08-31 17:54:26 +0400
commit	9e4928af69a01d6ef91e1c0ed1fb2d836959b773 (patch)
tree	9cd301b009b02b51bfaeaa62de95ffc1fd48712a /strings
parent	8c8080adfb9dfd2244562d785de7928388959f64 (diff)
download	mariadb-git-9e4928af69a01d6ef91e1c0ed1fb2d836959b773.tar.gz