summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2015-07-07 09:15:58 +0400
committerAlexander Barkov <bar@mariadb.org>2015-07-07 09:15:58 +0400
commite4f8cea35627a8383d5d2d9b985038c960c0b19f (patch)
treebfc3f99f0c38c692f673463c2e43b1d5ce2f59a6
parenta5f4412bd44a16ba4d7ed31194716c0b59eecfeb (diff)
downloadmariadb-git-e4f8cea35627a8383d5d2d9b985038c960c0b19f.tar.gz
MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
-rw-r--r--mysql-test/r/ctype_utf32.result18
-rw-r--r--mysql-test/t/ctype_utf32.test17
-rw-r--r--strings/ctype-ucs2.c263
-rw-r--r--unittest/strings/strings-t.c54
4 files changed, 120 insertions, 232 deletions
diff --git a/mysql-test/r/ctype_utf32.result b/mysql-test/r/ctype_utf32.result
index 0ec89a50c0f..df7b77b18a5 100644
--- a/mysql-test/r/ctype_utf32.result
+++ b/mysql-test/r/ctype_utf32.result
@@ -2206,3 +2206,21 @@ DEALLOCATE PREPARE stmt;
#
# End of 10.0 tests
#
+#
+# Start of 10.1 tests
+#
+#
+# MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
+#
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
+INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
+SELECT COUNT(DISTINCT a) FROM t1;
+COUNT(DISTINCT a)
+1
+DROP TABLE t1;
+SELECT _utf32 0x10001=_utf32 0x10002;
+_utf32 0x10001=_utf32 0x10002
+1
+#
+# End of 10.1 tests
+#
diff --git a/mysql-test/t/ctype_utf32.test b/mysql-test/t/ctype_utf32.test
index e6583f990ca..a75ac72b67f 100644
--- a/mysql-test/t/ctype_utf32.test
+++ b/mysql-test/t/ctype_utf32.test
@@ -956,3 +956,20 @@ DEALLOCATE PREPARE stmt;
--echo # End of 10.0 tests
--echo #
+--echo #
+--echo # Start of 10.1 tests
+--echo #
+
+--echo #
+--echo # MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
+--echo #
+# Make sure that all non-BMP characters are compared as equal
+CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
+INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
+SELECT COUNT(DISTINCT a) FROM t1;
+DROP TABLE t1;
+SELECT _utf32 0x10001=_utf32 0x10002;
+
+--echo #
+--echo # End of 10.1 tests
+--echo #
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 41f6a90506a..02adc1492c4 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1892,6 +1892,34 @@ struct charset_info_st my_charset_utf16le_bin=
*/
#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
+#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
+
+#define MY_UTF32_WC4(b0,b1,b2,b3) ((b0 << 24) + (b1 << 16) + (b2 << 8) + (b3))
+
+static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
+ uchar b2, uchar b3)
+{
+ my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
+ if (wc <= 0xFFFF)
+ {
+ MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
+ return (int) (page ? page[wc & 0xFF].sort : wc);
+ }
+ return MY_CS_REPLACEMENT_CHARACTER;
+}
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
+#include "strcoll.ic"
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin
+#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
+#include "strcoll.ic"
+
+#undef IS_MB2_CHAR
+#undef IS_MB4_CHAR
+
static int
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
@@ -1899,7 +1927,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
- *pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
+ *pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
}
@@ -2029,144 +2057,6 @@ my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
}
-static int
-my_strnncoll_utf32(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool t_is_prefix)
-{
- my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
- const uchar *se= s + slen;
- const uchar *te= t + tlen;
- MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
- while (s < se && t < te)
- {
- int s_res= my_utf32_uni(cs, &s_wc, s, se);
- int t_res= my_utf32_uni(cs, &t_wc, t, te);
-
- if ( s_res <= 0 || t_res <= 0)
- {
- /* Incorrect string, compare by char value */
- return my_bincmp(s, se, t, te);
- }
-
- my_tosort_utf32(uni_plane, &s_wc);
- my_tosort_utf32(uni_plane, &t_wc);
-
- if (s_wc != t_wc)
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+= s_res;
- t+= t_res;
- }
- return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
-}
-
-
-/**
- Compare strings, discarding end space
-
- If one string is shorter as the other, then we space extend the other
- so that the strings have equal length.
-
- This will ensure that the following things hold:
-
- "a" == "a "
- "a\0" < "a"
- "a\0" < "a "
-
- @param cs Character set pinter.
- @param a First string to compare.
- @param a_length Length of 'a'.
- @param b Second string to compare.
- @param b_length Length of 'b'.
-
- IMPLEMENTATION
-
- @return Comparison result.
- @retval Negative number, if a less than b.
- @retval 0, if a is equal to b
- @retval Positive number, if a > b
-*/
-
-
-static int
-my_strnncollsp_utf32(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool diff_if_only_endspace_difference)
-{
- int res;
- my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
- const uchar *se= s + slen, *te= t + tlen;
- MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
- DBUG_ASSERT((slen % 4) == 0);
- DBUG_ASSERT((tlen % 4) == 0);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= FALSE;
-#endif
-
- while ( s < se && t < te )
- {
- int s_res= my_utf32_uni(cs, &s_wc, s, se);
- int t_res= my_utf32_uni(cs, &t_wc, t, te);
-
- if ( s_res <= 0 || t_res <= 0 )
- {
- /* Incorrect string, compare bytewise */
- return my_bincmp(s, se, t, te);
- }
-
- my_tosort_utf32(uni_plane, &s_wc);
- my_tosort_utf32(uni_plane, &t_wc);
-
- if ( s_wc != t_wc )
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+= s_res;
- t+= t_res;
- }
-
- slen= (size_t) (se - s);
- tlen= (size_t) (te - t);
- res= 0;
-
- if (slen != tlen)
- {
- int s_res, swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 's' is bigger */
- if (slen < tlen)
- {
- slen= tlen;
- s= t;
- se= te;
- swap= -1;
- res= -res;
- }
-
- for ( ; s < se; s+= s_res)
- {
- if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0)
- {
- DBUG_ASSERT(0);
- return 0;
- }
- if (s_wc != ' ')
- return (s_wc < ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
-
static uint
my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *b,
@@ -2578,97 +2468,6 @@ my_wildcmp_utf32_bin(CHARSET_INFO *cs,
}
-static int
-my_strnncoll_utf32_bin(CHARSET_INFO *cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool t_is_prefix)
-{
- my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
- const uchar *se= s + slen;
- const uchar *te= t + tlen;
-
- while (s < se && t < te)
- {
- int s_res= my_utf32_uni(cs, &s_wc, s, se);
- int t_res= my_utf32_uni(cs, &t_wc, t, te);
-
- if (s_res <= 0 || t_res <= 0)
- {
- /* Incorrect string, compare by char value */
- return my_bincmp(s, se, t, te);
- }
- if (s_wc != t_wc)
- {
- return s_wc > t_wc ? 1 : -1;
- }
-
- s+= s_res;
- t+= t_res;
- }
- return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t)));
-}
-
-
-static inline my_wc_t
-my_utf32_get(const uchar *s)
-{
- return
- ((my_wc_t) s[0] << 24) +
- ((my_wc_t) s[1] << 16) +
- ((my_wc_t) s[2] << 8) +
- s[3];
-}
-
-
-static int
-my_strnncollsp_utf32_bin(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool diff_if_only_endspace_difference
- __attribute__((unused)))
-{
- const uchar *se, *te;
- size_t minlen;
-
- DBUG_ASSERT((slen % 4) == 0);
- DBUG_ASSERT((tlen % 4) == 0);
-
- se= s + slen;
- te= t + tlen;
-
- for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4)
- {
- my_wc_t s_wc= my_utf32_get(s);
- my_wc_t t_wc= my_utf32_get(t);
- if (s_wc != t_wc)
- return s_wc > t_wc ? 1 : -1;
-
- s+= 4;
- t+= 4;
- }
-
- if (slen != tlen)
- {
- int swap= 1;
- if (slen < tlen)
- {
- s= t;
- se= te;
- swap= -1;
- }
-
- for ( ; s < se ; s+= 4)
- {
- my_wc_t s_wc= my_utf32_get(s);
- if (s_wc != ' ')
- return (s_wc < ' ') ? -swap : swap;
- }
- }
- return 0;
-}
-
-
static size_t
my_scan_utf32(CHARSET_INFO *cs,
const char *str, const char *end, int sequence_type)
@@ -2696,8 +2495,8 @@ my_scan_utf32(CHARSET_INFO *cs,
static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
{
NULL, /* init */
- my_strnncoll_utf32,
- my_strnncollsp_utf32,
+ my_strnncoll_utf32_general_ci,
+ my_strnncollsp_utf32_general_ci,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_generic,
diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c
index 65a7f1e1155..fe595a5c303 100644
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -537,6 +537,55 @@ static STRNNCOLL_PARAM strcoll_utf16le_general_ci[]=
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xFF\xDB\xFF\xDF"), 0},/* Non-BMP MB4 vs non-BMP MB4 */
{CSTR("\x00\x00"), CSTR("\x00\xD8\x01\xDC"), -1},/* U+0000 vs non-BMP MB4 */
{CSTR("\x00\x00"), CSTR("\xFF\xDB\xFF\xDF"), -1},/* U+0000 vs non-BMP MB4 */
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+static STRNNCOLL_PARAM strcoll_utf32_common[]=
+{
+ /* Minimum character: U+0000 == _utf32 0x00000000 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x00\x00"),-1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x20\x00\x00"),-1},/* MB4 vs broken MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */
+
+ /* Minimum non-BMP character: U+10000 == _utf32 0x00010000 */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x00\x00"),-1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x20\x00\x00"),-1},/* MB4 vs broken MB4 */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */
+
+ /* Maximum character: U+10FFFF == _utf32 0x0010FFFF */
+ {CSTR("\x00\x10\xFF\xFF"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x10\xFF\xFF"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x10\xFF\xFF"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x10\xFF\xFF"), CSTR("\x00\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
+ {CSTR("\x00\x10\xFF\xFF"), CSTR("\x20\x00\x00\x00"),-1},/* MB4 vs broken MB3 */
+ {CSTR("\x00\x10\xFF\xFF"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */
+
+
+ /* Broken MB4 vs incomplete/broken MB3 */
+ {CSTR("\x00\x20\x00\x00"), CSTR("\x00"), 1}, /* Broken MB4 vs incomplete MB4 */
+ {CSTR("\x00\x20\x00\x00"), CSTR("\x00\x00"), 1}, /* Broken MB4 vs incomplete MB4 */
+ {CSTR("\x00\x20\x00\x00"), CSTR("\x00\x00\x00"), 1}, /* Broken MB4 vs incomplete MB4 */
+ {CSTR("\x00\x20\x00\x00"), CSTR("\x00\x20\x00\x01"),-1},/* Broken MB4 vs broken MB4 */
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+static STRNNCOLL_PARAM strcoll_utf32_general_ci[]=
+{
+ /* Two non-BMP characters are compared as equal */
+ {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x01\x00\x01"), 0}, /* non-BMP MB4 vs non-BMP MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x01\x00\x00"), -1}, /* U+0000 vs non-BMP MB4 */
+ {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x01\x00\x01"), -1}, /* U+0000 vs non-BMP MB4 */
+
{NULL, 0, NULL, 0, 0}
};
@@ -688,6 +737,11 @@ test_strcollsp()
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common);
#endif
+#ifdef HAVE_CHARSET_utf32
+ failed+= strcollsp(&my_charset_utf32_general_ci, strcoll_utf32_common);
+ failed+= strcollsp(&my_charset_utf32_general_ci, strcoll_utf32_general_ci);
+ failed+= strcollsp(&my_charset_utf32_bin, strcoll_utf32_common);
+#endif
#ifdef HAVE_CHARSET_utf8
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);