diff options
author | Alexander Barkov <bar@mariadb.org> | 2015-07-07 09:15:58 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2015-07-07 09:15:58 +0400 |
commit | e4f8cea35627a8383d5d2d9b985038c960c0b19f (patch) | |
tree | bfc3f99f0c38c692f673463c2e43b1d5ce2f59a6 | |
parent | a5f4412bd44a16ba4d7ed31194716c0b59eecfeb (diff) | |
download | mariadb-git-e4f8cea35627a8383d5d2d9b985038c960c0b19f.tar.gz |
MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
-rw-r--r-- | mysql-test/r/ctype_utf32.result | 18 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf32.test | 17 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 263 | ||||
-rw-r--r-- | unittest/strings/strings-t.c | 54 |
4 files changed, 120 insertions, 232 deletions
diff --git a/mysql-test/r/ctype_utf32.result b/mysql-test/r/ctype_utf32.result index 0ec89a50c0f..df7b77b18a5 100644 --- a/mysql-test/r/ctype_utf32.result +++ b/mysql-test/r/ctype_utf32.result @@ -2206,3 +2206,21 @@ DEALLOCATE PREPARE stmt; # # End of 10.0 tests # +# +# Start of 10.1 tests +# +# +# MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character" +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a)); +INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002); +SELECT COUNT(DISTINCT a) FROM t1; +COUNT(DISTINCT a) +1 +DROP TABLE t1; +SELECT _utf32 0x10001=_utf32 0x10002; +_utf32 0x10001=_utf32 0x10002 +1 +# +# End of 10.1 tests +# diff --git a/mysql-test/t/ctype_utf32.test b/mysql-test/t/ctype_utf32.test index e6583f990ca..a75ac72b67f 100644 --- a/mysql-test/t/ctype_utf32.test +++ b/mysql-test/t/ctype_utf32.test @@ -956,3 +956,20 @@ DEALLOCATE PREPARE stmt; --echo # End of 10.0 tests --echo # +--echo # +--echo # Start of 10.1 tests +--echo # + +--echo # +--echo # MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character" +--echo # +# Make sure that all non-BMP characters are compared as equal +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a)); +INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002); +SELECT COUNT(DISTINCT a) FROM t1; +DROP TABLE t1; +SELECT _utf32 0x10001=_utf32 0x10002; + +--echo # +--echo # End of 10.1 tests +--echo # diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 41f6a90506a..02adc1492c4 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1892,6 +1892,34 @@ struct charset_info_st my_charset_utf16le_bin= */ #define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10)) +#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1)) + +#define MY_UTF32_WC4(b0,b1,b2,b3) ((b0 << 24) + (b1 << 16) + (b2 << 8) + (b3)) + +static inline int my_weight_utf32_general_ci(uchar b0, uchar b1, + uchar b2, uchar b3) +{ + my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3); + if (wc <= 0xFFFF) + { + MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); + } + return MY_CS_REPLACEMENT_CHARACTER; +} +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3) +#include "strcoll.ic" + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3)) +#include "strcoll.ic" + +#undef IS_MB2_CHAR +#undef IS_MB4_CHAR + static int my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), @@ -1899,7 +1927,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), { if (s + 4 > e) return MY_CS_TOOSMALL4; - *pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]); + *pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]); return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4; } @@ -2029,144 +2057,6 @@ my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen, } -static int -my_strnncoll_utf32(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc); - const uchar *se= s + slen; - const uchar *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - while (s < se && t < te) - { - int s_res= my_utf32_uni(cs, &s_wc, s, se); - int t_res= my_utf32_uni(cs, &t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0) - { - /* Incorrect string, compare by char value */ - return my_bincmp(s, se, t, te); - } - - my_tosort_utf32(uni_plane, &s_wc); - my_tosort_utf32(uni_plane, &t_wc); - - if (s_wc != t_wc) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t))); -} - - -/** - Compare strings, discarding end space - - If one string is shorter as the other, then we space extend the other - so that the strings have equal length. - - This will ensure that the following things hold: - - "a" == "a " - "a\0" < "a" - "a\0" < "a " - - @param cs Character set pinter. - @param a First string to compare. - @param a_length Length of 'a'. - @param b Second string to compare. - @param b_length Length of 'b'. - - IMPLEMENTATION - - @return Comparison result. - @retval Negative number, if a less than b. - @retval 0, if a is equal to b - @retval Positive number, if a > b -*/ - - -static int -my_strnncollsp_utf32(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference) -{ - int res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - DBUG_ASSERT((slen % 4) == 0); - DBUG_ASSERT((tlen % 4) == 0); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= FALSE; -#endif - - while ( s < se && t < te ) - { - int s_res= my_utf32_uni(cs, &s_wc, s, se); - int t_res= my_utf32_uni(cs, &t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare bytewise */ - return my_bincmp(s, se, t, te); - } - - my_tosort_utf32(uni_plane, &s_wc); - my_tosort_utf32(uni_plane, &t_wc); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - - slen= (size_t) (se - s); - tlen= (size_t) (te - t); - res= 0; - - if (slen != tlen) - { - int s_res, swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 's' is bigger */ - if (slen < tlen) - { - slen= tlen; - s= t; - se= te; - swap= -1; - res= -res; - } - - for ( ; s < se; s+= s_res) - { - if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0) - { - DBUG_ASSERT(0); - return 0; - } - if (s_wc != ' ') - return (s_wc < ' ') ? -swap : swap; - } - } - return res; -} - - static uint my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)), const char *b, @@ -2578,97 +2468,6 @@ my_wildcmp_utf32_bin(CHARSET_INFO *cs, } -static int -my_strnncoll_utf32_bin(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - const uchar *se= s + slen; - const uchar *te= t + tlen; - - while (s < se && t < te) - { - int s_res= my_utf32_uni(cs, &s_wc, s, se); - int t_res= my_utf32_uni(cs, &t_wc, t, te); - - if (s_res <= 0 || t_res <= 0) - { - /* Incorrect string, compare by char value */ - return my_bincmp(s, se, t, te); - } - if (s_wc != t_wc) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t))); -} - - -static inline my_wc_t -my_utf32_get(const uchar *s) -{ - return - ((my_wc_t) s[0] << 24) + - ((my_wc_t) s[1] << 16) + - ((my_wc_t) s[2] << 8) + - s[3]; -} - - -static int -my_strnncollsp_utf32_bin(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference - __attribute__((unused))) -{ - const uchar *se, *te; - size_t minlen; - - DBUG_ASSERT((slen % 4) == 0); - DBUG_ASSERT((tlen % 4) == 0); - - se= s + slen; - te= t + tlen; - - for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4) - { - my_wc_t s_wc= my_utf32_get(s); - my_wc_t t_wc= my_utf32_get(t); - if (s_wc != t_wc) - return s_wc > t_wc ? 1 : -1; - - s+= 4; - t+= 4; - } - - if (slen != tlen) - { - int swap= 1; - if (slen < tlen) - { - s= t; - se= te; - swap= -1; - } - - for ( ; s < se ; s+= 4) - { - my_wc_t s_wc= my_utf32_get(s); - if (s_wc != ' ') - return (s_wc < ' ') ? -swap : swap; - } - } - return 0; -} - - static size_t my_scan_utf32(CHARSET_INFO *cs, const char *str, const char *end, int sequence_type) @@ -2696,8 +2495,8 @@ my_scan_utf32(CHARSET_INFO *cs, static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler = { NULL, /* init */ - my_strnncoll_utf32, - my_strnncollsp_utf32, + my_strnncoll_utf32_general_ci, + my_strnncollsp_utf32_general_ci, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_generic, diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c index 65a7f1e1155..fe595a5c303 100644 --- a/unittest/strings/strings-t.c +++ b/unittest/strings/strings-t.c @@ -537,6 +537,55 @@ static STRNNCOLL_PARAM strcoll_utf16le_general_ci[]= {CSTR("\x00\xD8\x00\xDC"), CSTR("\xFF\xDB\xFF\xDF"), 0},/* Non-BMP MB4 vs non-BMP MB4 */ {CSTR("\x00\x00"), CSTR("\x00\xD8\x01\xDC"), -1},/* U+0000 vs non-BMP MB4 */ {CSTR("\x00\x00"), CSTR("\xFF\xDB\xFF\xDF"), -1},/* U+0000 vs non-BMP MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + +static STRNNCOLL_PARAM strcoll_utf32_common[]= +{ + /* Minimum character: U+0000 == _utf32 0x00000000 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x00\x00"),-1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x20\x00\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */ + + /* Minimum non-BMP character: U+10000 == _utf32 0x00010000 */ + {CSTR("\x00\x01\x00\x00"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x01\x00\x00"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x00\x00"),-1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x20\x00\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\x00\x01\x00\x00"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */ + + /* Maximum character: U+10FFFF == _utf32 0x0010FFFF */ + {CSTR("\x00\x10\xFF\xFF"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x10\xFF\xFF"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x10\xFF\xFF"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x10\xFF\xFF"), CSTR("\x00\x00\x00"), -1}, /* MB4 vs incomplete MB4 */ + {CSTR("\x00\x10\xFF\xFF"), CSTR("\x20\x00\x00\x00"),-1},/* MB4 vs broken MB3 */ + {CSTR("\x00\x10\xFF\xFF"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */ + + + /* Broken MB4 vs incomplete/broken MB3 */ + {CSTR("\x00\x20\x00\x00"), CSTR("\x00"), 1}, /* Broken MB4 vs incomplete MB4 */ + {CSTR("\x00\x20\x00\x00"), CSTR("\x00\x00"), 1}, /* Broken MB4 vs incomplete MB4 */ + {CSTR("\x00\x20\x00\x00"), CSTR("\x00\x00\x00"), 1}, /* Broken MB4 vs incomplete MB4 */ + {CSTR("\x00\x20\x00\x00"), CSTR("\x00\x20\x00\x01"),-1},/* Broken MB4 vs broken MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + +static STRNNCOLL_PARAM strcoll_utf32_general_ci[]= +{ + /* Two non-BMP characters are compared as equal */ + {CSTR("\x00\x01\x00\x00"), CSTR("\x00\x01\x00\x01"), 0}, /* non-BMP MB4 vs non-BMP MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x01\x00\x00"), -1}, /* U+0000 vs non-BMP MB4 */ + {CSTR("\x00\x00\x00\x00"), CSTR("\x00\x01\x00\x01"), -1}, /* U+0000 vs non-BMP MB4 */ + {NULL, 0, NULL, 0, 0} }; @@ -688,6 +737,11 @@ test_strcollsp() failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space); failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common); #endif +#ifdef HAVE_CHARSET_utf32 + failed+= strcollsp(&my_charset_utf32_general_ci, strcoll_utf32_common); + failed+= strcollsp(&my_charset_utf32_general_ci, strcoll_utf32_general_ci); + failed+= strcollsp(&my_charset_utf32_bin, strcoll_utf32_common); +#endif #ifdef HAVE_CHARSET_utf8 failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common); |