From b2e324a21fbf481bccd15f9ee4bab819afefbed6 Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Mon, 6 Jul 2015 15:50:56 +0400 Subject: MDEV-8416 ucs2: compare broken bytes as "greater than any non-broken character" MDEV-8418 utf16: compare broken bytes as "greater than any non-broken character" --- unittest/strings/strings-t.c | 132 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 124 insertions(+), 8 deletions(-) (limited to 'unittest') diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c index 4e9ca820981..51537e624f9 100644 --- a/unittest/strings/strings-t.c +++ b/unittest/strings/strings-t.c @@ -149,7 +149,7 @@ typedef struct A1A1 - MB2 or 8BIT+8BIT E0E0 - MB2 */ -STRNNCOLL_PARAM strcoll_mb2_common[]= +static STRNNCOLL_PARAM strcoll_mb2_common[]= { /* Compare two good sequences */ {CSTR(""), CSTR(""), 0}, @@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]= /* For character sets that have good mb2 characters A1A1 and F9FE */ -STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= +static STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= { /* Compare two good characters */ {CSTR(""), CSTR("\xF9\xFE"), -1}, @@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= A1A1 - a good mb2 character F9FE - a bad sequence */ -STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= +static STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= { /* Compare a good character to an illegal or an incomplete sequence */ {CSTR(""), CSTR("\xF9\xFE"), -1}, @@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= F9 - ILSEQ or H2 F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ) */ -STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= +static STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= { /* Compare two good characters */ {CSTR(""), CSTR("\xA1"), -1}, @@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= and sort in this order: 8181 < A1 < E0E0 */ -STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= +static STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= { {CSTR("\x81\x81"), CSTR("\xA1"), -1}, {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1}, @@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= /* A shared test for eucjpms and ujis. */ -STRNNCOLL_PARAM strcoll_ujis[]= +static STRNNCOLL_PARAM strcoll_ujis[]= { {CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */ {CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */ @@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]= }; -STRNNCOLL_PARAM strcoll_utf8mb3_common[]= +static STRNNCOLL_PARAM strcoll_utf8mb3_common[]= { {CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */ {CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */ @@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]= }; -STRNNCOLL_PARAM strcoll_utf8mb4_common[]= +static STRNNCOLL_PARAM strcoll_utf8mb4_common[]= { /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */ {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */ @@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]= }; +static STRNNCOLL_PARAM strcoll_ucs2_common[]= +{ + {CSTR("\xC0"), CSTR("\xC1"), -1}, /* Incomlete MB2 vs incomplete MB2 */ + {CSTR("\xC0"), CSTR("\xFF"), -1}, /* Incomlete MB2 vs incomplete MB2 */ + {CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs incomplete MB2 */ + {CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */ + {CSTR("\xC2\xA0"), CSTR("\xC2\xA1"), -1}, /* MB2 vs MB2 */ + {CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */ + + {CSTR("\xFF\xFF"), CSTR("\x00"),-1}, /* MB2 vs incomplete */ + {CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00"),-1}, /* MB2+MB2 vs incomplete */ + {CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00\x00\x00"), 1},/* MB2+MB2 vs MB2+incomplete */ + + {NULL, 0, NULL, 0, 0} +}; + + +/* Tests that involve comparison to SPACE (explicit, or padded) */ +static STRNNCOLL_PARAM strcoll_ucs2_space[]= +{ + {CSTR("\x00\x1F"), CSTR("\x00\x20"), -1}, /* MB2 vs MB2 */ + {CSTR("\x00\x20"), CSTR("\x00\x21"), -1}, /* MB2 vs MB2 */ + {CSTR("\x00\x1F"), CSTR(""), -1}, /* MB2 vs empty */ + {CSTR("\x00\x20"), CSTR(""), 0}, /* MB2 vs empty */ + {CSTR("\x00\x21"), CSTR(""), 1}, /* MB2 vs empty */ + + {NULL, 0, NULL, 0, 0} +}; + + +/* Tests that involve comparison to SPACE (explicit, or padded) */ +static STRNNCOLL_PARAM strcoll_utf16le_space[]= +{ + {CSTR("\x1F\x00"), CSTR("\x20\x00"), -1}, /* MB2 vs MB2 */ + {CSTR("\x20\x00"), CSTR("\x21\x00"), -1}, /* MB2 vs MB2 */ + {CSTR("\x1F\x00"), CSTR(""), -1}, /* MB2 vs empty */ + {CSTR("\x20\x00"), CSTR(""), 0}, /* MB2 vs empty */ + {CSTR("\x21\x00"), CSTR(""), 1}, /* MB2 vs empty */ + + {NULL, 0, NULL, 0, 0} +}; + + +static STRNNCOLL_PARAM strcoll_utf16_common[]= +{ + /* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC"), -1},/* MB4 vs incomplete MB4 */ + + /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\xFF\xDF"), -1},/* MB4 vs incomplete MB4 */ + + /* Broken MB4 vs broken MB4 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC\x01"),-1},/* Broken MB4 vs broken MB4 */ + {CSTR("\xDB\xFF\xE0\xFE"), CSTR("\xDB\xFF\xE0\xFF"),-1},/* Broken MB4 vs broken MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + +static STRNNCOLL_PARAM strcoll_utf16le_common[]= +{ + /* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xD0"),-1},/* MB4 vs broken MB4 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00"), -1},/* MB4 vs incomplete MB4 */ + + /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xE0"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xFF\xDC\x00"), -1},/* MB4 vs incomplete MB4 */ + + /* Broken MB4 vs broken MB4 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x01\xDC"),-1},/* Broken MB4 vs broken MB4 */ + {CSTR("\xFF\xDB\xFE\xE0"), CSTR("\xFF\xDB\xFF\xE0"),-1},/* Broken MB4 vs broken MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + static void str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) { @@ -528,6 +623,12 @@ test_strcollsp() failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0); failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0); #endif +#ifdef HAVE_CHARSET_ucs2 + failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_space); + failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_space); +#endif #ifdef HAVE_CHARSET_ujis failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common); failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common); @@ -536,6 +637,21 @@ test_strcollsp() failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis); #endif +#ifdef HAVE_CHARSET_utf16 + failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_space); + failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_utf16_common); + failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_space); + failed+= strcollsp(&my_charset_utf16_bin, strcoll_utf16_common); + + failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_space); + failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_common); + failed+= strcollsp(&my_charset_utf16le_bin, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space); + failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common); +#endif #ifdef HAVE_CHARSET_utf8 failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common); -- cgit v1.2.1