summaryrefslogtreecommitdiff
path: root/unittest
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2015-06-26 13:40:28 +0400
committerAlexander Barkov <bar@mariadb.org>2015-06-26 13:40:28 +0400
commit4f828a1cac9a9c378a2a9f3c3ef0710eaf11ce02 (patch)
treef4da132264de74b64df5035bfec50c2bb80d987b /unittest
parentd535728165acb2eb55140bb70fa44c458d1ccc06 (diff)
downloadmariadb-git-4f828a1cac9a9c378a2a9f3c3ef0710eaf11ce02.tar.gz
MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"
Diffstat (limited to 'unittest')
-rw-r--r--unittest/strings/strings-t.c357
1 files changed, 356 insertions, 1 deletions
diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c
index 6baef0417a8..c7824d07047 100644
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -95,11 +95,361 @@ static CHARSET_INFO *charset_list[]=
};
+typedef struct
+{
+ const char *a;
+ size_t alen;
+ const char *b;
+ size_t blen;
+ int res;
+} STRNNCOLL_PARAM;
+
+
+#define CSTR(x) (x),(sizeof(x)-1)
+
+/*
+ Byte sequence types used in the tests:
+ 8BIT - a 8 bit byte (>=00x80) which makes a single byte characters
+ MB2 - two bytes that make a valid character
+ H2 - a byte which is a valid MB2 head byte
+ T2 - a byte which is a valid MB2 tail byte
+ ILSEQ - a byte which makes an illegal sequence
+ H2+ILSEQ - a sequence that starts with a valid H2 byte,
+ but not followed by a valid T2 byte.
+
+ Charset H2 T2 8BIT
+ ------- ---------------- --------------- --------
+ big5 [A1..F9] [40..7E,A1..FE]
+ euckr [81..FE] [41..5A,61..7A,81..FE]
+ gb2312 [A1..F7] [A1..FE]
+ gbk [81..FE] [40..7E,80..FE]
+
+ cp932 [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
+ sjis [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
+
+
+ Essential byte sequences in various character sets:
+
+ Sequence big5 cp932 euckr gb2312 gbk sjis
+ -------- ---- ----- ----- ------ --- ----
+ 80 ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ
+ 81 ILSEQ H2 H2 ILSEQ H2 H2
+ A1 H2 8BIT H2 H2 H2 8BIT
+ A1A1 MB2 8BIT+8BIT MB2 MB2 MB2 8BIT+8BIT
+ E0E0 MB2 MB2 MB2 MB2 MB2 MB2
+ F9FE MB2 H2+ILSEQ MB2 ILSEQ+T2 MB2 H2+ILSEQ
+*/
+
+
+/*
+ For character sets that have the following byte sequences:
+ 80 - ILSEQ
+ 81 - ILSEQ or H2
+ F9 - ILSEQ or H2
+ A1A1 - MB2 or 8BIT+8BIT
+ E0E0 - MB2
+*/
+STRNNCOLL_PARAM strcoll_mb2_common[]=
+{
+ /* Compare two good sequences */
+ {CSTR(""), CSTR(""), 0},
+ {CSTR(""), CSTR(" "), 0},
+ {CSTR(""), CSTR("A"), -1},
+ {CSTR(""), CSTR("a"), -1},
+ {CSTR(""), CSTR("\xA1\xA1"), -1},
+ {CSTR(""), CSTR("\xE0\xE0"), -1},
+
+ {CSTR(" "), CSTR(""), 0},
+ {CSTR(" "), CSTR(" "), 0},
+ {CSTR(" "), CSTR("A"), -1},
+ {CSTR(" "), CSTR("a"), -1},
+ {CSTR(" "), CSTR("\xA1\xA1"), -1},
+ {CSTR(" "), CSTR("\xE0\xE0"), -1},
+
+ {CSTR("a"), CSTR(""), 1},
+ {CSTR("a"), CSTR(" "), 1},
+ {CSTR("a"), CSTR("a"), 0},
+ {CSTR("a"), CSTR("\xA1\xA1"), -1},
+ {CSTR("a"), CSTR("\xE0\xE0"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xA1\xA1"), 0},
+ {CSTR("\xA1\xA1"), CSTR("\xE0\xE0"), -1},
+
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\x80"), -1},
+ {CSTR(""), CSTR("\x81"), -1},
+ {CSTR(""), CSTR("\xF9"), -1},
+
+ {CSTR(" "), CSTR("\x80"), -1},
+ {CSTR(" "), CSTR("\x81"), -1},
+ {CSTR(" "), CSTR("\xF9"), -1},
+
+ {CSTR("a"), CSTR("\x80"), -1},
+ {CSTR("a"), CSTR("\x81"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\x80"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\x81"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+
+ {CSTR("\xE0\xE0"), CSTR("\x80"), -1},
+ {CSTR("\xE0\xE0"), CSTR("\x81"), -1},
+ {CSTR("\xE0\xE0"), CSTR("\xF9"), -1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\x80"), CSTR("\x80"), 0},
+ {CSTR("\x80"), CSTR("\x81"), -1},
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+ {CSTR("\x81"), CSTR("\x81"), 0},
+ {CSTR("\x81"), CSTR("\xF9"), -1},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets that have good mb2 characters A1A1 and F9FE
+*/
+STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
+{
+ /* Compare two good characters */
+ {CSTR(""), CSTR("\xF9\xFE"), -1},
+ {CSTR(" "), CSTR("\xF9\xFE"), -1},
+ {CSTR("a") , CSTR("\xF9\xFE"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
+
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\xA1"), -1},
+ {CSTR(""), CSTR("\xF9"), -1},
+ {CSTR("a"), CSTR("\xA1"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+
+ {CSTR("\xF9\xFE"), CSTR("\x80"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\x81"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xA1"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9"), -1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\x80"), CSTR("\xA1"), -1},
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets that have:
+ A1A1 - a good mb2 character
+ F9FE - a bad sequence
+*/
+STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
+{
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\xF9\xFE"), -1},
+ {CSTR(" "), CSTR("\xF9\xFE"), -1},
+ {CSTR("a") , CSTR("\xF9\xFE"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+
+ {CSTR(""), CSTR("\xA1"), -1},
+ {CSTR(""), CSTR("\xF9"), -1},
+ {CSTR("a"), CSTR("\xA1"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\xF9\xFE"), CSTR("\x80"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\x81"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
+ {CSTR("\x80"), CSTR("\xA1"), -1},
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets that have:
+ 80 - ILSEQ or H2
+ 81 - ILSEQ or H2
+ A1 - 8BIT
+ F9 - ILSEQ or H2
+ F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
+*/
+STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
+{
+ /* Compare two good characters */
+ {CSTR(""), CSTR("\xA1"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xA1"), 1},
+
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\xF9"), -1},
+ {CSTR(""), CSTR("\xF9\xFE"), -1},
+ {CSTR(" "), CSTR("\xF9\xFE"), -1},
+ {CSTR("a"), CSTR("\xF9\xFE"), -1},
+ {CSTR("a"), CSTR("\xA1"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+
+ {CSTR("\xF9\xFE"), CSTR("\x80"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\x81"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
+
+ {CSTR("\x80"), CSTR("\xA1"), 1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets (e.g. cp932 and sjis) that have:
+ 8181 - a valid MB2 character
+ A1 - a valid 8BIT character
+ E0E0 - a valid MB2 character
+ and sort in this order:
+ 8181 < A1 < E0E0
+*/
+STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
+{
+ {CSTR("\x81\x81"), CSTR("\xA1"), -1},
+ {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
+ {CSTR("\xA1"), CSTR("\xE0\xE0"), -1},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+static void
+str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
+{
+ char *dstend= dst + dstlen;
+ const char *srcend= src + srclen;
+ for (*dst= '\0' ; dst + 3 < dstend && src < srcend; )
+ {
+ sprintf(dst, "%02X", (unsigned char) src[0]);
+ dst+=2;
+ src++;
+ }
+}
+
+
+/*
+ Check if the two comparison result are semantically equal:
+ both are negative, both are positive, or both are zero.
+*/
+static int
+eqres(int ares, int bres)
+{
+ return (ares < 0 && bres < 0) ||
+ (ares > 0 && bres > 0) ||
+ (ares == 0 && bres == 0);
+}
+
+
+static int
+strcollsp(CHARSET_INFO *cs, const STRNNCOLL_PARAM *param)
+{
+ int failed= 0;
+ const STRNNCOLL_PARAM *p;
+ diag("%-20s %-10s %-10s %10s %10s", "Collation", "a", "b", "ExpectSign", "Actual");
+ for (p= param; p->a; p++)
+ {
+ char ahex[64], bhex[64];
+ int res= cs->coll->strnncollsp(cs, (uchar *) p->a, p->alen,
+ (uchar *) p->b, p->blen, 0);
+ str2hex(ahex, sizeof(ahex), p->a, p->alen);
+ str2hex(bhex, sizeof(bhex), p->b, p->blen);
+ diag("%-20s %-10s %-10s %10d %10d%s",
+ cs->name, ahex, bhex, p->res, res,
+ eqres(res, p->res) ? "" : " FAILED");
+ if (!eqres(res, p->res))
+ {
+ failed++;
+ }
+ else
+ {
+ /* Test in reverse order */
+ res= cs->coll->strnncollsp(cs, (uchar *) p->b, p->blen,
+ (uchar *) p->a, p->alen, 0);
+ if (!eqres(res, -p->res))
+ {
+ diag("Comparison in reverse order failed. Expected %d, got %d",
+ -p->res, res);
+ failed++;
+ }
+ }
+ }
+ return failed;
+}
+
+
+static int
+test_strcollsp()
+{
+ int failed= 0;
+#ifdef HAVE_CHARSET_big5
+ failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
+ failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_cp932
+ failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_8181_A1_E0E0);
+ failed+= strcollsp(&my_charset_cp932_bin, strcoll_8181_A1_E0E0);
+#endif
+#ifdef HAVE_CHARSET_euckr
+ failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_A1A1_mb2_F9FE);
+ failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_gb2312
+ failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_A1A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_A1A1_bad_F9FE);
+#endif
+#ifdef HAVE_CHARSET_gbk
+ failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
+ failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_sjis
+ failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
+ failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
+#endif
+ return failed;
+}
+
+
int main()
{
size_t i, failed= 0;
- plan(1);
+ plan(2);
diag("Testing my_like_range_xxx() functions");
for (i= 0; i < array_elements(charset_list); i++)
@@ -112,5 +462,10 @@ int main()
}
}
ok(failed == 0, "Testing my_like_range_xxx() functions");
+
+ diag("Testing cs->coll->strnncollsp()");
+ failed= test_strcollsp();
+ ok(failed == 0, "Testing cs->coll->strnncollsp()");
+
return exit_status();
}