diff options
author | Alexander Barkov <bar@mariadb.org> | 2015-06-26 13:40:28 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2015-06-26 13:40:28 +0400 |
commit | 4f828a1cac9a9c378a2a9f3c3ef0710eaf11ce02 (patch) | |
tree | f4da132264de74b64df5035bfec50c2bb80d987b | |
parent | d535728165acb2eb55140bb70fa44c458d1ccc06 (diff) | |
download | mariadb-git-4f828a1cac9a9c378a2a9f3c3ef0710eaf11ce02.tar.gz |
MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"
-rw-r--r-- | strings/ctype-big5.c | 123 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 136 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 49 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 45 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 124 | ||||
-rw-r--r-- | strings/ctype-mb.ic | 2 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 136 | ||||
-rw-r--r-- | strings/strcoll.ic | 231 | ||||
-rw-r--r-- | unittest/strings/strings-t.c | 357 |
9 files changed, 830 insertions, 373 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index eda81c0c4d3..925398a4d82 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -49,6 +49,7 @@ #define big5tail(e) ((uchar)(e&0xff)) #define MY_FUNCTION_NAME(x) my_ ## x ## _big5 +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80) #define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -849,89 +850,6 @@ static uint16 big5strokexfrm(uint16 i) } - -static int my_strnncoll_big5_internal(const uchar **a_res, - const uchar **b_res, size_t length) -{ - const uchar *a= *a_res, *b= *b_res; - - while (length--) - { - if ((length > 0) && isbig5code(*a,*(a+1)) && isbig5code(*b, *(b+1))) - { - if (*a != *b || *(a+1) != *(b+1)) - return ((int) big5code(*a,*(a+1)) - - (int) big5code(*b,*(b+1))); - a+= 2; - b+= 2; - length--; - } - else if (sort_order_big5[*a++] != - sort_order_big5[*b++]) - return ((int) sort_order_big5[a[-1]] - - (int) sort_order_big5[b[-1]]); - } - *a_res= a; - *b_res= b; - return 0; -} - - -/* Compare strings */ - -static int my_strnncoll_big5(CHARSET_INFO *cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool b_is_prefix) -{ - size_t length= MY_MIN(a_length, b_length); - int res= my_strnncoll_big5_internal(&a, &b, length); - return res ? res : (int)((b_is_prefix ? length : a_length) - b_length); -} - - -/* compare strings, ignore end space */ - -static int my_strnncollsp_big5(CHARSET_INFO * cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool diff_if_only_endspace_difference) -{ - size_t length= MY_MIN(a_length, b_length); - int res= my_strnncoll_big5_internal(&a, &b, length); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= 0; -#endif - - if (!res && a_length != b_length) - { - const uchar *end; - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - /* - Check the next not space character of the longer key. If it's < ' ', - then it's smaller than the other key. - */ - if (a_length < b_length) - { - /* put longer key in a */ - a_length= b_length; - a= b; - swap= -1; /* swap sign of result */ - res= -res; - } - for (end= a + a_length-length; a < end ; a++) - { - if (*a != ' ') - return (*a < ' ') ? -swap : swap; - } - } - return res; -} - - static size_t my_strnxfrm_big5(CHARSET_INFO *cs, uchar *dst, size_t dstlen, uint nweights, @@ -6853,11 +6771,23 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), } -static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = +#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_chinese_ci +#define WEIGHT_MB1(x) (sort_order_big5[(uchar) (x)]) +#define WEIGHT_MB2(x,y) (big5code(x, y)) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_bin +#define WEIGHT_MB1(x) ((uchar) (x)) +#define WEIGHT_MB2(x,y) (big5code(x, y)) +#include "strcoll.ic" + + +static MY_COLLATION_HANDLER my_collation_handler_big5_chinese_ci= { NULL, /* init */ - my_strnncoll_big5, - my_strnncollsp_big5, + my_strnncoll_big5_chinese_ci, + my_strnncollsp_big5_chinese_ci, my_strnxfrm_big5, my_strnxfrmlen_simple, my_like_range_mb, @@ -6868,6 +6798,23 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = my_propagate_simple }; + +static MY_COLLATION_HANDLER my_collation_handler_big5_bin= +{ + NULL, /* init */ + my_strnncoll_big5_bin, + my_strnncollsp_big5_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + static MY_CHARSET_HANDLER my_charset_big5_handler= { NULL, /* init */ @@ -6931,7 +6878,7 @@ struct charset_info_st my_charset_big5_chinese_ci= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_big5_handler, - &my_collation_big5_chinese_ci_handler + &my_collation_handler_big5_chinese_ci }; @@ -6964,7 +6911,7 @@ struct charset_info_st my_charset_big5_bin= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_big5_handler, - &my_collation_mb_bin_handler + &my_collation_handler_big5_bin }; diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 2e26a98bf05..7a4abfa39d1 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -185,6 +185,7 @@ static const uchar sort_order_cp932[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _cp932 #define IS_8BIT_CHAR(x) iscp932kata(x) +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80 || iscp932kata(x)) #define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -1717,90 +1718,6 @@ MY_UNICASE_INFO my_caseinfo_cp932= my_caseinfo_pages_cp932 }; -static int my_strnncoll_cp932_internal(CHARSET_INFO *cs, - const uchar **a_res, size_t a_length, - const uchar **b_res, size_t b_length) -{ - const uchar *a= *a_res, *b= *b_res; - const uchar *a_end= a + a_length; - const uchar *b_end= b + b_length; - while (a < a_end && b < b_end) - { - if (ismbchar_cp932(cs,(char*) a, (char*) a_end) && - ismbchar_cp932(cs,(char*) b, (char*) b_end)) - { - uint a_char= cp932code(*a, *(a+1)); - uint b_char= cp932code(*b, *(b+1)); - if (a_char != b_char) - return a_char - b_char; - a += 2; - b += 2; - } else - { - if (sort_order_cp932[(uchar)*a] != sort_order_cp932[(uchar)*b]) - return sort_order_cp932[(uchar)*a] - sort_order_cp932[(uchar)*b]; - a++; - b++; - } - } - *a_res= a; - *b_res= b; - return 0; -} - - -static int my_strnncoll_cp932(CHARSET_INFO *cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool b_is_prefix) -{ - int res= my_strnncoll_cp932_internal(cs, &a, a_length, &b, b_length); - if (b_is_prefix && a_length > b_length) - a_length= b_length; - return res ? res : (int) (a_length - b_length); -} - - -static int my_strnncollsp_cp932(CHARSET_INFO *cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool diff_if_only_endspace_difference - __attribute__((unused))) -{ - const uchar *a_end= a + a_length; - const uchar *b_end= b + b_length; - int res= my_strnncoll_cp932_internal(cs, &a, a_length, &b, b_length); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= 0; -#endif - - if (!res && (a != a_end || b != b_end)) - { - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - /* - Check the next not space character of the longer key. If it's < ' ', - then it's smaller than the other key. - */ - if (a == a_end) - { - /* put shorter key in a */ - a_end= b_end; - a= b; - swap= -1; /* swap sign of result */ - res= -res; - } - for (; a < a_end ; a++) - { - if (*a != (uchar) ' ') - return (*a < (uchar) ' ') ? -swap : swap; - } - } - return res; -} - static const uint16 cp932_to_unicode[65536]= { @@ -34720,15 +34637,36 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)), } -static MY_COLLATION_HANDLER my_collation_ci_handler = +/* + cp932_chinese_ci and cp932_bin sort character blocks in this order: + 1. [00..7F] - 7BIT characters (ASCII) + 2. [81..9F][40..7E,80..FC] - MB2 characters, part1 + 3. [A1..DF] - 8BIT characters (Kana) + 4. [E0..FC][40..7E,80..FC] - MB2 characters, part2 +*/ +#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932_japanese_ci +#define WEIGHT_PAD_SPACE (256 * (int) ' ') +#define WEIGHT_MB1(x) (256 * (int) sort_order_cp932[(uchar) (x)]) +#define WEIGHT_MB2(x,y) (cp932code(x, y)) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932_bin +#define WEIGHT_PAD_SPACE (256 * (int) ' ') +#define WEIGHT_MB1(x) (256 * (int) (uchar) (x)) +#define WEIGHT_MB2(x,y) (cp932code(x, y)) +#include "strcoll.ic" + + +static MY_COLLATION_HANDLER my_collation_handler_cp932_japanese_ci= { - NULL, /* init */ - my_strnncoll_cp932, - my_strnncollsp_cp932, + NULL, /* init */ + my_strnncoll_cp932_japanese_ci, + my_strnncollsp_cp932_japanese_ci, my_strnxfrm_mb, my_strnxfrmlen_simple, my_like_range_mb, - my_wildcmp_mb, /* wildcmp */ + my_wildcmp_mb, my_strcasecmp_8bit, my_instr_mb, my_hash_sort_simple, @@ -34736,6 +34674,22 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = }; +static MY_COLLATION_HANDLER my_collation_handler_cp932_bin= +{ + NULL, /* init */ + my_strnncoll_cp932_bin, + my_strnncollsp_cp932_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* init */ @@ -34800,7 +34754,7 @@ struct charset_info_st my_charset_cp932_japanese_ci= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_ci_handler + &my_collation_handler_cp932_japanese_ci }; struct charset_info_st my_charset_cp932_bin= @@ -34832,7 +34786,7 @@ struct charset_info_st my_charset_cp932_bin= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_mb_bin_handler + &my_collation_handler_cp932_bin }; #endif diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index a2c95bf77c8..f4d4b445bb2 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -201,8 +201,10 @@ static const uchar sort_order_euc_kr[]= iseuc_kr_tail2(c) || \ iseuc_kr_tail3(c)) +#define euckrcode(c,d) (((uchar)(c) <<8) | (uchar)(d)) #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80) #define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -9938,21 +9940,50 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), } -static MY_COLLATION_HANDLER my_collation_ci_handler = +#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_ci +#define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)]) +#define WEIGHT_MB2(x,y) (euckrcode(x, y)) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_bin +#define WEIGHT_MB1(x) ((uchar) (x)) +#define WEIGHT_MB2(x,y) (euckrcode(x, y)) +#include "strcoll.ic" + + +static MY_COLLATION_HANDLER my_collation_handler_euckr_korean_ci= { - NULL, /* init */ - my_strnncoll_simple, /* strnncoll */ - my_strnncollsp_simple, - my_strnxfrm_mb, /* strnxfrm */ + NULL, /* init */ + my_strnncoll_euckr_korean_ci, + my_strnncollsp_euckr_korean_ci, + my_strnxfrm_mb, my_strnxfrmlen_simple, - my_like_range_mb, /* like_range */ - my_wildcmp_mb, /* wildcmp */ + my_like_range_mb, + my_wildcmp_mb, my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, my_propagate_simple }; + +static MY_COLLATION_HANDLER my_collation_handler_euckr_bin= +{ + NULL, /* init */ + my_strnncoll_euckr_bin, + my_strnncollsp_euckr_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* init */ @@ -10017,7 +10048,7 @@ struct charset_info_st my_charset_euckr_korean_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_ci_handler + &my_collation_handler_euckr_korean_ci }; @@ -10050,7 +10081,7 @@ struct charset_info_st my_charset_euckr_bin= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_mb_bin_handler + &my_collation_handler_euckr_bin }; #endif diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index 129e8edb966..e986584d356 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -163,9 +163,11 @@ static const uchar sort_order_gb2312[]= #define isgb2312head(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xf7) #define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe) +#define gb2312code(c,d) (((uchar)(c) <<8) | (uchar)(d)) #define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312 +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80) #define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -6341,11 +6343,23 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), } -static MY_COLLATION_HANDLER my_collation_ci_handler = +#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_chinese_ci +#define WEIGHT_MB1(x) (sort_order_gb2312[(uchar) (x)]) +#define WEIGHT_MB2(x,y) (gb2312code(x, y)) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_bin +#define WEIGHT_MB1(x) ((uchar) (x)) +#define WEIGHT_MB2(x,y) (gb2312code(x, y)) +#include "strcoll.ic" + + +static MY_COLLATION_HANDLER my_collation_handler_gb2312_chinese_ci= { - NULL, /* init */ - my_strnncoll_simple, /* strnncoll */ - my_strnncollsp_simple, + NULL, /* init */ + my_strnncoll_gb2312_chinese_ci, + my_strnncollsp_gb2312_chinese_ci, my_strnxfrm_mb, /* strnxfrm */ my_strnxfrmlen_simple, my_like_range_mb, /* like_range */ @@ -6356,6 +6370,24 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_propagate_simple }; + +static MY_COLLATION_HANDLER my_collation_handler_gb2312_bin= +{ + NULL, /* init */ + my_strnncoll_gb2312_bin, + my_strnncollsp_gb2312_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + + static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* init */ @@ -6420,9 +6452,10 @@ struct charset_info_st my_charset_gb2312_chinese_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_ci_handler + &my_collation_handler_gb2312_chinese_ci }; + struct charset_info_st my_charset_gb2312_bin= { 86,0,0, /* number */ @@ -6452,7 +6485,7 @@ struct charset_info_st my_charset_gb2312_bin= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_mb_bin_handler + &my_collation_handler_gb2312_bin }; #endif diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index b3bd1efb6c4..2d4dbaf202a 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -44,6 +44,7 @@ #define gbktail(e) ((uchar)(e&0xff)) #define MY_FUNCTION_NAME(x) my_ ## x ## _gbk +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80) #define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -3450,87 +3451,6 @@ static uint16 gbksortorder(uint16 i) } -int my_strnncoll_gbk_internal(const uchar **a_res, const uchar **b_res, - size_t length) -{ - const uchar *a= *a_res, *b= *b_res; - uint a_char,b_char; - - while (length--) - { - if ((length > 0) && isgbkcode(*a,*(a+1)) && isgbkcode(*b, *(b+1))) - { - a_char= gbkcode(*a,*(a+1)); - b_char= gbkcode(*b,*(b+1)); - if (a_char != b_char) - return ((int) gbksortorder((uint16) a_char) - - (int) gbksortorder((uint16) b_char)); - a+= 2; - b+= 2; - length--; - } - else if (sort_order_gbk[*a++] != sort_order_gbk[*b++]) - return ((int) sort_order_gbk[a[-1]] - - (int) sort_order_gbk[b[-1]]); - } - *a_res= a; - *b_res= b; - return 0; -} - - - -int my_strnncoll_gbk(CHARSET_INFO *cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool b_is_prefix) -{ - size_t length= MY_MIN(a_length, b_length); - int res= my_strnncoll_gbk_internal(&a, &b, length); - return res ? res : (int) ((b_is_prefix ? length : a_length) - b_length); -} - - -static int my_strnncollsp_gbk(CHARSET_INFO * cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool diff_if_only_endspace_difference) -{ - size_t length= MY_MIN(a_length, b_length); - int res= my_strnncoll_gbk_internal(&a, &b, length); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= 0; -#endif - - if (!res && a_length != b_length) - { - const uchar *end; - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - /* - Check the next not space character of the longer key. If it's < ' ', - then it's smaller than the other key. - */ - if (a_length < b_length) - { - /* put shorter key in a */ - a_length= b_length; - a= b; - swap= -1; /* swap sign of result */ - res= -res; - } - for (end= a + a_length-length; a < end ; a++) - { - if (*a != ' ') - return (*a < ' ') ? -swap : swap; - } - } - return res; -} - - static size_t my_strnxfrm_gbk(CHARSET_INFO *cs, uchar *dst, size_t dstlen, uint nweights, @@ -10735,11 +10655,23 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), } -static MY_COLLATION_HANDLER my_collation_ci_handler = +#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_chinese_ci +#define WEIGHT_MB1(x) (sort_order_gbk[(uchar) (x)]) +#define WEIGHT_MB2(x,y) (gbksortorder(gbkcode(x,y))) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_bin +#define WEIGHT_MB1(x) ((uchar) (x)) +#define WEIGHT_MB2(x,y) (gbkcode(x,y)) +#include "strcoll.ic" + + +static MY_COLLATION_HANDLER my_collation_handler_gbk_chinese_ci= { - NULL, /* init */ - my_strnncoll_gbk, - my_strnncollsp_gbk, + NULL, /* init */ + my_strnncoll_gbk_chinese_ci, + my_strnncollsp_gbk_chinese_ci, my_strnxfrm_gbk, my_strnxfrmlen_simple, my_like_range_mb, @@ -10750,6 +10682,24 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_propagate_simple }; + +static MY_COLLATION_HANDLER my_collation_handler_gbk_bin= +{ + NULL, /* init */ + my_strnncoll_gbk_bin, + my_strnncollsp_gbk_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + + static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* init */ @@ -10814,7 +10764,7 @@ struct charset_info_st my_charset_gbk_chinese_ci= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_ci_handler + &my_collation_handler_gbk_chinese_ci }; struct charset_info_st my_charset_gbk_bin= @@ -10846,7 +10796,7 @@ struct charset_info_st my_charset_gbk_bin= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_mb_bin_handler + &my_collation_handler_gbk_bin }; diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic index 55094535d5e..0a9c47090fe 100644 --- a/strings/ctype-mb.ic +++ b/strings/ctype-mb.ic @@ -256,3 +256,5 @@ MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused return nchars0 - nchars; } #endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */ + +#undef MY_FUNCTION_NAME diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index bbf0026cf2b..57e674f47a6 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -186,6 +186,7 @@ static const uchar sort_order_sjis[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _sjis #define IS_8BIT_CHAR(x) issjiskata(x) +#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80 || issjiskata(x)) #define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -1088,90 +1089,6 @@ static MY_UNICASE_INFO my_caseinfo_sjis= }; -static int my_strnncoll_sjis_internal(CHARSET_INFO *cs, - const uchar **a_res, size_t a_length, - const uchar **b_res, size_t b_length) -{ - const uchar *a= *a_res, *b= *b_res; - const uchar *a_end= a + a_length; - const uchar *b_end= b + b_length; - while (a < a_end && b < b_end) - { - if (ismbchar_sjis(cs,(char*) a, (char*) a_end) && - ismbchar_sjis(cs,(char*) b, (char*) b_end)) - { - uint a_char= sjiscode(*a, *(a+1)); - uint b_char= sjiscode(*b, *(b+1)); - if (a_char != b_char) - return (int) a_char - (int) b_char; - a += 2; - b += 2; - } else - { - if (sort_order_sjis[(uchar)*a] != sort_order_sjis[(uchar)*b]) - return sort_order_sjis[(uchar)*a] - sort_order_sjis[(uchar)*b]; - a++; - b++; - } - } - *a_res= a; - *b_res= b; - return 0; -} - - -static int my_strnncoll_sjis(CHARSET_INFO *cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool b_is_prefix) -{ - int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length); - if (b_is_prefix && a_length > b_length) - a_length= b_length; - return res ? res : (int) (a_length - b_length); -} - - -static int my_strnncollsp_sjis(CHARSET_INFO *cs __attribute__((unused)), - const uchar *a, size_t a_length, - const uchar *b, size_t b_length, - my_bool diff_if_only_endspace_difference) -{ - const uchar *a_end= a + a_length, *b_end= b + b_length; - int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= 0; -#endif - - if (!res && (a != a_end || b != b_end)) - { - int swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 'a' is bigger */ - /* - Check the next not space character of the longer key. If it's < ' ', - then it's smaller than the other key. - */ - if (a == a_end) - { - /* put shorter key in a */ - a_end= b_end; - a= b; - swap= -1; /* swap sign of result */ - res= -res; - } - for (; a < a_end ; a++) - { - if (*a != ' ') - return (*a < ' ') ? -swap : swap; - } - } - return res; -} - - - /* SJIS->Unicode conversion table */ static uint16 sjis_to_unicode[65536]= { @@ -34099,15 +34016,36 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)), } -static MY_COLLATION_HANDLER my_collation_ci_handler = +/* + sjis_chinese_ci and sjis_bin sort character blocks in this order: + 1. [00..7F] - 7BIT characters (ASCII) + 2. [81..9F][40..7E,80..FC] - MB2 characters, part1 + 3. [A1..DF] - 8BIT characters (Kana) + 4. [E0..FC][40..7E,80..FC] - MB2 characters, part2 +*/ +#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis_japanese_ci +#define WEIGHT_PAD_SPACE (256 * (int) ' ') +#define WEIGHT_MB1(x) (256 * (int) sort_order_sjis[(uchar) (x)]) +#define WEIGHT_MB2(x,y) (sjiscode(x, y)) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis_bin +#define WEIGHT_PAD_SPACE (256 * (int) ' ') +#define WEIGHT_MB1(x) (256 * (int) (uchar) (x)) +#define WEIGHT_MB2(x,y) (sjiscode(x, y)) +#include "strcoll.ic" + + +static MY_COLLATION_HANDLER my_collation_handler_sjis_japanese_ci= { - NULL, /* init */ - my_strnncoll_sjis, - my_strnncollsp_sjis, + NULL, /* init */ + my_strnncoll_sjis_japanese_ci, + my_strnncollsp_sjis_japanese_ci, my_strnxfrm_mb, my_strnxfrmlen_simple, my_like_range_mb, - my_wildcmp_mb, /* wildcmp */ + my_wildcmp_mb, my_strcasecmp_8bit, my_instr_mb, my_hash_sort_simple, @@ -34115,6 +34053,22 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = }; +static MY_COLLATION_HANDLER my_collation_handler_sjis_bin= +{ + NULL, /* init */ + my_strnncoll_sjis_bin, + my_strnncollsp_sjis_bin, + my_strnxfrm_mb, + my_strnxfrmlen_simple, + my_like_range_mb, + my_wildcmp_mb_bin, + my_strcasecmp_mb_bin, + my_instr_mb, + my_hash_sort_mb_bin, + my_propagate_simple +}; + + static MY_CHARSET_HANDLER my_charset_handler= { NULL, /* init */ @@ -34179,7 +34133,7 @@ struct charset_info_st my_charset_sjis_japanese_ci= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_ci_handler + &my_collation_handler_sjis_japanese_ci }; struct charset_info_st my_charset_sjis_bin= @@ -34211,7 +34165,7 @@ struct charset_info_st my_charset_sjis_bin= 1, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_handler, - &my_collation_mb_bin_handler + &my_collation_handler_sjis_bin }; #endif diff --git a/strings/strcoll.ic b/strings/strcoll.ic new file mode 100644 index 00000000000..f230c4f7411 --- /dev/null +++ b/strings/strcoll.ic @@ -0,0 +1,231 @@ +/* + Copyright (c) 2015, MariaDB Foundation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + + +#ifndef MY_FUNCTION_NAME +#error MY_FUNCTION_NAME is not defined +#endif + + +/* + The weight for automatically padded spaces when comparing strings with + the PAD SPACE property. + Should normally be equal to the weight of a regular space. +*/ +#ifndef WEIGHT_PAD_SPACE +#define WEIGHT_PAD_SPACE (' ') +#endif + + +/* + Weight of an illegal byte, must follow these rules: + 1. Must be greater than weight of any normal character in the collation. + 2. Two different bad bytes must have different weights and must be + compared in their binary order. + + Depends on mbmaxlen of the character set, as well as how the collation + sorts various single-byte and multi-byte character blocks. + + The macro below is the default definition, it is suitable for mbmaxlen=2 + character sets that sort all multi-byte characters after all single-byte + characters: big5, euckr, gb2312, gbk. + + All mbmaxlen>2 character sets must provide their own definitions. + All collations that have a more complex order (than just MB1 followed by MB2) + must also provide their own definitions (see definitions for + cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order). +*/ +#ifndef WEIGHT_ILSEQ +#define WEIGHT_ILSEQ(x) (0xFF00 + (x)) +#endif + + +/** + Scan a valid character, or a bad byte, or an auto-padded space + from a string and calculate the weight of the scanned sequence. + + @param [OUT] weight - the weight is returned here + @param str - the string + @param end - the end of the string + @return - the number of bytes scanned + + The including source file must define the following macros: + IS_MB1_CHAR(x) + IS_MB2_CHAR(x,y) + WEIGHT_PAD_SPACE + WEIGHT_MB1(x) + WEIGHT_MB2(x,y) + WEIGHT_ILSEQ(x) +*/ +static inline uint +MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) +{ + if (str >= end) + { + *weight= WEIGHT_PAD_SPACE; + return 0; + } + + if (IS_MB1_CHAR(*str)) + { + *weight= WEIGHT_MB1(*str); /* A valid single byte character*/ + return 1; + } + + if (str + 2 > end) /* The string ended unexpectedly */ + goto bad; /* Treat as a bad byte */ + + if (IS_MB2_CHAR(str[0], str[1])) + { + *weight= WEIGHT_MB2(str[0], str[1]); + return 2; /* A valid two-byte character */ + } + +bad: + *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ + return 1; +} + + +/** + Compare two strings according to the collation, + without handling the PAD SPACE property. + + Note, cs->coll->strnncoll() is usually used to compare identifiers. + Perhaps we should eventually (in 10.2?) create a new collation + my_charset_utf8_general_ci_no_pad and have only one comparison function + in MY_COLLATION_HANDLER. + + @param cs - the character set and collation + @param a - the left string + @param a_length - the length of the left string + @param b - the right string + @param b_length - the length of the right string + @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a" + @return - the comparison result +*/ +static int +MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)), + const uchar *a, size_t a_length, + const uchar *b, size_t b_length, + my_bool b_is_prefix) +{ + const uchar *a_end= a + a_length; + const uchar *b_end= b + b_length; + for ( ; ; ) + { + int a_weight, b_weight, res; + uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); + uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); + /* + a_wlen b_wlen Comment + ------ ------ ------- + 0 0 Strings ended simultaneously, "a" and "b" are equal. + 0 >0 "a" is a prefix of "b", so "a" is smaller. + >0 0 "b" is a prefix of "a", check b_is_prefix. + >0 >0 Two weights were scanned, check weight difference. + */ + if (!a_wlen) + return b_wlen ? -b_weight : 0; + + if (!b_wlen) + return b_is_prefix ? 0 : a_weight; + + if ((res= (a_weight - b_weight))) + return res; + /* + None of the strings has ended yet. + */ + DBUG_ASSERT(a < a_end); + DBUG_ASSERT(b < b_end); + a+= a_wlen; + b+= b_wlen; + } + DBUG_ASSERT(0); + return 0; +} + + +/** + Compare two strings according to the collation, with PAD SPACE handling. + + @param cs - the character set and collation + @param a - the left string + @param a_length - the length of the left string + @param b - the right string + @param b_length - the length of the right string + @param diff_if_only_endspace_difference - not used in the code. + TODO: this should be eventually removed (in 10.2?) + @return - the comparison result +*/ + +static int +MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), + const uchar *a, size_t a_length, + const uchar *b, size_t b_length, + my_bool diff_if_only_endspace_difference + __attribute__((unused))) +{ + const uchar *a_end= a + a_length; + const uchar *b_end= b + b_length; + for ( ; ; ) + { + int a_weight, b_weight, res; + uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); + uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); + if ((res= (a_weight - b_weight))) + { + /* + Got two different weights. Each weight can be generated by either of: + - a real character + - a bad byte sequence or an incomplete byte sequence + - an auto-generated trailing space (PAD SPACE) + It does not matter how exactly each weight was generated. + Just return the weight difference. + */ + return res; + } + if (!a_wlen && !b_wlen) + { + /* + Got two auto-generated trailing spaces, i.e. + both strings have now ended, so they are equal. + */ + DBUG_ASSERT(a == a_end); + DBUG_ASSERT(b == b_end); + return 0; + } + /* + At least one of the strings has not ended yet, continue comparison. + */ + DBUG_ASSERT(a < a_end || b < b_end); + a+= a_wlen; + b+= b_wlen; + } + DBUG_ASSERT(0); + return 0; +} + +/* + We usually include this file at least two times from the same source file, + for the _ci and the _bin collations. Prepare for the second inclusion. +*/ +#undef MY_FUNCTION_NAME +#undef WEIGHT_ILSEQ +#undef WEIGHT_MB1 +#undef WEIGHT_MB2 +#undef WEIGHT_PAD_SPACE diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c index 6baef0417a8..c7824d07047 100644 --- a/unittest/strings/strings-t.c +++ b/unittest/strings/strings-t.c @@ -95,11 +95,361 @@ static CHARSET_INFO *charset_list[]= }; +typedef struct +{ + const char *a; + size_t alen; + const char *b; + size_t blen; + int res; +} STRNNCOLL_PARAM; + + +#define CSTR(x) (x),(sizeof(x)-1) + +/* + Byte sequence types used in the tests: + 8BIT - a 8 bit byte (>=00x80) which makes a single byte characters + MB2 - two bytes that make a valid character + H2 - a byte which is a valid MB2 head byte + T2 - a byte which is a valid MB2 tail byte + ILSEQ - a byte which makes an illegal sequence + H2+ILSEQ - a sequence that starts with a valid H2 byte, + but not followed by a valid T2 byte. + + Charset H2 T2 8BIT + ------- ---------------- --------------- -------- + big5 [A1..F9] [40..7E,A1..FE] + euckr [81..FE] [41..5A,61..7A,81..FE] + gb2312 [A1..F7] [A1..FE] + gbk [81..FE] [40..7E,80..FE] + + cp932 [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] + sjis [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] + + + Essential byte sequences in various character sets: + + Sequence big5 cp932 euckr gb2312 gbk sjis + -------- ---- ----- ----- ------ --- ---- + 80 ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ + 81 ILSEQ H2 H2 ILSEQ H2 H2 + A1 H2 8BIT H2 H2 H2 8BIT + A1A1 MB2 8BIT+8BIT MB2 MB2 MB2 8BIT+8BIT + E0E0 MB2 MB2 MB2 MB2 MB2 MB2 + F9FE MB2 H2+ILSEQ MB2 ILSEQ+T2 MB2 H2+ILSEQ +*/ + + +/* + For character sets that have the following byte sequences: + 80 - ILSEQ + 81 - ILSEQ or H2 + F9 - ILSEQ or H2 + A1A1 - MB2 or 8BIT+8BIT + E0E0 - MB2 +*/ +STRNNCOLL_PARAM strcoll_mb2_common[]= +{ + /* Compare two good sequences */ + {CSTR(""), CSTR(""), 0}, + {CSTR(""), CSTR(" "), 0}, + {CSTR(""), CSTR("A"), -1}, + {CSTR(""), CSTR("a"), -1}, + {CSTR(""), CSTR("\xA1\xA1"), -1}, + {CSTR(""), CSTR("\xE0\xE0"), -1}, + + {CSTR(" "), CSTR(""), 0}, + {CSTR(" "), CSTR(" "), 0}, + {CSTR(" "), CSTR("A"), -1}, + {CSTR(" "), CSTR("a"), -1}, + {CSTR(" "), CSTR("\xA1\xA1"), -1}, + {CSTR(" "), CSTR("\xE0\xE0"), -1}, + + {CSTR("a"), CSTR(""), 1}, + {CSTR("a"), CSTR(" "), 1}, + {CSTR("a"), CSTR("a"), 0}, + {CSTR("a"), CSTR("\xA1\xA1"), -1}, + {CSTR("a"), CSTR("\xE0\xE0"), -1}, + + {CSTR("\xA1\xA1"), CSTR("\xA1\xA1"), 0}, + {CSTR("\xA1\xA1"), CSTR("\xE0\xE0"), -1}, + + /* Compare a good character to an illegal or an incomplete sequence */ + {CSTR(""), CSTR("\x80"), -1}, + {CSTR(""), CSTR("\x81"), -1}, + {CSTR(""), CSTR("\xF9"), -1}, + + {CSTR(" "), CSTR("\x80"), -1}, + {CSTR(" "), CSTR("\x81"), -1}, + {CSTR(" "), CSTR("\xF9"), -1}, + + {CSTR("a"), CSTR("\x80"), -1}, + {CSTR("a"), CSTR("\x81"), -1}, + {CSTR("a"), CSTR("\xF9"), -1}, + + {CSTR("\xA1\xA1"), CSTR("\x80"), -1}, + {CSTR("\xA1\xA1"), CSTR("\x81"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xF9"), -1}, + + {CSTR("\xE0\xE0"), CSTR("\x80"), -1}, + {CSTR("\xE0\xE0"), CSTR("\x81"), -1}, + {CSTR("\xE0\xE0"), CSTR("\xF9"), -1}, + + /* Compare two bad/incomplete sequences */ + {CSTR("\x80"), CSTR("\x80"), 0}, + {CSTR("\x80"), CSTR("\x81"), -1}, + {CSTR("\x80"), CSTR("\xF9"), -1}, + {CSTR("\x81"), CSTR("\x81"), 0}, + {CSTR("\x81"), CSTR("\xF9"), -1}, + + {NULL, 0, NULL, 0, 0} +}; + + +/* + For character sets that have good mb2 characters A1A1 and F9FE +*/ +STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= +{ + /* Compare two good characters */ + {CSTR(""), CSTR("\xF9\xFE"), -1}, + {CSTR(" "), CSTR("\xF9\xFE"), -1}, + {CSTR("a") , CSTR("\xF9\xFE"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1}, + {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0}, + + /* Compare a good character to an illegal or an incomplete sequence */ + {CSTR(""), CSTR("\xA1"), -1}, + {CSTR(""), CSTR("\xF9"), -1}, + {CSTR("a"), CSTR("\xA1"), -1}, + {CSTR("a"), CSTR("\xF9"), -1}, + + {CSTR("\xA1\xA1"), CSTR("\xA1"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xF9"), -1}, + + {CSTR("\xF9\xFE"), CSTR("\x80"), -1}, + {CSTR("\xF9\xFE"), CSTR("\x81"), -1}, + {CSTR("\xF9\xFE"), CSTR("\xA1"), -1}, + {CSTR("\xF9\xFE"), CSTR("\xF9"), -1}, + + /* Compare two bad/incomplete sequences */ + {CSTR("\x80"), CSTR("\xA1"), -1}, + {CSTR("\x80"), CSTR("\xF9"), -1}, + + {NULL, 0, NULL, 0, 0} +}; + + +/* + For character sets that have: + A1A1 - a good mb2 character + F9FE - a bad sequence +*/ +STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= +{ + /* Compare a good character to an illegal or an incomplete sequence */ + {CSTR(""), CSTR("\xF9\xFE"), -1}, + {CSTR(" "), CSTR("\xF9\xFE"), -1}, + {CSTR("a") , CSTR("\xF9\xFE"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1}, + + {CSTR(""), CSTR("\xA1"), -1}, + {CSTR(""), CSTR("\xF9"), -1}, + {CSTR("a"), CSTR("\xA1"), -1}, + {CSTR("a"), CSTR("\xF9"), -1}, + + {CSTR("\xA1\xA1"), CSTR("\xA1"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xF9"), -1}, + + /* Compare two bad/incomplete sequences */ + {CSTR("\xF9\xFE"), CSTR("\x80"), 1}, + {CSTR("\xF9\xFE"), CSTR("\x81"), 1}, + {CSTR("\xF9\xFE"), CSTR("\xA1"), 1}, + {CSTR("\xF9\xFE"), CSTR("\xF9"), 1}, + {CSTR("\x80"), CSTR("\xA1"), -1}, + {CSTR("\x80"), CSTR("\xF9"), -1}, + {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0}, + + {NULL, 0, NULL, 0, 0} +}; + + +/* + For character sets that have: + 80 - ILSEQ or H2 + 81 - ILSEQ or H2 + A1 - 8BIT + F9 - ILSEQ or H2 + F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ) +*/ +STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= +{ + /* Compare two good characters */ + {CSTR(""), CSTR("\xA1"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xA1"), 1}, + + /* Compare a good character to an illegal or an incomplete sequence */ + {CSTR(""), CSTR("\xF9"), -1}, + {CSTR(""), CSTR("\xF9\xFE"), -1}, + {CSTR(" "), CSTR("\xF9\xFE"), -1}, + {CSTR("a"), CSTR("\xF9\xFE"), -1}, + {CSTR("a"), CSTR("\xA1"), -1}, + {CSTR("a"), CSTR("\xF9"), -1}, + + {CSTR("\xA1\xA1"), CSTR("\xF9"), -1}, + {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1}, + + {CSTR("\xF9\xFE"), CSTR("\x80"), 1}, + {CSTR("\xF9\xFE"), CSTR("\x81"), 1}, + {CSTR("\xF9\xFE"), CSTR("\xA1"), 1}, + {CSTR("\xF9\xFE"), CSTR("\xF9"), 1}, + + {CSTR("\x80"), CSTR("\xA1"), 1}, + + /* Compare two bad/incomplete sequences */ + {CSTR("\x80"), CSTR("\xF9"), -1}, + {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0}, + + {NULL, 0, NULL, 0, 0} +}; + + +/* + For character sets (e.g. cp932 and sjis) that have: + 8181 - a valid MB2 character + A1 - a valid 8BIT character + E0E0 - a valid MB2 character + and sort in this order: + 8181 < A1 < E0E0 +*/ +STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= +{ + {CSTR("\x81\x81"), CSTR("\xA1"), -1}, + {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1}, + {CSTR("\xA1"), CSTR("\xE0\xE0"), -1}, + + {NULL, 0, NULL, 0, 0} +}; + + +static void +str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) +{ + char *dstend= dst + dstlen; + const char *srcend= src + srclen; + for (*dst= '\0' ; dst + 3 < dstend && src < srcend; ) + { + sprintf(dst, "%02X", (unsigned char) src[0]); + dst+=2; + src++; + } +} + + +/* + Check if the two comparison result are semantically equal: + both are negative, both are positive, or both are zero. +*/ +static int +eqres(int ares, int bres) +{ + return (ares < 0 && bres < 0) || + (ares > 0 && bres > 0) || + (ares == 0 && bres == 0); +} + + +static int +strcollsp(CHARSET_INFO *cs, const STRNNCOLL_PARAM *param) +{ + int failed= 0; + const STRNNCOLL_PARAM *p; + diag("%-20s %-10s %-10s %10s %10s", "Collation", "a", "b", "ExpectSign", "Actual"); + for (p= param; p->a; p++) + { + char ahex[64], bhex[64]; + int res= cs->coll->strnncollsp(cs, (uchar *) p->a, p->alen, + (uchar *) p->b, p->blen, 0); + str2hex(ahex, sizeof(ahex), p->a, p->alen); + str2hex(bhex, sizeof(bhex), p->b, p->blen); + diag("%-20s %-10s %-10s %10d %10d%s", + cs->name, ahex, bhex, p->res, res, + eqres(res, p->res) ? "" : " FAILED"); + if (!eqres(res, p->res)) + { + failed++; + } + else + { + /* Test in reverse order */ + res= cs->coll->strnncollsp(cs, (uchar *) p->b, p->blen, + (uchar *) p->a, p->alen, 0); + if (!eqres(res, -p->res)) + { + diag("Comparison in reverse order failed. Expected %d, got %d", + -p->res, res); + failed++; + } + } + } + return failed; +} + + +static int +test_strcollsp() +{ + int failed= 0; +#ifdef HAVE_CHARSET_big5 + failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_common); + failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE); + failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_common); + failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_A1A1_mb2_F9FE); +#endif +#ifdef HAVE_CHARSET_cp932 + failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb2_common); + failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb1_A1_bad_F9FE); + failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb2_common); + failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb1_A1_bad_F9FE); + failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_8181_A1_E0E0); + failed+= strcollsp(&my_charset_cp932_bin, strcoll_8181_A1_E0E0); +#endif +#ifdef HAVE_CHARSET_euckr + failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_common); + failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_A1A1_mb2_F9FE); + failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_common); + failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_A1A1_mb2_F9FE); +#endif +#ifdef HAVE_CHARSET_gb2312 + failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_common); + failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_A1A1_bad_F9FE); + failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_common); + failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_A1A1_bad_F9FE); +#endif +#ifdef HAVE_CHARSET_gbk + failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_common); + failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE); + failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_common); + failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_A1A1_mb2_F9FE); +#endif +#ifdef HAVE_CHARSET_sjis + failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb2_common); + failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb2_common); + failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb1_A1_bad_F9FE); + failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb1_A1_bad_F9FE); + failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0); + failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0); +#endif + return failed; +} + + int main() { size_t i, failed= 0; - plan(1); + plan(2); diag("Testing my_like_range_xxx() functions"); for (i= 0; i < array_elements(charset_list); i++) @@ -112,5 +462,10 @@ int main() } } ok(failed == 0, "Testing my_like_range_xxx() functions"); + + diag("Testing cs->coll->strnncollsp()"); + failed= test_strcollsp(); + ok(failed == 0, "Testing cs->coll->strnncollsp()"); + return exit_status(); } |