summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2015-06-26 13:40:28 +0400
committerAlexander Barkov <bar@mariadb.org>2015-06-26 13:40:28 +0400
commit4f828a1cac9a9c378a2a9f3c3ef0710eaf11ce02 (patch)
treef4da132264de74b64df5035bfec50c2bb80d987b
parentd535728165acb2eb55140bb70fa44c458d1ccc06 (diff)
downloadmariadb-git-4f828a1cac9a9c378a2a9f3c3ef0710eaf11ce02.tar.gz
MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"
-rw-r--r--strings/ctype-big5.c123
-rw-r--r--strings/ctype-cp932.c136
-rw-r--r--strings/ctype-euc_kr.c49
-rw-r--r--strings/ctype-gb2312.c45
-rw-r--r--strings/ctype-gbk.c124
-rw-r--r--strings/ctype-mb.ic2
-rw-r--r--strings/ctype-sjis.c136
-rw-r--r--strings/strcoll.ic231
-rw-r--r--unittest/strings/strings-t.c357
9 files changed, 830 insertions, 373 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index eda81c0c4d3..925398a4d82 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -49,6 +49,7 @@
#define big5tail(e) ((uchar)(e&0xff))
#define MY_FUNCTION_NAME(x) my_ ## x ## _big5
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
#define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y))
#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -849,89 +850,6 @@ static uint16 big5strokexfrm(uint16 i)
}
-
-static int my_strnncoll_big5_internal(const uchar **a_res,
- const uchar **b_res, size_t length)
-{
- const uchar *a= *a_res, *b= *b_res;
-
- while (length--)
- {
- if ((length > 0) && isbig5code(*a,*(a+1)) && isbig5code(*b, *(b+1)))
- {
- if (*a != *b || *(a+1) != *(b+1))
- return ((int) big5code(*a,*(a+1)) -
- (int) big5code(*b,*(b+1)));
- a+= 2;
- b+= 2;
- length--;
- }
- else if (sort_order_big5[*a++] !=
- sort_order_big5[*b++])
- return ((int) sort_order_big5[a[-1]] -
- (int) sort_order_big5[b[-1]]);
- }
- *a_res= a;
- *b_res= b;
- return 0;
-}
-
-
-/* Compare strings */
-
-static int my_strnncoll_big5(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool b_is_prefix)
-{
- size_t length= MY_MIN(a_length, b_length);
- int res= my_strnncoll_big5_internal(&a, &b, length);
- return res ? res : (int)((b_is_prefix ? length : a_length) - b_length);
-}
-
-
-/* compare strings, ignore end space */
-
-static int my_strnncollsp_big5(CHARSET_INFO * cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool diff_if_only_endspace_difference)
-{
- size_t length= MY_MIN(a_length, b_length);
- int res= my_strnncoll_big5_internal(&a, &b, length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= 0;
-#endif
-
- if (!res && a_length != b_length)
- {
- const uchar *end;
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- /*
- Check the next not space character of the longer key. If it's < ' ',
- then it's smaller than the other key.
- */
- if (a_length < b_length)
- {
- /* put longer key in a */
- a_length= b_length;
- a= b;
- swap= -1; /* swap sign of result */
- res= -res;
- }
- for (end= a + a_length-length; a < end ; a++)
- {
- if (*a != ' ')
- return (*a < ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
-
static size_t
my_strnxfrm_big5(CHARSET_INFO *cs,
uchar *dst, size_t dstlen, uint nweights,
@@ -6853,11 +6771,23 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
}
-static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
+#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_chinese_ci
+#define WEIGHT_MB1(x) (sort_order_big5[(uchar) (x)])
+#define WEIGHT_MB2(x,y) (big5code(x, y))
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_bin
+#define WEIGHT_MB1(x) ((uchar) (x))
+#define WEIGHT_MB2(x,y) (big5code(x, y))
+#include "strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_big5_chinese_ci=
{
NULL, /* init */
- my_strnncoll_big5,
- my_strnncollsp_big5,
+ my_strnncoll_big5_chinese_ci,
+ my_strnncollsp_big5_chinese_ci,
my_strnxfrm_big5,
my_strnxfrmlen_simple,
my_like_range_mb,
@@ -6868,6 +6798,23 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
my_propagate_simple
};
+
+static MY_COLLATION_HANDLER my_collation_handler_big5_bin=
+{
+ NULL, /* init */
+ my_strnncoll_big5_bin,
+ my_strnncollsp_big5_bin,
+ my_strnxfrm_mb,
+ my_strnxfrmlen_simple,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_bin,
+ my_propagate_simple
+};
+
+
static MY_CHARSET_HANDLER my_charset_big5_handler=
{
NULL, /* init */
@@ -6931,7 +6878,7 @@ struct charset_info_st my_charset_big5_chinese_ci=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_big5_handler,
- &my_collation_big5_chinese_ci_handler
+ &my_collation_handler_big5_chinese_ci
};
@@ -6964,7 +6911,7 @@ struct charset_info_st my_charset_big5_bin=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_big5_handler,
- &my_collation_mb_bin_handler
+ &my_collation_handler_big5_bin
};
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 2e26a98bf05..7a4abfa39d1 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -185,6 +185,7 @@ static const uchar sort_order_cp932[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932
#define IS_8BIT_CHAR(x) iscp932kata(x)
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80 || iscp932kata(x))
#define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y))
#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -1717,90 +1718,6 @@ MY_UNICASE_INFO my_caseinfo_cp932=
my_caseinfo_pages_cp932
};
-static int my_strnncoll_cp932_internal(CHARSET_INFO *cs,
- const uchar **a_res, size_t a_length,
- const uchar **b_res, size_t b_length)
-{
- const uchar *a= *a_res, *b= *b_res;
- const uchar *a_end= a + a_length;
- const uchar *b_end= b + b_length;
- while (a < a_end && b < b_end)
- {
- if (ismbchar_cp932(cs,(char*) a, (char*) a_end) &&
- ismbchar_cp932(cs,(char*) b, (char*) b_end))
- {
- uint a_char= cp932code(*a, *(a+1));
- uint b_char= cp932code(*b, *(b+1));
- if (a_char != b_char)
- return a_char - b_char;
- a += 2;
- b += 2;
- } else
- {
- if (sort_order_cp932[(uchar)*a] != sort_order_cp932[(uchar)*b])
- return sort_order_cp932[(uchar)*a] - sort_order_cp932[(uchar)*b];
- a++;
- b++;
- }
- }
- *a_res= a;
- *b_res= b;
- return 0;
-}
-
-
-static int my_strnncoll_cp932(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool b_is_prefix)
-{
- int res= my_strnncoll_cp932_internal(cs, &a, a_length, &b, b_length);
- if (b_is_prefix && a_length > b_length)
- a_length= b_length;
- return res ? res : (int) (a_length - b_length);
-}
-
-
-static int my_strnncollsp_cp932(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool diff_if_only_endspace_difference
- __attribute__((unused)))
-{
- const uchar *a_end= a + a_length;
- const uchar *b_end= b + b_length;
- int res= my_strnncoll_cp932_internal(cs, &a, a_length, &b, b_length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= 0;
-#endif
-
- if (!res && (a != a_end || b != b_end))
- {
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- /*
- Check the next not space character of the longer key. If it's < ' ',
- then it's smaller than the other key.
- */
- if (a == a_end)
- {
- /* put shorter key in a */
- a_end= b_end;
- a= b;
- swap= -1; /* swap sign of result */
- res= -res;
- }
- for (; a < a_end ; a++)
- {
- if (*a != (uchar) ' ')
- return (*a < (uchar) ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
static const uint16 cp932_to_unicode[65536]=
{
@@ -34720,15 +34637,36 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
}
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+/*
+ cp932_chinese_ci and cp932_bin sort character blocks in this order:
+ 1. [00..7F] - 7BIT characters (ASCII)
+ 2. [81..9F][40..7E,80..FC] - MB2 characters, part1
+ 3. [A1..DF] - 8BIT characters (Kana)
+ 4. [E0..FC][40..7E,80..FC] - MB2 characters, part2
+*/
+#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932_japanese_ci
+#define WEIGHT_PAD_SPACE (256 * (int) ' ')
+#define WEIGHT_MB1(x) (256 * (int) sort_order_cp932[(uchar) (x)])
+#define WEIGHT_MB2(x,y) (cp932code(x, y))
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932_bin
+#define WEIGHT_PAD_SPACE (256 * (int) ' ')
+#define WEIGHT_MB1(x) (256 * (int) (uchar) (x))
+#define WEIGHT_MB2(x,y) (cp932code(x, y))
+#include "strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_cp932_japanese_ci=
{
- NULL, /* init */
- my_strnncoll_cp932,
- my_strnncollsp_cp932,
+ NULL, /* init */
+ my_strnncoll_cp932_japanese_ci,
+ my_strnncollsp_cp932_japanese_ci,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
- my_wildcmp_mb, /* wildcmp */
+ my_wildcmp_mb,
my_strcasecmp_8bit,
my_instr_mb,
my_hash_sort_simple,
@@ -34736,6 +34674,22 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
};
+static MY_COLLATION_HANDLER my_collation_handler_cp932_bin=
+{
+ NULL, /* init */
+ my_strnncoll_cp932_bin,
+ my_strnncollsp_cp932_bin,
+ my_strnxfrm_mb,
+ my_strnxfrmlen_simple,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_bin,
+ my_propagate_simple
+};
+
+
static MY_CHARSET_HANDLER my_charset_handler=
{
NULL, /* init */
@@ -34800,7 +34754,7 @@ struct charset_info_st my_charset_cp932_japanese_ci=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_ci_handler
+ &my_collation_handler_cp932_japanese_ci
};
struct charset_info_st my_charset_cp932_bin=
@@ -34832,7 +34786,7 @@ struct charset_info_st my_charset_cp932_bin=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_mb_bin_handler
+ &my_collation_handler_cp932_bin
};
#endif
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index a2c95bf77c8..f4d4b445bb2 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -201,8 +201,10 @@ static const uchar sort_order_euc_kr[]=
iseuc_kr_tail2(c) || \
iseuc_kr_tail3(c))
+#define euckrcode(c,d) (((uchar)(c) <<8) | (uchar)(d))
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
#define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y))
#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -9938,21 +9940,50 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
}
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_ci
+#define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)])
+#define WEIGHT_MB2(x,y) (euckrcode(x, y))
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_bin
+#define WEIGHT_MB1(x) ((uchar) (x))
+#define WEIGHT_MB2(x,y) (euckrcode(x, y))
+#include "strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_euckr_korean_ci=
{
- NULL, /* init */
- my_strnncoll_simple, /* strnncoll */
- my_strnncollsp_simple,
- my_strnxfrm_mb, /* strnxfrm */
+ NULL, /* init */
+ my_strnncoll_euckr_korean_ci,
+ my_strnncollsp_euckr_korean_ci,
+ my_strnxfrm_mb,
my_strnxfrmlen_simple,
- my_like_range_mb, /* like_range */
- my_wildcmp_mb, /* wildcmp */
+ my_like_range_mb,
+ my_wildcmp_mb,
my_strcasecmp_mb,
my_instr_mb,
my_hash_sort_simple,
my_propagate_simple
};
+
+static MY_COLLATION_HANDLER my_collation_handler_euckr_bin=
+{
+ NULL, /* init */
+ my_strnncoll_euckr_bin,
+ my_strnncollsp_euckr_bin,
+ my_strnxfrm_mb,
+ my_strnxfrmlen_simple,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_bin,
+ my_propagate_simple
+};
+
+
static MY_CHARSET_HANDLER my_charset_handler=
{
NULL, /* init */
@@ -10017,7 +10048,7 @@ struct charset_info_st my_charset_euckr_korean_ci=
0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_ci_handler
+ &my_collation_handler_euckr_korean_ci
};
@@ -10050,7 +10081,7 @@ struct charset_info_st my_charset_euckr_bin=
0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_mb_bin_handler
+ &my_collation_handler_euckr_bin
};
#endif
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 129e8edb966..e986584d356 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -163,9 +163,11 @@ static const uchar sort_order_gb2312[]=
#define isgb2312head(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xf7)
#define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe)
+#define gb2312code(c,d) (((uchar)(c) <<8) | (uchar)(d))
#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
#define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y))
#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -6341,11 +6343,23 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
}
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_chinese_ci
+#define WEIGHT_MB1(x) (sort_order_gb2312[(uchar) (x)])
+#define WEIGHT_MB2(x,y) (gb2312code(x, y))
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_bin
+#define WEIGHT_MB1(x) ((uchar) (x))
+#define WEIGHT_MB2(x,y) (gb2312code(x, y))
+#include "strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_gb2312_chinese_ci=
{
- NULL, /* init */
- my_strnncoll_simple, /* strnncoll */
- my_strnncollsp_simple,
+ NULL, /* init */
+ my_strnncoll_gb2312_chinese_ci,
+ my_strnncollsp_gb2312_chinese_ci,
my_strnxfrm_mb, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_mb, /* like_range */
@@ -6356,6 +6370,24 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_propagate_simple
};
+
+static MY_COLLATION_HANDLER my_collation_handler_gb2312_bin=
+{
+ NULL, /* init */
+ my_strnncoll_gb2312_bin,
+ my_strnncollsp_gb2312_bin,
+ my_strnxfrm_mb,
+ my_strnxfrmlen_simple,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_bin,
+ my_propagate_simple
+};
+
+
+
static MY_CHARSET_HANDLER my_charset_handler=
{
NULL, /* init */
@@ -6420,9 +6452,10 @@ struct charset_info_st my_charset_gb2312_chinese_ci=
0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_ci_handler
+ &my_collation_handler_gb2312_chinese_ci
};
+
struct charset_info_st my_charset_gb2312_bin=
{
86,0,0, /* number */
@@ -6452,7 +6485,7 @@ struct charset_info_st my_charset_gb2312_bin=
0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_mb_bin_handler
+ &my_collation_handler_gb2312_bin
};
#endif
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index b3bd1efb6c4..2d4dbaf202a 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -44,6 +44,7 @@
#define gbktail(e) ((uchar)(e&0xff))
#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
#define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y))
#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -3450,87 +3451,6 @@ static uint16 gbksortorder(uint16 i)
}
-int my_strnncoll_gbk_internal(const uchar **a_res, const uchar **b_res,
- size_t length)
-{
- const uchar *a= *a_res, *b= *b_res;
- uint a_char,b_char;
-
- while (length--)
- {
- if ((length > 0) && isgbkcode(*a,*(a+1)) && isgbkcode(*b, *(b+1)))
- {
- a_char= gbkcode(*a,*(a+1));
- b_char= gbkcode(*b,*(b+1));
- if (a_char != b_char)
- return ((int) gbksortorder((uint16) a_char) -
- (int) gbksortorder((uint16) b_char));
- a+= 2;
- b+= 2;
- length--;
- }
- else if (sort_order_gbk[*a++] != sort_order_gbk[*b++])
- return ((int) sort_order_gbk[a[-1]] -
- (int) sort_order_gbk[b[-1]]);
- }
- *a_res= a;
- *b_res= b;
- return 0;
-}
-
-
-
-int my_strnncoll_gbk(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool b_is_prefix)
-{
- size_t length= MY_MIN(a_length, b_length);
- int res= my_strnncoll_gbk_internal(&a, &b, length);
- return res ? res : (int) ((b_is_prefix ? length : a_length) - b_length);
-}
-
-
-static int my_strnncollsp_gbk(CHARSET_INFO * cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool diff_if_only_endspace_difference)
-{
- size_t length= MY_MIN(a_length, b_length);
- int res= my_strnncoll_gbk_internal(&a, &b, length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= 0;
-#endif
-
- if (!res && a_length != b_length)
- {
- const uchar *end;
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- /*
- Check the next not space character of the longer key. If it's < ' ',
- then it's smaller than the other key.
- */
- if (a_length < b_length)
- {
- /* put shorter key in a */
- a_length= b_length;
- a= b;
- swap= -1; /* swap sign of result */
- res= -res;
- }
- for (end= a + a_length-length; a < end ; a++)
- {
- if (*a != ' ')
- return (*a < ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
-
static size_t
my_strnxfrm_gbk(CHARSET_INFO *cs,
uchar *dst, size_t dstlen, uint nweights,
@@ -10735,11 +10655,23 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
}
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_chinese_ci
+#define WEIGHT_MB1(x) (sort_order_gbk[(uchar) (x)])
+#define WEIGHT_MB2(x,y) (gbksortorder(gbkcode(x,y)))
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_bin
+#define WEIGHT_MB1(x) ((uchar) (x))
+#define WEIGHT_MB2(x,y) (gbkcode(x,y))
+#include "strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_gbk_chinese_ci=
{
- NULL, /* init */
- my_strnncoll_gbk,
- my_strnncollsp_gbk,
+ NULL, /* init */
+ my_strnncoll_gbk_chinese_ci,
+ my_strnncollsp_gbk_chinese_ci,
my_strnxfrm_gbk,
my_strnxfrmlen_simple,
my_like_range_mb,
@@ -10750,6 +10682,24 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_propagate_simple
};
+
+static MY_COLLATION_HANDLER my_collation_handler_gbk_bin=
+{
+ NULL, /* init */
+ my_strnncoll_gbk_bin,
+ my_strnncollsp_gbk_bin,
+ my_strnxfrm_mb,
+ my_strnxfrmlen_simple,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_bin,
+ my_propagate_simple
+};
+
+
+
static MY_CHARSET_HANDLER my_charset_handler=
{
NULL, /* init */
@@ -10814,7 +10764,7 @@ struct charset_info_st my_charset_gbk_chinese_ci=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_ci_handler
+ &my_collation_handler_gbk_chinese_ci
};
struct charset_info_st my_charset_gbk_bin=
@@ -10846,7 +10796,7 @@ struct charset_info_st my_charset_gbk_bin=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_mb_bin_handler
+ &my_collation_handler_gbk_bin
};
diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic
index 55094535d5e..0a9c47090fe 100644
--- a/strings/ctype-mb.ic
+++ b/strings/ctype-mb.ic
@@ -256,3 +256,5 @@ MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused
return nchars0 - nchars;
}
#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */
+
+#undef MY_FUNCTION_NAME
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index bbf0026cf2b..57e674f47a6 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -186,6 +186,7 @@ static const uchar sort_order_sjis[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis
#define IS_8BIT_CHAR(x) issjiskata(x)
+#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80 || issjiskata(x))
#define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y))
#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -1088,90 +1089,6 @@ static MY_UNICASE_INFO my_caseinfo_sjis=
};
-static int my_strnncoll_sjis_internal(CHARSET_INFO *cs,
- const uchar **a_res, size_t a_length,
- const uchar **b_res, size_t b_length)
-{
- const uchar *a= *a_res, *b= *b_res;
- const uchar *a_end= a + a_length;
- const uchar *b_end= b + b_length;
- while (a < a_end && b < b_end)
- {
- if (ismbchar_sjis(cs,(char*) a, (char*) a_end) &&
- ismbchar_sjis(cs,(char*) b, (char*) b_end))
- {
- uint a_char= sjiscode(*a, *(a+1));
- uint b_char= sjiscode(*b, *(b+1));
- if (a_char != b_char)
- return (int) a_char - (int) b_char;
- a += 2;
- b += 2;
- } else
- {
- if (sort_order_sjis[(uchar)*a] != sort_order_sjis[(uchar)*b])
- return sort_order_sjis[(uchar)*a] - sort_order_sjis[(uchar)*b];
- a++;
- b++;
- }
- }
- *a_res= a;
- *b_res= b;
- return 0;
-}
-
-
-static int my_strnncoll_sjis(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool b_is_prefix)
-{
- int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length);
- if (b_is_prefix && a_length > b_length)
- a_length= b_length;
- return res ? res : (int) (a_length - b_length);
-}
-
-
-static int my_strnncollsp_sjis(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool diff_if_only_endspace_difference)
-{
- const uchar *a_end= a + a_length, *b_end= b + b_length;
- int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length);
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
- diff_if_only_endspace_difference= 0;
-#endif
-
- if (!res && (a != a_end || b != b_end))
- {
- int swap= 1;
- if (diff_if_only_endspace_difference)
- res= 1; /* Assume 'a' is bigger */
- /*
- Check the next not space character of the longer key. If it's < ' ',
- then it's smaller than the other key.
- */
- if (a == a_end)
- {
- /* put shorter key in a */
- a_end= b_end;
- a= b;
- swap= -1; /* swap sign of result */
- res= -res;
- }
- for (; a < a_end ; a++)
- {
- if (*a != ' ')
- return (*a < ' ') ? -swap : swap;
- }
- }
- return res;
-}
-
-
-
/* SJIS->Unicode conversion table */
static uint16 sjis_to_unicode[65536]=
{
@@ -34099,15 +34016,36 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
}
-static MY_COLLATION_HANDLER my_collation_ci_handler =
+/*
+ sjis_chinese_ci and sjis_bin sort character blocks in this order:
+ 1. [00..7F] - 7BIT characters (ASCII)
+ 2. [81..9F][40..7E,80..FC] - MB2 characters, part1
+ 3. [A1..DF] - 8BIT characters (Kana)
+ 4. [E0..FC][40..7E,80..FC] - MB2 characters, part2
+*/
+#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis_japanese_ci
+#define WEIGHT_PAD_SPACE (256 * (int) ' ')
+#define WEIGHT_MB1(x) (256 * (int) sort_order_sjis[(uchar) (x)])
+#define WEIGHT_MB2(x,y) (sjiscode(x, y))
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis_bin
+#define WEIGHT_PAD_SPACE (256 * (int) ' ')
+#define WEIGHT_MB1(x) (256 * (int) (uchar) (x))
+#define WEIGHT_MB2(x,y) (sjiscode(x, y))
+#include "strcoll.ic"
+
+
+static MY_COLLATION_HANDLER my_collation_handler_sjis_japanese_ci=
{
- NULL, /* init */
- my_strnncoll_sjis,
- my_strnncollsp_sjis,
+ NULL, /* init */
+ my_strnncoll_sjis_japanese_ci,
+ my_strnncollsp_sjis_japanese_ci,
my_strnxfrm_mb,
my_strnxfrmlen_simple,
my_like_range_mb,
- my_wildcmp_mb, /* wildcmp */
+ my_wildcmp_mb,
my_strcasecmp_8bit,
my_instr_mb,
my_hash_sort_simple,
@@ -34115,6 +34053,22 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
};
+static MY_COLLATION_HANDLER my_collation_handler_sjis_bin=
+{
+ NULL, /* init */
+ my_strnncoll_sjis_bin,
+ my_strnncollsp_sjis_bin,
+ my_strnxfrm_mb,
+ my_strnxfrmlen_simple,
+ my_like_range_mb,
+ my_wildcmp_mb_bin,
+ my_strcasecmp_mb_bin,
+ my_instr_mb,
+ my_hash_sort_mb_bin,
+ my_propagate_simple
+};
+
+
static MY_CHARSET_HANDLER my_charset_handler=
{
NULL, /* init */
@@ -34179,7 +34133,7 @@ struct charset_info_st my_charset_sjis_japanese_ci=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_ci_handler
+ &my_collation_handler_sjis_japanese_ci
};
struct charset_info_st my_charset_sjis_bin=
@@ -34211,7 +34165,7 @@ struct charset_info_st my_charset_sjis_bin=
1, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */
&my_charset_handler,
- &my_collation_mb_bin_handler
+ &my_collation_handler_sjis_bin
};
#endif
diff --git a/strings/strcoll.ic b/strings/strcoll.ic
new file mode 100644
index 00000000000..f230c4f7411
--- /dev/null
+++ b/strings/strcoll.ic
@@ -0,0 +1,231 @@
+/*
+ Copyright (c) 2015, MariaDB Foundation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+
+#ifndef MY_FUNCTION_NAME
+#error MY_FUNCTION_NAME is not defined
+#endif
+
+
+/*
+ The weight for automatically padded spaces when comparing strings with
+ the PAD SPACE property.
+ Should normally be equal to the weight of a regular space.
+*/
+#ifndef WEIGHT_PAD_SPACE
+#define WEIGHT_PAD_SPACE (' ')
+#endif
+
+
+/*
+ Weight of an illegal byte, must follow these rules:
+ 1. Must be greater than weight of any normal character in the collation.
+ 2. Two different bad bytes must have different weights and must be
+ compared in their binary order.
+
+ Depends on mbmaxlen of the character set, as well as how the collation
+ sorts various single-byte and multi-byte character blocks.
+
+ The macro below is the default definition, it is suitable for mbmaxlen=2
+ character sets that sort all multi-byte characters after all single-byte
+ characters: big5, euckr, gb2312, gbk.
+
+ All mbmaxlen>2 character sets must provide their own definitions.
+ All collations that have a more complex order (than just MB1 followed by MB2)
+ must also provide their own definitions (see definitions for
+ cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order).
+*/
+#ifndef WEIGHT_ILSEQ
+#define WEIGHT_ILSEQ(x) (0xFF00 + (x))
+#endif
+
+
+/**
+ Scan a valid character, or a bad byte, or an auto-padded space
+ from a string and calculate the weight of the scanned sequence.
+
+ @param [OUT] weight - the weight is returned here
+ @param str - the string
+ @param end - the end of the string
+ @return - the number of bytes scanned
+
+ The including source file must define the following macros:
+ IS_MB1_CHAR(x)
+ IS_MB2_CHAR(x,y)
+ WEIGHT_PAD_SPACE
+ WEIGHT_MB1(x)
+ WEIGHT_MB2(x,y)
+ WEIGHT_ILSEQ(x)
+*/
+static inline uint
+MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
+{
+ if (str >= end)
+ {
+ *weight= WEIGHT_PAD_SPACE;
+ return 0;
+ }
+
+ if (IS_MB1_CHAR(*str))
+ {
+ *weight= WEIGHT_MB1(*str); /* A valid single byte character*/
+ return 1;
+ }
+
+ if (str + 2 > end) /* The string ended unexpectedly */
+ goto bad; /* Treat as a bad byte */
+
+ if (IS_MB2_CHAR(str[0], str[1]))
+ {
+ *weight= WEIGHT_MB2(str[0], str[1]);
+ return 2; /* A valid two-byte character */
+ }
+
+bad:
+ *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */
+ return 1;
+}
+
+
+/**
+ Compare two strings according to the collation,
+ without handling the PAD SPACE property.
+
+ Note, cs->coll->strnncoll() is usually used to compare identifiers.
+ Perhaps we should eventually (in 10.2?) create a new collation
+ my_charset_utf8_general_ci_no_pad and have only one comparison function
+ in MY_COLLATION_HANDLER.
+
+ @param cs - the character set and collation
+ @param a - the left string
+ @param a_length - the length of the left string
+ @param b - the right string
+ @param b_length - the length of the right string
+ @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a"
+ @return - the comparison result
+*/
+static int
+MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *a, size_t a_length,
+ const uchar *b, size_t b_length,
+ my_bool b_is_prefix)
+{
+ const uchar *a_end= a + a_length;
+ const uchar *b_end= b + b_length;
+ for ( ; ; )
+ {
+ int a_weight, b_weight, res;
+ uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
+ uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+ /*
+ a_wlen b_wlen Comment
+ ------ ------ -------
+ 0 0 Strings ended simultaneously, "a" and "b" are equal.
+ 0 >0 "a" is a prefix of "b", so "a" is smaller.
+ >0 0 "b" is a prefix of "a", check b_is_prefix.
+ >0 >0 Two weights were scanned, check weight difference.
+ */
+ if (!a_wlen)
+ return b_wlen ? -b_weight : 0;
+
+ if (!b_wlen)
+ return b_is_prefix ? 0 : a_weight;
+
+ if ((res= (a_weight - b_weight)))
+ return res;
+ /*
+ None of the strings has ended yet.
+ */
+ DBUG_ASSERT(a < a_end);
+ DBUG_ASSERT(b < b_end);
+ a+= a_wlen;
+ b+= b_wlen;
+ }
+ DBUG_ASSERT(0);
+ return 0;
+}
+
+
+/**
+ Compare two strings according to the collation, with PAD SPACE handling.
+
+ @param cs - the character set and collation
+ @param a - the left string
+ @param a_length - the length of the left string
+ @param b - the right string
+ @param b_length - the length of the right string
+ @param diff_if_only_endspace_difference - not used in the code.
+ TODO: this should be eventually removed (in 10.2?)
+ @return - the comparison result
+*/
+
+static int
+MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *a, size_t a_length,
+ const uchar *b, size_t b_length,
+ my_bool diff_if_only_endspace_difference
+ __attribute__((unused)))
+{
+ const uchar *a_end= a + a_length;
+ const uchar *b_end= b + b_length;
+ for ( ; ; )
+ {
+ int a_weight, b_weight, res;
+ uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
+ uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+ if ((res= (a_weight - b_weight)))
+ {
+ /*
+ Got two different weights. Each weight can be generated by either of:
+ - a real character
+ - a bad byte sequence or an incomplete byte sequence
+ - an auto-generated trailing space (PAD SPACE)
+ It does not matter how exactly each weight was generated.
+ Just return the weight difference.
+ */
+ return res;
+ }
+ if (!a_wlen && !b_wlen)
+ {
+ /*
+ Got two auto-generated trailing spaces, i.e.
+ both strings have now ended, so they are equal.
+ */
+ DBUG_ASSERT(a == a_end);
+ DBUG_ASSERT(b == b_end);
+ return 0;
+ }
+ /*
+ At least one of the strings has not ended yet, continue comparison.
+ */
+ DBUG_ASSERT(a < a_end || b < b_end);
+ a+= a_wlen;
+ b+= b_wlen;
+ }
+ DBUG_ASSERT(0);
+ return 0;
+}
+
+/*
+ We usually include this file at least two times from the same source file,
+ for the _ci and the _bin collations. Prepare for the second inclusion.
+*/
+#undef MY_FUNCTION_NAME
+#undef WEIGHT_ILSEQ
+#undef WEIGHT_MB1
+#undef WEIGHT_MB2
+#undef WEIGHT_PAD_SPACE
diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c
index 6baef0417a8..c7824d07047 100644
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -95,11 +95,361 @@ static CHARSET_INFO *charset_list[]=
};
+typedef struct
+{
+ const char *a;
+ size_t alen;
+ const char *b;
+ size_t blen;
+ int res;
+} STRNNCOLL_PARAM;
+
+
+#define CSTR(x) (x),(sizeof(x)-1)
+
+/*
+ Byte sequence types used in the tests:
+ 8BIT - a 8 bit byte (>=00x80) which makes a single byte characters
+ MB2 - two bytes that make a valid character
+ H2 - a byte which is a valid MB2 head byte
+ T2 - a byte which is a valid MB2 tail byte
+ ILSEQ - a byte which makes an illegal sequence
+ H2+ILSEQ - a sequence that starts with a valid H2 byte,
+ but not followed by a valid T2 byte.
+
+ Charset H2 T2 8BIT
+ ------- ---------------- --------------- --------
+ big5 [A1..F9] [40..7E,A1..FE]
+ euckr [81..FE] [41..5A,61..7A,81..FE]
+ gb2312 [A1..F7] [A1..FE]
+ gbk [81..FE] [40..7E,80..FE]
+
+ cp932 [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
+ sjis [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
+
+
+ Essential byte sequences in various character sets:
+
+ Sequence big5 cp932 euckr gb2312 gbk sjis
+ -------- ---- ----- ----- ------ --- ----
+ 80 ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ
+ 81 ILSEQ H2 H2 ILSEQ H2 H2
+ A1 H2 8BIT H2 H2 H2 8BIT
+ A1A1 MB2 8BIT+8BIT MB2 MB2 MB2 8BIT+8BIT
+ E0E0 MB2 MB2 MB2 MB2 MB2 MB2
+ F9FE MB2 H2+ILSEQ MB2 ILSEQ+T2 MB2 H2+ILSEQ
+*/
+
+
+/*
+ For character sets that have the following byte sequences:
+ 80 - ILSEQ
+ 81 - ILSEQ or H2
+ F9 - ILSEQ or H2
+ A1A1 - MB2 or 8BIT+8BIT
+ E0E0 - MB2
+*/
+STRNNCOLL_PARAM strcoll_mb2_common[]=
+{
+ /* Compare two good sequences */
+ {CSTR(""), CSTR(""), 0},
+ {CSTR(""), CSTR(" "), 0},
+ {CSTR(""), CSTR("A"), -1},
+ {CSTR(""), CSTR("a"), -1},
+ {CSTR(""), CSTR("\xA1\xA1"), -1},
+ {CSTR(""), CSTR("\xE0\xE0"), -1},
+
+ {CSTR(" "), CSTR(""), 0},
+ {CSTR(" "), CSTR(" "), 0},
+ {CSTR(" "), CSTR("A"), -1},
+ {CSTR(" "), CSTR("a"), -1},
+ {CSTR(" "), CSTR("\xA1\xA1"), -1},
+ {CSTR(" "), CSTR("\xE0\xE0"), -1},
+
+ {CSTR("a"), CSTR(""), 1},
+ {CSTR("a"), CSTR(" "), 1},
+ {CSTR("a"), CSTR("a"), 0},
+ {CSTR("a"), CSTR("\xA1\xA1"), -1},
+ {CSTR("a"), CSTR("\xE0\xE0"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xA1\xA1"), 0},
+ {CSTR("\xA1\xA1"), CSTR("\xE0\xE0"), -1},
+
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\x80"), -1},
+ {CSTR(""), CSTR("\x81"), -1},
+ {CSTR(""), CSTR("\xF9"), -1},
+
+ {CSTR(" "), CSTR("\x80"), -1},
+ {CSTR(" "), CSTR("\x81"), -1},
+ {CSTR(" "), CSTR("\xF9"), -1},
+
+ {CSTR("a"), CSTR("\x80"), -1},
+ {CSTR("a"), CSTR("\x81"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\x80"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\x81"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+
+ {CSTR("\xE0\xE0"), CSTR("\x80"), -1},
+ {CSTR("\xE0\xE0"), CSTR("\x81"), -1},
+ {CSTR("\xE0\xE0"), CSTR("\xF9"), -1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\x80"), CSTR("\x80"), 0},
+ {CSTR("\x80"), CSTR("\x81"), -1},
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+ {CSTR("\x81"), CSTR("\x81"), 0},
+ {CSTR("\x81"), CSTR("\xF9"), -1},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets that have good mb2 characters A1A1 and F9FE
+*/
+STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
+{
+ /* Compare two good characters */
+ {CSTR(""), CSTR("\xF9\xFE"), -1},
+ {CSTR(" "), CSTR("\xF9\xFE"), -1},
+ {CSTR("a") , CSTR("\xF9\xFE"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
+
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\xA1"), -1},
+ {CSTR(""), CSTR("\xF9"), -1},
+ {CSTR("a"), CSTR("\xA1"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+
+ {CSTR("\xF9\xFE"), CSTR("\x80"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\x81"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xA1"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9"), -1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\x80"), CSTR("\xA1"), -1},
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets that have:
+ A1A1 - a good mb2 character
+ F9FE - a bad sequence
+*/
+STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
+{
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\xF9\xFE"), -1},
+ {CSTR(" "), CSTR("\xF9\xFE"), -1},
+ {CSTR("a") , CSTR("\xF9\xFE"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+
+ {CSTR(""), CSTR("\xA1"), -1},
+ {CSTR(""), CSTR("\xF9"), -1},
+ {CSTR("a"), CSTR("\xA1"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\xF9\xFE"), CSTR("\x80"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\x81"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
+ {CSTR("\x80"), CSTR("\xA1"), -1},
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets that have:
+ 80 - ILSEQ or H2
+ 81 - ILSEQ or H2
+ A1 - 8BIT
+ F9 - ILSEQ or H2
+ F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
+*/
+STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
+{
+ /* Compare two good characters */
+ {CSTR(""), CSTR("\xA1"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xA1"), 1},
+
+ /* Compare a good character to an illegal or an incomplete sequence */
+ {CSTR(""), CSTR("\xF9"), -1},
+ {CSTR(""), CSTR("\xF9\xFE"), -1},
+ {CSTR(" "), CSTR("\xF9\xFE"), -1},
+ {CSTR("a"), CSTR("\xF9\xFE"), -1},
+ {CSTR("a"), CSTR("\xA1"), -1},
+ {CSTR("a"), CSTR("\xF9"), -1},
+
+ {CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
+ {CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
+
+ {CSTR("\xF9\xFE"), CSTR("\x80"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\x81"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
+
+ {CSTR("\x80"), CSTR("\xA1"), 1},
+
+ /* Compare two bad/incomplete sequences */
+ {CSTR("\x80"), CSTR("\xF9"), -1},
+ {CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+/*
+ For character sets (e.g. cp932 and sjis) that have:
+ 8181 - a valid MB2 character
+ A1 - a valid 8BIT character
+ E0E0 - a valid MB2 character
+ and sort in this order:
+ 8181 < A1 < E0E0
+*/
+STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
+{
+ {CSTR("\x81\x81"), CSTR("\xA1"), -1},
+ {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
+ {CSTR("\xA1"), CSTR("\xE0\xE0"), -1},
+
+ {NULL, 0, NULL, 0, 0}
+};
+
+
+static void
+str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
+{
+ char *dstend= dst + dstlen;
+ const char *srcend= src + srclen;
+ for (*dst= '\0' ; dst + 3 < dstend && src < srcend; )
+ {
+ sprintf(dst, "%02X", (unsigned char) src[0]);
+ dst+=2;
+ src++;
+ }
+}
+
+
+/*
+ Check if the two comparison result are semantically equal:
+ both are negative, both are positive, or both are zero.
+*/
+static int
+eqres(int ares, int bres)
+{
+ return (ares < 0 && bres < 0) ||
+ (ares > 0 && bres > 0) ||
+ (ares == 0 && bres == 0);
+}
+
+
+static int
+strcollsp(CHARSET_INFO *cs, const STRNNCOLL_PARAM *param)
+{
+ int failed= 0;
+ const STRNNCOLL_PARAM *p;
+ diag("%-20s %-10s %-10s %10s %10s", "Collation", "a", "b", "ExpectSign", "Actual");
+ for (p= param; p->a; p++)
+ {
+ char ahex[64], bhex[64];
+ int res= cs->coll->strnncollsp(cs, (uchar *) p->a, p->alen,
+ (uchar *) p->b, p->blen, 0);
+ str2hex(ahex, sizeof(ahex), p->a, p->alen);
+ str2hex(bhex, sizeof(bhex), p->b, p->blen);
+ diag("%-20s %-10s %-10s %10d %10d%s",
+ cs->name, ahex, bhex, p->res, res,
+ eqres(res, p->res) ? "" : " FAILED");
+ if (!eqres(res, p->res))
+ {
+ failed++;
+ }
+ else
+ {
+ /* Test in reverse order */
+ res= cs->coll->strnncollsp(cs, (uchar *) p->b, p->blen,
+ (uchar *) p->a, p->alen, 0);
+ if (!eqres(res, -p->res))
+ {
+ diag("Comparison in reverse order failed. Expected %d, got %d",
+ -p->res, res);
+ failed++;
+ }
+ }
+ }
+ return failed;
+}
+
+
+static int
+test_strcollsp()
+{
+ int failed= 0;
+#ifdef HAVE_CHARSET_big5
+ failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
+ failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_cp932
+ failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_8181_A1_E0E0);
+ failed+= strcollsp(&my_charset_cp932_bin, strcoll_8181_A1_E0E0);
+#endif
+#ifdef HAVE_CHARSET_euckr
+ failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_A1A1_mb2_F9FE);
+ failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_gb2312
+ failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_A1A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_A1A1_bad_F9FE);
+#endif
+#ifdef HAVE_CHARSET_gbk
+ failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
+ failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_A1A1_mb2_F9FE);
+#endif
+#ifdef HAVE_CHARSET_sjis
+ failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb2_common);
+ failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb1_A1_bad_F9FE);
+ failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
+ failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
+#endif
+ return failed;
+}
+
+
int main()
{
size_t i, failed= 0;
- plan(1);
+ plan(2);
diag("Testing my_like_range_xxx() functions");
for (i= 0; i < array_elements(charset_list); i++)
@@ -112,5 +462,10 @@ int main()
}
}
ok(failed == 0, "Testing my_like_range_xxx() functions");
+
+ diag("Testing cs->coll->strnncollsp()");
+ failed= test_strcollsp();
+ ok(failed == 0, "Testing cs->coll->strnncollsp()");
+
return exit_status();
}