diff options
author | unknown <monty@mysql.com> | 2004-07-07 11:29:39 +0300 |
---|---|---|
committer | unknown <monty@mysql.com> | 2004-07-07 11:29:39 +0300 |
commit | 11b8987313aab1eb8d9f829731fa0dcd83bf62f5 (patch) | |
tree | ca8974710746bb69b04a49bc64f938d0b1a8420a /strings | |
parent | c92670d80b3674fa025e373e0498eccbb407f873 (diff) | |
parent | 9890cd9a9eb37083d13370947fa50d64e21e54ff (diff) | |
download | mariadb-git-11b8987313aab1eb8d9f829731fa0dcd83bf62f5.tar.gz |
Merge with 4.1.3-beta
BitKeeper/etc/ignore:
auto-union
BitKeeper/etc/logging_ok:
auto-union
VC++Files/libmysqld/libmysqld.dsp:
Auto merged
VC++Files/sql/mysqld.dsp:
Auto merged
client/mysql.cc:
Auto merged
client/mysqlbinlog.cc:
Auto merged
client/mysqltest.c:
Auto merged
include/config-netware.h:
Auto merged
include/my_base.h:
Auto merged
include/my_global.h:
Auto merged
include/my_sys.h:
Auto merged
include/mysql_com.h:
Auto merged
include/sql_state.h:
Auto merged
innobase/include/row0mysql.h:
Auto merged
innobase/row/row0sel.c:
Auto merged
libmysql/libmysql.c:
Auto merged
libmysqld/lib_sql.cc:
Auto merged
myisam/mi_check.c:
Auto merged
mysql-test/r/bdb.result:
Auto merged
mysql-test/r/connect.result:
Auto merged
mysql-test/r/ctype_ucs.result:
Auto merged
mysql-test/r/derived.result:
Auto merged
mysql-test/r/func_group.result:
Auto merged
mysql-test/r/func_like.result:
Auto merged
mysql-test/r/func_sapdb.result:
Auto merged
mysql-test/r/func_time.result:
Auto merged
mysql-test/r/insert.result:
Auto merged
mysql-test/r/insert_select.result:
Auto merged
mysql-test/r/join_outer.result:
Auto merged
mysql-test/r/key.result:
Auto merged
mysql-test/r/multi_update.result:
Auto merged
mysql-test/r/mysqldump.result:
Auto merged
mysql-test/r/null.result:
Auto merged
mysql-test/r/null_key.result:
Auto merged
mysql-test/r/query_cache.result:
Auto merged
mysql-test/r/rpl_rotate_logs.result:
Auto merged
mysql-test/r/rpl_server_id1.result:
Auto merged
mysql-test/r/rpl_until.result:
Auto merged
mysql-test/r/select.result:
Auto merged
mysql-test/r/show_check.result:
Auto merged
mysql-test/r/subselect.result:
Auto merged
mysql-test/r/system_mysql_db.result:
Auto merged
mysql-test/r/union.result:
Auto merged
mysql-test/r/variables.result:
Auto merged
mysql-test/t/multi_update.test:
Auto merged
mysql-test/t/mysqlbinlog.test:
Auto merged
mysql-test/t/rpl000015.test:
Auto merged
mysql-test/t/subselect.test:
Auto merged
mysql-test/t/variables.test:
Auto merged
mysys/mf_iocache2.c:
Auto merged
mysys/my_bitmap.c:
Auto merged
mysys/my_pthread.c:
Auto merged
netware/Makefile.am:
Auto merged
netware/my_manage.c:
Auto merged
netware/mysql_test_run.c:
Auto merged
netware/BUILD/compile-linux-tools:
Auto merged
netware/BUILD/compile-netware-standard:
Auto merged
netware/BUILD/mwenv:
Auto merged
netware/BUILD/nwbootstrap:
Auto merged
scripts/make_binary_distribution.sh:
Auto merged
scripts/mysql_install_db.sh:
Auto merged
sql/ha_berkeley.cc:
Auto merged
sql/ha_berkeley.h:
Auto merged
sql/ha_heap.h:
Auto merged
sql/item.cc:
Auto merged
sql/item.h:
Auto merged
sql/item_cmpfunc.cc:
Auto merged
sql/item_cmpfunc.h:
Auto merged
sql/item_create.cc:
Auto merged
sql/item_create.h:
Auto merged
sql/item_func.h:
Auto merged
sql/item_subselect.cc:
Auto merged
sql/item_sum.cc:
Auto merged
sql/item_sum.h:
Auto merged
sql/item_timefunc.h:
Auto merged
sql/lex.h:
Auto merged
sql/mysql_priv.h:
Auto merged
sql/net_serv.cc:
Auto merged
sql/protocol.cc:
Auto merged
sql/protocol.h:
Auto merged
sql/records.cc:
Auto merged
sql/repl_failsafe.cc:
Auto merged
sql/set_var.cc:
Auto merged
sql/sql_acl.cc:
Auto merged
sql/sql_acl.h:
Auto merged
sql/sql_base.cc:
Auto merged
sql/sql_cache.cc:
Auto merged
sql/sql_delete.cc:
Auto merged
sql/sql_derived.cc:
Auto merged
sql/sql_load.cc:
Auto merged
sql/sql_show.cc:
Auto merged
sql/sql_string.cc:
Auto merged
sql/sql_update.cc:
Auto merged
sql/structs.h:
Auto merged
sql-common/client.c:
Auto merged
configure.in:
Merge with 4.1
include/mysqld_error.h:
New errors from 4.1
libmysqld/Makefile.am:
Merge with 4.1
myisam/myisamchk.c:
Merge with 4.1
myisam/myisamdef.h:
Merge with 4.1
myisam/sort.c:
Merge with 4.1
mysql-test/r/mysqlbinlog.result:
Merge with 4.1
mysql-test/r/range.result:
Merge with 4.1
mysql-test/r/rpl_flush_log_loop.result:
Merge with 4.1
mysql-test/r/rpl_replicate_do.result:
Merge with 4.1
mysql-test/r/rpl_temporary.result:
Merge with 4.1
mysql-test/r/rpl_user_variables.result:
Merge with 4.1
mysql-test/t/func_time.test:
Merge with 4.1
scripts/mysql_create_system_tables.sh:
Merge with 4.1
scripts/mysql_fix_privilege_tables.sql:
Merge with 4.1
sql/Makefile.am:
Merge with 4.1
sql/filesort.cc:
Merge with 4.1
sql/ha_innodb.cc:
Merge with 4.1
sql/ha_innodb.h:
Merge with 4.1
sql/ha_myisam.cc:
Merge with 4.1
sql/handler.cc:
Merge with 4.1
sql/handler.h:
Merge with 4.1
sql/item_func.cc:
Merge with 4.1
sql/item_timefunc.cc:
Merge with 4.1
sql/log.cc:
Merge with 4.1
sql/log_event.cc:
Merge with 4.1
sql/mysqld.cc:
Merge with 4.1
sql/opt_range.cc:
Merge with 4.1
sql/opt_range.h:
Merge with 4.1
sql/share/czech/errmsg.txt:
Merge with 4.1
Updated english error messages
sql/share/danish/errmsg.txt:
Merge with 4.1
sql/share/dutch/errmsg.txt:
Merge with 4.1
sql/share/english/errmsg.txt:
Merge with 4.1
sql/share/estonian/errmsg.txt:
Merge with 4.1
sql/share/french/errmsg.txt:
Merge with 4.1
sql/share/german/errmsg.txt:
Merge with 4.1
sql/share/greek/errmsg.txt:
Merge with 4.1
sql/share/hungarian/errmsg.txt:
Merge with 4.1
sql/share/italian/errmsg.txt:
Merge with 4.1
sql/share/japanese/errmsg.txt:
Merge with 4.1
sql/share/korean/errmsg.txt:
Merge with 4.1
sql/share/norwegian-ny/errmsg.txt:
Merge with 4.1
sql/share/norwegian/errmsg.txt:
Merge with 4.1
sql/share/polish/errmsg.txt:
Merge with 4.1
sql/share/portuguese/errmsg.txt:
Merge with 4.1
sql/share/romanian/errmsg.txt:
Merge with 4.1
sql/share/russian/errmsg.txt:
Merge with 4.1
sql/share/serbian/errmsg.txt:
Merge with 4.1
sql/share/slovak/errmsg.txt:
Merge with 4.1
sql/share/spanish/errmsg.txt:
Merge with 4.1
sql/share/swedish/errmsg.txt:
Merge with 4.1
sql/share/ukrainian/errmsg.txt:
Merge with 4.1
sql/slave.cc:
Merge with 4.1
sql/sql_class.cc:
Merge with 4.1
sql/sql_class.h:
Merge with 4.1
sql/sql_db.cc:
Merge with 4.1
sql/sql_insert.cc:
Merge with 4.1
sql/sql_lex.cc:
Merge with 4.1
sql/sql_lex.h:
Merge with 4.1
sql/sql_parse.cc:
Merge with 4.1 tree
Changed // comments to /* */
sql/sql_prepare.cc:
Merge with 4.1
sql/sql_select.cc:
Merge with 4.1
sql/sql_table.cc:
Merge with 4.1
sql/sql_yacc.yy:
Merge with 4.1
sql/table.h:
Merge with 4.1
tests/client_test.c:
Merge with 4.1
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-big5.c | 26 | ||||
-rw-r--r-- | strings/ctype-bin.c | 28 | ||||
-rw-r--r-- | strings/ctype-czech.c | 14 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 18 | ||||
-rw-r--r-- | strings/ctype-extra.c | 27 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 22 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 29 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 29 | ||||
-rw-r--r-- | strings/ctype-mb.c | 20 | ||||
-rw-r--r-- | strings/ctype-simple.c | 134 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 27 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 26 | ||||
-rw-r--r-- | strings/ctype-uca.c | 1785 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 52 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 26 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 56 | ||||
-rw-r--r-- | strings/ctype-win1250ch.c | 21 | ||||
-rw-r--r-- | strings/ctype.c | 55 | ||||
-rw-r--r-- | strings/int2str.c | 139 | ||||
-rw-r--r-- | strings/longlong2str-x86.s | 4 | ||||
-rw-r--r-- | strings/longlong2str.c | 10 | ||||
-rw-r--r-- | strings/my_strtoll10.c | 2 | ||||
-rw-r--r-- | strings/my_vsnprintf.c | 3 |
23 files changed, 2290 insertions, 263 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index 2071759ddae..ff53f61c053 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -251,11 +251,12 @@ static int my_strnncoll_big5_internal(const uchar **a_res, static int my_strnncoll_big5(CHARSET_INFO *cs __attribute__((unused)), const uchar *a, uint a_length, - const uchar *b, uint b_length) + const uchar *b, uint b_length, + my_bool b_is_prefix) { uint length= min(a_length, b_length); int res= my_strnncoll_big5_internal(&a, &b, length); - return res ? res : (int) (a_length - b_length); + return res ? res : (int)((b_is_prefix ? length : a_length) - b_length); } @@ -402,7 +403,7 @@ static my_bool my_like_range_big5(CHARSET_INFO *cs __attribute__((unused)), } if (*ptr == escape && ptr+1 != end) { - ptr++; /* Skipp escape */ + ptr++; /* Skip escape */ *min_str++= *max_str++ = *ptr; continue; } @@ -6269,6 +6270,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = { + NULL, /* init */ my_strnncoll_big5, my_strnncollsp_big5, my_strnxfrm_big5, @@ -6281,6 +6283,7 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = static MY_CHARSET_HANDLER my_charset_big5_handler= { + NULL, /* init */ ismbchar_big5, mbcharlen_big5, my_numchars_mb, @@ -6297,7 +6300,6 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_long10_to_str_8bit, my_longlong10_to_str_8bit, my_fill_8bit, - my_strntol_8bit, my_strntoul_8bit, my_strntoll_8bit, @@ -6313,20 +6315,22 @@ CHARSET_INFO my_charset_big5_chinese_ci= "big5", /* cs name */ "big5_chinese_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_big5, to_lower_big5, to_upper_big5, sort_order_big5, + NULL, /* contractions */ NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_big5_handler, &my_collation_big5_chinese_ci_handler }; @@ -6339,20 +6343,22 @@ CHARSET_INFO my_charset_big5_bin= "big5", /* cs name */ "big5_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_big5, to_lower_big5, to_upper_big5, sort_order_big5, + NULL, /* contractions */ NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_big5_handler, &my_collation_mb_bin_handler }; diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 7cac8c7c337..cc83471f264 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -91,10 +91,20 @@ static uchar bin_char_array[] = static int my_strnncoll_binary(CHARSET_INFO * cs __attribute__((unused)), const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *t, uint tlen, + my_bool t_is_prefix) { - int cmp= memcmp(s,t,min(slen,tlen)); - return cmp ? cmp : (int) (slen - tlen); + uint len=min(slen,tlen); + int cmp= memcmp(s,t,len); + return cmp ? cmp : (int)((t_is_prefix ? len : slen) - tlen); +} + + +static int my_strnncollsp_binary(CHARSET_INFO * cs __attribute__((unused)), + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + return my_strnncoll_binary(cs,s,slen,t,tlen,0); } @@ -333,8 +343,9 @@ skip: MY_COLLATION_HANDLER my_collation_8bit_bin_handler = { + NULL, /* init */ my_strnncoll_binary, - my_strnncoll_binary, + my_strnncollsp_binary, my_strnxfrm_bin, my_like_range_simple, my_wildcmp_bin, @@ -346,6 +357,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ NULL, /* ismbchar */ my_mbcharlen_8bit, /* mbcharlen */ my_numchars_8bit, @@ -378,15 +390,17 @@ CHARSET_INFO my_charset_bin = "binary", /* cs name */ "binary", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_bin, /* ctype */ bin_char_array, /* to_lower */ bin_char_array, /* to_upper */ bin_char_array, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ diff --git a/strings/ctype-czech.c b/strings/ctype-czech.c index 2eb2fac46e9..6f9e9f74d35 100644 --- a/strings/ctype-czech.c +++ b/strings/ctype-czech.c @@ -242,12 +242,16 @@ while (1) \ static int my_strnncoll_czech(CHARSET_INFO *cs __attribute__((unused)), const uchar * s1, uint len1, - const uchar * s2, uint len2) + const uchar * s2, uint len2, + my_bool s2_is_prefix) { int v1, v2; const uchar * p1, * p2, * store1, * store2; int pass1 = 0, pass2 = 0; + if (s2_is_prefix && len1 > len2) + len1=len2; + p1 = s1; p2 = s2; store1 = s1; store2 = s2; @@ -276,7 +280,7 @@ int my_strnncollsp_czech(CHARSET_INFO * cs, { for ( ; slen && s[slen-1] == ' ' ; slen--); for ( ; tlen && t[tlen-1] == ' ' ; tlen--); - return my_strnncoll_czech(cs,s,slen,t,tlen); + return my_strnncoll_czech(cs,s,slen,t,tlen,0); } @@ -572,6 +576,7 @@ static MY_UNI_IDX idx_uni_8859_2[]={ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler = { + NULL, /* init */ my_strnncoll_czech, my_strnncollsp_czech, my_strnxfrm_czech, @@ -589,14 +594,17 @@ CHARSET_INFO my_charset_latin2_czech_ci = "latin2", /* cs name */ "latin2_czech_cs", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_czech, to_lower_czech, to_upper_czech, sort_order_czech, + NULL, /* contractions */ NULL, /* sort_order_big*/ tab_8859_2_uni, /* tab_to_uni */ idx_uni_8859_2, /* tab_from_uni */ - "","", + NULL, /* state_map */ + NULL, /* ident_map */ 4, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 8f955c15a73..fd8659a181c 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8637,6 +8637,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_simple, /* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ @@ -8649,6 +8650,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ ismbchar_euc_kr, mbcharlen_euc_kr, my_numchars_mb, @@ -8681,20 +8683,22 @@ CHARSET_INFO my_charset_euckr_korean_ci= "euckr", /* cs name */ "euckr_korean_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_euc_kr, to_lower_euc_kr, to_upper_euc_kr, sort_order_euc_kr, + NULL, /* contractions */ NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_ci_handler }; @@ -8707,20 +8711,22 @@ CHARSET_INFO my_charset_euckr_bin= "euckr", /* cs name */ "euckr_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_euc_kr, to_lower_euc_kr, to_upper_euc_kr, sort_order_euc_kr, + NULL, /* contractions */ NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_mb_bin_handler }; diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c index 51a9531fbf5..3672dcd0b33 100644 --- a/strings/ctype-extra.c +++ b/strings/ctype-extra.c @@ -24,20 +24,23 @@ CHARSET_INFO compiled_charsets[] = { NullS, /* cs name */ NullS, /* name */ NullS, /* comment */ - NULL, - NULL, - NULL, - NULL, + NULL, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "","", - 0, - 0, - 0, - 0, - 0, - NULL, - NULL + NULL, /* state_map */ + NULL, /* ident_map */ + 0, /* strxfrm_mul */ + 0, /* mbminlen */ + 0, /* mbmaxlen */ + 0, /* min_sort_ord */ + 0, /* max_sort_ord */ + NULL, /* cset handler */ + NULL /* coll handler */ } }; diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index b76511fc4f3..b9f61256717 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5688,6 +5688,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_simple, /* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ @@ -5700,6 +5701,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ ismbchar_gb2312, mbcharlen_gb2312, my_numchars_mb, @@ -5732,20 +5734,22 @@ CHARSET_INFO my_charset_gb2312_chinese_ci= "gb2312", /* cs name */ "gb2312_chinese_ci",/* name */ "", /* comment */ + NULL, /* tailoring */ ctype_gb2312, to_lower_gb2312, to_upper_gb2312, sort_order_gb2312, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_ci_handler }; @@ -5757,20 +5761,22 @@ CHARSET_INFO my_charset_gb2312_bin= "gb2312", /* cs name */ "gb2312_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_gb2312, to_lower_gb2312, to_upper_gb2312, sort_order_gb2312, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_mb_bin_handler }; diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index cc0f226d01c..2ef75e27d9a 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -2614,11 +2614,12 @@ int my_strnncoll_gbk_internal(const uchar **a_res, const uchar **b_res, int my_strnncoll_gbk(CHARSET_INFO *cs __attribute__((unused)), const uchar *a, uint a_length, - const uchar *b, uint b_length) + const uchar *b, uint b_length, + my_bool b_is_prefix) { uint length= min(a_length, b_length); int res= my_strnncoll_gbk_internal(&a, &b, length); - return res ? res : (int) (a_length - b_length); + return res ? res : (int) ((b_is_prefix ? length : a_length) - b_length); } @@ -2715,7 +2716,7 @@ static my_bool my_like_range_gbk(CHARSET_INFO *cs __attribute__((unused)), } if (*ptr == escape && ptr+1 != end) { - ptr++; /* Skipp escape */ + ptr++; /* Skip escape */ *min_str++= *max_str++ = *ptr; continue; } @@ -9918,6 +9919,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_gbk, my_strnncollsp_gbk, my_strnxfrm_gbk, @@ -9930,6 +9932,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ ismbchar_gbk, mbcharlen_gbk, my_numchars_mb, @@ -9962,20 +9965,22 @@ CHARSET_INFO my_charset_gbk_chinese_ci= "gbk", /* cs name */ "gbk_chinese_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_gbk, to_lower_gbk, to_upper_gbk, sort_order_gbk, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_ci_handler }; @@ -9987,20 +9992,22 @@ CHARSET_INFO my_charset_gbk_bin= "gbk", /* cs name */ "gbk_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_gbk, to_lower_gbk, to_upper_gbk, sort_order_gbk, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_mb_bin_handler }; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 0b439964c7c..652794fa84d 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -380,6 +380,7 @@ int my_wc_mb_latin1(CHARSET_INFO *cs __attribute__((unused)), static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ NULL, my_mbcharlen_8bit, my_numchars_8bit, @@ -412,19 +413,22 @@ CHARSET_INFO my_charset_latin1= "latin1", /* cs name */ "latin1_swedish_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_latin1, to_lower_latin1, to_upper_latin1, sort_order_latin1, + NULL, /* contractions */ NULL, /* sort_order_big*/ cs_to_uni, /* tab_to_uni */ NULL, /* tab_from_uni */ - "","", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_8bit_simple_ci_handler }; @@ -525,7 +529,8 @@ uchar combo2map[]={ static int my_strnncoll_latin1_de(CHARSET_INFO *cs __attribute__((unused)), const uchar *a, uint a_length, - const uchar *b, uint b_length) + const uchar *b, uint b_length, + my_bool b_is_prefix) { const uchar *a_end= a + a_length; const uchar *b_end= b + b_length; @@ -558,7 +563,7 @@ static int my_strnncoll_latin1_de(CHARSET_INFO *cs __attribute__((unused)), A simple test of string lengths won't work -- we test to see which string ran out first */ - return ((a < a_end || a_extend) ? 1 : + return ((a < a_end || a_extend) ? (b_is_prefix ? 0 : 1) : (b < b_end || b_extend) ? -1 : 0); } @@ -672,6 +677,7 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_german2_ci_handler= { + NULL, /* init */ my_strnncoll_latin1_de, my_strnncollsp_latin1_de, my_strnxfrm_latin1_de, @@ -690,19 +696,22 @@ CHARSET_INFO my_charset_latin1_german2_ci= "latin1", /* cs name */ "latin1_german2_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_latin1, to_lower_latin1, to_upper_latin1, sort_order_latin1_de, + NULL, /* contractions */ NULL, /* sort_order_big*/ cs_to_uni, /* tab_to_uni */ NULL, /* tab_from_uni */ - "","", + NULL, /* state_map */ + NULL, /* ident_map */ 2, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 247, /* max_sort_char */ &my_charset_handler, &my_collation_german2_ci_handler }; @@ -715,20 +724,22 @@ CHARSET_INFO my_charset_latin1_bin= "latin1", /* cs name */ "latin1_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_latin1, to_lower_latin1, to_upper_latin1, sort_order_latin1_de, + NULL, /* contractions */ NULL, /* sort_order_big*/ cs_to_uni, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_8bit_bin_handler }; diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 9b02cd3b3da..7b0dadcfa19 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -322,7 +322,7 @@ uint my_instr_mb(CHARSET_INFO *cs, int mblen; if (!cs->coll->strnncoll(cs, (unsigned char*) b, s_length, - (unsigned char*) s, s_length)) + (unsigned char*) s, s_length, 0)) { if (nmatch) { @@ -352,10 +352,19 @@ uint my_instr_mb(CHARSET_INFO *cs, static int my_strnncoll_mb_bin(CHARSET_INFO * cs __attribute__((unused)), const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *t, uint tlen, + my_bool t_is_prefix) { - int cmp= memcmp(s,t,min(slen,tlen)); - return cmp ? cmp : (int) (slen - tlen); + uint len=min(slen,tlen); + int cmp= memcmp(s,t,len); + return cmp ? cmp : (int) ((t_is_prefix ? len : slen) - tlen); +} + +static int my_strnncollsp_mb_bin(CHARSET_INFO * cs __attribute__((unused)), + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + return my_strnncoll_mb_bin(cs,s,slen,t,tlen,0); } @@ -512,8 +521,9 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs, MY_COLLATION_HANDLER my_collation_mb_bin_handler = { + NULL, /* init */ my_strnncoll_mb_bin, - my_strnncoll_mb_bin, + my_strnncollsp_mb_bin, my_strnxfrm_mb_bin, my_like_range_simple, my_wildcmp_mb_bin, diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index ba1fc1c424a..8e295b9e13e 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -20,7 +20,6 @@ #include <errno.h> #include "stdarg.h" -#include "assert.h" int my_strnxfrm_simple(CHARSET_INFO * cs, @@ -47,16 +46,19 @@ int my_strnxfrm_simple(CHARSET_INFO * cs, } int my_strnncoll_simple(CHARSET_INFO * cs, const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *t, uint tlen, + my_bool t_is_prefix) { int len = ( slen > tlen ) ? tlen : slen; uchar *map= cs->sort_order; + if (t_is_prefix && slen > tlen) + slen=tlen; while (len--) { if (map[*s++] != map[*t++]) return ((int) map[s[-1]] - (int) map[t[-1]]); } - return (int) (slen-tlen); + return (int) (slen - tlen); } @@ -1143,8 +1145,133 @@ skip: } +typedef struct +{ + int nchars; + MY_UNI_IDX uidx; +} uni_idx; + +#define PLANE_SIZE 0x100 +#define PLANE_NUM 0x100 +#define PLANE_NUMBER(x) (((x)>>8) % PLANE_NUM) + +static int pcmp(const void * f, const void * s) +{ + const uni_idx *F= (const uni_idx*) f; + const uni_idx *S= (const uni_idx*) s; + int res; + + if (!(res=((S->nchars)-(F->nchars)))) + res=((F->uidx.from)-(S->uidx.to)); + return res; +} + +static my_bool create_fromuni(CHARSET_INFO *cs, void *(*alloc)(uint)) +{ + uni_idx idx[PLANE_NUM]; + int i,n; + + /* Clear plane statistics */ + bzero(idx,sizeof(idx)); + + /* Count number of characters in each plane */ + for (i=0; i< 0x100; i++) + { + uint16 wc=cs->tab_to_uni[i]; + int pl= PLANE_NUMBER(wc); + + if (wc || !i) + { + if (!idx[pl].nchars) + { + idx[pl].uidx.from=wc; + idx[pl].uidx.to=wc; + }else + { + idx[pl].uidx.from=wc<idx[pl].uidx.from?wc:idx[pl].uidx.from; + idx[pl].uidx.to=wc>idx[pl].uidx.to?wc:idx[pl].uidx.to; + } + idx[pl].nchars++; + } + } + + /* Sort planes in descending order */ + qsort(&idx,PLANE_NUM,sizeof(uni_idx),&pcmp); + + for (i=0; i < PLANE_NUM; i++) + { + int ch,numchars; + + /* Skip empty plane */ + if (!idx[i].nchars) + break; + + numchars=idx[i].uidx.to-idx[i].uidx.from+1; + if (!(idx[i].uidx.tab=(uchar*) alloc(numchars * sizeof(*idx[i].uidx.tab)))) + return TRUE; + + bzero(idx[i].uidx.tab,numchars*sizeof(*idx[i].uidx.tab)); + + for (ch=1; ch < PLANE_SIZE; ch++) + { + uint16 wc=cs->tab_to_uni[ch]; + if (wc >= idx[i].uidx.from && wc <= idx[i].uidx.to && wc) + { + int ofs= wc - idx[i].uidx.from; + idx[i].uidx.tab[ofs]= ch; + } + } + } + + /* Allocate and fill reverse table for each plane */ + n=i; + if (!(cs->tab_from_uni= (MY_UNI_IDX*) alloc(sizeof(MY_UNI_IDX)*(n+1)))) + return TRUE; + + for (i=0; i< n; i++) + cs->tab_from_uni[i]= idx[i].uidx; + + /* Set end-of-list marker */ + bzero(&cs->tab_from_uni[i],sizeof(MY_UNI_IDX)); + return FALSE; +} + +static my_bool my_cset_init_8bit(CHARSET_INFO *cs, void *(*alloc)(uint)) +{ + return create_fromuni(cs, alloc); +} + +static void set_max_sort_char(CHARSET_INFO *cs) +{ + uchar max_char; + uint i; + + if (!cs->sort_order) + return; + + max_char=cs->sort_order[(uchar) cs->max_sort_char]; + for (i= 0; i < 256; i++) + { + if ((uchar) cs->sort_order[i] > max_char) + { + max_char=(uchar) cs->sort_order[i]; + cs->max_sort_char= i; + } + } +} + +static my_bool my_coll_init_simple(CHARSET_INFO *cs, + void *(*alloc)(uint) __attribute__((unused))) +{ + set_max_sort_char(cs); + return FALSE; +} + + + MY_CHARSET_HANDLER my_charset_8bit_handler= { + my_cset_init_8bit, NULL, /* ismbchar */ my_mbcharlen_8bit, /* mbcharlen */ my_numchars_8bit, @@ -1171,6 +1298,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler = { + my_coll_init_simple, /* init */ my_strnncoll_simple, my_strnncollsp_simple, my_strnxfrm_simple, diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 72666175a1f..5fd005f842e 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -232,9 +232,12 @@ static int my_strnncoll_sjis_internal(CHARSET_INFO *cs, static int my_strnncoll_sjis(CHARSET_INFO *cs __attribute__((unused)), const uchar *a, uint a_length, - const uchar *b, uint b_length) + const uchar *b, uint b_length, + my_bool b_is_prefix) { int res= my_strnncoll_sjis_internal(cs, &a, a_length, &b, b_length); + if (b_is_prefix && a_length > b_length) + a_length= b_length; return res ? res : (int) (a_length - b_length); } @@ -4534,6 +4537,7 @@ my_mb_wc_sjis(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_sjis, my_strnncollsp_sjis, my_strnxfrm_sjis, @@ -4547,6 +4551,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ ismbchar_sjis, mbcharlen_sjis, my_numchars_mb, @@ -4579,20 +4584,22 @@ CHARSET_INFO my_charset_sjis_japanese_ci= "sjis", /* cs name */ "sjis_japanese_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_sjis, to_lower_sjis, to_upper_sjis, sort_order_sjis, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_ci_handler }; @@ -4604,20 +4611,22 @@ CHARSET_INFO my_charset_sjis_bin= "sjis", /* cs name */ "sjis_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_sjis, to_lower_sjis, to_upper_sjis, sort_order_sjis, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 2, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_mb_bin_handler }; diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 1b6b1edc8b9..c7d859a6ead 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -529,12 +529,16 @@ static uint thai2sortable(uchar *tstr, uint len) static int my_strnncoll_tis620(CHARSET_INFO *cs __attribute__((unused)), const uchar * s1, uint len1, - const uchar * s2, uint len2) + const uchar * s2, uint len2, + my_bool s2_is_prefix) { uchar buf[80] ; uchar *tc1, *tc2; int i; + if (s2_is_prefix && len1 > len2) + len1= len2; + tc1= buf; if ((len1 + len2 +2) > (int) sizeof(buf)) tc1= (uchar*) malloc(len1+len2); @@ -671,7 +675,7 @@ my_bool my_like_range_tis620(CHARSET_INFO *cs __attribute__((unused)), { if (*ptr == escape && ptr+1 != end) { - ptr++; /* Skipp escape */ + ptr++; /* Skip escape */ *min_str++ = *max_str++ = *ptr; continue; } @@ -906,6 +910,7 @@ int my_wc_mb_tis620(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_tis620, my_strnncollsp_tis620, my_strnxfrm_tis620, @@ -918,6 +923,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ NULL, /* ismbchar */ my_mbcharlen_8bit, /* mbcharlen */ my_numchars_8bit, @@ -951,15 +957,17 @@ CHARSET_INFO my_charset_tis620_thai_ci= "tis620", /* cs name */ "tis620_thai_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_tis620, to_lower_tis620, to_upper_tis620, sort_order_tis620, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 4, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ @@ -976,15 +984,17 @@ CHARSET_INFO my_charset_tis620_bin= "tis620", /* cs name */ "tis620_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_tis620, to_lower_tis620, to_upper_tis620, sort_order_tis620, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 81073d47554..5bb710946b1 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -19,14 +19,15 @@ UCA (Unicode Collation Algorithm) support. Written by Alexander Barkov <bar@mysql.com> - Currently supports only subset of the full UCA. - + Currently supports only subset of the full UCA: - Only Primary level key comparison + - Basic Latin letters contraction is implemented - Variable weighting is done for Non-ignorable option + + Features that are not implemented yet: - No Normalization From D is done + No decomposition is done + No Thai/Lao orderding is done - - No contraction is done - No combining marks processing is done */ @@ -36,8 +37,6 @@ #include "m_ctype.h" -#ifdef HAVE_CHARSET_ucs2 - #define MY_UCA_NPAGES 256 #define MY_UCA_NCHARS 256 #define MY_UCA_CMASK 255 @@ -6521,6 +6520,139 @@ NULL ,page0F9data,page0FAdata,page0FBdata, page0FCdata,page0FDdata,page0FEdata,page0FFdata }; +/* + Some sources treat LETTER A WITH DIARESIS (00E4,00C4) + secondary greater than LETTER AE (00E6,00C6). + http://www.evertype.com/alphabets/icelandic.pdf + http://developer.mimer.com/collations/charts/icelandic.htm + + Other sources do not provide any special rules + for LETTER A WITH DIARESIS: + http://www.omniglot.com/writing/icelandic.htm + http://en.wikipedia.org/wiki/Icelandic_alphabet + http://oss.software.ibm.com/icu/charts/collation/is.html + + Let's go the first way. +*/ + +static const char icelandic[]= + "& A < \\u00E1 <<< \\u00C1 " + "& D < \\u00F0 <<< \\u00D0 " + "& E < \\u00E9 <<< \\u00C9 " + "& I < \\u00ED <<< \\u00CD " + "& O < \\u00F3 <<< \\u00D3 " + "& U < \\u00FA <<< \\u00DA " + "& Y < \\u00FD <<< \\u00DD " + "& Z < \\u00FE <<< \\u00DE " + "< \\u00E6 <<< \\u00C6 << \\u00E4 <<< \\u00C4 " + "< \\u00F6 <<< \\u00D6 << \\u00F8 <<< \\u00D8 " + "< \\u00E5 <<< \\u00C5 "; + +/* + Some sources treat I and Y primary different. + Other sources treat I and Y the same on primary level. + We'll go the first way. +*/ + +static const char latvian[]= + "& C < \\u010D <<< \\u010C " + "& G < \\u0123 <<< \\u0122 " + "& I < \\u0079 <<< \\u0059 " + "& K < \\u0137 <<< \\u0136 " + "& L < \\u013C <<< \\u013B " + "& N < \\u0146 <<< \\u0145 " + "& R < \\u0157 <<< \\u0156 " + "& S < \\u0161 <<< \\u0160 " + "& Z < \\u017E <<< \\u017D "; + + +static const char romanian[]= + "& A < \\u0103 <<< \\u0102 < \\u00E2 <<< \\u00C2 " + "& I < \\u00EE <<< \\u00CE " + "& S < \\u0219 <<< \\u0218 << \\u015F <<< \\u015E " + "& T < \\u021B <<< \\u021A << \\u0163 <<< \\u0162 "; + +static const char slovenian[]= + "& C < \\u010D <<< \\u010C " + "& S < \\u0161 <<< \\u0160 " + "& Z < \\u017E <<< \\u017D "; + + +static const char polish[]= + "& A < \\u0105 <<< \\u0104 " + "& C < \\u0107 <<< \\u0106 " + "& E < \\u0119 <<< \\u0118 " + "& L < \\u0142 <<< \\u0141 " + "& N < \\u0144 <<< \\u0143 " + "& O < \\u00F3 <<< \\u00D3 " + "& S < \\u015B <<< \\u015A " + "& Z < \\u017A <<< \\u0179 < \\u017C <<< \\u017B"; + +static const char estonian[]= + "& S < \\u0161 <<< \\u0160 " + " < \\u007A <<< \\u005A " + " < \\u017E <<< \\u017D " + "& W < \\u00F5 <<< \\u00D5 " + "< \\u00E4 <<< \\u00C4 " + "< \\u00F6 <<< \\u00D6 " + "< \\u00FC <<< \\u00DC "; + +static const char spanish[]= "& N < \\u00F1 <<< \\u00D1 "; + +/* + Some sources treat V and W as similar on primary level. + We'll treat V and W as different on primary level. +*/ + +static const char swedish[]= + "& Y <<\\u00FC <<< \\u00DC " + "& Z < \\u00E5 <<< \\u00C5 " + "< \\u00E4 <<< \\u00C4 << \\u00E6 <<< \\u00C6 " + "< \\u00F6 <<< \\u00D6 << \\u00F8 <<< \\u00D8 "; + +static const char turkish[]= + "& C < \\u00E7 <<< \\u00C7 " + "& G < \\u011F <<< \\u011E " + "& H < \\u0131 <<< \\u0049 " + "& O < \\u00F6 <<< \\u00D6 " + "& S < \\u015F <<< \\u015E " + "& U < \\u00FC <<< \\u00DC "; + + +static const char czech[]= + "& C < \\u010D <<< \\u010C " + "& H < ch <<< Ch <<< CH" + "& R < \\u0159 <<< \\u0158" + "& S < \\u0161 <<< \\u0160" + "& Z < \\u017E <<< \\u017D"; + +static const char danish[]= /* Also good for Norwegian */ + "& Y << \\u00FC <<< \\u00DC << \\u0171 <<< \\u0170" + "& Z < \\u00E6 <<< \\u00C6 << \\u00E4 <<< \\u00C4" + " < \\u00F8 <<< \\u00D8 << \\u00F6 <<< \\u00D6 << \\u0151 <<< \\u0150" + " < \\u00E5 <<< \\u00C5 << aa <<< Aa <<< AA"; + +static const char lithuanian[]= + "& C << ch <<< Ch <<< CH< \\u010D <<< \\u010C" + "& E << \\u0119 <<< \\u0118 << \\u0117 <<< \\u0116" + "& I << y <<< Y" + "& S < \\u0161 <<< \\u0160" + "& Z < \\u017E <<< \\u017D"; + +static const char slovak[]= + "& A < \\u00E4 <<< \\u00C4" + "& C < \\u010D <<< \\u010C" + "& H < ch <<< Ch <<< CH" + "& O < \\u00F4 <<< \\u00D4" + "& S < \\u0161 <<< \\u0160" + "& Z < \\u017E <<< \\017D"; + +static const char spanish2[]= /* Also good for Asturian and Galician */ + "&C < ch <<< Ch <<< CH" + "&L < ll <<< Ll <<< LL" + "&N < \\u00F1 <<< \\u00D1" + "&R << rr <<< Rr <<< RR"; + /* Unicode Collation Algorithm: @@ -6535,12 +6667,28 @@ typedef struct my_uca_scanner_st const uchar *send; /* End of the input string */ uchar *uca_length; uint16 **uca_weight; + uint16 *contractions; uint16 implicit[2]; int page; int code; + CHARSET_INFO *cs; } my_uca_scanner; +/* + Charset dependent scanner part, to optimize + some character sets. +*/ +typedef struct my_uca_scanner_handler_st +{ + void (*init)(my_uca_scanner *scanner, CHARSET_INFO *cs, + const uchar *str, uint length); + int (*next)(my_uca_scanner *scanner); +} my_uca_scanner_handler; + +static uint16 nochar[]= {0}; + +#ifdef HAVE_CHARSET_ucs2 /* Initialize collation weight scanner @@ -6552,15 +6700,15 @@ typedef struct my_uca_scanner_st length Length of the string. NOTES: + Optimized for UCS2 RETURN N/A */ -static uint16 nochar[]= {0}; -static void my_uca_scanner_init(my_uca_scanner *scanner, - CHARSET_INFO *cs __attribute__((unused)), - const uchar *str, uint length) +static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner, + CHARSET_INFO *cs __attribute__((unused)), + const uchar *str, uint length) { /* Note, no needs to initialize scanner->wbeg */ scanner->sbeg= str; @@ -6568,6 +6716,7 @@ static void my_uca_scanner_init(my_uca_scanner *scanner, scanner->wbeg= nochar; scanner->uca_length= cs->sort_order; scanner->uca_weight= cs->sort_order_big; + scanner->contractions= cs->contractions; } @@ -6580,6 +6729,8 @@ static void my_uca_scanner_init(my_uca_scanner *scanner, scanner Address of a previously initialized scanner strucuture NOTES: + Optimized for UCS2 + Checks if the current character's weight string has been fully scanned, if no, then returns the next weight for this character, else scans the next character and returns its first weight. @@ -6610,7 +6761,7 @@ static void my_uca_scanner_init(my_uca_scanner *scanner, Or -1 on error (END-OF-STRING or ILLEGAL MULTIBYTE SEQUENCE) */ -static int my_uca_scanner_next(my_uca_scanner *scanner) +static int my_uca_scanner_next_ucs2(my_uca_scanner *scanner) { /* @@ -6634,6 +6785,22 @@ static int my_uca_scanner_next(my_uca_scanner *scanner) scanner->code= (unsigned char)scanner->sbeg[1]; scanner->sbeg+= 2; + if (scanner->contractions && (scanner->sbeg <= scanner->send)) + { + int cweight; + + if (!scanner->page && !scanner->sbeg[0] && + (scanner->sbeg[1] > 0x40) && (scanner->sbeg[1] < 0x80) && + (scanner->code > 0x40) && (scanner->code < 0x80) && + (cweight= scanner->contractions[(scanner->code-0x40)*0x40+scanner->sbeg[1]-0x40])) + { + scanner->implicit[0]= 0; + scanner->wbeg= scanner->implicit; + scanner->sbeg+=2; + return cweight; + } + } + if (!ucaw[scanner->page]) goto implicit; scanner->wbeg= ucaw[scanner->page] + scanner->code * ucal[scanner->page]; @@ -6660,6 +6827,111 @@ implicit: return scanner->page; } +static my_uca_scanner_handler my_ucs2_uca_scanner_handler= +{ + my_uca_scanner_init_ucs2, + my_uca_scanner_next_ucs2 +}; + +#endif + + +/* + The same two functions for any character set +*/ +static void my_uca_scanner_init_any(my_uca_scanner *scanner, + CHARSET_INFO *cs __attribute__((unused)), + const uchar *str, uint length) +{ + /* Note, no needs to initialize scanner->wbeg */ + scanner->sbeg= str; + scanner->send= str + length; + scanner->wbeg= nochar; + scanner->uca_length= cs->sort_order; + scanner->uca_weight= cs->sort_order_big; + scanner->contractions= cs->contractions; + scanner->cs= cs; +} + +static int my_uca_scanner_next_any(my_uca_scanner *scanner) +{ + + /* + Check if the weights for the previous character have been + already fully scanned. If yes, then get the next character and + initialize wbeg and wlength to its weight string. + */ + + if (scanner->wbeg[0]) + return *scanner->wbeg++; + + do + { + uint16 **ucaw= scanner->uca_weight; + uchar *ucal= scanner->uca_length; + my_wc_t wc; + int mblen; + + if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc, + scanner->sbeg, scanner->send)) < 0)) + return -1; + + scanner->page= wc >> 8; + scanner->code= wc & 0xFF; + scanner->sbeg+= mblen; + + if (scanner->contractions && !scanner->page && + (scanner->code > 0x40) && (scanner->code < 0x80)) + { + uint page1, code1, cweight; + + if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc, + scanner->sbeg, + scanner->send)) >=0) && + (!(page1= (wc >> 8))) && + ((code1= (wc & 0xFF)) > 0x40) && + (code1 < 0x80) && + (cweight= scanner->contractions[(scanner->code-0x40)*0x40 + code1-0x40])) + { + scanner->implicit[0]= 0; + scanner->wbeg= scanner->implicit; + scanner->sbeg+= mblen; + return cweight; + } + } + + if (!ucaw[scanner->page]) + goto implicit; + scanner->wbeg= ucaw[scanner->page] + scanner->code * ucal[scanner->page]; + } while (!scanner->wbeg[0]); + + return *scanner->wbeg++; + +implicit: + + scanner->code= (scanner->page << 8) + scanner->code; + scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000; + scanner->implicit[1]= 0; + scanner->wbeg= scanner->implicit; + + scanner->page= scanner->page >> 7; + + if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5) + scanner->page+= 0xFB80; + else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5) + scanner->page+= 0xFB40; + else + scanner->page+= 0xFBC0; + + return scanner->page; +} + + +static my_uca_scanner_handler my_any_uca_scanner_handler= +{ + my_uca_scanner_init_any, + my_uca_scanner_next_any +}; /* Compares two strings according to the collation @@ -6703,28 +6975,28 @@ implicit: */ static int my_strnncoll_uca(CHARSET_INFO *cs, + my_uca_scanner_handler *scanner_handler, const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *t, uint tlen, + my_bool t_is_prefix) { my_uca_scanner sscanner; my_uca_scanner tscanner; int s_res; int t_res; - my_uca_scanner_init(&sscanner, cs, s, slen); - my_uca_scanner_init(&tscanner, cs, t, tlen); + scanner_handler->init(&sscanner, cs, s, slen); + scanner_handler->init(&tscanner, cs, t, tlen); do { - s_res= my_uca_scanner_next(&sscanner); - t_res= my_uca_scanner_next(&tscanner); + s_res= scanner_handler->next(&sscanner); + t_res= scanner_handler->next(&tscanner); } while ( s_res == t_res && s_res >0); - return ( s_res - t_res ); + return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res); } - - /* Compares two strings according to the collation, ignoring trailing spaces. @@ -6749,8 +7021,9 @@ static int my_strnncoll_uca(CHARSET_INFO *cs, */ static int my_strnncollsp_uca(CHARSET_INFO *cs, - const uchar *s, uint slen, - const uchar *t, uint tlen) + my_uca_scanner_handler *scanner_handler, + const uchar *s, uint slen, + const uchar *t, uint tlen) { my_uca_scanner sscanner; my_uca_scanner tscanner; @@ -6760,19 +7033,18 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, slen= cs->cset->lengthsp(cs, (char*) s, slen); tlen= cs->cset->lengthsp(cs, (char*) t, tlen); - my_uca_scanner_init(&sscanner, cs, s, slen); - my_uca_scanner_init(&tscanner, cs, t, tlen); + scanner_handler->init(&sscanner, cs, s, slen); + scanner_handler->init(&tscanner, cs, t, tlen); do { - s_res= my_uca_scanner_next(&sscanner); - t_res= my_uca_scanner_next(&tscanner); + s_res= scanner_handler->next(&sscanner); + t_res= scanner_handler->next(&tscanner); } while ( s_res == t_res && s_res >0); return ( s_res - t_res ); } - /* Calculates hash value for the given string, according to the collation, and ignoring trailing spaces. @@ -6797,6 +7069,7 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, */ static void my_hash_sort_uca(CHARSET_INFO *cs, + my_uca_scanner_handler *scanner_handler, const uchar *s, uint slen, ulong *n1, ulong *n2) { @@ -6804,9 +7077,9 @@ static void my_hash_sort_uca(CHARSET_INFO *cs, my_uca_scanner scanner; slen= cs->cset->lengthsp(cs, (char*) s, slen); - my_uca_scanner_init(&scanner, cs, s, slen); + scanner_handler->init(&scanner, cs, s, slen); - while ((s_res= my_uca_scanner_next(&scanner)) >0) + while ((s_res= scanner_handler->next(&scanner)) >0) { n1[0]^= (((n1[0] & 63)+n2[0])*(s_res >> 8))+ (n1[0] << 8); n2[0]+=3; @@ -6848,24 +7121,28 @@ static void my_hash_sort_uca(CHARSET_INFO *cs, */ static int my_strnxfrm_uca(CHARSET_INFO *cs, - uchar *dst, uint dstlen, - const uchar *src, uint srclen) + my_uca_scanner_handler *scanner_handler, + uchar *dst, uint dstlen, + const uchar *src, uint srclen) { uchar *de = dst + dstlen; const uchar *dst_orig = dst; int s_res; my_uca_scanner scanner; - my_uca_scanner_init(&scanner, cs, src, srclen); + scanner_handler->init(&scanner, cs, src, srclen); - while (dst < de && (s_res= my_uca_scanner_next(&scanner)) >0) + while (dst < de && (s_res= scanner_handler->next(&scanner)) >0) { dst[0]= s_res >> 8; dst[1]= s_res & 0xFF; dst+= 2; } + for ( ; dst < de; *dst++='\0'); return dst - dst_orig; } + + /* This function compares if two characters are the same. The sign +1 or -1 does not matter. The only @@ -7036,34 +7313,660 @@ int my_wildcmp_uca(CHARSET_INFO *cs, } +/* + Collation language is implemented according to + subset of ICU Collation Customization (tailorings): + http://oss.software.ibm.com/icu/userguide/Collate_Customization.html + + Collation language elements: + Delimiters: + space - skipped + + <char> := A-Z | a-z | \uXXXX + + Shift command: + <shift> := & - reset at this letter. + + Diff command: + <d1> := < - Identifies a primary difference. + <d2> := << - Identifies a secondary difference. + <d3> := <<< - Idenfifies a tertiary difference. + + + Collation rules: + <ruleset> := <rule> { <ruleset> } + + <rule> := <d1> <string> + | <d2> <string> + | <d3> <string> + | <shift> <char> + + <string> := <char> [ <string> ] + + An example, Polish collation: + + &A < \u0105 <<< \u0104 + &C < \u0107 <<< \u0106 + &E < \u0119 <<< \u0118 + &L < \u0142 <<< \u0141 + &N < \u0144 <<< \u0143 + &O < \u00F3 <<< \u00D3 + &S < \u015B <<< \u015A + &Z < \u017A <<< \u017B +*/ + + +typedef enum my_coll_lexem_num_en +{ + MY_COLL_LEXEM_EOF = 0, + MY_COLL_LEXEM_DIFF = 1, + MY_COLL_LEXEM_SHIFT = 4, + MY_COLL_LEXEM_CHAR = 5, + MY_COLL_LEXEM_ERROR = 6 +} my_coll_lexem_num; + + +typedef struct my_coll_lexem_st +{ + const char *beg; + const char *end; + const char *prev; + int diff; + int code; +} MY_COLL_LEXEM; + + +/* + Initialize collation rule lexical anilizer + + SYNOPSIS + my_coll_lexem_init + lexem Lex analizer to init + str Const string to parse + strend End of the string + USAGE + + RETURN VALUES + N/A +*/ + +static void my_coll_lexem_init(MY_COLL_LEXEM *lexem, + const char *str, const char *strend) +{ + lexem->beg= str; + lexem->prev= str; + lexem->end= strend; + lexem->diff= 0; + lexem->code= 0; +} + + +/* + Print collation customization expression parse error, with context. + + SYNOPSIS + my_coll_lexem_print_error + lexem Lex analizer to take context from + errstr sting to write error to + errsize errstr size + txt error message + USAGE + + RETURN VALUES + N/A +*/ + +static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem, + char *errstr, size_t errsize, + const char *txt) +{ + char tail[30]; + size_t len= lexem->end - lexem->prev; + strmake (tail, lexem->prev, min(len, sizeof(tail)-1)); + errstr[errsize-1]= '\0'; + my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail); +} + + +/* + Convert a hex digit into its numeric value + + SYNOPSIS + ch2x + ch hex digit to convert + USAGE + + RETURN VALUES + an integer value in the range 0..15 + -1 on error +*/ + +static int ch2x(int ch) +{ + if (ch >= '0' && ch <= '9') + return ch - '0'; + + if (ch >= 'a' && ch <= 'f') + return 10 + ch - 'a'; + + if (ch >= 'A' && ch <= 'F') + return 10 + ch - 'A'; + + return -1; +} + + +/* + Collation language lexical parser: + Scans the next lexem. + + SYNOPSIS + my_coll_lexem_next + lexem Lex analizer, previously initialized by + my_coll_lexem_init. + USAGE + Call this function in a loop + + RETURN VALUES + Lexem number: eof, diff, shift, char or error. +*/ + +static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem) +{ + const char *beg; + my_coll_lexem_num rc; + + for (beg= lexem->beg ; beg < lexem->end ; beg++) + { + if (*beg == ' ' || *beg == '\t' || *beg == '\r' || *beg == '\n') + continue; + + if (*beg == '&') + { + beg++; + rc= MY_COLL_LEXEM_SHIFT; + goto ex; + } + + if (beg[0] == '<') + { + for (beg++, lexem->diff= 1; + (beg < lexem->end) && + (*beg == '<') && (lexem->diff<3); + beg++, lexem->diff++); + rc= MY_COLL_LEXEM_DIFF; + goto ex; + } + + if ((*beg >= 'a' && *beg <= 'z') || (*beg >= 'A' && *beg <= 'Z')) + { + lexem->code= *beg++; + rc= MY_COLL_LEXEM_CHAR; + goto ex; + } + + if ((*beg == '\\') && (beg+2 < lexem->end) && (beg[1] == 'u')) + { + int ch; + + beg+= 2; + lexem->code= 0; + while ((beg < lexem->end) && ((ch= ch2x(beg[0])) >= 0)) + { + lexem->code= (lexem->code << 4) + ch; + beg++; + } + rc= MY_COLL_LEXEM_CHAR; + goto ex; + } + + rc= MY_COLL_LEXEM_ERROR; + goto ex; + } + rc= MY_COLL_LEXEM_EOF; + +ex: + lexem->prev= lexem->beg; + lexem->beg= beg; + return rc; +} + + +/* + Collation rule item +*/ + +typedef struct my_coll_rule_item_st +{ + uint base; /* Base character */ + uint curr[2]; /* Current character */ + int diff[3]; /* Primary, Secondary and Tertiary difference */ +} MY_COLL_RULE; + + +/* + Collation language syntax parser. + Uses lexical parser. + + SYNOPSIS + my_coll_rule_parse + rule Collation rule list to load to. + str A string containin collation language expression. + strend End of the string. + USAGE + + RETURN VALUES + A positive number means the number of rules loaded. + -1 means ERROR, e.g. too many items, syntax error, etc. +*/ + +static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, + const char *str, const char *strend, + char *errstr, size_t errsize) +{ + MY_COLL_LEXEM lexem; + my_coll_lexem_num lexnum; + my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR; + MY_COLL_RULE item; + int state= 0; + size_t nitems= 0; + + /* Init all variables */ + errstr[0]= '\0'; + bzero(&item, sizeof(item)); + my_coll_lexem_init(&lexem, str, strend); + + while ((lexnum= my_coll_lexem_next(&lexem))) + { + if (lexnum == MY_COLL_LEXEM_ERROR) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character"); + return -1; + } + + switch (state) { + case 0: + if (lexnum != MY_COLL_LEXEM_SHIFT) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected"); + return -1; + } + prevlexnum= lexnum; + state= 2; + continue; + + case 1: + if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected"); + return -1; + } + prevlexnum= lexnum; + state= 2; + continue; + + case 2: + if (lexnum != MY_COLL_LEXEM_CHAR) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected"); + return -1; + } + + if (prevlexnum == MY_COLL_LEXEM_SHIFT) + { + item.base= lexem.code; + item.diff[0]= 0; + item.diff[1]= 0; + item.diff[2]= 0; + } + else if (prevlexnum == MY_COLL_LEXEM_DIFF) + { + MY_COLL_LEXEM savlex; + savlex= lexem; + item.curr[0]= lexem.code; + if ((lexnum= my_coll_lexem_next(&lexem)) == MY_COLL_LEXEM_CHAR) + { + item.curr[1]= lexem.code; + } + else + { + item.curr[1]= 0; + lexem=savlex; /* Restore previous parser state */ + } + if (lexem.diff == 3) + { + item.diff[2]++; + } + else if (lexem.diff == 2) + { + item.diff[1]++; + item.diff[2]= 0; + } + else if (lexem.diff == 1) + { + item.diff[0]++; + item.diff[1]= 0; + item.diff[2]= 0; + } + if (nitems >= mitems) + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules"); + return -1; + } + rule[nitems++]= item; + } + else + { + my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen"); + return -1; + } + state= 1; + continue; + } + } + return (size_t) nitems; +} + +#define MY_MAX_COLL_RULE 64 + +/* + This function copies an UCS2 collation from + the default Unicode Collation Algorithm (UCA) + weights applying tailorings, i.e. a set of + alternative weights for some characters. + + The default UCA weights are stored in uca_weight/uca_length. + They consist of 256 pages, 256 character each. + + If a page is not overwritten by tailoring rules, + it is copies as is from UCA as is. + + If a page contains some overwritten characters, it is + allocated. Untouched characters are copied from the + default weights. +*/ + +static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint)) +{ + MY_COLL_RULE rule[MY_MAX_COLL_RULE]; + char errstr[128]; + uchar *newlengths; + uint16 **newweights; + const uchar *deflengths= uca_length; + uint16 **defweights= uca_weight; + int rc, i; + int ncontractions= 0; + + if (!cs->tailoring) + return 1; + + /* Parse ICU Collation Customization expression */ + if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE, + cs->tailoring, + cs->tailoring + strlen(cs->tailoring), + errstr, sizeof(errstr))) < 0) + { + /* + TODO: add error message reporting. + printf("Error: %d '%s'\n", rc, errstr); + */ + return 1; + } + + if (!(newweights= (uint16**) (*alloc)(256*sizeof(uint16*)))) + return 1; + bzero(newweights, 256*sizeof(uint16*)); + + if (!(newlengths= (uchar*) (*alloc)(256))) + return 1; + + memcpy(newlengths, deflengths, 256); + + /* + Calculate maximum lenghts for the pages + which will be overwritten. + */ + for (i=0; i < rc; i++) + { + if (!rule[i].curr[1]) /* If not a contraction */ + { + uint pageb= (rule[i].base >> 8) & 0xFF; + uint pagec= (rule[i].curr[0] >> 8) & 0xFF; + + if (newlengths[pagec] < deflengths[pageb]) + newlengths[pagec]= deflengths[pageb]; + } + else + ncontractions++; + } + + for (i=0; i < rc; i++) + { + uint pageb= (rule[i].base >> 8) & 0xFF; + uint pagec= (rule[i].curr[0] >> 8) & 0xFF; + uint chb, chc; + + if (rule[i].curr[1]) /* Skip contraction */ + continue; + + if (!newweights[pagec]) + { + /* Alloc new page and copy the default UCA weights */ + uint size= 256*newlengths[pagec]*sizeof(uint16); + + if (!(newweights[pagec]= (uint16*) (*alloc)(size))) + return 1; + bzero((void*) newweights[pagec], size); + + for (chc=0 ; chc < 256; chc++) + { + memcpy(newweights[pagec] + chc*newlengths[pagec], + defweights[pagec] + chc*deflengths[pagec], + deflengths[pagec]*sizeof(uint16)); + } + } + + /* + Aply the alternative rule: + shift to the base character and primary difference. + */ + chc= rule[i].curr[0] & 0xFF; + chb= rule[i].base & 0xFF; + memcpy(newweights[pagec] + chc*newlengths[pagec], + defweights[pageb] + chb*deflengths[pageb], + deflengths[pageb]*sizeof(uint16)); + /* Apply primary difference */ + newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0]; + } + + /* Copy non-overwritten pages from the default UCA weights */ + for (i= 0; i < 256 ; i++) + { + if (!newweights[i]) + newweights[i]= defweights[i]; + } + + cs->sort_order= newlengths; + cs->sort_order_big= newweights; + cs->contractions= NULL; + + /* Now process contractions */ + if (ncontractions) + { + uint size= 0x40*0x40*sizeof(uint16); /* 8K, for basic latin letter only */ + if (!(cs->contractions= (uint16*) (*alloc)(size))) + return 1; + bzero((void*)cs->contractions, size); + for (i=0; i < rc; i++) + { + if (rule[i].curr[1]) + { + uint pageb= (rule[i].base >> 8) & 0xFF; + uint chb= rule[i].base & 0xFF; + uint16 *offsb= defweights[pageb] + chb*deflengths[pageb]; + uint offsc; + + if (offsb[1] || + rule[i].curr[0] < 0x40 || rule[i].curr[0] > 0x7f || + rule[i].curr[1] < 0x40 || rule[i].curr[1] > 0x7f) + { + /* + TODO: add error reporting; + We support only basic latin letters contractions at this point. + Also, We don't support contractions with weight longer than one. + Otherwise, we'd need much more memory. + */ + return 1; + } + offsc= (rule[i].curr[0]-0x40)*0x40+(rule[i].curr[1]-0x40); + + /* Copy base weight applying primary difference */ + cs->contractions[offsc]= offsb[0] + rule[i].diff[0]; + } + } + } + return 0; +} + + +/* + Universal CHARSET_INFO compatible wrappers + for the above internal functions. + Should work for any character set. +*/ + +static my_bool my_coll_init_uca(CHARSET_INFO *cs, void *(*alloc)(uint)) +{ + return create_tailoring(cs, alloc); +} + +static int my_strnncoll_any_uca(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen, + my_bool t_is_prefix) +{ + return my_strnncoll_uca(cs, &my_any_uca_scanner_handler, + s, slen, t, tlen, t_is_prefix); +} + +static int my_strnncollsp_any_uca(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + return my_strnncollsp_uca(cs, &my_any_uca_scanner_handler, + s, slen, t, tlen); +} + +static void my_hash_sort_any_uca(CHARSET_INFO *cs, + const uchar *s, uint slen, + ulong *n1, ulong *n2) +{ + my_hash_sort_uca(cs, &my_any_uca_scanner_handler, s, slen, n1, n2); +} + +static int my_strnxfrm_any_uca(CHARSET_INFO *cs, + uchar *dst, uint dstlen, + const uchar *src, uint srclen) +{ + return my_strnxfrm_uca(cs, &my_any_uca_scanner_handler, + dst, dstlen, src, srclen); +} + + +#ifdef HAVE_CHARSET_ucs2 +/* + UCS2 optimized CHARSET_INFO compatible wrappers. +*/ +static int my_strnncoll_ucs2_uca(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen, + my_bool t_is_prefix) +{ + return my_strnncoll_uca(cs, &my_ucs2_uca_scanner_handler, + s, slen, t, tlen, t_is_prefix); +} + +static int my_strnncollsp_ucs2_uca(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + return my_strnncollsp_uca(cs, &my_ucs2_uca_scanner_handler, + s, slen, t, tlen); +} + +static void my_hash_sort_ucs2_uca(CHARSET_INFO *cs, + const uchar *s, uint slen, + ulong *n1, ulong *n2) +{ + my_hash_sort_uca(cs, &my_ucs2_uca_scanner_handler, s, slen, n1, n2); +} + +static int my_strnxfrm_ucs2_uca(CHARSET_INFO *cs, + uchar *dst, uint dstlen, + const uchar *src, uint srclen) +{ + return my_strnxfrm_uca(cs, &my_ucs2_uca_scanner_handler, + dst, dstlen, src, srclen); +} + MY_COLLATION_HANDLER my_collation_ucs2_uca_handler = { - my_strnncoll_uca, - my_strnncollsp_uca, - my_strnxfrm_uca, + my_coll_init_uca, /* init */ + my_strnncoll_ucs2_uca, + my_strnncollsp_ucs2_uca, + my_strnxfrm_ucs2_uca, my_like_range_simple, my_wildcmp_uca, NULL, my_instr_mb, - my_hash_sort_uca + my_hash_sort_ucs2_uca }; CHARSET_INFO my_charset_ucs2_general_uca= { - 45,0,0, /* number */ - MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT, + 128,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, "ucs2", /* cs name */ - "ucs2_general_uca", /* name */ + "ucs2_unicode_ci", /* name */ "", /* comment */ + "", /* tailoring */ NULL, /* ctype */ NULL, /* to_lower */ NULL, /* to_upper */ uca_length, /* sort_order */ + NULL, /* contractions */ uca_weight, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_icelandic_uca_ci= +{ + 129,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_icelandic_ci",/* name */ + "", /* comment */ + icelandic, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ 8, /* strxfrm_multiply */ 2, /* mbminlen */ 2, /* mbmaxlen */ @@ -7073,5 +7976,807 @@ CHARSET_INFO my_charset_ucs2_general_uca= &my_collation_ucs2_uca_handler }; +CHARSET_INFO my_charset_ucs2_latvian_uca_ci= +{ + 130,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_latvian_ci", /* name */ + "", /* comment */ + latvian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; +CHARSET_INFO my_charset_ucs2_romanian_uca_ci= +{ + 131,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_romanian_ci", /* name */ + "", /* comment */ + romanian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_slovenian_uca_ci= +{ + 132,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_slovenian_ci",/* name */ + "", /* comment */ + slovenian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_polish_uca_ci= +{ + 133,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_polish_ci", /* name */ + "", /* comment */ + polish, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_estonian_uca_ci= +{ + 134,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_estonian_ci", /* name */ + "", /* comment */ + estonian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_spanish_uca_ci= +{ + 135,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_spanish_ci", /* name */ + "", /* comment */ + spanish, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_swedish_uca_ci= +{ + 136,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_swedish_ci", /* name */ + "", /* comment */ + swedish, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_turkish_uca_ci= +{ + 137,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_turkish_ci", /* name */ + "", /* comment */ + turkish, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_czech_uca_ci= +{ + 138,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_czech_ci", /* name */ + "", /* comment */ + czech, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + + +CHARSET_INFO my_charset_ucs2_danish_uca_ci= +{ + 139,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_danish_ci", /* name */ + "", /* comment */ + danish, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_lithuanian_uca_ci= +{ + 140,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_lithuanian_ci",/* name */ + "", /* comment */ + lithuanian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_slovak_uca_ci= +{ + 141,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_slovak_ci", /* name */ + "", /* comment */ + slovak, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +CHARSET_INFO my_charset_ucs2_spanish2_uca_ci= +{ + 142,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_spanish2_ci", /* name */ + "", /* comment */ + spanish2, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + +#endif + + +#ifdef HAVE_CHARSET_utf8 +MY_COLLATION_HANDLER my_collation_any_uca_handler = +{ + my_coll_init_uca, /* init */ + my_strnncoll_any_uca, + my_strnncollsp_any_uca, + my_strnxfrm_any_uca, + my_like_range_simple, + my_wildcmp_uca, + NULL, + my_instr_mb, + my_hash_sort_any_uca +}; + +/* + We consider bytes with code more than 127 as a letter. + This garantees that word boundaries work fine with regular + expressions. Note, there is no need to mark byte 255 as a + letter, it is illegal byte in UTF8. +*/ +static uchar ctype_utf8[] = { + 0, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16, + 16,129,129,129,129,129,129, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16, + 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0 +}; + +extern MY_CHARSET_HANDLER my_charset_utf8_handler; + +CHARSET_INFO my_charset_utf8_general_uca_ci= +{ + 192,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_unicode_ci", /* name */ + "", /* comment */ + "", /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + uca_length, /* sort_order */ + NULL, /* contractions */ + uca_weight, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 3, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + + +CHARSET_INFO my_charset_utf8_icelandic_uca_ci= +{ + 193,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_icelandic_ci",/* name */ + "", /* comment */ + icelandic, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_latvian_uca_ci= +{ + 194,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_latvian_ci", /* name */ + "", /* comment */ + latvian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_romanian_uca_ci= +{ + 195,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_romanian_ci", /* name */ + "", /* comment */ + romanian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_slovenian_uca_ci= +{ + 196,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_slovenian_ci",/* name */ + "", /* comment */ + slovenian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_polish_uca_ci= +{ + 197,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_polish_ci", /* name */ + "", /* comment */ + polish, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_estonian_uca_ci= +{ + 198,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_estonian_ci", /* name */ + "", /* comment */ + estonian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_spanish_uca_ci= +{ + 199,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_spanish_ci", /* name */ + "", /* comment */ + spanish, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_swedish_uca_ci= +{ + 200,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_swedish_ci", /* name */ + "", /* comment */ + swedish, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_turkish_uca_ci= +{ + 201,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_turkish_ci", /* name */ + "", /* comment */ + turkish, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_czech_uca_ci= +{ + 202,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_czech_ci", /* name */ + "", /* comment */ + czech, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + + +CHARSET_INFO my_charset_utf8_danish_uca_ci= +{ + 203,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_danish_ci", /* name */ + "", /* comment */ + danish, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_lithuanian_uca_ci= +{ + 204,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_lithuanian_ci",/* name */ + "", /* comment */ + lithuanian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_slovak_uca_ci= +{ + 205,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_slovak_ci", /* name */ + "", /* comment */ + slovak, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + +CHARSET_INFO my_charset_utf8_spanish2_uca_ci= +{ + 206,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_spanish2_ci", /* name */ + "", /* comment */ + spanish2, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; #endif diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 67340fdd4f4..20a5ff58d3a 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -182,7 +182,8 @@ static void my_casedn_str_ucs2(CHARSET_INFO *cs __attribute__((unused)), static int my_strnncoll_ucs2(CHARSET_INFO *cs, const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *t, uint tlen, + my_bool t_is_prefix) { int s_res,t_res; my_wc_t s_wc,t_wc; @@ -213,7 +214,14 @@ static int my_strnncoll_ucs2(CHARSET_INFO *cs, s+=s_res; t+=t_res; } - return ( (se-s) - (te-t) ); + return t_is_prefix ? t-te : ((se-s) - (te-t)); +} + +static int my_strnncollsp_ucs2(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + return my_strnncoll_ucs2(cs,s,slen,t,tlen,0); } @@ -312,7 +320,6 @@ static int my_mbcharlen_ucs2(CHARSET_INFO *cs __attribute__((unused)) , #include <m_string.h> #include <stdarg.h> -#include <assert.h> static int my_vsnprintf_ucs2(char *dst, uint n, const char* fmt, va_list ap) { @@ -1224,8 +1231,9 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, static int my_strnncoll_ucs2_bin(CHARSET_INFO *cs, - const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *s, uint slen, + const uchar *t, uint tlen, + my_bool t_is_prefix) { int s_res,t_res; my_wc_t s_wc,t_wc; @@ -1250,7 +1258,14 @@ int my_strnncoll_ucs2_bin(CHARSET_INFO *cs, s+=s_res; t+=t_res; } - return ( (se-s) - (te-t) ); + return t_is_prefix ? t-te : ((se-s) - (te-t)); +} + +static int my_strnncollsp_ucs2_bin(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + return my_strnncoll_ucs2_bin(cs,s,slen,t,tlen,0); } @@ -1373,8 +1388,9 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs, static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = { + NULL, /* init */ my_strnncoll_ucs2, - my_strnncoll_ucs2, + my_strnncollsp_ucs2, my_strnxfrm_ucs2, my_like_range_ucs2, my_wildcmp_ucs2_ci, @@ -1386,8 +1402,9 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = { + NULL, /* init */ my_strnncoll_ucs2_bin, - my_strnncoll_ucs2_bin, + my_strnncollsp_ucs2_bin, my_strnxfrm_ucs2_bin, my_like_range_simple, my_wildcmp_ucs2_bin, @@ -1399,6 +1416,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = MY_CHARSET_HANDLER my_charset_ucs2_handler= { + NULL, /* init */ my_ismbchar_ucs2, /* ismbchar */ my_mbcharlen_ucs2, /* mbcharlen */ my_numchars_ucs2, @@ -1427,19 +1445,21 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= CHARSET_INFO my_charset_ucs2_general_ci= { 35,0,0, /* number */ - MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT, + MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE, "ucs2", /* cs name */ "ucs2_general_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_ucs2, /* ctype */ to_lower_ucs2, /* to_lower */ to_upper_ucs2, /* to_upper */ to_upper_ucs2, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 2, /* mbminlen */ 2, /* mbmaxlen */ @@ -1452,19 +1472,21 @@ CHARSET_INFO my_charset_ucs2_general_ci= CHARSET_INFO my_charset_ucs2_bin= { 90,0,0, /* number */ - MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONTEXT, + MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE, "ucs2", /* cs name */ "ucs2_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_ucs2, /* ctype */ to_lower_ucs2, /* to_lower */ to_upper_ucs2, /* to_upper */ to_upper_ucs2, /* sort_order */ + NULL, /* contractions */ NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 2, /* mbminlen */ 2, /* mbmaxlen */ diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index fd3692553be..3f53a07f527 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -8423,6 +8423,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e) static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_simple,/* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ @@ -8435,14 +8436,15 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = static MY_CHARSET_HANDLER my_charset_handler= { + NULL, /* init */ ismbchar_ujis, mbcharlen_ujis, my_numchars_mb, my_charpos_mb, my_well_formed_len_mb, my_lengthsp_8bit, - my_mb_wc_euc_jp, /* mb_wc */ - my_wc_mb_euc_jp, /* wc_mb */ + my_mb_wc_euc_jp, /* mb_wc */ + my_wc_mb_euc_jp, /* wc_mb */ my_caseup_str_mb, my_casedn_str_mb, my_caseup_mb, @@ -8468,20 +8470,22 @@ CHARSET_INFO my_charset_ujis_japanese_ci= "ujis", /* cs name */ "ujis_japanese_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_ujis, to_lower_ujis, to_upper_ujis, sort_order_ujis, + NULL, /* sort_order_big*/ + NULL, /* contractions */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 3, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_ci_handler }; @@ -8494,20 +8498,22 @@ CHARSET_INFO my_charset_ujis_bin= "ujis", /* cs name */ "ujis_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_ujis, to_lower_ujis, to_upper_ujis, sort_order_ujis, + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 3, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ + 255, /* max_sort_char */ &my_charset_handler, &my_collation_mb_bin_handler }; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 29d2c5d1358..bf2d8a17fb4 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1524,8 +1524,12 @@ MY_UNICASE_INFO *uni_plane[256]={ #ifdef HAVE_CHARSET_utf8 -/* These arrays are taken from usa7 implementation */ - +/* + We consider bytes with code more than 127 as a letter. + This garantees that word boundaries work fine with regular + expressions. Note, there is no need to mark byte 255 as a + letter, it is illegal byte in UTF8. +*/ static uchar ctype_utf8[] = { 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32, @@ -1536,16 +1540,18 @@ static uchar ctype_utf8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16, 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0 }; +/* The below are taken from usa7 implementation */ + static uchar to_lower_utf8[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -1802,7 +1808,8 @@ static void my_casedn_str_utf8(CHARSET_INFO *cs, char * s) static int my_strnncoll_utf8(CHARSET_INFO *cs, const uchar *s, uint slen, - const uchar *t, uint tlen) + const uchar *t, uint tlen, + my_bool t_is_prefix) { int s_res,t_res; my_wc_t s_wc,t_wc; @@ -1833,7 +1840,7 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs, s+=s_res; t+=t_res; } - return ( (se-s) - (te-t) ); + return t_is_prefix ? t-te : ((se-s) - (te-t)); } @@ -2039,6 +2046,7 @@ static int my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)) , uint c) static MY_COLLATION_HANDLER my_collation_ci_handler = { + NULL, /* init */ my_strnncoll_utf8, my_strnncollsp_utf8, my_strnxfrm_utf8, @@ -2049,8 +2057,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_hash_sort_utf8 }; -static MY_CHARSET_HANDLER my_charset_handler= +MY_CHARSET_HANDLER my_charset_utf8_handler= { + NULL, /* init */ my_ismbchar_utf8, my_mbcharlen_utf8, my_numchars_mb, @@ -2084,21 +2093,23 @@ CHARSET_INFO my_charset_utf8_general_ci= "utf8", /* cs name */ "utf8_general_ci", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_utf8, /* ctype */ to_lower_utf8, /* to_lower */ to_upper_utf8, /* to_upper */ to_upper_utf8, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 3, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ - &my_charset_handler, + 255, /* max_sort_char */ + &my_charset_utf8_handler, &my_collation_ci_handler }; @@ -2110,27 +2121,28 @@ CHARSET_INFO my_charset_utf8_bin= "utf8", /* cs name */ "utf8_bin", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_utf8, /* ctype */ to_lower_utf8, /* to_lower */ to_upper_utf8, /* to_upper */ to_upper_utf8, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - NULL, /* sort_order_big*/ - "", - "", + NULL, /* state_map */ + NULL, /* ident_map */ 1, /* strxfrm_multiply */ 1, /* mbminlen */ 3, /* mbmaxlen */ 0, /* min_sort_char */ - 0, /* max_sort_char */ - &my_charset_handler, + 255, /* max_sort_char */ + &my_charset_utf8_handler, &my_collation_mb_bin_handler }; #ifdef MY_TEST_UTF8 - #include <stdio.h> static void test_mb(CHARSET_INFO *cs, uchar *s) @@ -2162,7 +2174,7 @@ int main() test_mb(cs,(uchar*)str); - pr1;2cintf("orig :'%s'\n",str); + printf("orig :'%s'\n",str); my_caseup_utf8(cs,str,15); printf("caseup :'%s'\n",str); diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c index 2eefb570170..b4dbda3e8ed 100644 --- a/strings/ctype-win1250ch.c +++ b/strings/ctype-win1250ch.c @@ -448,20 +448,25 @@ static struct wordvalue doubles[] = { static int my_strnncoll_win1250ch(CHARSET_INFO *cs __attribute__((unused)), const uchar * s1, uint len1, - const uchar * s2, uint len2) + const uchar * s2, uint len2, + my_bool s2_is_prefix) { int v1, v2; const uchar * p1, * p2; int pass1 = 0, pass2 = 0; int diff; + if (s2_is_prefix && len1 > len2) + len1=len2; + p1 = s1; p2 = s2; - do { + do + { NEXT_CMP_VALUE(s1, p1, pass1, v1, (int)len1); NEXT_CMP_VALUE(s2, p2, pass2, v2, (int)len2); - diff = v1 - v2; - if (diff != 0) return diff; + if ((diff = v1 - v2)) + return diff; } while (v1); return 0; } @@ -478,7 +483,7 @@ int my_strnncollsp_win1250ch(CHARSET_INFO * cs, { for ( ; slen && s[slen-1] == ' ' ; slen--); for ( ; tlen && t[tlen-1] == ' ' ; tlen--); - return my_strnncoll_win1250ch(cs,s,slen,t,tlen); + return my_strnncoll_win1250ch(cs,s,slen,t,tlen,0); } @@ -605,6 +610,7 @@ my_like_range_win1250ch(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_czech_ci_handler = { + NULL, /* init */ my_strnncoll_win1250ch, my_strnncollsp_win1250ch, my_strnxfrm_win1250ch, @@ -623,14 +629,17 @@ CHARSET_INFO my_charset_cp1250_czech_ci = "cp1250", /* cs name */ "cp1250_czech_cs", /* name */ "", /* comment */ + NULL, /* tailoring */ ctype_win1250ch, to_lower_win1250ch, to_upper_win1250ch, sort_order_win1250ch, + NULL, /* contractions */ NULL, /* sort_order_big*/ tab_cp1250_uni, /* tab_to_uni */ idx_uni_cp1250, /* tab_from_uni */ - "","", + NULL, /* state_map */ + NULL, /* ident_map */ 2, /* strxfrm_multiply */ 1, /* mbminlen */ 1, /* mbmaxlen */ diff --git a/strings/ctype.c b/strings/ctype.c index cbd13111b70..4454d3c45e1 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -22,6 +22,23 @@ #endif +/* + + This files implements routines which parse XML based + character set and collation description files. + + Unicode collations are encoded according to + + Unicode Technical Standard #35 + Locale Data Markup Language (LDML) + http://www.unicode.org/reports/tr35/ + + and converted into ICU string according to + + Collation Customization + http://oss.software.ibm.com/icu/userguide/Collate_Customization.html + +*/ static char *mstr(char *str,const char *src,uint l1,uint l2) { @@ -54,6 +71,11 @@ struct my_cs_file_section_st #define _CS_PRIMARY_ID 15 #define _CS_BINARY_ID 16 #define _CS_CSDESCRIPT 17 +#define _CS_RESET 18 +#define _CS_DIFF1 19 +#define _CS_DIFF2 20 +#define _CS_DIFF3 21 + static struct my_cs_file_section_st sec[] = { @@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] = {_CS_ORDER, "charsets.charset.collation.order"}, {_CS_FLAG, "charsets.charset.collation.flag"}, {_CS_COLLMAP, "charsets.charset.collation.map"}, + {_CS_RESET, "charsets.charset.collation.rules.reset"}, + {_CS_DIFF1, "charsets.charset.collation.rules.p"}, + {_CS_DIFF2, "charsets.charset.collation.rules.s"}, + {_CS_DIFF3, "charsets.charset.collation.rules.t"}, {0, NULL} }; @@ -98,6 +124,7 @@ static struct my_cs_file_section_st * cs_file_sec(const char *attr, uint len) } #define MY_CS_CSDESCR_SIZE 64 +#define MY_CS_TAILORING_SIZE 128 typedef struct my_cs_file_info { @@ -109,6 +136,8 @@ typedef struct my_cs_file_info uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; char comment[MY_CS_CSDESCR_SIZE]; + char tailoring[MY_CS_TAILORING_SIZE]; + size_t tailoring_length; CHARSET_INFO cs; int (*add_collation)(CHARSET_INFO *cs); } MY_CHARSET_LOADER; @@ -156,9 +185,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len) struct my_cs_file_section_st *s= cs_file_sec(attr,len); if ( s && (s->state == _CS_CHARSET)) - { bzero(&i->cs,sizeof(i->cs)); - } + + if (s && (s->state == _CS_COLLATION)) + i->tailoring_length= 0; + return MY_XML_OK; } @@ -242,6 +273,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len) fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); i->cs.ctype=i->ctype; break; + case _CS_RESET: + case _CS_DIFF1: + case _CS_DIFF2: + case _CS_DIFF3: + { + /* + Convert collation description from + Locale Data Markup Language (LDML) + into ICU Collation Customization expression. + */ + char arg[16]; + const char *cmd[]= {"&","<","<<","<<<"}; + i->cs.tailoring= i->tailoring; + mstr(arg,attr,len,sizeof(arg)-1); + if (i->tailoring_length + 20 < sizeof(i->tailoring)) + { + char *dst= i->tailoring_length + i->tailoring; + i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg); + } + } } return MY_XML_OK; } diff --git a/strings/int2str.c b/strings/int2str.c index 38e8a5182a3..be86e9735ab 100644 --- a/strings/int2str.c +++ b/strings/int2str.c @@ -14,82 +14,95 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* - Defines: int2str(), itoa(), ltoa() - - int2str(dst, radix, val) - converts the (long) integer "val" to character form and moves it to - the destination string "dst" followed by a terminating NUL. The - result is normally a pointer to this NUL character, but if the radix - is dud the result will be NullS and nothing will be changed. - - If radix is -2..-36, val is taken to be SIGNED. - If radix is 2.. 36, val is taken to be UNSIGNED. - That is, val is signed if and only if radix is. You will normally - use radix -10 only through itoa and ltoa, for radix 2, 8, or 16 - unsigned is what you generally want. - - _dig_vec is public just in case someone has a use for it. - The definitions of itoa and ltoa are actually macros in m_string.h, - but this is where the code is. - - Note: The standard itoa() returns a pointer to the argument, when int2str - returns the pointer to the end-null. - itoa assumes that 10 -base numbers are allways signed and other arn't. -*/ - #include <my_global.h> #include "m_string.h" -char NEAR _dig_vec[] = +/* + _dig_vec arrays are public because they are used in several outer places. +*/ +char NEAR _dig_vec_upper[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +char NEAR _dig_vec_lower[] = + "0123456789abcdefghijklmnopqrstuvwxyz"; -char *int2str(register long int val, register char *dst, register int radix) +/* + Convert integer to its string representation in given scale of notation. + + SYNOPSIS + int2str() + val - value to convert + dst - points to buffer where string representation should be stored + radix - radix of scale of notation + upcase - set to 1 if we should use upper-case digits + + DESCRIPTION + Converts the (long) integer value to its character form and moves it to + the destination buffer followed by a terminating NUL. + If radix is -2..-36, val is taken to be SIGNED, if radix is 2..36, val is + taken to be UNSIGNED. That is, val is signed if and only if radix is. + All other radixes treated as bad and nothing will be changed in this case. + + For conversion to decimal representation (radix is -10 or 10) one can use + optimized int10_to_str() function. + + RETURN VALUE + Pointer to ending NUL character or NullS if radix is bad. +*/ + +char * +int2str(register long int val, register char *dst, register int radix, + int upcase) { char buffer[65]; register char *p; long int new_val; + char *dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower; - if (radix < 0) { - if (radix < -36 || radix > -2) return NullS; - if (val < 0) { + if (radix < 0) + { + if (radix < -36 || radix > -2) + return NullS; + if (val < 0) + { *dst++ = '-'; val = -val; } radix = -radix; - } else { - if (radix > 36 || radix < 2) return NullS; } - /* The slightly contorted code which follows is due to the - fact that few machines directly support unsigned long / and %. - Certainly the VAX C compiler generates a subroutine call. In - the interests of efficiency (hollow laugh) I let this happen - for the first digit only; after that "val" will be in range so - that signed integer division will do. Sorry 'bout that. - CHECK THE CODE PRODUCED BY YOUR C COMPILER. The first % and / - should be unsigned, the second % and / signed, but C compilers - tend to be extraordinarily sensitive to minor details of style. - This works on a VAX, that's all I claim for it. - */ + else if (radix > 36 || radix < 2) + return NullS; + + /* + The slightly contorted code which follows is due to the fact that + few machines directly support unsigned long / and %. Certainly + the VAX C compiler generates a subroutine call. In the interests + of efficiency (hollow laugh) I let this happen for the first digit + only; after that "val" will be in range so that signed integer + division will do. Sorry 'bout that. CHECK THE CODE PRODUCED BY + YOUR C COMPILER. The first % and / should be unsigned, the second + % and / signed, but C compilers tend to be extraordinarily + sensitive to minor details of style. This works on a VAX, that's + all I claim for it. + */ p = &buffer[sizeof(buffer)-1]; *p = '\0'; new_val=(ulong) val / (ulong) radix; - *--p = _dig_vec[(uchar) ((ulong) val- (ulong) new_val*(ulong) radix)]; + *--p = dig_vec[(uchar) ((ulong) val- (ulong) new_val*(ulong) radix)]; val = new_val; #ifdef HAVE_LDIV while (val != 0) { ldiv_t res; res=ldiv(val,radix); - *--p = _dig_vec[res.rem]; + *--p = dig_vec[res.rem]; val= res.quot; } #else while (val != 0) { new_val=val/radix; - *--p = _dig_vec[(uchar) (val-new_val*radix)]; + *--p = dig_vec[(uchar) (val-new_val*radix)]; val= new_val; } #endif @@ -99,8 +112,21 @@ char *int2str(register long int val, register char *dst, register int radix) /* - This is a faster version of the above optimized for the normal case of - radix 10 / -10 + Converts integer to its string representation in decimal notation. + + SYNOPSIS + int10_to_str() + val - value to convert + dst - points to buffer where string representation should be stored + radix - flag that shows whenever val should be taken as signed or not + + DESCRIPTION + This is version of int2str() function which is optimized for normal case + of radix 10/-10. It takes only sign of radix parameter into account and + not its absolute value. + + RETURN VALUE + Pointer to ending NUL character. */ char *int10_to_str(long int val,char *dst,int radix) @@ -133,22 +159,3 @@ char *int10_to_str(long int val,char *dst,int radix) while ((*dst++ = *p++) != 0) ; return dst-1; } - - -#ifdef USE_MY_ITOA - - /* Change to less general itoa interface */ - -char *my_itoa(int val, char *dst, int radix) -{ - VOID(int2str((long) val,dst,(radix == 10 ? -10 : radix))); - return dst; -} - -char *my_ltoa(long int val, char *dst, int radix) -{ - VOID(int2str((long) val,dst,(radix == 10 ? -10 : radix))); - return dst; -} - -#endif diff --git a/strings/longlong2str-x86.s b/strings/longlong2str-x86.s index 8476bf49b75..fcc57810224 100644 --- a/strings/longlong2str-x86.s +++ b/strings/longlong2str-x86.s @@ -83,7 +83,7 @@ longlong2str: divl %ebx decl %ecx movl %eax,%esi # quotent in ebp:esi - movb _dig_vec(%edx),%al # al is faster than dl + movb _dig_vec_upper(%edx),%al # al is faster than dl movb %al,(%ecx) # store value in buff .align 4 .L155: @@ -93,7 +93,7 @@ longlong2str: jl .L153 je .L10_mov # Ready movl %esi,%eax - movl $_dig_vec,%ebp + movl $_dig_vec_upper,%ebp .align 4 .L154: # Do rest with integer precision diff --git a/strings/longlong2str.c b/strings/longlong2str.c index a991c57b4d9..096531095db 100644 --- a/strings/longlong2str.c +++ b/strings/longlong2str.c @@ -43,8 +43,6 @@ #if defined(HAVE_LONG_LONG) && !defined(longlong2str) && !defined(HAVE_LONGLONG2STR) -extern char NEAR _dig_vec[]; - /* This assumes that longlong multiplication is faster than longlong division. */ @@ -81,14 +79,14 @@ char *longlong2str(longlong val,char *dst,int radix) { ulonglong quo=(ulonglong) val/(uint) radix; uint rem= (uint) (val- quo* (uint) radix); - *--p = _dig_vec[rem]; + *--p = _dig_vec_upper[rem]; val= quo; } long_val= (long) val; while (long_val != 0) { long quo= long_val/radix; - *--p = _dig_vec[(uchar) (long_val - quo*radix)]; + *--p = _dig_vec_upper[(uchar) (long_val - quo*radix)]; long_val= quo; } while ((*dst++ = *p++) != 0) ; @@ -126,14 +124,14 @@ char *longlong10_to_str(longlong val,char *dst,int radix) { ulonglong quo=(ulonglong) val/(uint) 10; uint rem= (uint) (val- quo* (uint) 10); - *--p = _dig_vec[rem]; + *--p = _dig_vec_upper[rem]; val= quo; } long_val= (long) val; while (long_val != 0) { long quo= long_val/10; - *--p = _dig_vec[(uchar) (long_val - quo*10)]; + *--p = _dig_vec_upper[(uchar) (long_val - quo*10)]; long_val= quo; } while ((*dst++ = *p++) != 0) ; diff --git a/strings/my_strtoll10.c b/strings/my_strtoll10.c index 493d0d63de2..5217564087c 100644 --- a/strings/my_strtoll10.c +++ b/strings/my_strtoll10.c @@ -15,7 +15,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <my_global.h> -#include <my_sys.h> +#include <my_sys.h> /* Needed for MY_ERRNO_ERANGE */ #include <m_string.h> #undef ULONGLONG_MAX diff --git a/strings/my_vsnprintf.c b/strings/my_vsnprintf.c index d9d80263d31..784c4762724 100644 --- a/strings/my_vsnprintf.c +++ b/strings/my_vsnprintf.c @@ -18,7 +18,6 @@ #include <m_string.h> #include <stdarg.h> #include <m_ctype.h> -#include <assert.h> /* Limited snprintf() implementations @@ -118,7 +117,7 @@ int my_vsnprintf(char *to, size_t n, const char* fmt, va_list ap) if (*fmt== 'u') store_end= int10_to_str(larg, store_start, 10); else - store_end= int2str(larg, store_start, 16); + store_end= int2str(larg, store_start, 16, 0); if ((res_length= (uint) (store_end - store_start)) > to_length) break; /* num doesn't fit in output */ /* If %#d syntax was used, we have to pre-zero/pre-space the string */ |