diff options
author | Alexander Barkov <bar@mnogosearch.org> | 2013-10-02 15:04:07 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mnogosearch.org> | 2013-10-02 15:04:07 +0400 |
commit | 0b6c4bb34f99b8f4023fd0bef25a1b714f96b699 (patch) | |
tree | 87e5f83097f30c9fb7e30928800bcc92690f6bbd /strings | |
parent | 9538bbfce9055f99529adb461d101b7b236eb5a3 (diff) | |
download | mariadb-git-0b6c4bb34f99b8f4023fd0bef25a1b714f96b699.tar.gz |
MDEV-4928 Merge collation customization improvements
Merging the following MySQL-5.6 changes:
- WL#5624: Collation customization improvements
http://dev.mysql.com/worklog/task/?id=5624
- WL#4013: Unicode german2 collation
http://dev.mysql.com/worklog/task/?id=4013
- Bug#62429 XML: ExtractValue, UpdateXML max arg length 127 chars
http://bugs.mysql.com/bug.php?id=62429
(required by WL#5624)
Diffstat (limited to 'strings')
-rw-r--r-- | strings/CMakeLists.txt | 3 | ||||
-rw-r--r-- | strings/conf_to_src.c | 41 | ||||
-rw-r--r-- | strings/ctype-big5.c | 25 | ||||
-rw-r--r-- | strings/ctype-bin.c | 7 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 34 | ||||
-rw-r--r-- | strings/ctype-czech.c | 5 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 31 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 45 | ||||
-rw-r--r-- | strings/ctype-extra.c | 310 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 29 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 31 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 15 | ||||
-rw-r--r-- | strings/ctype-mb.c | 34 | ||||
-rw-r--r-- | strings/ctype-simple.c | 25 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 29 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 10 | ||||
-rw-r--r-- | strings/ctype-uca.c | 3245 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 169 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 47 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 257 | ||||
-rw-r--r-- | strings/ctype-win1250ch.c | 5 | ||||
-rw-r--r-- | strings/ctype.c | 594 | ||||
-rw-r--r-- | strings/str_alloc.c | 6 | ||||
-rw-r--r-- | strings/xml.c | 98 |
24 files changed, 3468 insertions, 1627 deletions
diff --git a/strings/CMakeLists.txt b/strings/CMakeLists.txt index 2747374058d..1b26b3d962a 100644 --- a/strings/CMakeLists.txt +++ b/strings/CMakeLists.txt @@ -32,3 +32,6 @@ ENDIF() # Avoid dependencies on perschema data defined in mysys ADD_DEFINITIONS(-DDISABLE_MYSQL_THREAD_H) ADD_CONVENIENCE_LIBRARY(strings ${STRINGS_SOURCES}) + +ADD_EXECUTABLE(conf_to_src EXCLUDE_FROM_ALL conf_to_src.c) +TARGET_LINK_LIBRARIES(conf_to_src strings) diff --git a/strings/conf_to_src.c b/strings/conf_to_src.c index 7dfc76e5417..8d25ac8e7ed 100644 --- a/strings/conf_to_src.c +++ b/strings/conf_to_src.c @@ -145,12 +145,35 @@ static int add_collation(struct charset_info_st *cs) } +static void +default_reporter(enum loglevel level __attribute__ ((unused)), + const char *format __attribute__ ((unused)), + ...) +{ +} + + +static void +my_charset_loader_init(MY_CHARSET_LOADER *loader) +{ + loader->error[0]= '\0'; + loader->once_alloc= malloc; + loader->malloc= malloc; + loader->realloc= realloc; + loader->free= free; + loader->reporter= default_reporter; + loader->add_collation= add_collation; +} + + static int my_read_charset_file(const char *filename) { char buf[MAX_BUF]; int fd; uint len; + MY_CHARSET_LOADER loader; + my_charset_loader_init(&loader); if ((fd=open(filename,O_RDONLY)) < 0) { fprintf(stderr,"Can't open '%s'\n",filename); @@ -161,14 +184,10 @@ static int my_read_charset_file(const char *filename) DBUG_ASSERT(len < MAX_BUF); close(fd); - if (my_parse_charset_xml(buf,len,add_collation)) + if (my_parse_charset_xml(&loader, buf, len)) { -#if 0 - printf("ERROR at line %d pos %d '%s'\n", - my_xml_error_lineno(&p)+1, - my_xml_error_pos(&p), - my_xml_error_string(&p)); -#endif + fprintf(stderr, "Error while parsing '%s': %s\n", filename, loader.error); + exit(1); } return FALSE; @@ -207,8 +226,7 @@ void dispcset(FILE *f,CHARSET_INFO *cs) fprintf(f," sort_order_%s, /* sort_order */\n",cs->name); else fprintf(f," NULL, /* sort_order */\n"); - fprintf(f," NULL, /* contractions */\n"); - fprintf(f," NULL, /* sort_order_big*/\n"); + fprintf(f," NULL, /* uca */\n"); fprintf(f," to_uni_%s, /* to_uni */\n",cs->name); } else @@ -221,13 +239,12 @@ void dispcset(FILE *f,CHARSET_INFO *cs) fprintf(f," NULL, /* lower */\n"); fprintf(f," NULL, /* upper */\n"); fprintf(f," NULL, /* sort order */\n"); - fprintf(f," NULL, /* contractions */\n"); - fprintf(f," NULL, /* sort_order_big*/\n"); + fprintf(f," NULL, /* uca */\n"); fprintf(f," NULL, /* to_uni */\n"); } fprintf(f," NULL, /* from_uni */\n"); - fprintf(f," my_unicase_default, /* caseinfo */\n"); + fprintf(f," &my_unicase_default, /* caseinfo */\n"); fprintf(f," NULL, /* state map */\n"); fprintf(f," NULL, /* ident map */\n"); fprintf(f," 1, /* strxfrm_multiply*/\n"); diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index f77e705525c..cf9fc339280 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -177,7 +177,7 @@ static const uchar sort_order_big5[]= }; -static MY_UNICASE_INFO cA2[256]= +static MY_UNICASE_CHARACTER cA2[256]= { /* A200-A20F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -370,7 +370,7 @@ static MY_UNICASE_INFO cA2[256]= }; -static MY_UNICASE_INFO cA3[256]= +static MY_UNICASE_CHARACTER cA3[256]= { /* A300-A30F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -563,7 +563,7 @@ static MY_UNICASE_INFO cA3[256]= }; -static MY_UNICASE_INFO cC7[256]= +static MY_UNICASE_CHARACTER cC7[256]= { /* C700-C70F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -756,7 +756,7 @@ static MY_UNICASE_INFO cC7[256]= }; -static MY_UNICASE_INFO *my_caseinfo_big5[256]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_big5[256]= { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -793,6 +793,13 @@ static MY_UNICASE_INFO *my_caseinfo_big5[256]= }; +static MY_UNICASE_INFO my_caseinfo_big5= +{ + 0xFFFF, + my_caseinfo_pages_big5 +}; + + static uint16 big5strokexfrm(uint16 i) { if ((i == 0xA440) || (i == 0xA441)) return 0xA440; @@ -6926,11 +6933,10 @@ struct charset_info_st my_charset_big5_chinese_ci= to_lower_big5, to_upper_big5, sort_order_big5, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_big5, /* caseinfo */ + &my_caseinfo_big5, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -6959,11 +6965,10 @@ struct charset_info_st my_charset_big5_bin= to_lower_big5, to_upper_big5, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_big5, /* caseinfo */ + &my_caseinfo_big5, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 2363a235550..76e8da25fc2 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -69,7 +69,7 @@ static const uchar bin_char_array[] = static my_bool my_coll_init_8bit_bin(struct charset_info_st *cs, - void *(*alloc)(size_t) __attribute__((unused))) + MY_CHARSET_LOADER *loader __attribute__((unused))) { cs->max_sort_char=255; return FALSE; @@ -571,11 +571,10 @@ struct charset_info_st my_charset_bin = bin_char_array, /* to_lower */ bin_char_array, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index b7847e4509e..946cf4253d8 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -197,7 +197,7 @@ static uint mbcharlen_cp932(CHARSET_INFO *cs __attribute__((unused)),uint c) #define cp932code(c,d) ((((uint) (uchar)(c)) << 8) | (uint) (uchar) (d)) -static MY_UNICASE_INFO c81[256]= +static MY_UNICASE_CHARACTER c81[256]= { /* 8100-810F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -407,7 +407,7 @@ static MY_UNICASE_INFO c81[256]= }; -static MY_UNICASE_INFO c82[256]= +static MY_UNICASE_CHARACTER c82[256]= { /* 8200-820F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -615,7 +615,7 @@ static MY_UNICASE_INFO c82[256]= }; -static MY_UNICASE_INFO c83[256]= +static MY_UNICASE_CHARACTER c83[256]= { /* 8300-830F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -825,7 +825,7 @@ static MY_UNICASE_INFO c83[256]= }; -static MY_UNICASE_INFO c84[256]= +static MY_UNICASE_CHARACTER c84[256]= { /* 8400-840F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1035,7 +1035,7 @@ static MY_UNICASE_INFO c84[256]= }; -static MY_UNICASE_INFO c87[256]= +static MY_UNICASE_CHARACTER c87[256]= { /* 8700-870F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1245,7 +1245,7 @@ static MY_UNICASE_INFO c87[256]= }; -static MY_UNICASE_INFO cEE[256]= +static MY_UNICASE_CHARACTER cEE[256]= { /* EE00-EE0F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1456,7 +1456,7 @@ static MY_UNICASE_INFO cEE[256]= }; -static MY_UNICASE_INFO cFA[256]= +static MY_UNICASE_CHARACTER cFA[256]= { /* FA00-FA0F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1666,7 +1666,7 @@ static MY_UNICASE_INFO cFA[256]= }; -static MY_UNICASE_INFO *my_caseinfo_cp932[256]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_cp932[256]= { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1703,7 +1703,13 @@ static MY_UNICASE_INFO *my_caseinfo_cp932[256]= }; -static int my_strnncoll_cp932_internal(CHARSET_INFO *cs, +MY_UNICASE_INFO my_caseinfo_cp932= +{ + 0xFFFF, + my_caseinfo_pages_cp932 +}; + +static int my_strnncoll_cp932_internal(const CHARSET_INFO *cs, const uchar **a_res, size_t a_length, const uchar **b_res, size_t b_length) { @@ -34834,11 +34840,10 @@ struct charset_info_st my_charset_cp932_japanese_ci= to_lower_cp932, to_upper_cp932, sort_order_cp932, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_cp932, /* caseinfo */ + &my_caseinfo_cp932, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -34866,11 +34871,10 @@ struct charset_info_st my_charset_cp932_bin= to_lower_cp932, to_upper_cp932, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_cp932, /* caseinfo */ + &my_caseinfo_cp932, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-czech.c b/strings/ctype-czech.c index 36fc6b36f0c..1418edfecb3 100644 --- a/strings/ctype-czech.c +++ b/strings/ctype-czech.c @@ -613,11 +613,10 @@ struct charset_info_st my_charset_latin2_czech_ci = to_lower_czech, to_upper_czech, sort_order_czech, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ tab_8859_2_uni, /* tab_to_uni */ idx_uni_8859_2, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 4, /* strxfrm_multiply */ diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 3caa1977c0b..66b8b090241 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -216,7 +216,7 @@ static uint mbcharlen_euc_kr(CHARSET_INFO *cs __attribute__((unused)),uint c) } -static MY_UNICASE_INFO cA3[256]= +static MY_UNICASE_CHARACTER cA3[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -421,7 +421,7 @@ static MY_UNICASE_INFO cA3[256]= }; -static MY_UNICASE_INFO cA5[256]= +static MY_UNICASE_CHARACTER cA5[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -626,7 +626,7 @@ static MY_UNICASE_INFO cA5[256]= }; -static MY_UNICASE_INFO cA7[256]= +static MY_UNICASE_CHARACTER cA7[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -831,7 +831,7 @@ static MY_UNICASE_INFO cA7[256]= }; -static MY_UNICASE_INFO cA8[256]= +static MY_UNICASE_CHARACTER cA8[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1036,7 +1036,7 @@ static MY_UNICASE_INFO cA8[256]= }; -static MY_UNICASE_INFO cA9[256]= +static MY_UNICASE_CHARACTER cA9[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1241,7 +1241,7 @@ static MY_UNICASE_INFO cA9[256]= }; -static MY_UNICASE_INFO cAC[256]= +static MY_UNICASE_CHARACTER cAC[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1446,7 +1446,7 @@ static MY_UNICASE_INFO cAC[256]= }; -static MY_UNICASE_INFO *my_caseinfo_euckr[256]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_euckr[256]= { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1483,6 +1483,13 @@ static MY_UNICASE_INFO *my_caseinfo_euckr[256]= }; +static MY_UNICASE_INFO my_caseinfo_euckr= +{ + 0xFFFF, + my_caseinfo_pages_euckr +}; + + /* page 0 0x8141-0xC8FE */ static const uint16 tab_ksc5601_uni0[]={ 0xAC02,0xAC03,0xAC05,0xAC06,0xAC0B,0xAC0C,0xAC0D,0xAC0E, @@ -10016,11 +10023,10 @@ struct charset_info_st my_charset_euckr_korean_ci= to_lower_euc_kr, to_upper_euc_kr, sort_order_euc_kr, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_euckr, /* caseinfo */ + &my_caseinfo_euckr, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -10049,11 +10055,10 @@ struct charset_info_st my_charset_euckr_bin= to_lower_euc_kr, to_upper_euc_kr, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_euckr, /* caseinfo */ + &my_caseinfo_euckr, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index fe6bb744706..59a9a43c0f5 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -203,7 +203,7 @@ static uint mbcharlen_eucjpms(CHARSET_INFO *cs __attribute__((unused)),uint c) /* Case info pages for JIS-X-0208 range */ -static MY_UNICASE_INFO cA2[256]= +static MY_UNICASE_CHARACTER cA2[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -324,7 +324,7 @@ static MY_UNICASE_INFO cA2[256]= }; -static MY_UNICASE_INFO cA3[256]= +static MY_UNICASE_CHARACTER cA3[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -445,7 +445,7 @@ static MY_UNICASE_INFO cA3[256]= }; -static MY_UNICASE_INFO cA6[256]= +static MY_UNICASE_CHARACTER cA6[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -566,7 +566,7 @@ static MY_UNICASE_INFO cA6[256]= }; -static MY_UNICASE_INFO cA7[256]= +static MY_UNICASE_CHARACTER cA7[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -687,7 +687,7 @@ static MY_UNICASE_INFO cA7[256]= }; -static MY_UNICASE_INFO cAD[256]= +static MY_UNICASE_CHARACTER cAD[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -810,7 +810,7 @@ static MY_UNICASE_INFO cAD[256]= /* Case info pages for JIS-X-0212 range */ -static MY_UNICASE_INFO c8FA6[256]= +static MY_UNICASE_CHARACTER c8FA6[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -931,7 +931,7 @@ static MY_UNICASE_INFO c8FA6[256]= }; -static MY_UNICASE_INFO c8FA7[256]= +static MY_UNICASE_CHARACTER c8FA7[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1052,7 +1052,7 @@ static MY_UNICASE_INFO c8FA7[256]= }; -static MY_UNICASE_INFO c8FA9[256]= +static MY_UNICASE_CHARACTER c8FA9[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1173,7 +1173,7 @@ static MY_UNICASE_INFO c8FA9[256]= }; -static MY_UNICASE_INFO c8FAA[256]= +static MY_UNICASE_CHARACTER c8FAA[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1294,7 +1294,7 @@ static MY_UNICASE_INFO c8FAA[256]= }; -static MY_UNICASE_INFO c8FAB[256]= +static MY_UNICASE_CHARACTER c8FAB[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1415,7 +1415,7 @@ static MY_UNICASE_INFO c8FAB[256]= }; -static MY_UNICASE_INFO c8FF3[256]= +static MY_UNICASE_CHARACTER c8FF3[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1536,7 +1536,7 @@ static MY_UNICASE_INFO c8FF3[256]= }; -static MY_UNICASE_INFO c8FF4[256]= +static MY_UNICASE_CHARACTER c8FF4[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1657,7 +1657,7 @@ static MY_UNICASE_INFO c8FF4[256]= }; -static MY_UNICASE_INFO *my_caseinfo_eucjpms[512]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_eucjpms[512]= { /* JIS-X-0208 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ @@ -1729,7 +1729,14 @@ static MY_UNICASE_INFO *my_caseinfo_eucjpms[512]= }; -static const uint16 jisx0208_eucjpms_to_unicode[65536]= +static MY_UNICASE_INFO my_caseinfo_eucjpms= +{ + 0x0FFFF, + my_caseinfo_pages_eucjpms +}; + + +static uint16 jisx0208_eucjpms_to_unicode[65536]= { 0x0000, 0x0001, 0x0002, 0x0003, /* 0000 */ 0x0004, 0x0005, 0x0006, 0x0007, @@ -67559,11 +67566,10 @@ struct charset_info_st my_charset_eucjpms_japanese_ci= to_lower_eucjpms, to_upper_eucjpms, sort_order_eucjpms, - NULL, /* sort_order_big*/ - NULL, /* contractions */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_eucjpms,/* caseinfo */ + &my_caseinfo_eucjpms,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -67592,11 +67598,10 @@ struct charset_info_st my_charset_eucjpms_bin= to_lower_eucjpms, to_upper_eucjpms, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_eucjpms,/* caseinfo */ + &my_caseinfo_eucjpms,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c index e0499c6f2e3..addeeba8ba0 100644 --- a/strings/ctype-extra.c +++ b/strings/ctype-extra.c @@ -6616,11 +6616,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_dec8_swedish_ci, /* lower */ to_upper_dec8_swedish_ci, /* upper */ sort_order_dec8_swedish_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_dec8_swedish_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6649,11 +6648,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp850_general_ci, /* lower */ to_upper_cp850_general_ci, /* upper */ sort_order_cp850_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp850_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6682,11 +6680,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin1_german1_ci, /* lower */ to_upper_latin1_german1_ci, /* upper */ sort_order_latin1_german1_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin1_german1_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6715,11 +6712,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_hp8_english_ci, /* lower */ to_upper_hp8_english_ci, /* upper */ sort_order_hp8_english_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_hp8_english_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6748,11 +6744,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_koi8r_general_ci, /* lower */ to_upper_koi8r_general_ci, /* upper */ sort_order_koi8r_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_koi8r_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6781,11 +6776,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin2_general_ci, /* lower */ to_upper_latin2_general_ci, /* upper */ sort_order_latin2_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin2_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6814,11 +6808,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_swe7_swedish_ci, /* lower */ to_upper_swe7_swedish_ci, /* upper */ sort_order_swe7_swedish_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_swe7_swedish_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6847,11 +6840,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_ascii_general_ci, /* lower */ to_upper_ascii_general_ci, /* upper */ sort_order_ascii_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_ascii_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6880,11 +6872,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1251_bulgarian_ci, /* lower */ to_upper_cp1251_bulgarian_ci, /* upper */ sort_order_cp1251_bulgarian_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1251_bulgarian_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6913,11 +6904,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin1_danish_ci, /* lower */ to_upper_latin1_danish_ci, /* upper */ sort_order_latin1_danish_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin1_danish_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6946,11 +6936,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_hebrew_general_ci, /* lower */ to_upper_hebrew_general_ci, /* upper */ sort_order_hebrew_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_hebrew_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -6979,11 +6968,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin7_estonian_cs, /* lower */ to_upper_latin7_estonian_cs, /* upper */ sort_order_latin7_estonian_cs, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin7_estonian_cs, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7012,11 +7000,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin2_hungarian_ci, /* lower */ to_upper_latin2_hungarian_ci, /* upper */ sort_order_latin2_hungarian_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin2_hungarian_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7045,11 +7032,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_koi8u_general_ci, /* lower */ to_upper_koi8u_general_ci, /* upper */ sort_order_koi8u_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_koi8u_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7078,11 +7064,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1251_ukrainian_ci, /* lower */ to_upper_cp1251_ukrainian_ci, /* upper */ sort_order_cp1251_ukrainian_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1251_ukrainian_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7111,11 +7096,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_greek_general_ci, /* lower */ to_upper_greek_general_ci, /* upper */ sort_order_greek_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_greek_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7144,11 +7128,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1250_general_ci, /* lower */ to_upper_cp1250_general_ci, /* upper */ sort_order_cp1250_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1250_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7177,11 +7160,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin2_croatian_ci, /* lower */ to_upper_latin2_croatian_ci, /* upper */ sort_order_latin2_croatian_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin2_croatian_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7210,11 +7192,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1257_lithuanian_ci, /* lower */ to_upper_cp1257_lithuanian_ci, /* upper */ sort_order_cp1257_lithuanian_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1257_lithuanian_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7243,11 +7224,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin5_turkish_ci, /* lower */ to_upper_latin5_turkish_ci, /* upper */ sort_order_latin5_turkish_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin5_turkish_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7276,11 +7256,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_armscii8_general_ci, /* lower */ to_upper_armscii8_general_ci, /* upper */ sort_order_armscii8_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_armscii8_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7309,11 +7288,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp866_general_ci, /* lower */ to_upper_cp866_general_ci, /* upper */ sort_order_cp866_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp866_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7342,11 +7320,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_keybcs2_general_ci, /* lower */ to_upper_keybcs2_general_ci, /* upper */ sort_order_keybcs2_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_keybcs2_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7375,11 +7352,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_macce_general_ci, /* lower */ to_upper_macce_general_ci, /* upper */ sort_order_macce_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_macce_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7408,11 +7384,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_macroman_general_ci, /* lower */ to_upper_macroman_general_ci, /* upper */ sort_order_macroman_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_macroman_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7441,11 +7416,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp852_general_ci, /* lower */ to_upper_cp852_general_ci, /* upper */ sort_order_cp852_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp852_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7474,11 +7448,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin7_general_ci, /* lower */ to_upper_latin7_general_ci, /* upper */ sort_order_latin7_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin7_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7507,11 +7480,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin7_general_cs, /* lower */ to_upper_latin7_general_cs, /* upper */ sort_order_latin7_general_cs, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin7_general_cs, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7540,11 +7512,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_macce_bin, /* lower */ to_upper_macce_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_macce_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7573,11 +7544,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1250_croatian_ci, /* lower */ to_upper_cp1250_croatian_ci, /* upper */ sort_order_cp1250_croatian_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1250_croatian_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7606,11 +7576,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin1_general_ci, /* lower */ to_upper_latin1_general_ci, /* upper */ sort_order_latin1_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin1_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7639,11 +7608,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin1_general_cs, /* lower */ to_upper_latin1_general_cs, /* upper */ sort_order_latin1_general_cs, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin1_general_cs, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7672,11 +7640,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1251_bin, /* lower */ to_upper_cp1251_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1251_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7705,11 +7672,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1251_general_ci, /* lower */ to_upper_cp1251_general_ci, /* upper */ sort_order_cp1251_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1251_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7738,11 +7704,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1251_general_cs, /* lower */ to_upper_cp1251_general_cs, /* upper */ sort_order_cp1251_general_cs, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1251_general_cs, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7771,11 +7736,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_macroman_bin, /* lower */ to_upper_macroman_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_macroman_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7804,11 +7768,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1256_general_ci, /* lower */ to_upper_cp1256_general_ci, /* upper */ sort_order_cp1256_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1256_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7837,11 +7800,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1257_bin, /* lower */ to_upper_cp1257_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1257_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7870,11 +7832,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1257_general_ci, /* lower */ to_upper_cp1257_general_ci, /* upper */ sort_order_cp1257_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1257_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7903,11 +7864,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_armscii8_bin, /* lower */ to_upper_armscii8_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_armscii8_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7936,11 +7896,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_ascii_bin, /* lower */ to_upper_ascii_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_ascii_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -7969,11 +7928,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1250_bin, /* lower */ to_upper_cp1250_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1250_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8002,11 +7960,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1256_bin, /* lower */ to_upper_cp1256_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1256_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8035,11 +7992,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp866_bin, /* lower */ to_upper_cp866_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp866_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8068,11 +8024,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_dec8_bin, /* lower */ to_upper_dec8_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_dec8_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8101,11 +8056,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_greek_bin, /* lower */ to_upper_greek_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_greek_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8134,11 +8088,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_hebrew_bin, /* lower */ to_upper_hebrew_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_hebrew_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8167,11 +8120,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_hp8_bin, /* lower */ to_upper_hp8_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_hp8_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8200,11 +8152,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_keybcs2_bin, /* lower */ to_upper_keybcs2_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_keybcs2_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8233,11 +8184,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_koi8r_bin, /* lower */ to_upper_koi8r_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_koi8r_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8266,11 +8216,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_koi8u_bin, /* lower */ to_upper_koi8u_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_koi8u_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8299,11 +8248,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin2_bin, /* lower */ to_upper_latin2_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin2_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8332,11 +8280,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin5_bin, /* lower */ to_upper_latin5_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin5_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8365,11 +8312,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin7_bin, /* lower */ to_upper_latin7_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin7_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8398,11 +8344,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp850_bin, /* lower */ to_upper_cp850_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp850_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8431,11 +8376,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp852_bin, /* lower */ to_upper_cp852_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp852_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8464,11 +8408,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_swe7_bin, /* lower */ to_upper_swe7_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_swe7_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8497,11 +8440,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_geostd8_general_ci, /* lower */ to_upper_geostd8_general_ci, /* upper */ sort_order_geostd8_general_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_geostd8_general_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8530,11 +8472,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_geostd8_bin, /* lower */ to_upper_geostd8_bin, /* upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_geostd8_bin, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8563,11 +8504,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_latin1_spanish_ci, /* lower */ to_upper_latin1_spanish_ci, /* upper */ sort_order_latin1_spanish_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_latin1_spanish_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8596,11 +8536,10 @@ struct charset_info_st compiled_charsets[] = { to_lower_cp1250_polish_ci, /* lower */ to_upper_cp1250_polish_ci, /* upper */ sort_order_cp1250_polish_ci, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ to_uni_cp1250_polish_ci, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ @@ -8628,11 +8567,10 @@ struct charset_info_st compiled_charsets[] = { NULL, /* lower */ NULL, /* upper */ NULL, /* sort order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* to_uni */ NULL, /* from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state map */ NULL, /* ident map */ 1, /* strxfrm_multiply*/ diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index b27ea67059d..74be52a5c6d 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -177,7 +177,7 @@ static uint mbcharlen_gb2312(CHARSET_INFO *cs __attribute__((unused)),uint c) } -static MY_UNICASE_INFO cA2[256]= +static MY_UNICASE_CHARACTER cA2[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -298,7 +298,7 @@ static MY_UNICASE_INFO cA2[256]= }; -static MY_UNICASE_INFO cA3[256]= +static MY_UNICASE_CHARACTER cA3[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -419,7 +419,7 @@ static MY_UNICASE_INFO cA3[256]= }; -static MY_UNICASE_INFO cA6[256]= +static MY_UNICASE_CHARACTER cA6[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -540,7 +540,7 @@ static MY_UNICASE_INFO cA6[256]= }; -static MY_UNICASE_INFO cA7[256]= +static MY_UNICASE_CHARACTER cA7[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -661,7 +661,7 @@ static MY_UNICASE_INFO cA7[256]= }; -static MY_UNICASE_INFO cA8[256]= +static MY_UNICASE_CHARACTER cA8[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -782,7 +782,7 @@ static MY_UNICASE_INFO cA8[256]= }; -static MY_UNICASE_INFO *my_caseinfo_gb2312[256]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_gb2312[256]= { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -819,6 +819,13 @@ static MY_UNICASE_INFO *my_caseinfo_gb2312[256]= }; +static MY_UNICASE_INFO my_caseinfo_gb2312= +{ + 0xFFFF, + my_caseinfo_pages_gb2312 +}; + + /* page 0 0x2121-0x2658 */ static const uint16 tab_gb2312_uni0[]={ 0x3000,0x3001,0x3002,0x30FB,0x02C9,0x02C7,0x00A8,0x3003, @@ -6419,11 +6426,10 @@ struct charset_info_st my_charset_gb2312_chinese_ci= to_lower_gb2312, to_upper_gb2312, sort_order_gb2312, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_gb2312, /* caseinfo */ + &my_caseinfo_gb2312,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -6451,11 +6457,10 @@ struct charset_info_st my_charset_gb2312_bin= to_lower_gb2312, to_upper_gb2312, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_gb2312, /* caseinfo */ + &my_caseinfo_gb2312,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index e21c406d2a9..dd617fd8548 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -136,7 +136,8 @@ static const uchar to_upper_gbk[]= (uchar) '\370',(uchar) '\371',(uchar) '\372',(uchar) '\373',(uchar) '\374',(uchar) '\375',(uchar) '\376',(uchar) '\377', }; -static MY_UNICASE_INFO cA2[256]= + +static MY_UNICASE_CHARACTER cA2[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -340,7 +341,7 @@ static MY_UNICASE_INFO cA2[256]= {0xA2FF,0xA2FF,0xA2FF} }; -static MY_UNICASE_INFO cA3[256]= +static MY_UNICASE_CHARACTER cA3[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -545,7 +546,7 @@ static MY_UNICASE_INFO cA3[256]= }; -static MY_UNICASE_INFO cA6[256]= +static MY_UNICASE_CHARACTER cA6[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -750,7 +751,7 @@ static MY_UNICASE_INFO cA6[256]= }; -static MY_UNICASE_INFO cA7[256]= +static MY_UNICASE_CHARACTER cA7[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -955,7 +956,7 @@ static MY_UNICASE_INFO cA7[256]= }; -static MY_UNICASE_INFO *my_caseinfo_gbk[256]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_gbk[256]= { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -991,7 +992,15 @@ static MY_UNICASE_INFO *my_caseinfo_gbk[256]= NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; -static const uchar sort_order_gbk[]= + +static MY_UNICASE_INFO my_caseinfo_gbk= +{ + 0xFFFF, + my_caseinfo_pages_gbk +}; + + +static uchar sort_order_gbk[]= { '\000','\001','\002','\003','\004','\005','\006','\007', '\010','\011','\012','\013','\014','\015','\016','\017', @@ -10809,11 +10818,10 @@ struct charset_info_st my_charset_gbk_chinese_ci= to_lower_gbk, to_upper_gbk, sort_order_gbk, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_gbk, /* caseinfo */ + &my_caseinfo_gbk, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -10841,11 +10849,10 @@ struct charset_info_st my_charset_gbk_bin= to_lower_gbk, to_upper_gbk, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_gbk, /* caseinfo */ + &my_caseinfo_gbk, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index f8455344498..759997dae2d 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -437,11 +437,10 @@ struct charset_info_st my_charset_latin1= to_lower_latin1, to_upper_latin1, sort_order_latin1, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ cs_to_uni, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -736,11 +735,10 @@ struct charset_info_st my_charset_latin1_german2_ci= to_lower_latin1, to_upper_latin1, sort_order_latin1_de, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ cs_to_uni, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 2, /* strxfrm_multiply */ @@ -769,11 +767,10 @@ struct charset_info_st my_charset_latin1_bin= to_lower_latin1, to_upper_latin1, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ cs_to_uni, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 8c7de1d16c7..c5c8fd92842 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -62,11 +62,11 @@ size_t my_casedn_str_mb(CHARSET_INFO * cs, char *str) } -static inline MY_UNICASE_INFO* -get_case_info_for_ch(CHARSET_INFO *cs, uint page, uint offs) +static inline MY_UNICASE_CHARACTER* +get_case_info_for_ch(const CHARSET_INFO *cs, uint page, uint offs) { - MY_UNICASE_INFO *p; - return cs->caseinfo ? ((p= cs->caseinfo[page]) ? &p[offs] : NULL) : NULL; + MY_UNICASE_CHARACTER *p; + return cs->caseinfo && (p= cs->caseinfo->page[page]) ? &p[offs] : NULL; } @@ -89,7 +89,7 @@ size_t my_caseup_mb(CHARSET_INFO * cs, char *src, size_t srclen, { if ((l=my_ismbchar(cs, src, srcend))) { - MY_UNICASE_INFO *ch; + MY_UNICASE_CHARACTER *ch; if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1]))) { *src++= ch->toupper >> 8; @@ -124,7 +124,7 @@ size_t my_casedn_mb(CHARSET_INFO * cs, char *src, size_t srclen, { if ((l= my_ismbchar(cs, src, srcend))) { - MY_UNICASE_INFO *ch; + MY_UNICASE_CHARACTER *ch; if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1]))) { *src++= ch->tolower >> 8; @@ -168,7 +168,7 @@ my_casefold_mb_varlen(CHARSET_INFO *cs, size_t mblen= my_ismbchar(cs, src, srcend); if (mblen) { - MY_UNICASE_INFO *ch; + MY_UNICASE_CHARACTER *ch; if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1]))) { int code= is_upper ? ch->toupper : ch->tolower; @@ -696,7 +696,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs, char *min_end= min_str + res_length; char *max_end= max_str + res_length; size_t maxcharlen= res_length / cs->mbmaxlen; - my_bool have_contractions= my_cs_have_contractions(cs); + const MY_CONTRACTIONS *contractions= my_charset_get_contractions(cs, 0); for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--) { @@ -764,8 +764,8 @@ fill_max_and_min: 'ab\min\min\min\min' and 'ab\max\max\max\max'. */ - if (have_contractions && ptr + 1 < end && - my_cs_can_be_contraction_head(cs, (uchar) *ptr)) + if (contractions && ptr + 1 < end && + my_uca_can_be_contraction_head(contractions, (uchar) *ptr)) { /* Ptr[0] is a contraction head. */ @@ -787,8 +787,8 @@ fill_max_and_min: is not a contraction, then we put only ptr[0], and continue with ptr[1] on the next loop. */ - if (my_cs_can_be_contraction_tail(cs, (uchar) ptr[1]) && - my_cs_contraction2_weight(cs, (uchar) ptr[0], (uchar) ptr[1])) + if (my_uca_can_be_contraction_tail(contractions, (uchar) ptr[1]) && + my_uca_contraction2_weight(contractions, (uchar) ptr[0], ptr[1])) { /* Contraction found */ if (maxcharlen == 1 || min_str + 1 >= min_end) @@ -853,7 +853,7 @@ my_like_range_generic(CHARSET_INFO *cs, char *max_end= max_str + res_length; size_t charlen= res_length / cs->mbmaxlen; size_t res_length_diff; - my_bool have_contractions= my_cs_have_contractions(cs); + const MY_CONTRACTIONS *contractions= my_charset_get_contractions(cs, 0); for ( ; charlen > 0; charlen--) { @@ -921,8 +921,8 @@ my_like_range_generic(CHARSET_INFO *cs, goto pad_min_max; } - if (have_contractions && - my_cs_can_be_contraction_head(cs, wc) && + if (contractions && + my_uca_can_be_contraction_head(contractions, wc) && (res= cs->cset->mb_wc(cs, &wc2, (uchar*) ptr, (uchar*) end)) > 0) { const uint16 *weight; @@ -933,8 +933,8 @@ my_like_range_generic(CHARSET_INFO *cs, goto pad_min_max; } - if (my_cs_can_be_contraction_tail(cs, wc2) && - (weight= my_cs_contraction2_weight(cs, wc, wc2)) && weight[0]) + if (my_uca_can_be_contraction_tail(contractions, wc2) && + (weight= my_uca_contraction2_weight(contractions, wc, wc2)) && weight[0]) { /* Contraction found */ if (charlen == 1) diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index f084ff9949a..91a9df9d50b 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1163,12 +1163,12 @@ static int pcmp(const void * f, const void * s) return res; } -static my_bool create_fromuni(struct charset_info_st *cs, - void *(*alloc)(size_t)) +static my_bool +create_fromuni(struct charset_info_st *cs, + MY_CHARSET_LOADER *loader) { uni_idx idx[PLANE_NUM]; int i,n; - struct my_uni_idx_st *tab_from_uni; /* Check that Unicode map is loaded. @@ -1217,7 +1217,8 @@ static my_bool create_fromuni(struct charset_info_st *cs, numchars=idx[i].uidx.to-idx[i].uidx.from+1; if (!(idx[i].uidx.tab= tab= (uchar*) - alloc(numchars * sizeof(*idx[i].uidx.tab)))) + (loader->once_alloc) (numchars * + sizeof(*idx[i].uidx.tab)))) return TRUE; bzero(tab,numchars*sizeof(*tab)); @@ -1235,25 +1236,25 @@ static my_bool create_fromuni(struct charset_info_st *cs, /* Allocate and fill reverse table for each plane */ n=i; - if (!(cs->tab_from_uni= tab_from_uni= (struct my_uni_idx_st*) - alloc(sizeof(MY_UNI_IDX)*(n+1)))) + if (!(cs->tab_from_uni= (MY_UNI_IDX *) + (loader->once_alloc)(sizeof(MY_UNI_IDX) * (n + 1)))) return TRUE; for (i=0; i< n; i++) - tab_from_uni[i]= idx[i].uidx; + ((struct my_uni_idx_st*)cs->tab_from_uni)[i]= idx[i].uidx; /* Set end-of-list marker */ - bzero(&tab_from_uni[i],sizeof(MY_UNI_IDX)); + bzero((char*) &cs->tab_from_uni[i],sizeof(MY_UNI_IDX)); return FALSE; } -static my_bool my_cset_init_8bit(struct charset_info_st *cs, - void *(*alloc)(size_t)) +static my_bool +my_cset_init_8bit(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) { cs->caseup_multiply= 1; cs->casedn_multiply= 1; cs->pad_char= ' '; - return create_fromuni(cs, alloc); + return create_fromuni(cs, loader); } static void set_max_sort_char(struct charset_info_st *cs) @@ -1276,7 +1277,7 @@ static void set_max_sort_char(struct charset_info_st *cs) } static my_bool my_coll_init_simple(struct charset_info_st *cs, - void *(*alloc)(size_t) __attribute__((unused))) + MY_CHARSET_LOADER *loader __attribute__((unused))) { set_max_sort_char(cs); return FALSE; diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 50756799a56..2c3d2b34dab 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -197,7 +197,7 @@ static uint mbcharlen_sjis(CHARSET_INFO *cs __attribute__((unused)),uint c) #define sjiscode(c,d) ((((uint) (uchar)(c)) << 8) | (uint) (uchar) (d)) -static MY_UNICASE_INFO c81[256]= +static MY_UNICASE_CHARACTER c81[256]= { /* 8100-810F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -407,7 +407,7 @@ static MY_UNICASE_INFO c81[256]= }; -static MY_UNICASE_INFO c82[256]= +static MY_UNICASE_CHARACTER c82[256]= { /* 8200-820F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -615,7 +615,7 @@ static MY_UNICASE_INFO c82[256]= }; -static MY_UNICASE_INFO c83[256]= +static MY_UNICASE_CHARACTER c83[256]= { /* 8300-830F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -825,7 +825,7 @@ static MY_UNICASE_INFO c83[256]= }; -static MY_UNICASE_INFO c84[256]= +static MY_UNICASE_CHARACTER c84[256]= { /* 8400-840F */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -1035,7 +1035,7 @@ static MY_UNICASE_INFO c84[256]= }; -static MY_UNICASE_INFO *my_caseinfo_sjis[256]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_sjis[256]= { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1072,7 +1072,14 @@ static MY_UNICASE_INFO *my_caseinfo_sjis[256]= }; -static int my_strnncoll_sjis_internal(CHARSET_INFO *cs, +static MY_UNICASE_INFO my_caseinfo_sjis= +{ + 0xFFFF, + my_caseinfo_pages_sjis +}; + + +static int my_strnncoll_sjis_internal(const CHARSET_INFO *cs, const uchar **a_res, size_t a_length, const uchar **b_res, size_t b_length) { @@ -34204,11 +34211,10 @@ struct charset_info_st my_charset_sjis_japanese_ci= to_lower_sjis, to_upper_sjis, sort_order_sjis, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_sjis, /* caseinfo */ + &my_caseinfo_sjis, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -34236,11 +34242,10 @@ struct charset_info_st my_charset_sjis_bin= to_lower_sjis, to_upper_sjis, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_sjis, /* caseinfo */ + &my_caseinfo_sjis, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index d84d43a67bd..d0b4f9b8862 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -894,11 +894,10 @@ struct charset_info_st my_charset_tis620_thai_ci= to_lower_tis620, to_upper_tis620, sort_order_tis620, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 4, /* strxfrm_multiply */ @@ -926,11 +925,10 @@ struct charset_info_st my_charset_tis620_bin= to_lower_tis620, to_upper_tis620, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 109a233d62e..5d52cb7e517 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -46,7 +46,6 @@ #define MY_UCA_NCHARS 256 #define MY_UCA_CMASK 255 #define MY_UCA_PSHIFT 8 -#define MAX_UCA_CHAR_WITH_EXPLICIT_WEIGHT 0xFFFF static const uint16 page000data[]= { /* 0000 (4 weights per char) */ 0x0000,0x0000,0x0000,0x0000, 0x0000,0x0000,0x0000,0x0000, @@ -6526,6 +6525,59 @@ NULL ,page0F9data,page0FAdata,page0FBdata, page0FCdata,page0FDdata,page0FEdata,page0FFdata }; + +MY_UCA_INFO my_uca_v400= +{ + { + { + 0xFFFF, /* maxchar */ + (uchar *) uca_length, + (uint16 **) uca_weight, + { /* Contractions: */ + 0, /* nitems */ + NULL, /* item */ + NULL /* flags */ + } + }, + }, + + /* Logical positions */ + 0x0009, /* first_non_ignorable p != ignore */ + 0xA48C, /* last_non_ignorable Not a CJK and not UNASSIGNED */ + + 0x0332, /* first_primary_ignorable p == 0 */ + 0x20EA, /* last_primary_ignorable */ + + 0x0000, /* first_secondary_ignorable p,s == 0 */ + 0xFE73, /* last_secondary_ignorable p,s == 0 */ + + 0x0000, /* first_tertiary_ignorable p,s,t == 0 */ + 0xFE73, /* last_tertiary_ignorable p,s,t == 0 */ + + 0x0000, /* first_trailing */ + 0x0000, /* last_trailing */ + + 0x0009, /* first_variable */ + 0x2183, /* last_variable */ +}; + +/******************************************************/ + +#define MY_UCA_CMASK 255 +#define MY_UCA_PSHIFT 8 + + +/******************************************************/ + +/* + German Phonebook +*/ +static const char german2[]= + "&AE << \\u00E6 <<< \\u00C6 << \\u00E4 <<< \\u00C4 " + "&OE << \\u0153 <<< \\u0152 << \\u00F6 <<< \\u00D6 " + "&UE << \\u00FC <<< \\u00DC "; + + /* Some sources treat LETTER A WITH DIARESIS (00E4,00C4) secondary greater than LETTER AE (00E6,00C6). @@ -6686,7 +6738,13 @@ static const char persian[]= "& \\u0642 < \\u06A9 < \\u0643" "& \\u0648 < \\u0647 < \\u0629 < \\u06C0 < \\u06CC < \\u0649 < \\u064A" "& \\uFE80 < \\uFE81 < \\uFE82 < \\uFE8D < \\uFE8E < \\uFB50 < \\uFB51" - " < \\uFE80 < \\uFE83 < \\uFE84 < \\uFE87 < \\uFE88 < \\uFE85" + " < \\uFE80 " + /* + FE80 appears both in reset and shift. + We need to break the rule here and reset to *new* FE80 again, + so weight for FE83 is calculated as P[FE80]+1, not as P[FE80]+8. + */ + " & \\uFE80 < \\uFE83 < \\uFE84 < \\uFE87 < \\uFE88 < \\uFE85" " < \\uFE86 < \\u0689 < \\u068A" "& \\uFEAE < \\uFDFC" "& \\uFED8 < \\uFB8E < \\uFB8F < \\uFB90 < \\uFB91 < \\uFED9 < \\uFEDA" @@ -6747,7 +6805,6 @@ static const char sinhala[]= static const char croatian[]= - "&C < \\u010D <<< \\u010C < \\u0107 <<< \\u0106 " "&D < d\\u017E <<< \\u01C6 <<< D\\u017E <<< \\u01C5 <<< D\\u017D <<< \\u01C4 " " < \\u0111 <<< \\u0110 " @@ -6755,7 +6812,6 @@ static const char croatian[]= "&N < nj <<< \\u01CC <<< Nj <<< \\u01CB <<< NJ <<< \\u01CA " "&S < \\u0161 <<< \\u0160 " "&Z < \\u017E <<< \\u017D"; - /* Unicode Collation Algorithm: Collation element (weight) scanner, @@ -6767,9 +6823,7 @@ typedef struct my_uca_scanner_st const uint16 *wbeg; /* Beginning of the current weight string */ const uchar *sbeg; /* Beginning of the input string */ const uchar *send; /* End of the input string */ - const uchar *uca_length; - const uint16 * const *uca_weight; - const MY_CONTRACTIONS *contractions; + const MY_UCA_WEIGHT_LEVEL *level; uint16 implicit[2]; int page; int code; @@ -6782,51 +6836,81 @@ typedef struct my_uca_scanner_st */ typedef struct my_uca_scanner_handler_st { - void (*init)(my_uca_scanner *scanner, CHARSET_INFO *cs, + void (*init)(my_uca_scanner *scanner, CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, const uchar *str, size_t length); int (*next)(my_uca_scanner *scanner); } my_uca_scanner_handler; static const uint16 nochar[]= {0,0}; + +#define MY_UCA_CNT_FLAG_SIZE 4096 +#define MY_UCA_CNT_FLAG_MASK 4095 + +#define MY_UCA_CNT_HEAD 1 +#define MY_UCA_CNT_TAIL 2 +#define MY_UCA_CNT_MID1 4 +#define MY_UCA_CNT_MID2 8 +#define MY_UCA_CNT_MID3 16 +#define MY_UCA_CNT_MID4 32 + +#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64 +#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128 + /********** Helper functions to handle contraction ************/ /** Mark a character as a contraction part - @cs Pointer to CHARSET_INFO data - @wc Unicode code point - @flag flag: "is contraction head", "is contraction tail" + @param uca Pointer to UCA data + @param wc Unicode code point + @param flag flag: "is contraction head", "is contraction tail" */ -static void -my_uca_add_contraction_flag(CHARSET_INFO *cs, my_wc_t wc, int flag) +static inline void +my_uca_add_contraction_flag(MY_CONTRACTIONS *list, my_wc_t wc, int flag) { - cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK]|= flag; + list->flags[wc & MY_UCA_CNT_FLAG_MASK]|= flag; } /** Add a new contraction into contraction list - @cs Pointer to CHARSET_INFO data - @wc Unicode code points of the characters - @len Number of characters + @param uca Pointer to UCA data + @param wc Unicode code points of the characters + @param len Number of characters @return New contraction @retval Pointer to a newly added contraction */ static MY_CONTRACTION * -my_uca_add_contraction(struct charset_info_st *cs, - my_wc_t *wc, int len __attribute__((unused))) +my_uca_add_contraction(MY_CONTRACTIONS *list, my_wc_t *wc, size_t len, + my_bool with_context) { - MY_CONTRACTIONS *list= (MY_CONTRACTIONS*) cs->contractions; MY_CONTRACTION *next= &list->item[list->nitems]; - DBUG_ASSERT(len == 2); /* We currently support only contraction2 */ - next->ch[0]= wc[0]; - next->ch[1]= wc[1]; + size_t i; + /* + Contraction is always at least 2 characters. + Contraction is never longer than MY_UCA_MAX_CONTRACTION, + which is guaranteed by using my_coll_rule_expand() with proper limit. + */ + DBUG_ASSERT(len > 1 && len <= MY_UCA_MAX_CONTRACTION); + for (i= 0; i < len; i++) + { + /* + We don't support contractions with U+0000. + my_coll_rule_expand() guarantees there're no U+0000 in a contraction. + */ + DBUG_ASSERT(wc[i] != 0); + next->ch[i]= wc[i]; + } + if (i < MY_UCA_MAX_CONTRACTION) + next->ch[i]= 0; /* Add end-of-line marker */ + next->with_context= with_context; list->nitems++; return next; } @@ -6835,9 +6919,9 @@ my_uca_add_contraction(struct charset_info_st *cs, /** Allocate and initialize memory for contraction list and flags - @cs Pointer to CHARSET_INFO data - @alloc Memory allocation function (typically points to my_alloc_once) - @n Number of contractions + @param uca Pointer to UCA data + @param alloc Memory allocation function (typically points to my_alloc_once) + @param n Number of contractions @return Error code @retval 0 - memory allocated successfully @@ -6845,171 +6929,318 @@ my_uca_add_contraction(struct charset_info_st *cs, */ static my_bool -my_uca_alloc_contractions(struct charset_info_st *cs, - void *(*alloc)(size_t), size_t n) +my_uca_alloc_contractions(MY_CONTRACTIONS *contractions, + MY_CHARSET_LOADER *loader, size_t n) { uint size= n * sizeof(MY_CONTRACTION); - MY_CONTRACTIONS *contractions; - - if (!(cs->contractions= contractions= (*alloc)(sizeof(MY_CONTRACTIONS)))) - return 1; - bzero(contractions, sizeof(MY_CONTRACTIONS)); - if (!(contractions->item= (*alloc)(size)) || - !(contractions->flags= (char*) (*alloc)(MY_UCA_CNT_FLAG_SIZE))) + if (!(contractions->item= (loader->once_alloc)(size)) || + !(contractions->flags= (char *) (loader->once_alloc)(MY_UCA_CNT_FLAG_SIZE))) return 1; - bzero(contractions->item, size); - bzero(contractions->flags, MY_UCA_CNT_FLAG_SIZE); + memset(contractions->item, 0, size); + memset(contractions->flags, 0, MY_UCA_CNT_FLAG_SIZE); return 0; } -#ifdef HAVE_CHARSET_ucs2 -/* - Initialize collation weight scanner +/** + Return UCA contraction data for a CHARSET_INFO structure. - SYNOPSIS: - my_uca_scanner_init() - scanner Pointer to an initialized scanner structure - cs Character set + collation information - str Beginning of the string - length Length of the string. - - NOTES: - Optimized for UCS2 + @param cs Pointer to CHARSET_INFO structure + @retval Pointer to contraction data + @retval NULL, if this collation does not have UCA contraction +*/ - RETURN - N/A +const MY_CONTRACTIONS * +my_charset_get_contractions(const CHARSET_INFO *cs, int level) +{ + return (cs->uca != NULL) && (cs->uca->level[level].contractions.nitems > 0) ? + &cs->uca->level[level].contractions : NULL; +} + + +/** + Check if UCA level data has contractions (static version) + Static quick version of my_uca_have_contractions(), + optimized for performance purposes, also marked as "inline". + + @param level Pointer to UCA level data + + @return Flags indicating if UCA with contractions + @retval 0 - no contractions + @retval 1 - there are some contractions */ -static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner, - CHARSET_INFO *cs, - const uchar *str, size_t length) +static inline my_bool +my_uca_have_contractions_quick(const MY_UCA_WEIGHT_LEVEL *level) { - scanner->wbeg= nochar; - if (length) + return (level->contractions.nitems > 0); +} + + + +/** + Check if a character can be contraction head + + @param c Pointer to UCA contraction data + @param wc Code point + + @retval 0 - cannot be contraction head + @retval 1 - can be contraction head +*/ + +my_bool +my_uca_can_be_contraction_head(const MY_CONTRACTIONS *c, my_wc_t wc) +{ + return c->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD; +} + + +/** + Check if a character can be contraction tail + + @param c Pointer to UCA contraction data + @param wc Code point + + @retval 0 - cannot be contraction tail + @retval 1 - can be contraction tail +*/ + +my_bool +my_uca_can_be_contraction_tail(const MY_CONTRACTIONS *c, my_wc_t wc) +{ + return c->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL; +} + + +/** + Check if a character can be contraction part + + @param c Pointer to UCA contraction data + @param wc Code point + + @retval 0 - cannot be contraction part + @retval 1 - can be contraction part +*/ + +static inline my_bool +my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag) +{ + return c->flags[wc & MY_UCA_CNT_FLAG_MASK] & flag; +} + + +/** + Find a contraction consisting of two characters and return its weight array + + @param list Pointer to UCA contraction data + @param wc1 First character + @param wc2 Second character + + @return Weight array + @retval NULL - no contraction found + @retval ptr - contraction weight array +*/ + +uint16 * +my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2) +{ + MY_CONTRACTION *c, *last; + for (c= list->item, last= c + list->nitems; c < last; c++) { - scanner->sbeg= str; - scanner->send= str + length - 2; - scanner->uca_length= cs->sort_order; - scanner->uca_weight= cs->sort_order_big; - scanner->contractions= cs->contractions; - scanner->cs= cs; - return; + if (c->ch[0] == wc1 && c->ch[1] == wc2 && c->ch[2] == 0) + { + return c->weight; + } } + return NULL; +} - /* - Sometimes this function is called with - str=NULL and length=0, which should be - considered as an empty string. - - The above initialization is unsafe for such cases, - because scanner->send is initialized to (NULL-2), which is 0xFFFFFFFE. - Then we fall into an endless loop in my_uca_scanner_next_ucs2(). - - Do special initialization for the case when length=0. - Initialize scanner->sbeg to an address greater than scanner->send. - Next call of my_uca_scanner_next_ucs2() will correctly return with -1. - */ - scanner->sbeg= (uchar*) &nochar[1]; - scanner->send= (uchar*) &nochar[0]; + +/** + Check if a character can be previous context head + + @param list Pointer to UCA contraction data + @param wc Code point + + @return + @retval FALSE - cannot be previous context head + @retval TRUE - can be previous context head +*/ + +static inline my_bool +my_uca_can_be_previous_context_head(const MY_CONTRACTIONS *list, my_wc_t wc) +{ + return list->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_PREVIOUS_CONTEXT_HEAD; } -/* - Read next collation element (weight), i.e. converts - a stream of characters into a stream of their weights. - - SYNOPSIS: - my_uca_scanner_next() - scanner Address of a previously initialized scanner strucuture - - NOTES: - Optimized for UCS2 - - Checks if the current character's weight string has been fully scanned, - if no, then returns the next weight for this character, - else scans the next character and returns its first weight. +/** + Check if a character can be previois context tail - Each character can have number weights from 0 to 8. - - Some characters do not have weights at all, 0 weights. - It means they are ignored during comparison. - - Examples: - 1. 0x0001 START OF HEADING, has no weights, ignored, does - not produce any weights. - 2. 0x0061 LATIN SMALL LETTER A, has one weight. - 0x0E33 will be returned - 3. 0x00DF LATIN SMALL LETTER SHARP S, aka SZ ligature, - has two weights. It will return 0x0FEA twice for two - consequent calls. - 4. 0x247D PATENTHESIZED NUMBER TEN, has four weights, - this function will return these numbers in four - consequent calls: 0x0288, 0x0E2A, 0x0E29, 0x0289 - 5. A string consisting of the above characters: - 0x0001 0x0061 0x00DF 0x247D - will return the following weights, one weight per call: - 0x0E33 0x0FEA 0x0FEA 0x0288, 0x0E2A, 0x0E29, 0x0289 - - RETURN - Next weight, a number between 0x0000 and 0xFFFF - Or -1 on error (END-OF-STRING or ILLEGAL MULTIBYTE SEQUENCE) + @param uca Pointer to UCA contraction data + @param wc Code point + + @return + @retval FALSE - cannot be contraction tail + @retval TRUE - can be contraction tail */ -static int my_uca_scanner_next_ucs2(my_uca_scanner *scanner) +static inline my_bool +my_uca_can_be_previous_context_tail(const MY_CONTRACTIONS *list, my_wc_t wc) { - - /* - Check if the weights for the previous character have been - already fully scanned. If yes, then get the next character and - initialize wbeg and wlength to its weight string. - */ - - if (scanner->wbeg[0]) - return *scanner->wbeg++; - - do + return list->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_PREVIOUS_CONTEXT_TAIL; +} + + +/** + Compare two wide character strings, wide analog to strncmp(). + + @param a Pointer to the first string + @param b Pointer to the second string + @param len Length of the strings + + @return + @retval 0 - strings are equal + @retval non-zero - strings are different +*/ + +static int +my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len) +{ + return memcmp(a, b, len * sizeof(my_wc_t)); +} + + +/** + Check if a string is a contraction, + and return its weight array on success. + + @param list Pointer to UCA contraction data + @param wc Pointer to wide string + @param len String length + + @return Weight array + @retval NULL - Input string is not a known contraction + @retval ptr - contraction weight array +*/ + +static inline uint16 * +my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) +{ + MY_CONTRACTION *c, *last; + for (c= list->item, last= c + list->nitems; c < last; c++) { - const uint16 *const *ucaw= scanner->uca_weight; - const uchar *ucal= scanner->uca_length; - - if (scanner->sbeg > scanner->send) - return -1; - - scanner->page= (uchar)scanner->sbeg[0]; - scanner->code= (uchar)scanner->sbeg[1]; - scanner->sbeg+= 2; - - if (scanner->contractions && (scanner->sbeg <= scanner->send)) + if ((len == MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) && + !c->with_context && + !my_wmemcmp(c->ch, wc, len)) + return c->weight; + } + return NULL; +} + + +/** + Find a contraction in the input stream and return its weight array + + Scan input characters while their flags tell that they can be + a contraction part. Then try to find real contraction among the + candidates, starting from the longest. + + @param scanner Pointer to UCA scanner + @param[OUT] *wc Where to store the scanned string + + @return Weight array + @retval NULL - no contraction found + @retval ptr - contraction weight array +*/ + +static uint16 * +my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc) +{ + size_t clen= 1; + int flag; + const uchar *s, *beg[MY_UCA_MAX_CONTRACTION]; + memset(beg, 0, sizeof(beg)); + + /* Scan all contraction candidates */ + for (s= scanner->sbeg, flag= MY_UCA_CNT_MID1; + clen < MY_UCA_MAX_CONTRACTION; + flag<<= 1) + { + int mblen; + if ((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc[clen], + s, scanner->send)) <= 0) + break; + beg[clen]= s= s + mblen; + if (!my_uca_can_be_contraction_part(&scanner->level->contractions, + wc[clen++], flag)) + break; + } + + /* Find among candidates the longest real contraction */ + for ( ; clen > 1; clen--) + { + uint16 *cweight; + if (my_uca_can_be_contraction_tail(&scanner->level->contractions, + wc[clen - 1]) && + (cweight= my_uca_contraction_weight(&scanner->level->contractions, + wc, clen))) { - my_wc_t wc1= ((scanner->page << 8) | scanner->code); - - if (my_cs_can_be_contraction_head(scanner->cs, wc1)) - { - const uint16 *cweight; - my_wc_t wc2= (((my_wc_t) scanner->sbeg[0]) << 8) | scanner->sbeg[1]; - if (my_cs_can_be_contraction_tail(scanner->cs, wc2) && - (cweight= my_cs_contraction2_weight(scanner->cs, - scanner->code, - scanner->sbeg[1]))) - { - scanner->implicit[0]= 0; - scanner->wbeg= scanner->implicit; - scanner->sbeg+=2; - return *cweight; - } - } + scanner->wbeg= cweight + 1; + scanner->sbeg= beg[clen - 1]; + return cweight; } - - if (!ucaw[scanner->page]) - goto implicit; - scanner->wbeg= ucaw[scanner->page] + scanner->code * ucal[scanner->page]; - } while (!scanner->wbeg[0]); - - return *scanner->wbeg++; + } + + return NULL; /* No contractions were found */ +} + + +/** + Find weight for contraction with previous context + and return its weight array. + + @param scanner Pointer to UCA scanner + @param wc0 Previous character + @param wc1 Current character + + @return Weight array + @retval NULL - no contraction with context found + @retval ptr - contraction weight array +*/ + +static uint16 * +my_uca_previous_context_find(my_uca_scanner *scanner, + my_wc_t wc0, my_wc_t wc1) +{ + const MY_CONTRACTIONS *list= &scanner->level->contractions; + MY_CONTRACTION *c, *last; + for (c= list->item, last= c + list->nitems; c < last; c++) + { + if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1]) + { + scanner->wbeg= c->weight + 1; + return c->weight; + } + } + return NULL; +} + +/****************************************************************/ + + +/** + Return implicit UCA weight + Used for characters that do not have assigned UCA weights. -implicit: + @param scanner UCA weight scanner + @return The leading implicit weight. +*/ + +static inline int +my_uca_scanner_next_implicit(my_uca_scanner *scanner) +{ scanner->code= (scanner->page << 8) + scanner->code; scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000; scanner->implicit[1]= 0; @@ -7027,112 +7258,101 @@ implicit: return scanner->page; } -static my_uca_scanner_handler my_ucs2_uca_scanner_handler= -{ - my_uca_scanner_init_ucs2, - my_uca_scanner_next_ucs2 -}; - -#endif /* HAVE_CHARSET_ucs2 */ - /* The same two functions for any character set */ -static void my_uca_scanner_init_any(my_uca_scanner *scanner, - CHARSET_INFO *cs __attribute__((unused)), - const uchar *str, size_t length) +static void +my_uca_scanner_init_any(my_uca_scanner *scanner, + CHARSET_INFO *cs, + const MY_UCA_WEIGHT_LEVEL *level, + const uchar *str, size_t length) { /* Note, no needs to initialize scanner->wbeg */ scanner->sbeg= str; scanner->send= str + length; scanner->wbeg= nochar; - scanner->uca_length= cs->sort_order; - scanner->uca_weight= cs->sort_order_big; - scanner->contractions= cs->contractions; + scanner->level= level; scanner->cs= cs; } static int my_uca_scanner_next_any(my_uca_scanner *scanner) { - /* Check if the weights for the previous character have been already fully scanned. If yes, then get the next character and initialize wbeg and wlength to its weight string. */ - - if (scanner->wbeg[0]) - return *scanner->wbeg++; - - do + + if (scanner->wbeg[0]) /* More weights left from the previous step: */ + return *scanner->wbeg++; /* return the next weight from expansion */ + + do { - const uint16 *const *ucaw= scanner->uca_weight; - const uchar *ucal= scanner->uca_length; - my_wc_t wc; - int mb_len; - - if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc, + const uint16 *wpage; + my_wc_t wc[MY_UCA_MAX_CONTRACTION]; + int mblen; + + /* Get next character */ + if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, wc, scanner->sbeg, scanner->send)) <= 0)) return -1; - - scanner->sbeg+= mb_len; - if (wc > MAX_UCA_CHAR_WITH_EXPLICIT_WEIGHT) + + scanner->sbeg+= mblen; + if (wc[0] > scanner->level->maxchar) { /* Return 0xFFFD as weight for all characters outside BMP */ scanner->wbeg= nochar; return 0xFFFD; } - else - { - scanner->page= wc >> 8; - scanner->code= wc & 0xFF; - } - - if (my_cs_have_contractions(scanner->cs) && - my_cs_can_be_contraction_head(scanner->cs, wc)) + + if (my_uca_have_contractions_quick(scanner->level)) { - my_wc_t wc2; - const uint16 *cweight; - - if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc2, - scanner->sbeg, - scanner->send)) >=0) && - my_cs_can_be_contraction_tail(scanner->cs, wc2) && - (cweight= my_cs_contraction2_weight(scanner->cs, wc, wc2))) + uint16 *cweight; + /* + If we have scanned a character which can have previous context, + and there were some more characters already before, + then reconstruct codepoint of the previous character + from "page" and "code" into w[1], and verify that {wc[1], wc[0]} + together form a real previous context pair. + Note, we support only 2-character long sequences with previous + context at the moment. CLDR does not have longer sequences. + */ + if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, + wc[0]) && + scanner->wbeg != nochar && /* if not the very first character */ + my_uca_can_be_previous_context_head(&scanner->level->contractions, + (wc[1]= ((scanner->page << 8) + + scanner->code))) && + (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0]))) { - scanner->implicit[0]= 0; - scanner->wbeg= scanner->implicit; - scanner->sbeg+= mb_len; + scanner->page= scanner->code= 0; /* Clear for the next character */ return *cweight; } + else if (my_uca_can_be_contraction_head(&scanner->level->contractions, + wc[0])) + { + /* Check if w[0] starts a contraction */ + if ((cweight= my_uca_scanner_contraction_find(scanner, wc))) + return *cweight; + } } - - if (!ucaw[scanner->page]) - goto implicit; - scanner->wbeg= ucaw[scanner->page] + scanner->code * ucal[scanner->page]; - } while (!scanner->wbeg[0]); - + + /* Process single character */ + scanner->page= wc[0] >> 8; + scanner->code= wc[0] & 0xFF; + + /* If weight page for w[0] does not exist, then calculate algoritmically */ + if (!(wpage= scanner->level->weights[scanner->page])) + return my_uca_scanner_next_implicit(scanner); + + /* Calculate pointer to w[0]'s weight, using page and offset */ + scanner->wbeg= wpage + + scanner->code * scanner->level->lengths[scanner->page]; + } while (!scanner->wbeg[0]); /* Skip ignorable characters */ + return *scanner->wbeg++; - -implicit: - - scanner->code= (scanner->page << 8) + scanner->code; - scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000; - scanner->implicit[1]= 0; - scanner->wbeg= scanner->implicit; - - scanner->page= scanner->page >> 7; - - if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5) - scanner->page+= 0xFB80; - else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5) - scanner->page+= 0xFB40; - else - scanner->page+= 0xFBC0; - - return scanner->page; } @@ -7142,7 +7362,6 @@ static my_uca_scanner_handler my_any_uca_scanner_handler= my_uca_scanner_next_any }; - /* Compares two strings according to the collation @@ -7195,8 +7414,8 @@ static int my_strnncoll_uca(CHARSET_INFO *cs, int s_res; int t_res; - scanner_handler->init(&sscanner, cs, s, slen); - scanner_handler->init(&tscanner, cs, t, tlen); + scanner_handler->init(&sscanner, cs, &cs->uca->level[0], s, slen); + scanner_handler->init(&tscanner, cs, &cs->uca->level[0], t, tlen); do { @@ -7207,6 +7426,38 @@ static int my_strnncoll_uca(CHARSET_INFO *cs, return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res); } + +static inline int +my_space_weight(const CHARSET_INFO *cs) /* W3-TODO */ +{ + return cs->uca->level[0].weights[0][0x20 * cs->uca->level[0].lengths[0]]; +} + + +/** + Helper function: + Find address of weights of the given character. + + @param weights UCA weight array + @param lengths UCA length array + @param ch character Unicode code point + + @return Weight array + @retval pointer to weight array for the given character, + or NULL if this page does not have implicit weights. +*/ + +static inline uint16 * +my_char_weight_addr(const MY_UCA_WEIGHT_LEVEL *level, uint wc) +{ + uint page, ofst; + return wc > level->maxchar ? NULL : + (level->weights[page= (wc >> 8)] ? + level->weights[page] + (ofst= (wc & 0xFF)) * level->lengths[page] : + NULL); +} + + /* Compares two strings according to the collation, ignoring trailing spaces. @@ -7268,8 +7519,8 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, diff_if_only_endspace_difference= 0; #endif - scanner_handler->init(&sscanner, cs, s, slen); - scanner_handler->init(&tscanner, cs, t, tlen); + scanner_handler->init(&sscanner, cs, &cs->uca->level[0], s, slen); + scanner_handler->init(&tscanner, cs, &cs->uca->level[0], t, tlen); do { @@ -7280,7 +7531,7 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, if (s_res > 0 && t_res < 0) { /* Calculate weight for SPACE character */ - t_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]]; + t_res= my_space_weight(cs); /* compare the first string to spaces */ do @@ -7295,7 +7546,7 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs, if (s_res < 0 && t_res > 0) { /* Calculate weight for SPACE character */ - s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]]; + s_res= my_space_weight(cs); /* compare the second string to spaces */ do @@ -7342,7 +7593,7 @@ static void my_hash_sort_uca(CHARSET_INFO *cs, my_uca_scanner scanner; slen= cs->cset->lengthsp(cs, (char*) s, slen); - scanner_handler->init(&scanner, cs, s, slen); + scanner_handler->init(&scanner, cs, &cs->uca->level[0], s, slen); while ((s_res= scanner_handler->next(&scanner)) >0) { @@ -7393,7 +7644,7 @@ static size_t my_strnxfrm_uca(CHARSET_INFO *cs, uchar *de= dst + (dstlen & (size_t) ~1); /* add even length for easier code */ int s_res; my_uca_scanner scanner; - scanner_handler->init(&scanner, cs, src, srclen); + scanner_handler->init(&scanner, cs, &cs->uca->level[0], src, srclen); while (dst < de && (s_res= scanner_handler->next(&scanner)) >0) { @@ -7401,7 +7652,7 @@ static size_t my_strnxfrm_uca(CHARSET_INFO *cs, dst[1]= s_res & 0xFF; dst+= 2; } - s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]]; + s_res= my_space_weight(cs); while (dst < de) { dst[0]= s_res >> 8; @@ -7416,33 +7667,6 @@ static size_t my_strnxfrm_uca(CHARSET_INFO *cs, -/** - Helper function: - Find address of weights of the given character. - - @param weights UCA weight array - @param lengths UCA length array - @param ch character Unicode code point - - @return Weight array - @retval pointer to weight array for the given character, - or NULL if this page does not have implicit weights. -*/ - -static inline const uint16 * -my_char_weight_addr(CHARSET_INFO *cs, uint wc) -{ - uint page, ofst; - const uchar *ucal= cs->sort_order; - const uint16 * const *ucaw= cs->sort_order_big; - - return wc > MAX_UCA_CHAR_WITH_EXPLICIT_WEIGHT ? NULL : - (ucaw[page= (wc >> 8)] ? - ucaw[page] + (ofst= (wc & 0xFF)) * ucal[page] : - NULL); -} - - /* This function compares if two characters are the same. The sign +1 or -1 does not matter. The only @@ -7454,8 +7678,8 @@ my_char_weight_addr(CHARSET_INFO *cs, uint wc) static int my_uca_charcmp(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2) { size_t length1, length2; - const uint16 *weight1= my_char_weight_addr(cs, wc1); - const uint16 *weight2= my_char_weight_addr(cs, wc2); + const uint16 *weight1= my_char_weight_addr(&cs->uca->level[0], wc1); + const uint16 *weight2= my_char_weight_addr(&cs->uca->level[0], wc2); if (!weight1 || !weight2) return wc1 != wc2; @@ -7465,8 +7689,8 @@ static int my_uca_charcmp(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2) return 1; /* Thoroughly compare all weights */ - length1= cs->sort_order[wc1 >> MY_UCA_PSHIFT]; - length2= cs->sort_order[wc2 >> MY_UCA_PSHIFT]; + length1= cs->uca->level[0].lengths[wc1 >> MY_UCA_PSHIFT]; /* W3-TODO */ + length2= cs->uca->level[0].lengths[wc2 >> MY_UCA_PSHIFT]; if (length1 > length2) return memcmp((const void*)weight1, (const void*)weight2, length2*2) ? @@ -7632,7 +7856,7 @@ int my_wildcmp_uca(CHARSET_INFO *cs, /* Collation language is implemented according to subset of ICU Collation Customization (tailorings): - http://oss.software.ibm.com/icu/userguide/Collate_Customization.html + http://icu.sourceforge.net/userguide/Collate_Customization.html Collation language elements: Delimiters: @@ -7674,16 +7898,47 @@ int my_wildcmp_uca(CHARSET_INFO *cs, typedef enum my_coll_lexem_num_en { - MY_COLL_LEXEM_EOF = 0, - MY_COLL_LEXEM_DIFF = 1, - MY_COLL_LEXEM_SHIFT = 4, - MY_COLL_LEXEM_CHAR = 5, - MY_COLL_LEXEM_ERROR = 6 + MY_COLL_LEXEM_EOF = 0, + MY_COLL_LEXEM_SHIFT = 1, + MY_COLL_LEXEM_RESET = 4, + MY_COLL_LEXEM_CHAR = 5, + MY_COLL_LEXEM_ERROR = 6, + MY_COLL_LEXEM_OPTION = 7, + MY_COLL_LEXEM_EXTEND = 8, + MY_COLL_LEXEM_CONTEXT = 9, } my_coll_lexem_num; +/** + Convert collation customization lexem to string, + for nice error reporting + + @param term lexem code + + @return lexem name +*/ + +static const char * +my_coll_lexem_num_to_str(my_coll_lexem_num term) +{ + switch (term) + { + case MY_COLL_LEXEM_EOF: return "EOF"; + case MY_COLL_LEXEM_SHIFT: return "Shift"; + case MY_COLL_LEXEM_RESET: return "&"; + case MY_COLL_LEXEM_CHAR: return "Character"; + case MY_COLL_LEXEM_OPTION: return "Bracket option"; + case MY_COLL_LEXEM_EXTEND: return "/"; + case MY_COLL_LEXEM_CONTEXT:return "|"; + case MY_COLL_LEXEM_ERROR: return "ERROR"; + } + return NULL; +} + + typedef struct my_coll_lexem_st { + my_coll_lexem_num term; const char *beg; const char *end; const char *prev; @@ -7717,6 +7972,27 @@ static void my_coll_lexem_init(MY_COLL_LEXEM *lexem, } +/** + Compare lexem to string with length + + @param lexem lexem + @param pattern string + @param patternlen string length + + @return + @retval 0 if lexem is equal to string, non-0 otherwise. +*/ + +static int +lex_cmp(MY_COLL_LEXEM *lexem, const char *pattern, size_t patternlen) +{ + size_t lexemlen= lexem->beg - lexem->prev; + if (lexemlen < patternlen) + return 1; /* Not a prefix */ + return strncasecmp(lexem->prev, pattern, patternlen); +} + + /* Print collation customization expression parse error, with context. @@ -7740,7 +8016,8 @@ static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem, size_t len= lexem->end - lexem->prev; strmake (tail, lexem->prev, (size_t) MY_MIN(len, sizeof(tail)-1)); errstr[errsize-1]= '\0'; - my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail); + my_snprintf(errstr, errsize - 1, + "%s at '%s'", txt[0] ? txt : "Syntax error", tail); } @@ -7791,44 +8068,75 @@ static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem) { const char *beg; my_coll_lexem_num rc; - + for (beg= lexem->beg ; beg < lexem->end ; beg++) { - if (*beg == ' ' || *beg == '\t' || *beg == '\r' || *beg == '\n') - continue; - - if (*beg == '&') + switch (*beg) { + case ' ': + case '\t': + case '\r': + case '\n': + continue; + + case '[': /* Bracket expression, e.g. "[optimize [a-z]]" */ + { + size_t nbrackets; /* Indicates nested recursion level */ + for (beg++, nbrackets= 1 ; beg < lexem->end; beg++) + { + if (*beg == '[') /* Enter nested bracket expression */ + nbrackets++; + else if (*beg == ']') + { + if (--nbrackets == 0) + { + rc= MY_COLL_LEXEM_OPTION; + beg++; + goto ex; + } + } + } + rc= MY_COLL_LEXEM_ERROR; + goto ex; + } + + case '&': beg++; - rc= MY_COLL_LEXEM_SHIFT; + rc= MY_COLL_LEXEM_RESET; goto ex; - } - - if (beg[0] == '=') - { + + case '=': beg++; - rc= MY_COLL_LEXEM_DIFF; + lexem->diff= 0; + rc= MY_COLL_LEXEM_SHIFT; goto ex; - } - - if (beg[0] == '<') - { - for (beg++, lexem->diff= 1; - (beg < lexem->end) && - (*beg == '<') && (lexem->diff<3); - beg++, lexem->diff++); - rc= MY_COLL_LEXEM_DIFF; + + case '/': + beg++; + rc= MY_COLL_LEXEM_EXTEND; goto ex; - } - - if ((*beg >= 'a' && *beg <= 'z') || (*beg >= 'A' && *beg <= 'Z')) - { - lexem->code= *beg++; - rc= MY_COLL_LEXEM_CHAR; + + case '|': + beg++; + rc= MY_COLL_LEXEM_CONTEXT; goto ex; + + case '<': /* Shift: '<' or '<<' or '<<<' or '<<<<' */ + { + /* Scan up to 3 additional '<' characters */ + for (beg++, lexem->diff= 1; + (beg < lexem->end) && (*beg == '<') && (lexem->diff <= 3); + beg++, lexem->diff++); + rc= MY_COLL_LEXEM_SHIFT; + goto ex; + } + default: + break; } - - if ((*beg == '\\') && (beg+2 < lexem->end) && (beg[1] == 'u')) + + /* Escaped character, e.g. \u1234 */ + if ((*beg == '\\') && (beg + 2 < lexem->end) && + (beg[1] == 'u') && my_isxdigit(&my_charset_utf8_general_ci, beg[2])) { int ch; @@ -7842,15 +8150,43 @@ static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem) rc= MY_COLL_LEXEM_CHAR; goto ex; } - + + /* + Unescaped single byte character: + allow printable ASCII range except SPACE and + special characters parsed above []<&/|= + */ + if (*beg >= 0x21 && *beg <= 0x7E) + { + lexem->code= *beg++; + rc= MY_COLL_LEXEM_CHAR; + goto ex; + } + + if (((uchar) *beg) > 0x7F) /* Unescaped multibyte character */ + { + CHARSET_INFO *cs= &my_charset_utf8_general_ci; + my_wc_t wc; + int nbytes= cs->cset->mb_wc(cs, &wc, + (uchar *) beg, (uchar *) lexem->end); + if (nbytes > 0) + { + rc= MY_COLL_LEXEM_CHAR; + beg+= nbytes; + lexem->code= (int) wc; + goto ex; + } + } + rc= MY_COLL_LEXEM_ERROR; goto ex; } rc= MY_COLL_LEXEM_EOF; - + ex: lexem->prev= lexem->beg; lexem->beg= beg; + lexem->term= rc; return rc; } @@ -7859,142 +8195,1149 @@ ex: Collation rule item */ +#define MY_UCA_MAX_EXPANSION 6 /* Maximum expansion length */ + typedef struct my_coll_rule_item_st { - my_wc_t base; /* Base character */ - my_wc_t curr[2]; /* Current character */ - int diff[3]; /* Primary, Secondary and Tertiary difference */ + my_wc_t base[MY_UCA_MAX_EXPANSION]; /* Base character */ + my_wc_t curr[MY_UCA_MAX_CONTRACTION]; /* Current character */ + int diff[4]; /* Primary, Secondary, Tertiary, Quaternary difference */ + size_t before_level; /* "reset before" indicator */ + my_bool with_context; } MY_COLL_RULE; +/** + Return length of a 0-terminated wide string, analog to strnlen(). + + @param s Pointer to wide string + @param maxlen Mamixum string length + + @return string length, or maxlen if no '\0' is met. +*/ +static size_t +my_wstrnlen(my_wc_t *s, size_t maxlen) +{ + size_t i; + for (i= 0; i < maxlen; i++) + { + if (s[i] == 0) + return i; + } + return maxlen; +} + + +/** + Return length of the "reset" string of a rule. + + @param r Collation customization rule + + @return Length of r->base +*/ + +static inline size_t +my_coll_rule_reset_length(MY_COLL_RULE *r) +{ + return my_wstrnlen(r->base, MY_UCA_MAX_EXPANSION); +} + + +/** + Return length of the "shift" string of a rule. + + @param r Collation customization rule + + @return Length of r->base +*/ + +static inline size_t +my_coll_rule_shift_length(MY_COLL_RULE *r) +{ + return my_wstrnlen(r->curr, MY_UCA_MAX_CONTRACTION); +} + + +/** + Append new character to the end of a 0-terminated wide string. + + @param wc Wide string + @param limit Maximum possible result length + @param code Character to add + + @return 1 if character was added, 0 if string was too long +*/ + +static int +my_coll_rule_expand(my_wc_t *wc, size_t limit, my_wc_t code) +{ + size_t i; + for (i= 0; i < limit; i++) + { + if (wc[i] == 0) + { + wc[i]= code; + return 1; + } + } + return 0; +} + + +/** + Initialize collation customization rule + + @param wc Rule +*/ + +static void +my_coll_rule_reset(MY_COLL_RULE *r) +{ + memset(r, 0, sizeof(*r)); +} + + +/* + Shift methods: + Simple: "&B < C" : weight('C') = weight('B') + 1 + Expand: weght('C') = { weight('B'), weight(last_non_ignorable) + 1 } +*/ +typedef enum +{ + my_shift_method_simple= 0, + my_shift_method_expand +} my_coll_shift_method; + + +typedef struct my_coll_rules_st +{ + uint version; /* Unicode version, e.g. 400 or 520 */ + MY_UCA_INFO *uca; /* Unicode weight data */ + size_t nrules; /* Number of rules in the rule array */ + size_t mrules; /* Number of allocated rules */ + MY_COLL_RULE *rule; /* Rule array */ + MY_CHARSET_LOADER *loader; + my_coll_shift_method shift_after_method; +} MY_COLL_RULES; + + +/** + Realloc rule array to a new size. + Reallocate memory for 128 additional rules at once, + to reduce the number of reallocs, which is important + for long tailorings (e.g. for East Asian collations). + + @param rules Rule container + @param n new number of rules + + @return 0 on success, -1 on error. +*/ + +static int +my_coll_rules_realloc(MY_COLL_RULES *rules, size_t n) +{ + if (rules->nrules < rules->mrules || + (rules->rule= rules->loader->realloc(rules->rule, + sizeof(MY_COLL_RULE) * + (rules->mrules= n + 128)))) + return 0; + return -1; +} + + +/** + Append one new rule to a rule array + + @param rules Rule container + @param rule New rule to add + + @return 0 on success, -1 on error. +*/ + +static int +my_coll_rules_add(MY_COLL_RULES *rules, MY_COLL_RULE *rule) +{ + if (my_coll_rules_realloc(rules, rules->nrules + 1)) + return -1; + rules->rule[rules->nrules++]= rule[0]; + return 0; +} + + +/** + Apply difference at level + + @param r Rule + @param level Level (0,1,2,3,4) +*/ + +static void +my_coll_rule_shift_at_level(MY_COLL_RULE *r, int level) +{ + switch (level) + { + case 4: /* Quaternary difference */ + r->diff[3]++; + break; + case 3: /* Tertiary difference */ + r->diff[2]++; + r->diff[3]= 0; + break; + case 2: /* Secondary difference */ + r->diff[1]++; + r->diff[2]= r->diff[3]= 0; + break; + case 1: /* Primary difference */ + r->diff[0]++; + r->diff[1]= r->diff[2]= r->diff[3]= 0; + break; + case 0: + /* Do nothing for '=': use the previous offsets for all levels */ + break; + default: + DBUG_ASSERT(0); + } +} + + +typedef struct my_coll_rule_parser_st +{ + MY_COLL_LEXEM tok[2]; /* Current token and next token for look-ahead */ + MY_COLL_RULE rule; /* Currently parsed rule */ + MY_COLL_RULES *rules; /* Rule list pointer */ + char errstr[128]; /* Error message */ +} MY_COLL_RULE_PARSER; + + +/** + Current parser token + + @param p Collation customization parser + + @return Pointer to the current token +*/ + +static MY_COLL_LEXEM * +my_coll_parser_curr(MY_COLL_RULE_PARSER *p) +{ + return &p->tok[0]; +} + + +/** + Next parser token, to look ahead. + + @param p Collation customization parser + + @return Pointer to the next token +*/ + +static MY_COLL_LEXEM * +my_coll_parser_next(MY_COLL_RULE_PARSER *p) +{ + return &p->tok[1]; +} + + +/** + Scan one token from the input stream + + @param p Collation customization parser + + @return 1, for convenience, to use in logical expressions easier. +*/ +static int +my_coll_parser_scan(MY_COLL_RULE_PARSER *p) +{ + my_coll_parser_curr(p)[0]= my_coll_parser_next(p)[0]; + my_coll_lexem_next(my_coll_parser_next(p)); + return 1; +} + + +/** + Initialize collation customization parser + + @param p Collation customization parser + @param rules Where to store rules + @param str Beginning of a collation customization sting + @param str_end End of the collation customizations string +*/ + +static void +my_coll_parser_init(MY_COLL_RULE_PARSER *p, + MY_COLL_RULES *rules, + const char *str, const char *str_end) +{ + /* + Initialize parser to the input buffer and scan two tokens, + to make the current token and the next token known. + */ + memset(p, 0, sizeof(*p)); + p->rules= rules; + p->errstr[0]= '\0'; + my_coll_lexem_init(my_coll_parser_curr(p), str, str_end); + my_coll_lexem_next(my_coll_parser_curr(p)); + my_coll_parser_next(p)[0]= my_coll_parser_curr(p)[0]; + my_coll_lexem_next(my_coll_parser_next(p)); +} + + +/** + Display error when an unexpected token found + + @param p Collation customization parser + @param term Which lexem was expected + + @return 0, to use in "return" and boolean expressions. +*/ + +static int +my_coll_parser_expected_error(MY_COLL_RULE_PARSER *p, my_coll_lexem_num term) +{ + my_snprintf(p->errstr, sizeof(p->errstr), + "%s expected", my_coll_lexem_num_to_str(term)); + return 0; +} + + +/** + Display error when a too long character sequence is met + + @param p Collation customization parser + @param name Which kind of sequence: contraction, expansion, etc. + + @return 0, to use in "return" and boolean expressions. +*/ + +static int +my_coll_parser_too_long_error(MY_COLL_RULE_PARSER *p, const char *name) +{ + my_snprintf(p->errstr, sizeof(p->errstr), "%s is too long", name); + return 0; +} + + +/** + Scan the given lexem from input stream, or display "expected" error. + + @param p Collation customization parser + @param term Which lexem is expected. + + @return + @retval 0 if the required term was not found. + @retval 1 if the required term was found. +*/ +static int +my_coll_parser_scan_term(MY_COLL_RULE_PARSER *p, my_coll_lexem_num term) +{ + if (my_coll_parser_curr(p)->term != term) + return my_coll_parser_expected_error(p, term); + return my_coll_parser_scan(p); +} + + +/* + In the following code we have a few functions to parse + various collation customization non-terminal symbols. + Unlike our usual coding convension, they return + - 0 on "error" (when the rule was not scanned) and + - 1 on "success"(when the rule was scanned). + This is done intentionally to make body of the functions look easier + and repeat the grammar of the rules in straightforward manner. + For example: + + // <x> ::= <y> | <z> + int parse_x() { return parse_y() || parser_z(); } + + // <x> ::= <y> <z> + int parse_x() { return parse_y() && parser_z(); } + + Using 1 on "not found" and 0 on "found" in the parser code would + make the code more error prone and harder to read because + of having to use inverse boolean logic. +*/ + + +/** + Scan a collation setting in brakets, for example UCA version. + + @param p Collation customization parser + + @return + @retval 0 if setting was scanned. + @retval 1 if setting was not scanned. +*/ + +static int +my_coll_parser_scan_setting(MY_COLL_RULE_PARSER *p) +{ + MY_COLL_RULES *rules= p->rules; + MY_COLL_LEXEM *lexem= my_coll_parser_curr(p); + + if (!lex_cmp(lexem, C_STRING_WITH_LEN("[version 4.0.0]"))) + { + rules->version= 400; + rules->uca= &my_uca_v400; + } +#if RESOLVE_CONFLICTS_WITH_MARIA_AND_MYSQL_COLLATION_IDS + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[version 5.2.0]"))) + { + rules->version= 520; + rules->uca= &my_uca_v520; + } +#endif + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method expand]"))) + { + rules->shift_after_method= my_shift_method_expand; + } + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method simple]"))) + { + rules->shift_after_method= my_shift_method_simple; + } + else + { + return 0; + } + return my_coll_parser_scan(p); +} + + +/** + Scan multiple collation settings + + @param p Collation customization parser + + @return + @retval 0 if no settings were scanned. + @retval 1 if one or more settings were scanned. +*/ + +static int +my_coll_parser_scan_settings(MY_COLL_RULE_PARSER *p) +{ + /* Scan collation setting or special purpose command */ + while (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_OPTION) + { + if (!my_coll_parser_scan_setting(p)) + return 0; + } + return 1; +} + + +/** + Scan [before xxx] reset option + + @param p Collation customization parser + + @return + @retval 0 if reset option was not scanned. + @retval 1 if reset option was scanned. +*/ + +static int +my_coll_parser_scan_reset_before(MY_COLL_RULE_PARSER *p) +{ + MY_COLL_LEXEM *lexem= my_coll_parser_curr(p); + if (!lex_cmp(lexem, C_STRING_WITH_LEN("[before primary]")) || + !lex_cmp(lexem, C_STRING_WITH_LEN("[before 1]"))) + { + p->rule.before_level= 1; + } + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[before secondary]")) || + !lex_cmp(lexem, C_STRING_WITH_LEN("[before 2]"))) + { + p->rule.before_level= 2; + } + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[before tertiary]")) || + !lex_cmp(lexem, C_STRING_WITH_LEN("[before 3]"))) + { + p->rule.before_level= 3; + } + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[before quaternary]")) || + !lex_cmp(lexem, C_STRING_WITH_LEN("[before 4]"))) + { + p->rule.before_level= 4; + } + else + { + p->rule.before_level= 0; + return 0; /* Don't scan thr next character */ + } + return my_coll_parser_scan(p); +} + + +/** + Scan logical position and add to the wide string. + + @param p Collation customization parser + @param pwc Wide string to add code to + @param limit The result string cannot be longer than 'limit' characters + + @return + @retval 0 if logical position was not scanned. + @retval 1 if logical position was scanned. +*/ + +static int +my_coll_parser_scan_logical_position(MY_COLL_RULE_PARSER *p, + my_wc_t *pwc, size_t limit) +{ + MY_COLL_RULES *rules= p->rules; + MY_COLL_LEXEM *lexem= my_coll_parser_curr(p); + + if (!lex_cmp(lexem, C_STRING_WITH_LEN("[first non-ignorable]"))) + lexem->code= rules->uca->first_non_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[last non-ignorable]"))) + lexem->code= rules->uca->last_non_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[first primary ignorable]"))) + lexem->code= rules->uca->first_primary_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[last primary ignorable]"))) + lexem->code= rules->uca->last_primary_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[first secondary ignorable]"))) + lexem->code= rules->uca->first_secondary_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[last secondary ignorable]"))) + lexem->code= rules->uca->last_secondary_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[first tertiary ignorable]"))) + lexem->code= rules->uca->first_tertiary_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[last tertiary ignorable]"))) + lexem->code= rules->uca->last_tertiary_ignorable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[first trailing]"))) + lexem->code= rules->uca->first_trailing; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[last trailing]"))) + lexem->code= rules->uca->last_trailing; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[first variable]"))) + lexem->code= rules->uca->first_variable; + else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[last variable]"))) + lexem->code= rules->uca->last_variable; + else + return 0; /* Don't scan the next token */ + + if (!my_coll_rule_expand(pwc, limit, lexem->code)) + { + /* + Logical position can not be in a contraction, + so the above call should never fail. + Let's assert in debug version and print + a nice error message in production version. + */ + DBUG_ASSERT(0); + return my_coll_parser_too_long_error(p, "Logical position"); + } + return my_coll_parser_scan(p); +} + + +/** + Scan character list + + <character list> ::= CHAR [ CHAR... ] + + @param p Collation customization parser + @param pwc Character string to add code to + @param limit The result string cannot be longer than 'limit' characters + @param name E.g. "contraction", "expansion" + + @return + @retval 0 if character sequence was not scanned. + @retval 1 if character sequence was scanned. +*/ + +static int +my_coll_parser_scan_character_list(MY_COLL_RULE_PARSER *p, + my_wc_t *pwc, size_t limit, + const char *name) +{ + if (my_coll_parser_curr(p)->term != MY_COLL_LEXEM_CHAR) + return my_coll_parser_expected_error(p, MY_COLL_LEXEM_CHAR); + + if (!my_coll_rule_expand(pwc, limit, my_coll_parser_curr(p)->code)) + return my_coll_parser_too_long_error(p, name); + + if (!my_coll_parser_scan_term(p, MY_COLL_LEXEM_CHAR)) + return 0; + + while (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_CHAR) + { + if (!my_coll_rule_expand(pwc, limit, my_coll_parser_curr(p)->code)) + return my_coll_parser_too_long_error(p, name); + my_coll_parser_scan(p); + } + return 1; +} + + +/** + Scan reset sequence + + <reset sequence> ::= + [ <reset before option> ] <character list> + | [ <reset before option> ] <logical reset position> + + @param p Collation customization parser + + @return + @retval 0 if reset sequence was not scanned. + @retval 1 if reset sequence was scanned. +*/ + +static int +my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p) +{ + my_coll_rule_reset(&p->rule); + + /* Scan "[before x]" option, if exists */ + if (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_OPTION) + my_coll_parser_scan_reset_before(p); + + /* Try logical reset position */ + if (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_OPTION) + { + if (!my_coll_parser_scan_logical_position(p, p->rule.base, 1)) + return 0; + } + else + { + /* Scan single reset character or expansion */ + if (!my_coll_parser_scan_character_list(p, p->rule.base, + MY_UCA_MAX_EXPANSION, "Expansion")) + return 0; + } + + if (p->rules->shift_after_method == my_shift_method_expand || + p->rule.before_level == 1) /* Apply "before primary" option */ + { + /* + Suppose we have this rule: &B[before primary] < C + i.e. we need to put C before B, but after A, so + the result order is: A < C < B. + + Let primary weight of B be [BBBB]. + + We cannot just use [BBBB-1] as weight for C: + DUCET does not have enough unused weights between any two characters, + so using [BBBB-1] will likely make C equal to the previous character, + which is A, so we'll get this order instead of the desired: A = C < B. + + To guarantee that that C is sorted after A, we'll use expansion + with a kind of "biggest possible character". + As "biggest possible character" we'll use "last_non_ignorable": + + We'll compose weight for C as: [BBBB-1][MMMM+1] + where [MMMM] is weight for "last_non_ignorable". + + We also do the same trick for "reset after" if the collation + option says so. E.g. for the rules "&B < C", weight for + C will be calculated as: [BBBB][MMMM+1] + + At this point we only need to store codepoints + 'B' and 'last_non_ignorable'. Actual weights for 'C' + will be calculated according to the above formula later, + in create_tailoring(). + */ + if (!my_coll_rule_expand(p->rule.base, MY_UCA_MAX_EXPANSION, + p->rules->uca->last_non_ignorable)) + return my_coll_parser_too_long_error(p, "Expansion"); + } + return 1; +} + + +/** + Scan shift sequence + + <shift sequence> ::= + <character list> [ / <character list> ] + | <character list> [ | <character list> ] + + @param p Collation customization parser + + @return + @retval 0 if shift sequence was not scanned. + @retval 1 if shift sequence was scanned. +*/ + +static int +my_coll_parser_scan_shift_sequence(MY_COLL_RULE_PARSER *p) +{ + MY_COLL_RULE before_extend; + + memset(&p->rule.curr, 0, sizeof(p->rule.curr)); + + /* Scan single shift character or contraction */ + if (!my_coll_parser_scan_character_list(p, p->rule.curr, + MY_UCA_MAX_CONTRACTION, + "Contraction")) + return 0; + + before_extend= p->rule; /* Remember the part before "/" */ + + /* Append the part after "/" as expansion */ + if (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_EXTEND) + { + my_coll_parser_scan(p); + if (!my_coll_parser_scan_character_list(p, p->rule.base, + MY_UCA_MAX_EXPANSION, + "Expansion")) + return 0; + } + else if (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_CONTEXT) + { + /* + We support 2-character long context sequences only: + one character is the previous context, plus the current character. + It's OK as Unicode's CLDR does not have longer examples. + */ + my_coll_parser_scan(p); + p->rule.with_context= TRUE; + if (!my_coll_parser_scan_character_list(p, p->rule.curr + 1, 1, "context")) + return 0; + } + + /* Add rule to the rule list */ + if (my_coll_rules_add(p->rules, &p->rule)) + return 0; + + p->rule= before_extend; /* Restore to the state before "/" */ + + return 1; +} + + +/** + Scan shift operator + + <shift> ::= < | << | <<< | <<<< | = + + @param p Collation customization parser + + @return + @retval 0 if shift operator was not scanned. + @retval 1 if shift operator was scanned. +*/ +static int +my_coll_parser_scan_shift(MY_COLL_RULE_PARSER *p) +{ + if (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_SHIFT) + { + my_coll_rule_shift_at_level(&p->rule, my_coll_parser_curr(p)->diff); + return my_coll_parser_scan(p); + } + return 0; +} + + +/** + Scan one rule: reset followed by a number of shifts + + <rule> ::= + & <reset sequence> + <shift> <shift sequence> + [ { <shift> <shift sequence> }... ] + + @param p Collation customization parser + + @return + @retval 0 if rule was not scanned. + @retval 1 if rule was scanned. +*/ +static int +my_coll_parser_scan_rule(MY_COLL_RULE_PARSER *p) +{ + if (!my_coll_parser_scan_term(p, MY_COLL_LEXEM_RESET) || + !my_coll_parser_scan_reset_sequence(p)) + return 0; + + /* Scan the first required shift command */ + if (!my_coll_parser_scan_shift(p)) + return my_coll_parser_expected_error(p, MY_COLL_LEXEM_SHIFT); + + /* Scan the first shift sequence */ + if (!my_coll_parser_scan_shift_sequence(p)) + return 0; + + /* Scan subsequent shift rules */ + while (my_coll_parser_scan_shift(p)) + { + if (!my_coll_parser_scan_shift_sequence(p)) + return 0; + } + return 1; +} + + +/** + Scan collation customization: settings followed by rules + + <collation customization> ::= + [ <setting> ... ] + [ <rule>... ] + + @param p Collation customization parser + + @return + @retval 0 if collation customozation expression was not scanned. + @retval 1 if collation customization expression was scanned. +*/ + +static int +my_coll_parser_exec(MY_COLL_RULE_PARSER *p) +{ + if (!my_coll_parser_scan_settings(p)) + return 0; + + while (my_coll_parser_curr(p)->term == MY_COLL_LEXEM_RESET) + { + if (!my_coll_parser_scan_rule(p)) + return 0; + } + /* Make sure no unparsed input data left */ + return my_coll_parser_scan_term(p, MY_COLL_LEXEM_EOF); +} + + /* Collation language syntax parser. Uses lexical parser. - - SYNOPSIS - my_coll_rule_parse - rule Collation rule list to load to. - str A string containin collation language expression. - str_end End of the string. - USAGE - - RETURN VALUES - A positive number means the number of rules loaded. - -1 means ERROR, e.g. too many items, syntax error, etc. + + @param rules Collation rule list to load to. + @param str A string with collation customization. + @param str_end End of the string. + + @return + @retval 0 on success + @retval 1 on error */ -static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, - const char *str, const char *str_end, - char *errstr, size_t errsize) +static int +my_coll_rule_parse(MY_COLL_RULES *rules, + const char *str, const char *str_end) { - MY_COLL_LEXEM lexem; - my_coll_lexem_num lexnum; - my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR; - MY_COLL_RULE item; - int state= 0; - size_t nitems= 0; + MY_COLL_RULE_PARSER p; + + my_coll_parser_init(&p, rules, str, str_end); + + if (!my_coll_parser_exec(&p)) + { + my_coll_lexem_print_error(my_coll_parser_curr(&p), + rules->loader->error, + sizeof(rules->loader->error) - 1, + p.errstr); + return 1; + } + return 0; +} + + +/** + Helper function: + Copies UCA weights for a given "uint" string + to the given location. - /* Init all variables */ - errstr[0]= '\0'; - bzero(&item, sizeof(item)); - my_coll_lexem_init(&lexem, str, str_end); + @src_uca source UCA weight data + @dst_uca destination UCA weight data + @to destination address + @to_length size of destination + @str qide string + @len string length - while ((lexnum= my_coll_lexem_next(&lexem))) + @return number of weights put +*/ + +static size_t +my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst, + uint16 *to, size_t to_length, + my_wc_t *str, size_t len) +{ + size_t count; + if (!to_length) + return 0; + to_length--; /* Without trailing zero */ + + for (count= 0; len; ) { - if (lexnum == MY_COLL_LEXEM_ERROR) + size_t chlen; + const uint16 *from= NULL; + + for (chlen= len; chlen > 1; chlen--) { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character"); - return -1; - } - - switch (state) { - case 0: - if (lexnum != MY_COLL_LEXEM_SHIFT) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected"); - return -1; - } - prevlexnum= lexnum; - state= 2; - continue; - - case 1: - if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF) + if ((from= my_uca_contraction_weight(&dst->contractions, str, chlen))) { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected"); - return -1; + str+= chlen; + len-= chlen; + break; } - prevlexnum= lexnum; - state= 2; - continue; - - case 2: - if (lexnum != MY_COLL_LEXEM_CHAR) + } + + if (!from) + { + from= my_char_weight_addr(dst, *str); + str++; + len--; + } + + for ( ; from && *from && count < to_length; ) + { + *to++= *from++; + count++; + } + } + + *to= 0; + return count; +} + + +/** + Alloc new page and copy the default UCA weights + @param loader - Character set loader + @param src_uca - Default UCA data to copy from + @param dst_uca - UCA data to copy weights to + @param page - page number + + @return + @retval FALSE on success + @retval TRUE on error +*/ +static my_bool +my_uca_copy_page(MY_CHARSET_LOADER *loader, + const MY_UCA_WEIGHT_LEVEL *src, + MY_UCA_WEIGHT_LEVEL *dst, + size_t page) +{ + uint chc, size= 256 * dst->lengths[page] * sizeof(uint16); + if (!(dst->weights[page]= (uint16 *) (loader->once_alloc)(size))) + return TRUE; + + DBUG_ASSERT(src->lengths[page] <= dst->lengths[page]); + memset(dst->weights[page], 0, size); + for (chc=0 ; chc < 256; chc++) + { + memcpy(dst->weights[page] + chc * dst->lengths[page], + src->weights[page] + chc * src->lengths[page], + src->lengths[page] * sizeof(uint16)); + } + return FALSE; +} + + +static my_bool +apply_shift(MY_CHARSET_LOADER *loader, + MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, + uint16 *to, size_t nweights) +{ + /* Apply level difference. */ + if (nweights) + { + to[nweights - 1]+= r->diff[level]; + if (r->before_level == 1) /* Apply "&[before primary]" */ + { + if (nweights >= 2) { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected"); - return -1; + to[nweights - 2]--; /* Reset before */ + if (rules->shift_after_method == my_shift_method_expand) + { + /* + Special case. Don't let characters shifted after X + and before next(X) intermix to each other. + + For example: + "[shift-after-method expand] &0 < a &[before primary]1 < A". + I.e. we reorder 'a' after '0', and then 'A' before '1'. + 'a' must be sorted before 'A'. + + Note, there are no real collations in CLDR which shift + after and before two neighbourgh characters. We need this + just in case. Reserving 4096 (0x1000) weights for such + cases is perfectly enough. + */ + to[nweights - 1]+= 0x1000; /* W3-TODO: const may vary on levels 2,3*/ + } } - - if (prevlexnum == MY_COLL_LEXEM_SHIFT) + else { - item.base= lexem.code; - item.diff[0]= 0; - item.diff[1]= 0; - item.diff[2]= 0; + my_snprintf(loader->error, sizeof(loader->error), + "Can't reset before " + "a primary ignorable character U+%04lX", r->base[0]); + return TRUE; } - else if (prevlexnum == MY_COLL_LEXEM_DIFF) + } + } + else + { + /* Shift to an ignorable character, e.g.: & \u0000 < \u0001 */ + DBUG_ASSERT(to[0] == 0); + to[0]= r->diff[level]; + } + return FALSE; +} + + +static my_bool +apply_one_rule(MY_CHARSET_LOADER *loader, + MY_COLL_RULES *rules, MY_COLL_RULE *r, int level, + MY_UCA_WEIGHT_LEVEL *dst) +{ + size_t nweights; + size_t nreset= my_coll_rule_reset_length(r); /* Length of reset sequence */ + size_t nshift= my_coll_rule_shift_length(r); /* Length of shift sequence */ + uint16 *to; + + if (nshift >= 2) /* Contraction */ + { + size_t i; + int flag; + MY_CONTRACTIONS *contractions= &dst->contractions; + /* Add HEAD, MID and TAIL flags for the contraction parts */ + my_uca_add_contraction_flag(contractions, r->curr[0], + r->with_context ? + MY_UCA_PREVIOUS_CONTEXT_HEAD : + MY_UCA_CNT_HEAD); + for (i= 1, flag= MY_UCA_CNT_MID1; i < nshift - 1; i++, flag<<= 1) + my_uca_add_contraction_flag(contractions, r->curr[i], flag); + my_uca_add_contraction_flag(contractions, r->curr[i], + r->with_context ? + MY_UCA_PREVIOUS_CONTEXT_TAIL : + MY_UCA_CNT_TAIL); + /* Add new contraction to the contraction list */ + to= my_uca_add_contraction(contractions, r->curr, nshift, + r->with_context)->weight; + /* Store weights of the "reset to" character */ + dst->contractions.nitems--; /* Temporarily hide - it's incomplete */ + nweights= my_char_weight_put(dst, to, MY_UCA_MAX_WEIGHT_SIZE, + r->base, nreset); + dst->contractions.nitems++; /* Activate, now it's complete */ + } + else + { + my_wc_t pagec= (r->curr[0] >> 8); + DBUG_ASSERT(dst->weights[pagec]); + to= my_char_weight_addr(dst, r->curr[0]); + /* Store weights of the "reset to" character */ + nweights= my_char_weight_put(dst, to, dst->lengths[pagec], r->base, nreset); + } + + /* Apply level difference. */ + return apply_shift(loader, rules, r, level, to, nweights); +} + + +/** + Check if collation rules are valid, + i.e. characters are not outside of the collation suported range. +*/ +static int +check_rules(MY_CHARSET_LOADER *loader, + const MY_COLL_RULES *rules, + const MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src) +{ + const MY_COLL_RULE *r, *rlast; + for (r= rules->rule, rlast= rules->rule + rules->nrules; r < rlast; r++) + { + if (r->curr[0] > dst->maxchar) + { + my_snprintf(loader->error, sizeof(loader->error), + "Shift character out of range: u%04X", (uint) r->curr[0]); + return TRUE; + } + else if (r->base[0] > src->maxchar) + { + my_snprintf(loader->error, sizeof(loader->error), + "Reset character out of range: u%04X", (uint) r->base[0]); + return TRUE; + } + } + return FALSE; +} + + +static my_bool +init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, + MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src) +{ + MY_COLL_RULE *r, *rlast; + int ncontractions= 0; + size_t i, npages= (src->maxchar + 1) / 256; + + dst->maxchar= src->maxchar; + + if (check_rules(loader, rules, dst, src)) + return TRUE; + + /* Allocate memory for pages and their lengths */ + if (!(dst->lengths= (uchar *) (loader->once_alloc)(npages)) || + !(dst->weights= (uint16 **) (loader->once_alloc)(npages * + sizeof(uint16 *)))) + return TRUE; + + /* Copy pages lengths and page pointers from the default UCA weights */ + memcpy(dst->lengths, src->lengths, npages); + memcpy(dst->weights, src->weights, npages * sizeof(uint16 *)); + + /* + Calculate maximum lenghts for the pages which will be overwritten. + Mark pages that will be otherwriten as NULL. + We'll allocate their own memory. + */ + for (r= rules->rule, rlast= rules->rule + rules->nrules; r < rlast; r++) + { + if (!r->curr[1]) /* If not a contraction */ + { + uint pagec= (r->curr[0] >> 8); + if (r->base[1]) /* Expansion */ { - MY_COLL_LEXEM savlex; - savlex= lexem; - item.curr[0]= lexem.code; - if ((lexnum= my_coll_lexem_next(&lexem)) == MY_COLL_LEXEM_CHAR) - { - item.curr[1]= lexem.code; - } - else - { - item.curr[1]= 0; - lexem=savlex; /* Restore previous parser state */ - } - if (lexem.diff == 3) - { - item.diff[2]++; - } - else if (lexem.diff == 2) - { - item.diff[1]++; - item.diff[2]= 0; - } - else if (lexem.diff == 1) - { - item.diff[0]++; - item.diff[1]= 0; - item.diff[2]= 0; - } - else if (lexem.diff == 0) - { - item.diff[0]= item.diff[1]= item.diff[2]= 0; - } - if (nitems >= mitems) - { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules"); - return -1; - } - rule[nitems++]= item; + /* Reserve space for maximum possible length */ + dst->lengths[pagec]= MY_UCA_MAX_WEIGHT_SIZE; } else { - my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen"); - return -1; + uint pageb= (r->base[0] >> 8); + if (dst->lengths[pagec] < src->lengths[pageb]) + dst->lengths[pagec]= src->lengths[pageb]; } - state= 1; - continue; + dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */ } + else + ncontractions++; + } + + /* Allocate pages that we'll overwrite and copy default weights */ + for (i= 0; i < npages; i++) + { + my_bool rc; + /* + Don't touch pages with lengths[i]==0, they have implicit weights + calculated algorithmically. + */ + if (!dst->weights[i] && dst->lengths[i] && + (rc= my_uca_copy_page(loader, src, dst, i))) + return rc; + } + + if (ncontractions) + { + if (my_uca_alloc_contractions(&dst->contractions, loader, ncontractions)) + return TRUE; } - return (int) nitems; + + /* + Preparatory step is done at this point. + Now we have memory allocated for the pages that we'll overwrite, + and for contractions, including previous context contractions. + Also, for the pages that we'll overwrite, we have copied default weights. + Now iterate through the rules, overwrite weights for the characters + that appear in the rules, and put all contractions into contraction list. + */ + for (r= rules->rule; r < rlast; r++) + { + if (apply_one_rule(loader, rules, r, level, dst)) + return TRUE; + } + return FALSE; } -#define MY_MAX_COLL_RULE 128 /* This function copies an UCS2 collation from @@ -8013,145 +9356,65 @@ static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems, default weights. */ -static my_bool create_tailoring(struct charset_info_st *cs, - void *(*alloc)(size_t)) -{ - MY_COLL_RULE rule[MY_MAX_COLL_RULE]; - MY_COLL_RULE *r, *rfirst, *rlast; - char errstr[128]; - uchar *newlengths; - uint16 **newweights; - const uchar *deflengths= uca_length; - const uint16 *const *defweights= uca_weight; - int rc, i; - int ncontractions= 0; - +static my_bool +create_tailoring(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) +{ + MY_COLL_RULES rules; + MY_UCA_INFO new_uca, *src_uca= NULL; + int rc= 0; + + *loader->error= '\0'; + if (!cs->tailoring) - return 1; - + return 0; /* Ok to add a collation without tailoring */ + + memset(&rules, 0, sizeof(rules)); + rules.loader= loader; + rules.uca= cs->uca ? cs->uca : &my_uca_v400; /* For logical positions, etc */ + memset(&new_uca, 0, sizeof(new_uca)); + /* Parse ICU Collation Customization expression */ - if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE, + if ((rc= my_coll_rule_parse(&rules, cs->tailoring, - cs->tailoring + strlen(cs->tailoring), - errstr, sizeof(errstr))) < 0) + cs->tailoring + strlen(cs->tailoring)))) + goto ex; + +#if RESOLVE_CONFLICT_WITH_MYSQL_AND_MARIA_COLLATION_IDS + if (rules.version == 520) /* Unicode-5.2.0 requested */ { - /* - TODO: add error message reporting. - printf("Error: %d '%s'\n", rc, errstr); - */ - return 1; + src_uca= &my_uca_v520; + cs->caseinfo= &my_unicase_unicode520; } - - rfirst= rule; - rlast= rule + rc; - - if (!cs->caseinfo) - cs->caseinfo= my_unicase_default; - - if (!(newweights= (uint16**) (*alloc)(256*sizeof(uint16*)))) - return 1; - bzero(newweights, 256*sizeof(uint16*)); - - if (!(newlengths= (uchar*) (*alloc)(256))) - return 1; - - memcpy(newlengths, deflengths, 256); - - /* - Calculate maximum lenghts for the pages - which will be overwritten. - */ - for (i=0; i < rc; i++) + else +#endif + if (rules.version == 400) /* Unicode-4.0.0 requested */ { - /* check if the shift or the reset characters are out of range */ - if (rule[i].curr[0] > MAX_UCA_CHAR_WITH_EXPLICIT_WEIGHT || - rule[i].base > MAX_UCA_CHAR_WITH_EXPLICIT_WEIGHT) - return 1; - - if (!rule[i].curr[1]) /* If not a contraction */ - { - uint pageb= (rule[i].base >> 8) & 0xFF; - uint pagec= (rule[i].curr[0] >> 8) & 0xFF; - - if (newlengths[pagec] < deflengths[pageb]) - newlengths[pagec]= deflengths[pageb]; - } - else - ncontractions++; + src_uca= &my_uca_v400; + cs->caseinfo= &my_unicase_default; } - - for (i=0; i < rc; i++) + else /* No Unicode version specified */ { - uint pageb= (rule[i].base >> 8) & 0xFF; - uint pagec= (rule[i].curr[0] >> 8) & 0xFF; - uint chb, chc; - - if (rule[i].curr[1]) /* Skip contraction */ - continue; - - if (!newweights[pagec]) - { - /* Alloc new page and copy the default UCA weights */ - uint size= 256*newlengths[pagec]*sizeof(uint16); - - if (!(newweights[pagec]= (uint16*) (*alloc)(size))) - return 1; - bzero((void*) newweights[pagec], size); - - for (chc=0 ; chc < 256; chc++) - { - memcpy(newweights[pagec] + chc*newlengths[pagec], - defweights[pagec] + chc*deflengths[pagec], - deflengths[pagec]*sizeof(uint16)); - } - } - - /* - Aply the alternative rule: - shift to the base character and primary difference. - */ - chc= rule[i].curr[0] & 0xFF; - chb= rule[i].base & 0xFF; - memcpy(newweights[pagec] + chc*newlengths[pagec], - defweights[pageb] + chb*deflengths[pageb], - deflengths[pageb]*sizeof(uint16)); - /* Apply primary difference */ - newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0]; + src_uca= cs->uca ? cs->uca : &my_uca_v400; + if (!cs->caseinfo) + cs->caseinfo= &my_unicase_default; } - - /* Copy non-overwritten pages from the default UCA weights */ - for (i= 0; i < 256 ; i++) - { - if (!newweights[i]) - ((const uint16**) newweights)[i]= defweights[i]; - } - - cs->sort_order= newlengths; - cs->sort_order_big= (const uint16**) newweights; - cs->contractions= NULL; - - /* Now process contractions */ - if (ncontractions) + + if ((rc= init_weight_level(loader, &rules, 0, + &new_uca.level[0], &src_uca->level[0]))) + goto ex; + + if (!(cs->uca= (MY_UCA_INFO *) (loader->once_alloc)(sizeof(MY_UCA_INFO)))) { - if (my_uca_alloc_contractions(cs, alloc, ncontractions)) - return 1; - for (r= rfirst; r < rlast; r++) - { - uint16 *to; - if (r->curr[1]) /* Contraction */ - { - /* Mark both letters as "is contraction part" */ - my_uca_add_contraction_flag(cs, r->curr[0], MY_UCA_CNT_HEAD); - my_uca_add_contraction_flag(cs, r->curr[1], MY_UCA_CNT_TAIL); - to= my_uca_add_contraction(cs, r->curr, 2)->weight; - /* Copy weight from the reset character */ - to[0]= my_char_weight_addr(cs, r->base)[0]; - /* Apply primary difference */ - to[0]+= r->diff[0]; - } - } + rc= 1; + goto ex; } - return 0; + cs->uca[0]= new_uca; + +ex: + (loader->free)(rules.rule); + if (rc != 0 && loader->error[0]) + loader->reporter(ERROR_LEVEL, "%s", loader->error); + return rc; } @@ -8161,12 +9424,14 @@ static my_bool create_tailoring(struct charset_info_st *cs, Should work for any character set. */ -static my_bool my_coll_init_uca(struct charset_info_st *cs, - void *(*alloc)(size_t)) +static my_bool +my_coll_init_uca(struct charset_info_st *cs, MY_CHARSET_LOADER *loader) { cs->pad_char= ' '; cs->ctype= my_charset_utf8_unicode_ci.ctype; - return create_tailoring(cs, alloc); + if (!cs->caseinfo) + cs->caseinfo= &my_unicase_default; + return create_tailoring(cs, loader); } static int my_strnncoll_any_uca(CHARSET_INFO *cs, @@ -8213,7 +9478,7 @@ static int my_strnncoll_ucs2_uca(CHARSET_INFO *cs, const uchar *t, size_t tlen, my_bool t_is_prefix) { - return my_strnncoll_uca(cs, &my_ucs2_uca_scanner_handler, + return my_strnncoll_uca(cs, &my_any_uca_scanner_handler, s, slen, t, tlen, t_is_prefix); } @@ -8222,7 +9487,7 @@ static int my_strnncollsp_ucs2_uca(CHARSET_INFO *cs, const uchar *t, size_t tlen, my_bool diff_if_only_endspace_difference) { - return my_strnncollsp_uca(cs, &my_ucs2_uca_scanner_handler, + return my_strnncollsp_uca(cs, &my_any_uca_scanner_handler, s, slen, t, tlen, diff_if_only_endspace_difference); } @@ -8231,14 +9496,14 @@ static void my_hash_sort_ucs2_uca(CHARSET_INFO *cs, const uchar *s, size_t slen, ulong *n1, ulong *n2) { - my_hash_sort_uca(cs, &my_ucs2_uca_scanner_handler, s, slen, n1, n2); + my_hash_sort_uca(cs, &my_any_uca_scanner_handler, s, slen, n1, n2); } static size_t my_strnxfrm_ucs2_uca(CHARSET_INFO *cs, uchar *dst, size_t dstlen, const uchar *src, size_t srclen) { - return my_strnxfrm_uca(cs, &my_ucs2_uca_scanner_handler, + return my_strnxfrm_uca(cs, &my_any_uca_scanner_handler, dst, dstlen, src, srclen); } @@ -8268,12 +9533,11 @@ struct charset_info_st my_charset_ucs2_unicode_ci= NULL, /* ctype */ NULL, /* to_lower */ NULL, /* to_upper */ - uca_length, /* sort_order */ - NULL, /* contractions */ - uca_weight, /* sort_order_big*/ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8301,11 +9565,10 @@ struct charset_info_st my_charset_ucs2_icelandic_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8333,11 +9596,10 @@ struct charset_info_st my_charset_ucs2_latvian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8365,11 +9627,10 @@ struct charset_info_st my_charset_ucs2_romanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8397,11 +9658,10 @@ struct charset_info_st my_charset_ucs2_slovenian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8429,11 +9689,10 @@ struct charset_info_st my_charset_ucs2_polish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8461,11 +9720,10 @@ struct charset_info_st my_charset_ucs2_estonian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8493,11 +9751,10 @@ struct charset_info_st my_charset_ucs2_spanish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8525,11 +9782,10 @@ struct charset_info_st my_charset_ucs2_swedish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8557,11 +9813,10 @@ struct charset_info_st my_charset_ucs2_turkish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_turkish, /* caseinfo */ + &my_unicase_turkish,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8589,11 +9844,10 @@ struct charset_info_st my_charset_ucs2_czech_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8622,11 +9876,10 @@ struct charset_info_st my_charset_ucs2_danish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8654,11 +9907,10 @@ struct charset_info_st my_charset_ucs2_lithuanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8686,11 +9938,10 @@ struct charset_info_st my_charset_ucs2_slovak_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8718,11 +9969,10 @@ struct charset_info_st my_charset_ucs2_spanish2_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8751,11 +10001,10 @@ struct charset_info_st my_charset_ucs2_roman_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8784,11 +10033,10 @@ struct charset_info_st my_charset_ucs2_persian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8817,11 +10065,10 @@ struct charset_info_st my_charset_ucs2_esperanto_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8850,11 +10097,10 @@ struct charset_info_st my_charset_ucs2_hungarian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8882,11 +10128,43 @@ struct charset_info_st my_charset_ucs2_sinhala_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + + + +struct charset_info_st my_charset_ucs2_german2_uca_ci= +{ + 148,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, + "ucs2", /* csname */ + "ucs2_german2_ci", /* name */ + "", /* comment */ + german2, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8914,11 +10192,10 @@ struct charset_info_st my_charset_ucs2_croatian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -8934,6 +10211,7 @@ struct charset_info_st my_charset_ucs2_croatian_uca_ci= &my_collation_ucs2_uca_handler }; + #endif @@ -8981,7 +10259,7 @@ static uchar ctype_utf8[] = { extern MY_CHARSET_HANDLER my_charset_utf8_handler; -#define MY_CS_UTF8MB3_UCA_FLAGS (MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE) +#define MY_CS_UTF8MB3_UCA_FLAGS (MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE) struct charset_info_st my_charset_utf8_unicode_ci= { @@ -8994,12 +10272,11 @@ struct charset_info_st my_charset_utf8_unicode_ci= ctype_utf8, /* ctype */ NULL, /* to_lower */ NULL, /* to_upper */ - uca_length, /* sort_order */ - NULL, /* contractions */ - uca_weight, /* sort_order_big*/ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9019,7 +10296,7 @@ struct charset_info_st my_charset_utf8_unicode_ci= struct charset_info_st my_charset_utf8_icelandic_uca_ci= { 193,0,0, /* number */ - MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + MY_CS_UTF8MB3_UCA_FLAGS,/* flags */ "utf8", /* cs name */ "utf8_icelandic_ci",/* name */ "", /* comment */ @@ -9028,11 +10305,10 @@ struct charset_info_st my_charset_utf8_icelandic_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9060,11 +10336,10 @@ struct charset_info_st my_charset_utf8_latvian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9092,11 +10367,10 @@ struct charset_info_st my_charset_utf8_romanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9124,11 +10398,10 @@ struct charset_info_st my_charset_utf8_slovenian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9156,11 +10429,10 @@ struct charset_info_st my_charset_utf8_polish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9188,11 +10460,10 @@ struct charset_info_st my_charset_utf8_estonian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9220,11 +10491,10 @@ struct charset_info_st my_charset_utf8_spanish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9252,11 +10522,10 @@ struct charset_info_st my_charset_utf8_swedish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9284,11 +10553,10 @@ struct charset_info_st my_charset_utf8_turkish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_turkish, /* caseinfo */ + &my_unicase_turkish,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9316,11 +10584,10 @@ struct charset_info_st my_charset_utf8_czech_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9349,11 +10616,10 @@ struct charset_info_st my_charset_utf8_danish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9381,11 +10647,10 @@ struct charset_info_st my_charset_utf8_lithuanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9413,11 +10678,10 @@ struct charset_info_st my_charset_utf8_slovak_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9445,11 +10709,10 @@ struct charset_info_st my_charset_utf8_spanish2_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9477,11 +10740,10 @@ struct charset_info_st my_charset_utf8_roman_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9509,11 +10771,10 @@ struct charset_info_st my_charset_utf8_persian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9541,11 +10802,10 @@ struct charset_info_st my_charset_utf8_esperanto_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9573,11 +10833,10 @@ struct charset_info_st my_charset_utf8_hungarian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9605,11 +10864,42 @@ struct charset_info_st my_charset_utf8_sinhala_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 3, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + + +struct charset_info_st my_charset_utf8_german2_uca_ci= +{ + 212,0,0, /* number */ + MY_CS_UTF8MB3_UCA_FLAGS,/* flags */ + MY_UTF8MB3, /* cs name */ + MY_UTF8MB3 "_german2_ci",/* name */ + "", /* comment */ + german2, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9627,36 +10917,36 @@ struct charset_info_st my_charset_utf8_sinhala_uca_ci= struct charset_info_st my_charset_utf8_croatian_uca_ci= { - 213,0,0, /* number */ - MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, - "utf8", /* cs name */ - "utf8_croatian_ci", /* name */ - "", /* comment */ - croatian, /* tailoring */ - ctype_utf8, /* ctype */ - NULL, /* to_lower */ - NULL, /* to_upper */ - NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ - NULL, /* tab_to_uni */ - NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ - NULL, /* state_map */ - NULL, /* ident_map */ - 8, /* strxfrm_multiply */ - 1, /* caseup_multiply */ - 1, /* casedn_multiply */ - 1, /* mbminlen */ - 3, /* mbmaxlen */ - 9, /* min_sort_char */ - 0xFFFF, /* max_sort_char */ - ' ', /* pad char */ - 0, /* escape_with_backslash_is_dangerous */ + 213,0,0, /* number */ + MY_CS_UTF8MB3_UCA_FLAGS,/* flags */ + MY_UTF8MB3, /* cs name */ + MY_UTF8MB3 "_croatian_ci",/* name */ + "", /* comment */ + croatian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 3, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ &my_charset_utf8_handler, &my_collation_any_uca_handler }; + #endif /* HAVE_CHARSET_utf8 */ @@ -9677,12 +10967,11 @@ struct charset_info_st my_charset_utf8mb4_unicode_ci= ctype_utf8, /* ctype */ NULL, /* to_lower */ NULL, /* to_upper */ - uca_length, /* sort_order */ - NULL, /* contractions */ - uca_weight, /* sort_order_big*/ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9711,11 +11000,10 @@ struct charset_info_st my_charset_utf8mb4_icelandic_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9743,11 +11031,10 @@ struct charset_info_st my_charset_utf8mb4_latvian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9775,11 +11062,10 @@ struct charset_info_st my_charset_utf8mb4_romanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9807,11 +11093,10 @@ struct charset_info_st my_charset_utf8mb4_slovenian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9839,11 +11124,10 @@ struct charset_info_st my_charset_utf8mb4_polish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9871,11 +11155,10 @@ struct charset_info_st my_charset_utf8mb4_estonian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9903,11 +11186,10 @@ struct charset_info_st my_charset_utf8mb4_spanish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9935,11 +11217,10 @@ struct charset_info_st my_charset_utf8mb4_swedish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9967,11 +11248,10 @@ struct charset_info_st my_charset_utf8mb4_turkish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_turkish, /* caseinfo */ + &my_unicase_turkish, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -9999,11 +11279,10 @@ struct charset_info_st my_charset_utf8mb4_czech_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10032,11 +11311,10 @@ struct charset_info_st my_charset_utf8mb4_danish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10064,11 +11342,10 @@ struct charset_info_st my_charset_utf8mb4_lithuanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10096,11 +11373,10 @@ struct charset_info_st my_charset_utf8mb4_slovak_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10128,11 +11404,10 @@ struct charset_info_st my_charset_utf8mb4_spanish2_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10160,11 +11435,10 @@ struct charset_info_st my_charset_utf8mb4_roman_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10192,11 +11466,10 @@ struct charset_info_st my_charset_utf8mb4_persian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10224,11 +11497,10 @@ struct charset_info_st my_charset_utf8mb4_esperanto_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10256,11 +11528,10 @@ struct charset_info_st my_charset_utf8mb4_hungarian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10288,11 +11559,41 @@ struct charset_info_st my_charset_utf8mb4_sinhala_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 4, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf8mb4_handler, + &my_collation_any_uca_handler +}; + +struct charset_info_st my_charset_utf8mb4_german2_uca_ci= +{ + 244,0,0, /* number */ + MY_CS_UTF8MB4_UCA_FLAGS,/* state */ + MY_UTF8MB4, /* csname */ + MY_UTF8MB4 "_german2_ci",/* name */ + "", /* comment */ + german2, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10320,11 +11621,10 @@ struct charset_info_st my_charset_utf8mb4_croatian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10375,12 +11675,11 @@ struct charset_info_st my_charset_utf32_unicode_ci= NULL, /* ctype */ NULL, /* to_lower */ NULL, /* to_upper */ - uca_length, /* sort_order */ - NULL, /* contractions */ - uca_weight, /* sort_order_big*/ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10409,11 +11708,10 @@ struct charset_info_st my_charset_utf32_icelandic_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10441,11 +11739,10 @@ struct charset_info_st my_charset_utf32_latvian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10473,11 +11770,10 @@ struct charset_info_st my_charset_utf32_romanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10505,11 +11801,10 @@ struct charset_info_st my_charset_utf32_slovenian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10537,11 +11832,10 @@ struct charset_info_st my_charset_utf32_polish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10569,11 +11863,10 @@ struct charset_info_st my_charset_utf32_estonian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10601,11 +11894,10 @@ struct charset_info_st my_charset_utf32_spanish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10633,11 +11925,10 @@ struct charset_info_st my_charset_utf32_swedish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10665,11 +11956,10 @@ struct charset_info_st my_charset_utf32_turkish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_turkish, /* caseinfo */ + &my_unicase_turkish, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10697,11 +11987,10 @@ struct charset_info_st my_charset_utf32_czech_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10730,11 +12019,10 @@ struct charset_info_st my_charset_utf32_danish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10762,11 +12050,10 @@ struct charset_info_st my_charset_utf32_lithuanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10794,11 +12081,10 @@ struct charset_info_st my_charset_utf32_slovak_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10826,11 +12112,10 @@ struct charset_info_st my_charset_utf32_spanish2_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10858,11 +12143,10 @@ struct charset_info_st my_charset_utf32_roman_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10890,11 +12174,10 @@ struct charset_info_st my_charset_utf32_persian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10922,11 +12205,10 @@ struct charset_info_st my_charset_utf32_esperanto_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10954,11 +12236,10 @@ struct charset_info_st my_charset_utf32_hungarian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -10986,11 +12267,41 @@ struct charset_info_st my_charset_utf32_sinhala_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 4, /* mbminlen */ + 4, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf32_handler, + &my_collation_utf32_uca_handler +}; + +struct charset_info_st my_charset_utf32_german2_uca_ci= +{ + 180,0,0, /* number */ + MY_CS_UTF32_UCA_FLAGS,/* state */ + "utf32", /* csname */ + "utf32_german2_ci", /* name */ + "", /* comment */ + german2, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11009,8 +12320,8 @@ struct charset_info_st my_charset_utf32_sinhala_uca_ci= struct charset_info_st my_charset_utf32_croatian_uca_ci= { 214,0,0, /* number */ - MY_CS_UTF32_UCA_FLAGS /* state */, - "utf32", /* cs name */ + MY_CS_UTF32_UCA_FLAGS,/* state */ + "utf32", /* csname */ "utf32_croatian_ci", /* name */ "", /* comment */ croatian, /* tailoring */ @@ -11018,11 +12329,10 @@ struct charset_info_st my_charset_utf32_croatian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11037,6 +12347,7 @@ struct charset_info_st my_charset_utf32_croatian_uca_ci= &my_charset_utf32_handler, &my_collation_utf32_uca_handler }; + #endif /* HAVE_CHARSET_utf32 */ @@ -11073,12 +12384,11 @@ struct charset_info_st my_charset_utf16_unicode_ci= NULL, /* ctype */ NULL, /* to_lower */ NULL, /* to_upper */ - uca_length, /* sort_order */ - NULL, /* contractions */ - uca_weight, /* sort_order_big*/ + NULL, /* sort_order */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11107,11 +12417,10 @@ struct charset_info_st my_charset_utf16_icelandic_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11139,11 +12448,10 @@ struct charset_info_st my_charset_utf16_latvian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11171,11 +12479,10 @@ struct charset_info_st my_charset_utf16_romanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11203,11 +12510,10 @@ struct charset_info_st my_charset_utf16_slovenian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11235,11 +12541,10 @@ struct charset_info_st my_charset_utf16_polish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11267,11 +12572,10 @@ struct charset_info_st my_charset_utf16_estonian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11299,11 +12603,10 @@ struct charset_info_st my_charset_utf16_spanish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11331,11 +12634,10 @@ struct charset_info_st my_charset_utf16_swedish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11363,11 +12665,10 @@ struct charset_info_st my_charset_utf16_turkish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_turkish, /* caseinfo */ + &my_unicase_turkish, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11395,11 +12696,10 @@ struct charset_info_st my_charset_utf16_czech_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11428,11 +12728,10 @@ struct charset_info_st my_charset_utf16_danish_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11460,11 +12759,10 @@ struct charset_info_st my_charset_utf16_lithuanian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11492,11 +12790,10 @@ struct charset_info_st my_charset_utf16_slovak_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11524,11 +12821,10 @@ struct charset_info_st my_charset_utf16_spanish2_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11556,11 +12852,10 @@ struct charset_info_st my_charset_utf16_roman_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11588,11 +12883,10 @@ struct charset_info_st my_charset_utf16_persian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11620,11 +12914,10 @@ struct charset_info_st my_charset_utf16_esperanto_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11652,11 +12945,10 @@ struct charset_info_st my_charset_utf16_hungarian_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default,/* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11684,11 +12976,10 @@ struct charset_info_st my_charset_utf16_sinhala_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default,/* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 8, /* strxfrm_multiply */ @@ -11704,114 +12995,72 @@ struct charset_info_st my_charset_utf16_sinhala_uca_ci= &my_collation_utf16_uca_handler }; -struct charset_info_st my_charset_utf16_croatian_uca_ci= +struct charset_info_st my_charset_utf16_german2_uca_ci= { - 215,0,0, /* number */ - MY_CS_UTF16_UCA_FLAGS /* state */, - "utf16", /* cs name */ - "utf16_croatian_ci", /* name */ - "", /* comment */ - croatian, /* tailoring */ - NULL, /* ctype */ - NULL, /* to_lower */ - NULL, /* to_upper */ - NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ - NULL, /* tab_to_uni */ - NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ - NULL, /* state_map */ - NULL, /* ident_map */ - 8, /* strxfrm_multiply */ - 1, /* caseup_multiply */ - 1, /* casedn_multiply */ - 2, /* mbminlen */ - 4, /* mbmaxlen */ - 9, /* min_sort_char */ - 0xFFFF, /* max_sort_char */ - ' ', /* pad char */ - 0, /* escape_with_backslash_is_dangerous */ + 121,0,0, /* number */ + MY_CS_UTF16_UCA_FLAGS,/* state */ + "utf16", /* cs name */ + "utf16_german2_ci",/* name */ + "", /* comment */ + german2, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default,/* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 4, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ &my_charset_utf16_handler, &my_collation_utf16_uca_handler }; -#endif /* HAVE_CHARSET_utf16 */ - -#endif /* HAVE_UCA_COLLATIONS */ - -/** - Check if UCA data has contractions (public version) - - @cs Pointer to CHARSET_INFO data - @retval 0 - no contraction, 1 - have contractions. -*/ - -my_bool -my_cs_have_contractions(CHARSET_INFO *cs) -{ - return cs->contractions != NULL; -} - -/** - Check if a character can be contraction head - - @cs Pointer to CHARSET_INFO data - @wc Code point - - @retval 0 - cannot be contraction head - @retval 1 - can be contraction head -*/ - -my_bool -my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc) +struct charset_info_st my_charset_utf16_croatian_uca_ci= { - return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD; -} - - -/** - Check if a character can be contraction tail - - @cs Pointer to CHARSET_INFO data - @wc Code point - - @retval 0 - cannot be contraction tail - @retval 1 - can be contraction tail -*/ + 215,0,0, /* number */ + MY_CS_UTF16_UCA_FLAGS,/* state */ + "utf16", /* cs name */ + "utf16_croatian_ci",/* name */ + "", /* comment */ + croatian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* uca */ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + &my_unicase_default,/* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 4, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf16_handler, + &my_collation_utf16_uca_handler +}; -my_bool -my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc) -{ - return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL; -} +#endif /* HAVE_CHARSET_utf16 */ -/** - Find a contraction and return its weight array - - @cs Pointer to CHARSET data - @wc1 First character - @wc2 Second character - - @return Weight array - @retval NULL - no contraction found - @retval ptr - contraction weight array -*/ -const uint16 * -my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2) -{ - const MY_CONTRACTIONS *list= cs->contractions; - const MY_CONTRACTION *c, *last; - for (c= list->item, last= &list->item[list->nitems]; c < last; c++) - { - if (c->ch[0] == wc1 && c->ch[1] == wc2) - { - return c->weight; - } - } - return NULL; -} +#endif /* HAVE_UCA_COLLATIONS */ diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 26f15584bcd..a5845a26917 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1161,31 +1161,31 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)), static inline void -my_tolower_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_tolower_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - if (page < 256 && uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].tolower; + MY_UNICASE_CHARACTER *page; + if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8])) + *wc= page[*wc & 0xFF].tolower; } static inline void -my_toupper_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_toupper_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - if (page < 256 && uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].toupper; + MY_UNICASE_CHARACTER *page; + if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8])) + *wc= page[*wc & 0xFF].toupper; } static inline void -my_tosort_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - if (page < 256) + if (*wc <= uni_plane->maxchar) { - if (uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].sort; + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[*wc >> 8])) + *wc= page[*wc & 0xFF].sort; } else { @@ -1194,6 +1194,7 @@ my_tosort_utf16(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) } + static size_t my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen, char *dst __attribute__((unused)), @@ -1204,7 +1205,7 @@ my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen, my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb; int res; char *srcend= src + srclen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && @@ -1227,7 +1228,7 @@ my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen, my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; int res; const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen); - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0) { @@ -1251,7 +1252,7 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen, my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb; int res; char *srcend= src + srclen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && @@ -1277,7 +1278,7 @@ my_strnncoll_utf16(CHARSET_INFO *cs, my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; const uchar *se= s + slen; const uchar *te= t + tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while (s < se && t < te) { @@ -1341,7 +1342,7 @@ my_strnncollsp_utf16(CHARSET_INFO *cs, my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT((slen % 2) == 0); DBUG_ASSERT((tlen % 2) == 0); @@ -1483,7 +1484,7 @@ my_wildcmp_utf16_ci(CHARSET_INFO *cs, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend, escape, w_one, w_many, uni_plane); } @@ -1695,11 +1696,10 @@ struct charset_info_st my_charset_utf16_general_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -1728,11 +1728,10 @@ struct charset_info_st my_charset_utf16_bin= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -1864,11 +1863,10 @@ struct charset_info_st my_charset_utf16le_general_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -1897,11 +1895,10 @@ struct charset_info_st my_charset_utf16le_bin= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -1950,31 +1947,31 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)), static inline void -my_tolower_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_tolower_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - if (page < 256 && uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].tolower; + MY_UNICASE_CHARACTER *page; + if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8])) + *wc= page[*wc & 0xFF].tolower; } static inline void -my_toupper_utf32(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_toupper_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - if (page < 256 && uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].toupper; + MY_UNICASE_CHARACTER *page; + if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8])) + *wc= page[*wc & 0xFF].toupper; } static inline void -my_tosort_utf32(MY_UNICASE_INFO *const* uni_plane, my_wc_t *wc) +my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - if (page < 256) + if (*wc <= uni_plane->maxchar) { - if (uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].sort; + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[*wc >> 8])) + *wc= page[*wc & 0xFF].sort; } else { @@ -1991,7 +1988,7 @@ my_caseup_utf32(CHARSET_INFO *cs, char *src, size_t srclen, my_wc_t wc; int res; char *srcend= src + srclen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && @@ -2021,7 +2018,7 @@ my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen, my_wc_t wc; int res; const uchar *e= s + slen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; /* Skip trailing spaces */ while (e > s + 3 && e[-1] == ' ' && !e[-2] && !e[-3] && !e[-4]) @@ -2047,7 +2044,7 @@ my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen, my_wc_t wc; int res; char *srcend= src + srclen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0) @@ -2070,7 +2067,7 @@ my_strnncoll_utf32(CHARSET_INFO *cs, my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc); const uchar *se= s + slen; const uchar *te= t + tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while (s < se && t < te) { @@ -2134,7 +2131,7 @@ my_strnncollsp_utf32(CHARSET_INFO *cs, int res; my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT((slen % 4) == 0); DBUG_ASSERT((tlen % 4) == 0); @@ -2582,7 +2579,7 @@ my_wildcmp_utf32_ci(CHARSET_INFO *cs, const char *wildstr, const char *wildend, int escape, int w_one, int w_many) { - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend, escape, w_one, w_many, uni_plane); } @@ -2790,11 +2787,10 @@ struct charset_info_st my_charset_utf32_general_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -2823,11 +2819,10 @@ struct charset_info_st my_charset_utf32_bin= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -2934,32 +2929,29 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) , static inline void -my_tolower_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc) +my_tolower_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - DBUG_ASSERT(page < 256); - if (uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].tolower; + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8) & 0xFF])) + *wc= page[*wc & 0xFF].tolower; } static inline void -my_toupper_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc) +my_toupper_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - DBUG_ASSERT(page < 256); - if (uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].toupper; + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8) & 0xFF])) + *wc= page[*wc & 0xFF].toupper; } static inline void -my_tosort_ucs2(MY_UNICASE_INFO *const *uni_plane, my_wc_t *wc) +my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - uint page= *wc >> 8; - DBUG_ASSERT(page < 256); - if (uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].sort; + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8) & 0xFF])) + *wc= page[*wc & 0xFF].sort; } static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, @@ -2969,7 +2961,7 @@ static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, my_wc_t wc; int res; char *srcend= src + srclen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && @@ -2990,7 +2982,7 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen, my_wc_t wc; int res; const uchar *e=s+slen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while (e > s+1 && e[-1] == ' ' && e[-2] == '\0') e-= 2; @@ -3014,7 +3006,7 @@ static size_t my_casedn_ucs2(CHARSET_INFO *cs, char *src, size_t srclen, my_wc_t wc; int res; char *srcend= src + srclen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src == dst && srclen == dstlen); while ((src < srcend) && @@ -3062,7 +3054,7 @@ static int my_strnncoll_ucs2(CHARSET_INFO *cs, my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc); const uchar *se=s+slen; const uchar *te=t+tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while ( s < se && t < te ) { @@ -3124,7 +3116,7 @@ static int my_strnncollsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), { const uchar *se, *te; size_t minlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; /* extra safety to make sure the lengths are even numbers */ slen&= ~1; @@ -3135,11 +3127,11 @@ static int my_strnncollsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2) { - int s_wc = uni_plane[s[0]] ? (int) uni_plane[s[0]][s[1]].sort : - (((int) s[0]) << 8) + (int) s[1]; + int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort : + (((int) s[0]) << 8) + (int) s[1]; - int t_wc = uni_plane[t[0]] ? (int) uni_plane[t[0]][t[1]].sort : - (((int) t[0]) << 8) + (int) t[1]; + int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort : + (((int) t[0]) << 8) + (int) t[1]; if ( s_wc != t_wc ) return s_wc > t_wc ? 1 : -1; @@ -3220,7 +3212,7 @@ int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, escape,w_one,w_many,uni_plane); } @@ -3412,11 +3404,10 @@ struct charset_info_st my_charset_ucs2_general_ci= to_lower_ucs2, /* to_lower */ to_upper_ucs2, /* to_upper */ to_upper_ucs2, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -3445,11 +3436,10 @@ struct charset_info_st my_charset_ucs2_general_mysql500_ci= to_lower_ucs2, /* to_lower */ to_upper_ucs2, /* to_upper */ to_upper_ucs2, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_mysql500, /* caseinfo */ + &my_unicase_mysql500, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -3478,11 +3468,10 @@ struct charset_info_st my_charset_ucs2_bin= to_lower_ucs2, /* to_lower */ to_upper_ucs2, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 2743efc4087..0f405825830 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -65988,7 +65988,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *cs __attribute__((unused)), /* Case info pages for JIS-X-0208 range */ -static MY_UNICASE_INFO cA2[256]= +static MY_UNICASE_CHARACTER cA2[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66109,7 +66109,7 @@ static MY_UNICASE_INFO cA2[256]= }; -static MY_UNICASE_INFO cA3[256]= +static MY_UNICASE_CHARACTER cA3[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66230,7 +66230,7 @@ static MY_UNICASE_INFO cA3[256]= }; -static MY_UNICASE_INFO cA6[256]= +static MY_UNICASE_CHARACTER cA6[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66351,7 +66351,7 @@ static MY_UNICASE_INFO cA6[256]= }; -static MY_UNICASE_INFO cA7[256]= +static MY_UNICASE_CHARACTER cA7[256]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66473,7 +66473,7 @@ static MY_UNICASE_INFO cA7[256]= /* Case info pages for JIS-X-0212 range */ -static MY_UNICASE_INFO c8FA6[]= +static MY_UNICASE_CHARACTER c8FA6[]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66594,7 +66594,7 @@ static MY_UNICASE_INFO c8FA6[]= }; -static MY_UNICASE_INFO c8FA7[]= +static MY_UNICASE_CHARACTER c8FA7[]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66715,7 +66715,7 @@ static MY_UNICASE_INFO c8FA7[]= }; -static MY_UNICASE_INFO c8FA9[]= +static MY_UNICASE_CHARACTER c8FA9[]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66836,7 +66836,7 @@ static MY_UNICASE_INFO c8FA9[]= }; -static MY_UNICASE_INFO c8FAA[]= +static MY_UNICASE_CHARACTER c8FAA[]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -66957,7 +66957,7 @@ static MY_UNICASE_INFO c8FAA[]= }; -static MY_UNICASE_INFO c8FAB[]= +static MY_UNICASE_CHARACTER c8FAB[]= { {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, /* xx00 */ {0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, @@ -67078,7 +67078,7 @@ static MY_UNICASE_INFO c8FAB[]= }; -static MY_UNICASE_INFO *my_caseinfo_ujis[512]= +static MY_UNICASE_CHARACTER *my_caseinfo_pages_ujis[512]= { /* JIS-X-0208 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0 */ @@ -67148,6 +67148,15 @@ static MY_UNICASE_INFO *my_caseinfo_ujis[512]= NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* F */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; + +static MY_UNICASE_INFO my_caseinfo_ujis= +{ + 0x0FFFF, + my_caseinfo_pages_ujis +}; + + + #endif /* HAVE_CHARSET_ujis */ @@ -67158,11 +67167,11 @@ static MY_UNICASE_INFO *my_caseinfo_ujis[512]= UJIS and EUCJPMS share the same UPPER/LOWER functions. */ -static MY_UNICASE_INFO* +static MY_UNICASE_CHARACTER* get_case_info_for_ch(CHARSET_INFO *cs, uint plane, uint page, uint offs) { - MY_UNICASE_INFO *p; - return (p= cs->caseinfo[page + plane * 256]) ? &p[offs & 0xFF] : NULL; + MY_UNICASE_CHARACTER *p; + return (p= cs->caseinfo->page[page + plane * 256]) ? &p[offs & 0xFF] : NULL; } @@ -67183,7 +67192,7 @@ my_casefold_ujis(CHARSET_INFO *cs, size_t mblen= my_ismbchar(cs, src, srcend); if (mblen) { - MY_UNICASE_INFO *ch; + MY_UNICASE_CHARACTER *ch; ch= (mblen == 2) ? get_case_info_for_ch(cs, 0, (uchar) src[0], (uchar) src[1]) : get_case_info_for_ch(cs, 1, (uchar) src[1], (uchar) src[2]); @@ -67304,11 +67313,10 @@ struct charset_info_st my_charset_ujis_japanese_ci= to_lower_ujis, to_upper_ujis, sort_order_ujis, - NULL, /* sort_order_big*/ - NULL, /* contractions */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_ujis, /* caseinfo */ + &my_caseinfo_ujis, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -67337,11 +67345,10 @@ struct charset_info_st my_charset_ujis_bin= to_lower_ujis, to_upper_ujis, NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_caseinfo_ujis, /* caseinfo */ + &my_caseinfo_ujis, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index fe25f288d5f..ae891b43d37 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -60,7 +60,7 @@ #include "my_uctype.h" -static MY_UNICASE_INFO plane00[]={ +static MY_UNICASE_CHARACTER plane00[]={ {0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001}, {0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003}, {0x0004,0x0004,0x0004}, {0x0005,0x0005,0x0005}, @@ -196,7 +196,7 @@ static MY_UNICASE_INFO plane00[]={ Almost similar to plane00, but maps sorting order for U+00DF to 0x00DF instead of 0x0053. */ -static MY_UNICASE_INFO plane00_mysql500[]={ +static MY_UNICASE_CHARACTER plane00_mysql500[]={ {0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001}, {0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003}, {0x0004,0x0004,0x0004}, {0x0005,0x0005,0x0005}, @@ -328,7 +328,7 @@ static MY_UNICASE_INFO plane00_mysql500[]={ }; -static MY_UNICASE_INFO plane01[]={ +static MY_UNICASE_CHARACTER plane01[]={ {0x0100,0x0101,0x0041}, {0x0100,0x0101,0x0041}, {0x0102,0x0103,0x0041}, {0x0102,0x0103,0x0041}, {0x0104,0x0105,0x0041}, {0x0104,0x0105,0x0041}, @@ -459,7 +459,7 @@ static MY_UNICASE_INFO plane01[]={ {0x01FE,0x01FF,0x00D8}, {0x01FE,0x01FF,0x00D8} }; -static MY_UNICASE_INFO plane02[]={ +static MY_UNICASE_CHARACTER plane02[]={ {0x0200,0x0201,0x0041}, {0x0200,0x0201,0x0041}, {0x0202,0x0203,0x0041}, {0x0202,0x0203,0x0041}, {0x0204,0x0205,0x0045}, {0x0204,0x0205,0x0045}, @@ -590,7 +590,7 @@ static MY_UNICASE_INFO plane02[]={ {0x02FE,0x02FE,0x02FE}, {0x02FF,0x02FF,0x02FF} }; -static MY_UNICASE_INFO plane03[]={ +static MY_UNICASE_CHARACTER plane03[]={ {0x0300,0x0300,0x0300}, {0x0301,0x0301,0x0301}, {0x0302,0x0302,0x0302}, {0x0303,0x0303,0x0303}, {0x0304,0x0304,0x0304}, {0x0305,0x0305,0x0305}, @@ -721,7 +721,7 @@ static MY_UNICASE_INFO plane03[]={ {0x03FE,0x03FE,0x03FE}, {0x03FF,0x03FF,0x03FF} }; -static MY_UNICASE_INFO plane04[]={ +static MY_UNICASE_CHARACTER plane04[]={ {0x0400,0x0450,0x0415}, {0x0401,0x0451,0x0415}, {0x0402,0x0452,0x0402}, {0x0403,0x0453,0x0413}, {0x0404,0x0454,0x0404}, {0x0405,0x0455,0x0405}, @@ -852,7 +852,7 @@ static MY_UNICASE_INFO plane04[]={ {0x04FE,0x04FE,0x04FE}, {0x04FF,0x04FF,0x04FF} }; -static MY_UNICASE_INFO plane05[]={ +static MY_UNICASE_CHARACTER plane05[]={ {0x0500,0x0500,0x0500}, {0x0501,0x0501,0x0501}, {0x0502,0x0502,0x0502}, {0x0503,0x0503,0x0503}, {0x0504,0x0504,0x0504}, {0x0505,0x0505,0x0505}, @@ -983,7 +983,7 @@ static MY_UNICASE_INFO plane05[]={ {0x05FE,0x05FE,0x05FE}, {0x05FF,0x05FF,0x05FF} }; -static MY_UNICASE_INFO plane1E[]={ +static MY_UNICASE_CHARACTER plane1E[]={ {0x1E00,0x1E01,0x0041}, {0x1E00,0x1E01,0x0041}, {0x1E02,0x1E03,0x0042}, {0x1E02,0x1E03,0x0042}, {0x1E04,0x1E05,0x0042}, {0x1E04,0x1E05,0x0042}, @@ -1114,7 +1114,7 @@ static MY_UNICASE_INFO plane1E[]={ {0x1EFE,0x1EFE,0x1EFE}, {0x1EFF,0x1EFF,0x1EFF} }; -static MY_UNICASE_INFO plane1F[]={ +static MY_UNICASE_CHARACTER plane1F[]={ {0x1F08,0x1F00,0x0391}, {0x1F09,0x1F01,0x0391}, {0x1F0A,0x1F02,0x0391}, {0x1F0B,0x1F03,0x0391}, {0x1F0C,0x1F04,0x0391}, {0x1F0D,0x1F05,0x0391}, @@ -1245,7 +1245,7 @@ static MY_UNICASE_INFO plane1F[]={ {0x1FFE,0x1FFE,0x1FFE}, {0x1FFF,0x1FFF,0x1FFF} }; -static MY_UNICASE_INFO plane21[]={ +static MY_UNICASE_CHARACTER plane21[]={ {0x2100,0x2100,0x2100}, {0x2101,0x2101,0x2101}, {0x2102,0x2102,0x2102}, {0x2103,0x2103,0x2103}, {0x2104,0x2104,0x2104}, {0x2105,0x2105,0x2105}, @@ -1376,7 +1376,7 @@ static MY_UNICASE_INFO plane21[]={ {0x21FE,0x21FE,0x21FE}, {0x21FF,0x21FF,0x21FF} }; -static MY_UNICASE_INFO plane24[]={ +static MY_UNICASE_CHARACTER plane24[]={ {0x2400,0x2400,0x2400}, {0x2401,0x2401,0x2401}, {0x2402,0x2402,0x2402}, {0x2403,0x2403,0x2403}, {0x2404,0x2404,0x2404}, {0x2405,0x2405,0x2405}, @@ -1507,7 +1507,7 @@ static MY_UNICASE_INFO plane24[]={ {0x24FE,0x24FE,0x24FE}, {0x24FF,0x24FF,0x24FF} }; -static MY_UNICASE_INFO planeFF[]={ +static MY_UNICASE_CHARACTER planeFF[]={ {0xFF00,0xFF00,0xFF00}, {0xFF01,0xFF01,0xFF01}, {0xFF02,0xFF02,0xFF02}, {0xFF03,0xFF03,0xFF03}, {0xFF04,0xFF04,0xFF04}, {0xFF05,0xFF05,0xFF05}, @@ -1638,7 +1638,9 @@ static MY_UNICASE_INFO planeFF[]={ {0xFFFE,0xFFFE,0xFFFE}, {0xFFFF,0xFFFF,0xFFFF} }; -MY_UNICASE_INFO *const my_unicase_default[256]={ + +static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]= +{ plane00, plane01, plane02, plane03, plane04, plane05, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1671,14 +1673,20 @@ MY_UNICASE_INFO *const my_unicase_default[256]={ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, planeFF +}; + +MY_UNICASE_INFO my_unicase_default= +{ + 0xFFFF, + my_unicase_pages_default }; /* Reproduce old utf8_general_ci behaviour before we fixed Bug#27877. */ -MY_UNICASE_INFO *const my_unicase_mysql500[256]={ +MY_UNICASE_CHARACTER *my_unicase_pages_mysql500[256]={ plane00_mysql500, plane01, plane02, plane03, plane04, plane05, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1716,6 +1724,13 @@ MY_UNICASE_INFO *const my_unicase_mysql500[256]={ }; +MY_UNICASE_INFO my_unicase_mysql500= +{ + 0xFFFF, + my_unicase_pages_mysql500 +}; + + /* Turkish lower/upper mapping: 1. LOWER(0x0049 LATIN CAPITAL LETTER I) -> @@ -1724,7 +1739,7 @@ MY_UNICASE_INFO *const my_unicase_mysql500[256]={ 0x0130 LATIN CAPITAL LETTER I WITH DOT ABOVE */ -static MY_UNICASE_INFO turk00[]= +static MY_UNICASE_CHARACTER turk00[]= { {0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001}, {0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003}, @@ -1858,7 +1873,7 @@ static MY_UNICASE_INFO turk00[]= -MY_UNICASE_INFO *const my_unicase_turkish[256]= +static MY_UNICASE_CHARACTER *my_unicase_pages_turkish[256]= { turk00, plane01, plane02, plane03, plane04, plane05, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1895,14 +1910,23 @@ MY_UNICASE_INFO *const my_unicase_turkish[256]= }; +MY_UNICASE_INFO my_unicase_turkish= +{ + 0xFFFF, + my_unicase_pages_turkish +}; + + static inline void -my_tosort_unicode(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_tosort_unicode(MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags) { - int page= *wc >> 8; - if (page < 256) + if (*wc <= uni_plane->maxchar) { - if (uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].sort; + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[*wc >> 8])) + *wc= (flags & MY_CS_LOWER_SORT) ? + page[*wc & 0xFF].tolower : + page[*wc & 0xFF].sort; } else { @@ -1925,7 +1949,7 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs, const char *str,const char *str_end, const char *wildstr,const char *wildend, int escape, int w_one, int w_many, - MY_UNICASE_INFO *const *weights, int recurse_level) + MY_UNICASE_INFO *weights, int recurse_level) { int result= -1; /* Not found, using wildcards */ my_wc_t s_wc, w_wc; @@ -1974,8 +1998,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs, { if (weights) { - my_tosort_unicode(weights, &s_wc); - my_tosort_unicode(weights, &w_wc); + my_tosort_unicode(weights, &s_wc, cs->state); + my_tosort_unicode(weights, &w_wc, cs->state); } if (s_wc != w_wc) return 1; /* No match */ @@ -2045,8 +2069,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs, return 1; if (weights) { - my_tosort_unicode(weights, &s_wc); - my_tosort_unicode(weights, &w_wc); + my_tosort_unicode(weights, &s_wc, cs->state); + my_tosort_unicode(weights, &w_wc, cs->state); } if (s_wc == w_wc) @@ -2074,7 +2098,7 @@ my_wildcmp_unicode(CHARSET_INFO *cs, const char *str,const char *str_end, const char *wildstr,const char *wildend, int escape, int w_one, int w_many, - MY_UNICASE_INFO *const *weights) + MY_UNICASE_INFO *weights) { return my_wildcmp_unicode_impl(cs, str, str_end, wildstr, wildend, @@ -2099,7 +2123,7 @@ my_strnxfrm_unicode(CHARSET_INFO *cs, uchar *de= dst + dstlen; uchar *de_beg= de - 1; const uchar *se = src + srclen; - MY_UNICASE_INFO * const*uni_plane= (cs->state & MY_CS_BINSORT) ? + MY_UNICASE_INFO *uni_plane= (cs->state & MY_CS_BINSORT) ? NULL : cs->caseinfo; DBUG_ASSERT(src); @@ -2110,7 +2134,7 @@ my_strnxfrm_unicode(CHARSET_INFO *cs, src+=res; if (uni_plane) - my_tosort_unicode(uni_plane, &wc); + my_tosort_unicode(uni_plane, &wc, cs->state); *dst++= (uchar) (wc >> 8); if (dst < de) @@ -2476,20 +2500,45 @@ static int my_uni_utf8_no_range(CHARSET_INFO *cs __attribute__((unused)), } +static inline void +my_tolower_utf8mb3(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) +{ + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8) & 0xFF])) + *wc= page[*wc & 0xFF].tolower; +} + + +static inline void +my_toupper_utf8mb3(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) +{ + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8) & 0xFF])) + *wc= page[*wc & 0xFF].toupper; +} + + +static inline void +my_tosort_utf8mb3(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) +{ + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8) & 0xFF])) + *wc= page[*wc & 0xFF].sort; +} + static size_t my_caseup_utf8(CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen) { my_wc_t wc; int srcres, dstres; char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src != dst || cs->caseup_multiply == 1); while ((src < srcend) && (srcres= my_utf8_uni(cs, &wc, (uchar *) src, (uchar*) srcend)) > 0) { - int plane= (wc>>8) & 0xFF; - wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].toupper : wc; + my_toupper_utf8mb3(uni_plane, &wc); if ((dstres= my_uni_utf8(cs, wc, (uchar*) dst, (uchar*) dstend)) <= 0) break; src+= srcres; @@ -2505,7 +2554,7 @@ static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen, my_wc_t wc; int res; const uchar *e=s+slen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; /* Remove end space. We have to do this to be able to compare @@ -2516,8 +2565,7 @@ static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((s < e) && (res=my_utf8_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 ) { - int plane = (wc>>8) & 0xFF; - wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc; + my_tosort_unicode(uni_plane, &wc, cs->state); n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8); n2[0]+=3; n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8); @@ -2532,14 +2580,13 @@ static size_t my_caseup_str_utf8(CHARSET_INFO *cs, char *src) my_wc_t wc; int srcres, dstres; char *dst= src, *dst0= src; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(cs->caseup_multiply == 1); while (*src && (srcres= my_utf8_uni_no_range(cs, &wc, (uchar *) src)) > 0) { - int plane= (wc>>8) & 0xFF; - wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].toupper : wc; + my_toupper_utf8mb3(uni_plane, &wc); if ((dstres= my_uni_utf8_no_range(cs, wc, (uchar*) dst)) <= 0) break; src+= srcres; @@ -2556,14 +2603,13 @@ static size_t my_casedn_utf8(CHARSET_INFO *cs, char *src, size_t srclen, my_wc_t wc; int srcres, dstres; char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src != dst || cs->casedn_multiply == 1); while ((src < srcend) && (srcres= my_utf8_uni(cs, &wc, (uchar*) src, (uchar*)srcend)) > 0) { - int plane= (wc>>8) & 0xFF; - wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].tolower : wc; + my_tolower_utf8mb3(uni_plane, &wc); if ((dstres= my_uni_utf8(cs, wc, (uchar*) dst, (uchar*) dstend)) <= 0) break; src+= srcres; @@ -2578,14 +2624,13 @@ static size_t my_casedn_str_utf8(CHARSET_INFO *cs, char *src) my_wc_t wc; int srcres, dstres; char *dst= src, *dst0= src; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(cs->casedn_multiply == 1); while (*src && (srcres= my_utf8_uni_no_range(cs, &wc, (uchar *) src)) > 0) { - int plane= (wc>>8) & 0xFF; - wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].tolower : wc; + my_tolower_utf8mb3(uni_plane, &wc); if ((dstres= my_uni_utf8_no_range(cs, wc, (uchar*) dst)) <= 0) break; src+= srcres; @@ -2621,11 +2666,10 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs, my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); const uchar *se=s+slen; const uchar *te=t+tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while ( s < se && t < te ) { - int plane; s_res=my_utf8_uni(cs,&s_wc, s, se); t_res=my_utf8_uni(cs,&t_wc, t, te); @@ -2635,10 +2679,9 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs, return bincmp(s, se, t, te); } - plane=(s_wc>>8) & 0xFF; - s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc; - plane=(t_wc>>8) & 0xFF; - t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc; + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); + if ( s_wc != t_wc ) { return s_wc > t_wc ? 1 : -1; @@ -2690,7 +2733,7 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs, int s_res, t_res, res; my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); const uchar *se= s+slen, *te= t+tlen; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE diff_if_only_endspace_difference= 0; @@ -2698,7 +2741,6 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs, while ( s < se && t < te ) { - int plane; s_res=my_utf8_uni(cs,&s_wc, s, se); t_res=my_utf8_uni(cs,&t_wc, t, te); @@ -2708,10 +2750,9 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs, return bincmp(s, se, t, te); } - plane=(s_wc>>8) & 0xFF; - s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc; - plane=(t_wc>>8) & 0xFF; - t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc; + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); + if ( s_wc != t_wc ) { return s_wc > t_wc ? 1 : -1; @@ -2778,7 +2819,7 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs, static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) { - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while (s[0] && t[0]) { my_wc_t s_wc,t_wc; @@ -2795,7 +2836,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) } else { - int plane, res; + int res; /* Scan a multibyte character. @@ -2823,8 +2864,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) s+= res; /* Convert Unicode code into weight according to collation */ - plane=(s_wc>>8) & 0xFF; - s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc; + my_tolower_utf8mb3(uni_plane, &s_wc); } @@ -2838,15 +2878,13 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) } else { - int plane; int res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*) t + 3); if (res <= 0) return strcmp(s, t); t+= res; /* Convert code into weight */ - plane=(t_wc>>8) & 0xFF; - t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc; + my_tolower_utf8mb3(uni_plane, &t_wc); } /* Now we have two weights, let's compare them */ @@ -2863,7 +2901,7 @@ int my_wildcmp_utf8(CHARSET_INFO *cs, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, escape,w_one,w_many,uni_plane); } @@ -2966,11 +3004,10 @@ struct charset_info_st my_charset_utf8_general_ci= to_lower_utf8, /* to_lower */ to_upper_utf8, /* to_upper */ to_upper_utf8, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -2999,11 +3036,10 @@ struct charset_info_st my_charset_utf8_general_mysql500_ci= to_lower_utf8, /* to_lower */ to_upper_utf8, /* to_upper */ to_upper_utf8, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big */ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_mysql500, /* caseinfo */ + &my_unicase_mysql500, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -3032,11 +3068,10 @@ struct charset_info_st my_charset_utf8_bin= to_lower_utf8, /* to_lower */ to_upper_utf8, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -3117,7 +3152,7 @@ static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs, const uchar *se= s + slen; const uchar *te= t + tlen; int save_diff= 0; - MY_UNICASE_INFO *const *uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE diff_if_only_endspace_difference= 0; @@ -3125,7 +3160,6 @@ static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs, while ( s < se && t < te ) { - int plane; s_res=my_utf8_uni(cs,&s_wc, s, se); t_res=my_utf8_uni(cs,&t_wc, t, te); @@ -3139,10 +3173,10 @@ static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs, { save_diff = ((int)s_wc) - ((int)t_wc); } - plane=(s_wc>>8) & 0xFF; - s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc; - plane=(t_wc>>8) & 0xFF; - t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc; + + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); + if ( s_wc != t_wc ) { return ((int) s_wc) - ((int) t_wc); @@ -4521,11 +4555,10 @@ struct charset_info_st my_charset_filename= to_lower_utf8, /* to_lower */ to_upper_utf8, /* to_upper */ to_upper_utf8, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -4887,20 +4920,26 @@ my_wc_mb_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)), static inline void -my_tolower_utf8mb4(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_tolower_utf8mb4(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - int page= *wc >> 8; - if (page < 256 && uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].tolower; + if (*wc <= uni_plane->maxchar) + { + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8)])) + *wc= page[*wc & 0xFF].tolower; + } } static inline void -my_toupper_utf8mb4(MY_UNICASE_INFO * const* uni_plane, my_wc_t *wc) +my_toupper_utf8mb4(MY_UNICASE_INFO *uni_plane, my_wc_t *wc) { - int page= *wc >> 8; - if (page < 256 && uni_plane[page]) - *wc= uni_plane[page][*wc & 0xFF].toupper; + if (*wc <= uni_plane->maxchar) + { + MY_UNICASE_CHARACTER *page; + if ((page= uni_plane->page[(*wc >> 8)])) + *wc= page[*wc & 0xFF].toupper; + } } @@ -4911,7 +4950,7 @@ my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen, my_wc_t wc; int srcres, dstres; char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src != dst || cs->caseup_multiply == 1); while ((src < srcend) && @@ -4943,7 +4982,7 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen, my_wc_t wc; int res; const uchar *e= s + slen; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; /* Remove end space. We do this to be able to compare @@ -4954,7 +4993,7 @@ my_hash_sort_utf8mb4(CHARSET_INFO *cs, const uchar *s, size_t slen, while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0) { - my_tosort_unicode(uni_plane, &wc); + my_tosort_unicode(uni_plane, &wc, cs->state); my_hash_add(n1, n2, (uint) (wc & 0xFF)); my_hash_add(n1, n2, (uint) (wc >> 8) & 0xFF); if (wc > 0xFFFF) @@ -4979,7 +5018,7 @@ my_caseup_str_utf8mb4(CHARSET_INFO *cs, char *src) my_wc_t wc; int srcres, dstres; char *dst= src, *dst0= src; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(cs->caseup_multiply == 1); while (*src && @@ -5004,7 +5043,7 @@ my_casedn_utf8mb4(CHARSET_INFO *cs, my_wc_t wc; int srcres, dstres; char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(src != dst || cs->casedn_multiply == 1); while ((src < srcend) && @@ -5027,7 +5066,7 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src) my_wc_t wc; int srcres, dstres; char *dst= src, *dst0= src; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; DBUG_ASSERT(cs->casedn_multiply == 1); while (*src && @@ -5069,7 +5108,7 @@ my_strnncoll_utf8mb4(CHARSET_INFO *cs, my_wc_t s_wc,t_wc; const uchar *se= s + slen; const uchar *te= t + tlen; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; LINT_INIT(s_wc); LINT_INIT(t_wc); @@ -5084,9 +5123,9 @@ my_strnncoll_utf8mb4(CHARSET_INFO *cs, return bincmp_utf8mb4(s, se, t, te); } - my_tosort_unicode(uni_plane, &s_wc); - my_tosort_unicode(uni_plane, &t_wc); - + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); + if ( s_wc != t_wc ) { return s_wc > t_wc ? 1 : -1; @@ -5136,7 +5175,7 @@ my_strnncollsp_utf8mb4(CHARSET_INFO *cs, int res; my_wc_t s_wc, t_wc; const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; LINT_INIT(s_wc); LINT_INIT(t_wc); @@ -5155,8 +5194,8 @@ my_strnncollsp_utf8mb4(CHARSET_INFO *cs, return bincmp_utf8mb4(s, se, t, te); } - my_tosort_unicode(uni_plane, &s_wc); - my_tosort_unicode(uni_plane, &t_wc); + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); if ( s_wc != t_wc ) { @@ -5220,7 +5259,7 @@ my_strnncollsp_utf8mb4(CHARSET_INFO *cs, static int my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t) { - MY_UNICASE_INFO * const* uni_plane= cs->caseinfo; + MY_UNICASE_INFO *uni_plane= cs->caseinfo; while (s[0] && t[0]) { my_wc_t s_wc,t_wc; @@ -5399,11 +5438,10 @@ struct charset_info_st my_charset_utf8mb4_general_ci= to_lower_utf8mb4, /* to_lower */ to_upper_utf8mb4, /* to_upper */ to_upper_utf8mb4, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ @@ -5432,11 +5470,10 @@ struct charset_info_st my_charset_utf8mb4_bin= to_lower_utf8mb4, /* to_lower */ to_upper_utf8mb4, /* to_upper */ NULL, /* sort_order */ - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default,/* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 1, /* strxfrm_multiply */ diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c index 8fd15ebddb2..d1cd51a5d8d 100644 --- a/strings/ctype-win1250ch.c +++ b/strings/ctype-win1250ch.c @@ -690,11 +690,10 @@ struct charset_info_st my_charset_cp1250_czech_ci = to_lower_win1250ch, to_upper_win1250ch, sort_order_win1250ch, - NULL, /* contractions */ - NULL, /* sort_order_big*/ + NULL, /* uca */ tab_cp1250_uni, /* tab_to_uni */ idx_uni_cp1250, /* tab_from_uni */ - my_unicase_default, /* caseinfo */ + &my_unicase_default, /* caseinfo */ NULL, /* state_map */ NULL, /* ident_map */ 2, /* strxfrm_multiply */ diff --git a/strings/ctype.c b/strings/ctype.c index b71d7dee4c4..43e9b290b3e 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -38,6 +38,18 @@ */ + +/* + Avoid using my_snprintf + We cannot use my_snprintf() here, because ctype.o is + used to build conf_to_src, which must require minimun + dependency. +*/ + +#undef my_snprinf +#define my_snprintf "We cannot use my_snprintf in this file" + + int (*my_string_stack_guard)(int)= NULL; static char *mstr(char *str,const char *src,size_t l1,size_t l2) @@ -71,11 +83,75 @@ struct my_cs_file_section_st #define _CS_PRIMARY_ID 15 #define _CS_BINARY_ID 16 #define _CS_CSDESCRIPT 17 -#define _CS_RESET 18 -#define _CS_DIFF1 19 -#define _CS_DIFF2 20 -#define _CS_DIFF3 21 -#define _CS_IDENTICAL 22 + + +/* Special purpose commands */ +#define _CS_UCA_VERSION 100 +#define _CS_CL_SUPPRESS_CONTRACTIONS 101 +#define _CS_CL_OPTIMIZE 102 +#define _CS_CL_SHIFT_AFTER_METHOD 103 + + +/* Collation Settings */ +#define _CS_ST_SETTINGS 200 +#define _CS_ST_STRENGTH 201 +#define _CS_ST_ALTERNATE 202 +#define _CS_ST_BACKWARDS 203 +#define _CS_ST_NORMALIZATION 204 +#define _CS_ST_CASE_LEVEL 205 +#define _CS_ST_CASE_FIRST 206 +#define _CS_ST_HIRAGANA_QUATERNARY 207 +#define _CS_ST_NUMERIC 208 +#define _CS_ST_VARIABLE_TOP 209 +#define _CS_ST_MATCH_BOUNDARIES 210 +#define _CS_ST_MATCH_STYLE 211 + + +/* Rules */ +#define _CS_RULES 300 +#define _CS_RESET 301 +#define _CS_DIFF1 302 +#define _CS_DIFF2 303 +#define _CS_DIFF3 304 +#define _CS_DIFF4 305 +#define _CS_IDENTICAL 306 + +/* Rules: Expansions */ +#define _CS_EXP_X 320 +#define _CS_EXP_EXTEND 321 +#define _CS_EXP_DIFF1 322 +#define _CS_EXP_DIFF2 323 +#define _CS_EXP_DIFF3 324 +#define _CS_EXP_DIFF4 325 +#define _CS_EXP_IDENTICAL 326 + +/* Rules: Abbreviating Ordering Specifications */ +#define _CS_A_DIFF1 351 +#define _CS_A_DIFF2 352 +#define _CS_A_DIFF3 353 +#define _CS_A_DIFF4 354 +#define _CS_A_IDENTICAL 355 + +/* Rules: previous context */ +#define _CS_CONTEXT 370 + +/* Rules: Placing Characters Before Others*/ +#define _CS_RESET_BEFORE 380 + +/* Rules: Logical Reset Positions */ +#define _CS_RESET_FIRST_PRIMARY_IGNORABLE 401 +#define _CS_RESET_LAST_PRIMARY_IGNORABLE 402 +#define _CS_RESET_FIRST_SECONDARY_IGNORABLE 403 +#define _CS_RESET_LAST_SECONDARY_IGNORABLE 404 +#define _CS_RESET_FIRST_TERTIARY_IGNORABLE 405 +#define _CS_RESET_LAST_TERTIARY_IGNORABLE 406 +#define _CS_RESET_FIRST_TRAILING 407 +#define _CS_RESET_LAST_TRAILING 408 +#define _CS_RESET_FIRST_VARIABLE 409 +#define _CS_RESET_LAST_VARIABLE 410 +#define _CS_RESET_FIRST_NON_IGNORABLE 411 +#define _CS_RESET_LAST_NON_IGNORABLE 412 + static const struct my_cs_file_section_st sec[] = @@ -85,6 +161,8 @@ static const struct my_cs_file_section_st sec[] = {_CS_MISC, "xml/encoding"}, {_CS_MISC, "charsets"}, {_CS_MISC, "charsets/max-id"}, + {_CS_MISC, "charsets/copyright"}, + {_CS_MISC, "charsets/description"}, {_CS_CHARSET, "charsets/charset"}, {_CS_PRIMARY_ID, "charsets/charset/primary-id"}, {_CS_BINARY_ID, "charsets/charset/binary-id"}, @@ -106,11 +184,72 @@ static const struct my_cs_file_section_st sec[] = {_CS_ORDER, "charsets/charset/collation/order"}, {_CS_FLAG, "charsets/charset/collation/flag"}, {_CS_COLLMAP, "charsets/charset/collation/map"}, - {_CS_RESET, "charsets/charset/collation/rules/reset"}, - {_CS_DIFF1, "charsets/charset/collation/rules/p"}, - {_CS_DIFF2, "charsets/charset/collation/rules/s"}, - {_CS_DIFF3, "charsets/charset/collation/rules/t"}, - {_CS_IDENTICAL, "charsets/charset/collation/rules/i"}, + + /* Special purpose commands */ + {_CS_UCA_VERSION, "charsets/charset/collation/version"}, + {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"}, + {_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"}, + {_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"}, + + /* Collation Settings */ + {_CS_ST_SETTINGS, "charsets/charset/collation/settings"}, + {_CS_ST_STRENGTH, "charsets/charset/collation/settings/strength"}, + {_CS_ST_ALTERNATE, "charsets/charset/collation/settings/alternate"}, + {_CS_ST_BACKWARDS, "charsets/charset/collation/settings/backwards"}, + {_CS_ST_NORMALIZATION, "charsets/charset/collation/settings/normalization"}, + {_CS_ST_CASE_LEVEL, "charsets/charset/collation/settings/caseLevel"}, + {_CS_ST_CASE_FIRST, "charsets/charset/collation/settings/caseFirst"}, + {_CS_ST_HIRAGANA_QUATERNARY, "charsets/charset/collation/settings/hiraganaQuaternary"}, + {_CS_ST_NUMERIC, "charsets/charset/collation/settings/numeric"}, + {_CS_ST_VARIABLE_TOP, "charsets/charset/collation/settings/variableTop"}, + {_CS_ST_MATCH_BOUNDARIES, "charsets/charset/collation/settings/match-boundaries"}, + {_CS_ST_MATCH_STYLE, "charsets/charset/collation/settings/match-style"}, + + /* Rules */ + {_CS_RULES, "charsets/charset/collation/rules"}, + {_CS_RESET, "charsets/charset/collation/rules/reset"}, + {_CS_DIFF1, "charsets/charset/collation/rules/p"}, + {_CS_DIFF2, "charsets/charset/collation/rules/s"}, + {_CS_DIFF3, "charsets/charset/collation/rules/t"}, + {_CS_DIFF4, "charsets/charset/collation/rules/q"}, + {_CS_IDENTICAL, "charsets/charset/collation/rules/i"}, + + /* Rules: expansions */ + {_CS_EXP_X, "charsets/charset/collation/rules/x"}, + {_CS_EXP_EXTEND, "charsets/charset/collation/rules/x/extend"}, + {_CS_EXP_DIFF1, "charsets/charset/collation/rules/x/p"}, + {_CS_EXP_DIFF2, "charsets/charset/collation/rules/x/s"}, + {_CS_EXP_DIFF3, "charsets/charset/collation/rules/x/t"}, + {_CS_EXP_DIFF4, "charsets/charset/collation/rules/x/q"}, + {_CS_EXP_IDENTICAL, "charsets/charset/collation/rules/x/i"}, + + /* Rules: previous context */ + {_CS_CONTEXT, "charsets/charset/collation/rules/x/context"}, + + /* Rules: Abbreviating Ordering Specifications */ + {_CS_A_DIFF1, "charsets/charset/collation/rules/pc"}, + {_CS_A_DIFF2, "charsets/charset/collation/rules/sc"}, + {_CS_A_DIFF3, "charsets/charset/collation/rules/tc"}, + {_CS_A_DIFF4, "charsets/charset/collation/rules/qc"}, + {_CS_A_IDENTICAL, "charsets/charset/collation/rules/ic"}, + + /* Rules: Placing Characters Before Others*/ + {_CS_RESET_BEFORE, "charsets/charset/collation/rules/reset/before"}, + + /* Rules: Logical Reset Positions */ + {_CS_RESET_FIRST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/first_non_ignorable"}, + {_CS_RESET_LAST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/last_non_ignorable"}, + {_CS_RESET_FIRST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_primary_ignorable"}, + {_CS_RESET_LAST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_primary_ignorable"}, + {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"}, + {_CS_RESET_LAST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_secondary_ignorable"}, + {_CS_RESET_FIRST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_tertiary_ignorable"}, + {_CS_RESET_LAST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_tertiary_ignorable"}, + {_CS_RESET_FIRST_TRAILING, "charsets/charset/collation/rules/reset/first_trailing"}, + {_CS_RESET_LAST_TRAILING, "charsets/charset/collation/rules/reset/last_trailing"}, + {_CS_RESET_FIRST_VARIABLE, "charsets/charset/collation/rules/reset/first_variable"}, + {_CS_RESET_LAST_VARIABLE, "charsets/charset/collation/rules/reset/last_variable"}, + {0, NULL} }; @@ -120,14 +259,16 @@ static const struct my_cs_file_section_st const struct my_cs_file_section_st *s; for (s=sec; s->str; s++) { - if (!strncmp(attr,s->str,len)) + if (!strncmp(attr, s->str, len) && s->str[len] == 0) return s; } return NULL; } #define MY_CS_CSDESCR_SIZE 64 -#define MY_CS_TAILORING_SIZE 1024 +#define MY_CS_TAILORING_SIZE 32*1024 +#define MY_CS_UCA_VERSION_SIZE 64 +#define MY_CS_CONTEXT_SIZE 64 typedef struct my_cs_file_info { @@ -139,12 +280,59 @@ typedef struct my_cs_file_info uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; char comment[MY_CS_CSDESCR_SIZE]; - char tailoring[MY_CS_TAILORING_SIZE]; + char *tailoring; size_t tailoring_length; + size_t tailoring_alloced_length; + char context[MY_CS_CONTEXT_SIZE]; struct charset_info_st cs; - int (*add_collation)(struct charset_info_st *cs); -} MY_CHARSET_LOADER; + MY_CHARSET_LOADER *loader; +} MY_CHARSET_FILE; + + +static void +my_charset_file_reset_charset(MY_CHARSET_FILE *i) +{ + memset(&i->cs, 0, sizeof(i->cs)); +} + +static void +my_charset_file_reset_collation(MY_CHARSET_FILE *i) +{ + i->tailoring_length= 0; + i->context[0]= '\0'; +} + + +static void +my_charset_file_init(MY_CHARSET_FILE *i) +{ + my_charset_file_reset_charset(i); + my_charset_file_reset_collation(i); + i->tailoring= NULL; + i->tailoring_alloced_length= 0; +} + + +static void +my_charset_file_free(MY_CHARSET_FILE *i) +{ + i->loader->free(i->tailoring); +} + + +static int +my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen) +{ + if (i->tailoring_alloced_length > newlen || + (i->tailoring= i->loader->realloc(i->tailoring, + (i->tailoring_alloced_length= + (newlen + 32*1024))))) + { + return MY_XML_OK; + } + return MY_XML_ERROR; +} static int fill_uchar(uchar *a,uint size,const char *str, size_t len) @@ -182,17 +370,119 @@ static int fill_uint16(uint16 *a,uint size,const char *str, size_t len) } + + +static int +tailoring_append(MY_XML_PARSER *st, + const char *fmt, size_t len, const char *attr) +{ + struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data; + size_t newlen= i->tailoring_length + len + 64; /* 64 for format */ + if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen)) + { + char *dst= i->tailoring + i->tailoring_length; + sprintf(dst, fmt, (int) len, attr); + i->tailoring_length+= strlen(dst); + return MY_XML_OK; + } + return MY_XML_ERROR; +} + + +static int +tailoring_append2(MY_XML_PARSER *st, + const char *fmt, + size_t len1, const char *attr1, + size_t len2, const char *attr2) +{ + struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data; + size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */ + if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen)) + { + char *dst= i->tailoring + i->tailoring_length; + sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2); + i->tailoring_length+= strlen(dst); + return MY_XML_OK; + } + return MY_XML_ERROR; +} + + +static size_t +scan_one_character(const char *s, const char *e, my_wc_t *wc) +{ + CHARSET_INFO *cs= &my_charset_utf8_general_ci; + if (s >= e) + return 0; + + /* Escape sequence: \uXXXX */ + if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2])) + { + size_t len= 3; /* We have at least one digit */ + for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++) + { + } + wc[0]= 0; + return len; + } + else if (s[0] > 0) /* 7-bit character */ + { + wc[0]= 0; + return 1; + } + else /* Non-escaped character */ + { + int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e); + if (rc > 0) + return (size_t) rc; + } + return 0; +} + + +static int +tailoring_append_abbreviation(MY_XML_PARSER *st, + const char *fmt, size_t len, const char *attr) +{ + size_t clen; + const char *attrend= attr + len; + my_wc_t wc; + + for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen) + { + DBUG_ASSERT(attr < attrend); + if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK) + return MY_XML_ERROR; + } + return MY_XML_OK; +} + + static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len) { struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; const struct my_cs_file_section_st *s= cs_file_sec(attr,len); + int state= s ? s->state : 0; - if ( s && (s->state == _CS_CHARSET)) - bzero(&i->cs,sizeof(i->cs)); - - if (s && (s->state == _CS_COLLATION)) - i->tailoring_length= 0; + switch (state) { + case 0: + i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr); + break; + + case _CS_CHARSET: + my_charset_file_reset_charset(i); + break; + + case _CS_COLLATION: + my_charset_file_reset_collation(i); + break; + case _CS_RESET: + return tailoring_append(st, " &", 0, NULL); + + default: + break; + } return MY_XML_OK; } @@ -206,8 +496,60 @@ static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len) switch(state){ case _CS_COLLATION: - rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK; + if (i->tailoring_length) + i->cs.tailoring= i->tailoring; + rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK; break; + + /* Rules: Logical Reset Positions */ + case _CS_RESET_FIRST_NON_IGNORABLE: + rc= tailoring_append(st, "[first non-ignorable]", 0, NULL); + break; + + case _CS_RESET_LAST_NON_IGNORABLE: + rc= tailoring_append(st, "[last non-ignorable]", 0, NULL); + break; + + case _CS_RESET_FIRST_PRIMARY_IGNORABLE: + rc= tailoring_append(st, "[first primary ignorable]", 0, NULL); + break; + + case _CS_RESET_LAST_PRIMARY_IGNORABLE: + rc= tailoring_append(st, "[last primary ignorable]", 0, NULL); + break; + + case _CS_RESET_FIRST_SECONDARY_IGNORABLE: + rc= tailoring_append(st, "[first secondary ignorable]", 0, NULL); + break; + + case _CS_RESET_LAST_SECONDARY_IGNORABLE: + rc= tailoring_append(st, "[last secondary ignorable]", 0, NULL); + break; + + case _CS_RESET_FIRST_TERTIARY_IGNORABLE: + rc= tailoring_append(st, "[first tertiary ignorable]", 0, NULL); + break; + + case _CS_RESET_LAST_TERTIARY_IGNORABLE: + rc= tailoring_append(st, "[last tertiary ignorable]", 0, NULL); + break; + + case _CS_RESET_FIRST_TRAILING: + rc= tailoring_append(st, "[first trailing]", 0, NULL); + break; + + case _CS_RESET_LAST_TRAILING: + rc= tailoring_append(st, "[last trailing]", 0, NULL); + break; + + case _CS_RESET_FIRST_VARIABLE: + rc= tailoring_append(st, "[first variable]", 0, NULL); + break; + + case _CS_RESET_LAST_VARIABLE: + rc= tailoring_append(st, "[last variable]", 0, NULL); + break; + default: rc=MY_XML_OK; } @@ -215,14 +557,40 @@ static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len) } +static const char *diff_fmt[5]= +{ + "<%.*s", + "<<%.*s", + "<<<%.*s", + "<<<<%.*s", + "=%.*s" +}; + + +static const char *context_diff_fmt[5]= +{ + "<%.*s|%.*s", + "<<%.*s|%.*s", + "<<<%.*s|%.*s", + "<<<<%.*s|%.*s", + "=%.*s|%.*s" +}; + + static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len) { struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; const struct my_cs_file_section_st *s; - int state= (int)((s= cs_file_sec(st->attr, strlen(st->attr))) ? s->state : - 0); - + int state= (int)((s= cs_file_sec(st->attr.start, + st->attr.end - st->attr.start)) ? + s->state : 0); + int rc= MY_XML_OK; + switch (state) { + case _CS_MISC: + case _CS_FAMILY: + case _CS_ORDER: + break; case _CS_ID: i->cs.number= strtol(attr,(char**)NULL,10); break; @@ -269,47 +637,185 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len) fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); i->cs.ctype=i->ctype; break; + + /* Special purpose commands */ + case _CS_UCA_VERSION: + rc= tailoring_append(st, "[version %.*s]", len, attr); + break; + + case _CS_CL_SUPPRESS_CONTRACTIONS: + rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr); + break; + + case _CS_CL_OPTIMIZE: + rc= tailoring_append(st, "[optimize %.*s]", len, attr); + break; + + case _CS_CL_SHIFT_AFTER_METHOD: + rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr); + break; + + /* Collation Settings */ + case _CS_ST_STRENGTH: + /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */ + rc= tailoring_append(st, "[strength %.*s]", len, attr); + break; + + case _CS_ST_ALTERNATE: + /* non-ignorable, shifted */ + rc= tailoring_append(st, "[alternate %.*s]", len, attr); + break; + + case _CS_ST_BACKWARDS: + /* on, off, 2 */ + rc= tailoring_append(st, "[backwards %.*s]", len, attr); + break; + + case _CS_ST_NORMALIZATION: + /* + TODO for WL#896: check collations for normalization: vi.xml + We want precomposed characters work well at this point. + */ + /* on, off */ + rc= tailoring_append(st, "[normalization %.*s]", len, attr); + break; + + case _CS_ST_CASE_LEVEL: + /* on, off */ + rc= tailoring_append(st, "[caseLevel %.*s]", len, attr); + break; + + case _CS_ST_CASE_FIRST: + /* upper, lower, off */ + rc= tailoring_append(st, "[caseFirst %.*s]", len, attr); + break; + + case _CS_ST_HIRAGANA_QUATERNARY: + /* on, off */ + rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr); + break; + + case _CS_ST_NUMERIC: + /* on, off */ + rc= tailoring_append(st, "[numeric %.*s]", len, attr); + break; + + case _CS_ST_VARIABLE_TOP: + /* TODO for WL#896: check value format */ + rc= tailoring_append(st, "[variableTop %.*s]", len, attr); + break; + + case _CS_ST_MATCH_BOUNDARIES: + /* none, whole-character, whole-word */ + rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr); + break; + + case _CS_ST_MATCH_STYLE: + /* minimal, medial, maximal */ + rc= tailoring_append(st, "[match-style %.*s]", len, attr); + break; + + + /* Rules */ case _CS_RESET: + rc= tailoring_append(st, "%.*s", len, attr); + break; + case _CS_DIFF1: case _CS_DIFF2: case _CS_DIFF3: + case _CS_DIFF4: case _CS_IDENTICAL: + rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr); + break; + + + /* Rules: Expansion */ + case _CS_EXP_EXTEND: + rc= tailoring_append(st, " / %.*s", len, attr); + break; + + case _CS_EXP_DIFF1: + case _CS_EXP_DIFF2: + case _CS_EXP_DIFF3: + case _CS_EXP_DIFF4: + case _CS_EXP_IDENTICAL: + if (i->context[0]) { - /* - Convert collation description from - Locale Data Markup Language (LDML) - into ICU Collation Customization expression. - */ - char arg[16]; - const char *cmd[]= {"&","<","<<","<<<","="}; - i->cs.tailoring= i->tailoring; - mstr(arg,attr,len,sizeof(arg)-1); - if (i->tailoring_length + 20 < sizeof(i->tailoring)) - { - char *dst= i->tailoring_length + i->tailoring; - i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg); - } + rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1], + strlen(i->context), i->context, len, attr); + i->context[0]= 0; } + else + rc= tailoring_append(st, diff_fmt[state - _CS_EXP_DIFF1], len, attr); + break; + + /* Rules: Context */ + case _CS_CONTEXT: + if (len < sizeof(i->context) + 1) + { + memcpy(i->context, attr, len); + i->context[len]= '\0'; + } + break; + + /* Rules: Abbreviating Ordering Specifications */ + case _CS_A_DIFF1: + case _CS_A_DIFF2: + case _CS_A_DIFF3: + case _CS_A_DIFF4: + case _CS_A_IDENTICAL: + rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr); + break; + + /* Rules: Placing Characters Before Others */ + case _CS_RESET_BEFORE: + /* + TODO for WL#896: Add this check into text customization parser: + It is an error if the strength of the before relation is not identical + to the relation after the reset. We'll need this for WL#896. + */ + rc= tailoring_append(st, "[before %.*s]", len, attr); + break; + + + default: + break; } - return MY_XML_OK; + + return rc; } -my_bool my_parse_charset_xml(const char *buf, size_t len, - int (*add_collation)(struct charset_info_st *cs)) +my_bool +my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len) { MY_XML_PARSER p; - struct my_cs_file_info i; + struct my_cs_file_info info; my_bool rc; + my_charset_file_init(&info); my_xml_parser_create(&p); my_xml_set_enter_handler(&p,cs_enter); my_xml_set_value_handler(&p,cs_value); my_xml_set_leave_handler(&p,cs_leave); - i.add_collation= add_collation; - my_xml_set_user_data(&p,(void*)&i); + info.loader= loader; + my_xml_set_user_data(&p, (void *) &info); rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE; my_xml_parser_free(&p); + my_charset_file_free(&info); + if (rc != MY_XML_OK) + { + const char *errstr= my_xml_error_string(&p); + if (sizeof(loader->error) > 32 + strlen(errstr)) + { + /* We cannot use my_snprintf() here. See previous comment. */ + sprintf(loader->error, "at line %d pos %d: %s", + my_xml_error_lineno(&p)+1, + (int) my_xml_error_pos(&p), + my_xml_error_string(&p)); + } + } return rc; } diff --git a/strings/str_alloc.c b/strings/str_alloc.c index 17139e7b773..91246603f2e 100644 --- a/strings/str_alloc.c +++ b/strings/str_alloc.c @@ -31,5 +31,11 @@ static void my_str_free_default(void *ptr) free(ptr); } +void *my_str_realloc_default(void *ptr, size_t size) +{ + return realloc(ptr, size); +} + void *(*my_str_malloc)(size_t)= &my_str_malloc_default; void (*my_str_free)(void *)= &my_str_free_default; +void *(*my_str_realloc)(void *, size_t)= &my_str_realloc_default; diff --git a/strings/xml.c b/strings/xml.c index 3b2c278f553..8073b881a47 100644 --- a/strings/xml.c +++ b/strings/xml.c @@ -15,6 +15,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ #include "strings_def.h" +#include "m_string.h" #include "my_xml.h" @@ -207,25 +208,71 @@ static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len) } -static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len) +/** + Ensure the attr buffer is wide enough to hold the new value + + Expand and/or allocate dynamic buffer as needed to hold the concatenated + path and the terminating zero. + + @attr st the parser instance + @attr len the length of the attribute to be added + @return state + @retval 1 failed + @retval 0 success +*/ +static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len) { - if ((size_t) (st->attrend-st->attr+len+1) > sizeof(st->attr)) + size_t ofs= st->attr.end - st->attr.start; + len++; // Add terminating zero. + if (ofs + len > st->attr.buffer_size) { - sprintf(st->errstr,"To deep XML"); - return MY_XML_ERROR; + st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ? + st->attr.buffer_size * 2 + len : SIZE_T_MAX; + + if (!st->attr.buffer) + { + st->attr.buffer= (char *) my_str_malloc(st->attr.buffer_size); + if (st->attr.buffer) + memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */); + } + else + st->attr.buffer= (char *) my_str_realloc(st->attr.buffer, + st->attr.buffer_size); + st->attr.start= st->attr.buffer; + st->attr.end= st->attr.start + ofs; + + return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR; } - if (st->attrend > st->attr) + return MY_XML_OK; +} + + +/** rewind the attr buffer to initial state */ +static void my_xml_attr_rewind(MY_XML_PARSER *p) +{ + /* keep the buffer already allocated */ + p->attr.end= p->attr.start; +} + + +static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len) +{ + if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */)) + return MY_XML_ERROR; + + if (st->attr.end > st->attr.start) { - st->attrend[0]= '/'; - st->attrend++; + st->attr.end[0]= '/'; + st->attr.end++; } - memcpy(st->attrend,str,len); - st->attrend+=len; - st->attrend[0]='\0'; + memcpy(st->attr.end, str, len); + st->attr.end+= len; + st->attr.end[0]= '\0'; if (st->flags & MY_XML_FLAG_RELATIVE_NAMES) return st->enter ? st->enter(st, str, len) : MY_XML_OK; else - return st->enter ? st->enter(st,st->attr,st->attrend-st->attr) : MY_XML_OK; + return st->enter ? + st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK; } @@ -246,8 +293,8 @@ static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen) int rc; /* Find previous '/' or beginning */ - for (e=p->attrend; (e>p->attr) && (e[0] != '/') ; e--); - glen = (size_t) ((e[0] == '/') ? (p->attrend-e-1) : p->attrend-e); + for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--); + glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e); if (str && (slen != glen)) { @@ -265,11 +312,12 @@ static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen) if (p->flags & MY_XML_FLAG_RELATIVE_NAMES) rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK; else - rc= (p->leave_xml ? p->leave_xml(p,p->attr,p->attrend-p->attr) : + rc= (p->leave_xml ? + p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) : MY_XML_OK); *e='\0'; - p->attrend=e; + p->attr.end= e; return rc; } @@ -277,7 +325,9 @@ static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen) int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len) { - p->attrend=p->attr; + + my_xml_attr_rewind(p); + p->beg=str; p->cur=str; p->end=str+len; @@ -432,7 +482,7 @@ gt: } } - if (p->attr[0]) + if (p->attr.start[0]) { sprintf(p->errstr,"unexpected END-OF-INPUT"); return MY_XML_ERROR; @@ -443,12 +493,22 @@ gt: void my_xml_parser_create(MY_XML_PARSER *p) { - bzero((void*)p,sizeof(p[0])); + memset(p, 0, sizeof(p[0])); + /* + Use static buffer while it's sufficient. + */ + p->attr.start= p->attr.end= p->attr.static_buffer; + p->attr.buffer_size= sizeof(p->attr.static_buffer); } -void my_xml_parser_free(MY_XML_PARSER *p __attribute__((unused))) +void my_xml_parser_free(MY_XML_PARSER *p) { + if (p->attr.buffer) + { + my_str_free(p->attr.buffer); + p->attr.buffer= NULL; + } } |