From 13b47b5e4340169906698054a27ac91bc76d6211 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 29 Mar 2005 14:48:47 +0500 Subject: Adding Cybozu's patch. Not active by default. One need to pass -DHAVE_CYBOZU_COLLATION to activate it. mysys/charset-def.c: Adding Cybozu's patch. Not active by default. strings/ctype-utf8.c: Adding Cybozu's patch. Not active by default. --- mysys/charset-def.c | 6 ++ strings/ctype-utf8.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) diff --git a/mysys/charset-def.c b/mysys/charset-def.c index 3278566788c..c7fa0ffd8e0 100644 --- a/mysys/charset-def.c +++ b/mysys/charset-def.c @@ -62,6 +62,9 @@ extern CHARSET_INFO my_charset_utf8_slovak_uca_ci; extern CHARSET_INFO my_charset_utf8_spanish2_uca_ci; extern CHARSET_INFO my_charset_utf8_roman_uca_ci; extern CHARSET_INFO my_charset_utf8_persian_uca_ci; +#ifdef HAVE_CYBOZU_COLLATION +extern CHARSET_INFO my_charset_utf8_general_cs; +#endif #endif #endif /* HAVE_UCA_COLLATIONS */ @@ -146,6 +149,9 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused))) #ifdef HAVE_CHARSET_utf8 add_compiled_collation(&my_charset_utf8_general_ci); add_compiled_collation(&my_charset_utf8_bin); +#ifdef HAVE_CYBOZU_COLLATION + add_compiled_collation(&my_charset_utf8_general_cs); +#endif #ifdef HAVE_UCA_COLLATIONS add_compiled_collation(&my_charset_utf8_general_uca_ci); add_compiled_collation(&my_charset_utf8_icelandic_uca_ci); diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 69371aa38c2..4d2bff5e89f 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -2380,6 +2380,172 @@ CHARSET_INFO my_charset_utf8_bin= &my_collation_mb_bin_handler }; +#ifdef HAVE_CYBOZU_COLLATION + +/* + * These functions bacically do the same as their original, except + * that they return 0 only when two comparing unicode strings are + * strictly the same in case-sensitive way. See "save_diff" local + * variable to what they actually do. + */ + +static int my_strnncoll_utf8_cs(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen, + my_bool t_is_prefix) +{ + int s_res,t_res; + my_wc_t s_wc,t_wc; + const uchar *se=s+slen; + const uchar *te=t+tlen; + int save_diff = 0; + int diff; + + while ( s < se && t < te ) + { + int plane; + s_res=my_utf8_uni(cs,&s_wc, s, se); + t_res=my_utf8_uni(cs,&t_wc, t, te); + + if ( s_res <= 0 || t_res <= 0 ) + + { + /* Incorrect string, compare by char value */ + return ((int)s[0]-(int)t[0]); + } + + if ( save_diff == 0 ) + { + save_diff = ((int)s_wc) - ((int)t_wc); + } + plane=(s_wc>>8) & 0xFF; + s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc; + plane=(t_wc>>8) & 0xFF; + t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc; + if ( s_wc != t_wc ) + { + return ((int) s_wc) - ((int) t_wc); + } + + s+=s_res; + t+=t_res; + } + diff = ( (se-s) - (te-t) ); + return t_is_prefix ? t-te : ((diff == 0) ? save_diff : diff); +} + +static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs, + const uchar *s, uint slen, + const uchar *t, uint tlen) +{ + int s_res,t_res; + my_wc_t s_wc,t_wc; + const uchar *se= s+slen; + const uchar *te= t+tlen; + int save_diff = 0; + + while ( s < se && t < te ) + { + int plane; + s_res=my_utf8_uni(cs,&s_wc, s, se); + t_res=my_utf8_uni(cs,&t_wc, t, te); + + if ( s_res <= 0 || t_res <= 0 ) + { + /* Incorrect string, compare by char value */ + return ((int)s[0]-(int)t[0]); + } + + if ( save_diff == 0 ) + { + save_diff = ((int)s_wc) - ((int)t_wc); + } + plane=(s_wc>>8) & 0xFF; + s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc; + plane=(t_wc>>8) & 0xFF; + t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc; + if ( s_wc != t_wc ) + { + return ((int) s_wc) - ((int) t_wc); + } + + s+=s_res; + t+=t_res; + } + + slen= se-s; + tlen= te-t; + + if (slen != tlen) + { + int swap= 0; + if (slen < tlen) + { + slen= tlen; + s= t; + se= te; + swap= -1; + } + /* + This following loop uses the fact that in UTF-8 + all multibyte characters are greater than space, + and all multibyte head characters are greater than + space. It means if we meet a character greater + than space, it always means that the longer string + is greater. So we can reuse the same loop from the + 8bit version, without having to process full multibute + sequences. + */ + for ( ; s < se; s++) + { + if (*s != ' ') + return ((int)*s - (int) ' ') ^ swap; + } + } + return save_diff; +} + +static MY_COLLATION_HANDLER my_collation_cs_handler = +{ + NULL, /* init */ + my_strnncoll_utf8_cs, + my_strnncollsp_utf8_cs, + my_strnxfrm_utf8, + my_like_range_simple, + my_wildcmp_mb, + my_strcasecmp_utf8, + my_instr_mb, + my_hash_sort_utf8 +}; + +CHARSET_INFO my_charset_utf8_general_cs= +{ + 254,0,0, /* number */ + MY_CS_COMPILED|MY_CS_UNICODE, /* state */ + "utf8", /* cs name */ + "utf8_general_cs", /* name */ + "", /* comment */ + NULL, /* tailoring */ + ctype_utf8, /* ctype */ + to_lower_utf8, /* to_lower */ + to_upper_utf8, /* to_upper */ + to_upper_utf8, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + NULL, /* state_map */ + NULL, /* ident_map */ + 1, /* strxfrm_multiply */ + 1, /* mbminlen */ + 3, /* mbmaxlen */ + 0, /* min_sort_char */ + 255, /* max_sort_char */ + &my_charset_utf8_handler, + &my_collation_cs_handler +}; +#endif /* Cybozu Hack */ + #ifdef MY_TEST_UTF8 #include -- cgit v1.2.1