summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mysys/charset-def.c6
-rw-r--r--strings/ctype-utf8.c166
2 files changed, 172 insertions, 0 deletions
diff --git a/mysys/charset-def.c b/mysys/charset-def.c
index 3278566788c..c7fa0ffd8e0 100644
--- a/mysys/charset-def.c
+++ b/mysys/charset-def.c
@@ -62,6 +62,9 @@ extern CHARSET_INFO my_charset_utf8_slovak_uca_ci;
extern CHARSET_INFO my_charset_utf8_spanish2_uca_ci;
extern CHARSET_INFO my_charset_utf8_roman_uca_ci;
extern CHARSET_INFO my_charset_utf8_persian_uca_ci;
+#ifdef HAVE_CYBOZU_COLLATION
+extern CHARSET_INFO my_charset_utf8_general_cs;
+#endif
#endif
#endif /* HAVE_UCA_COLLATIONS */
@@ -146,6 +149,9 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
#ifdef HAVE_CHARSET_utf8
add_compiled_collation(&my_charset_utf8_general_ci);
add_compiled_collation(&my_charset_utf8_bin);
+#ifdef HAVE_CYBOZU_COLLATION
+ add_compiled_collation(&my_charset_utf8_general_cs);
+#endif
#ifdef HAVE_UCA_COLLATIONS
add_compiled_collation(&my_charset_utf8_general_uca_ci);
add_compiled_collation(&my_charset_utf8_icelandic_uca_ci);
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 69371aa38c2..4d2bff5e89f 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2380,6 +2380,172 @@ CHARSET_INFO my_charset_utf8_bin=
&my_collation_mb_bin_handler
};
+#ifdef HAVE_CYBOZU_COLLATION
+
+/*
+ * These functions bacically do the same as their original, except
+ * that they return 0 only when two comparing unicode strings are
+ * strictly the same in case-sensitive way. See "save_diff" local
+ * variable to what they actually do.
+ */
+
+static int my_strnncoll_utf8_cs(CHARSET_INFO *cs,
+ const uchar *s, uint slen,
+ const uchar *t, uint tlen,
+ my_bool t_is_prefix)
+{
+ int s_res,t_res;
+ my_wc_t s_wc,t_wc;
+ const uchar *se=s+slen;
+ const uchar *te=t+tlen;
+ int save_diff = 0;
+ int diff;
+
+ while ( s < se && t < te )
+ {
+ int plane;
+ s_res=my_utf8_uni(cs,&s_wc, s, se);
+ t_res=my_utf8_uni(cs,&t_wc, t, te);
+
+ if ( s_res <= 0 || t_res <= 0 )
+
+ {
+ /* Incorrect string, compare by char value */
+ return ((int)s[0]-(int)t[0]);
+ }
+
+ if ( save_diff == 0 )
+ {
+ save_diff = ((int)s_wc) - ((int)t_wc);
+ }
+ plane=(s_wc>>8) & 0xFF;
+ s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
+ plane=(t_wc>>8) & 0xFF;
+ t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc;
+ if ( s_wc != t_wc )
+ {
+ return ((int) s_wc) - ((int) t_wc);
+ }
+
+ s+=s_res;
+ t+=t_res;
+ }
+ diff = ( (se-s) - (te-t) );
+ return t_is_prefix ? t-te : ((diff == 0) ? save_diff : diff);
+}
+
+static int my_strnncollsp_utf8_cs(CHARSET_INFO *cs,
+ const uchar *s, uint slen,
+ const uchar *t, uint tlen)
+{
+ int s_res,t_res;
+ my_wc_t s_wc,t_wc;
+ const uchar *se= s+slen;
+ const uchar *te= t+tlen;
+ int save_diff = 0;
+
+ while ( s < se && t < te )
+ {
+ int plane;
+ s_res=my_utf8_uni(cs,&s_wc, s, se);
+ t_res=my_utf8_uni(cs,&t_wc, t, te);
+
+ if ( s_res <= 0 || t_res <= 0 )
+ {
+ /* Incorrect string, compare by char value */
+ return ((int)s[0]-(int)t[0]);
+ }
+
+ if ( save_diff == 0 )
+ {
+ save_diff = ((int)s_wc) - ((int)t_wc);
+ }
+ plane=(s_wc>>8) & 0xFF;
+ s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
+ plane=(t_wc>>8) & 0xFF;
+ t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc;
+ if ( s_wc != t_wc )
+ {
+ return ((int) s_wc) - ((int) t_wc);
+ }
+
+ s+=s_res;
+ t+=t_res;
+ }
+
+ slen= se-s;
+ tlen= te-t;
+
+ if (slen != tlen)
+ {
+ int swap= 0;
+ if (slen < tlen)
+ {
+ slen= tlen;
+ s= t;
+ se= te;
+ swap= -1;
+ }
+ /*
+ This following loop uses the fact that in UTF-8
+ all multibyte characters are greater than space,
+ and all multibyte head characters are greater than
+ space. It means if we meet a character greater
+ than space, it always means that the longer string
+ is greater. So we can reuse the same loop from the
+ 8bit version, without having to process full multibute
+ sequences.
+ */
+ for ( ; s < se; s++)
+ {
+ if (*s != ' ')
+ return ((int)*s - (int) ' ') ^ swap;
+ }
+ }
+ return save_diff;
+}
+
+static MY_COLLATION_HANDLER my_collation_cs_handler =
+{
+ NULL, /* init */
+ my_strnncoll_utf8_cs,
+ my_strnncollsp_utf8_cs,
+ my_strnxfrm_utf8,
+ my_like_range_simple,
+ my_wildcmp_mb,
+ my_strcasecmp_utf8,
+ my_instr_mb,
+ my_hash_sort_utf8
+};
+
+CHARSET_INFO my_charset_utf8_general_cs=
+{
+ 254,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_UNICODE, /* state */
+ "utf8", /* cs name */
+ "utf8_general_cs", /* name */
+ "", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8, /* ctype */
+ to_lower_utf8, /* to_lower */
+ to_upper_utf8, /* to_upper */
+ to_upper_utf8, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* mbminlen */
+ 3, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 255, /* max_sort_char */
+ &my_charset_utf8_handler,
+ &my_collation_cs_handler
+};
+#endif /* Cybozu Hack */
+
#ifdef MY_TEST_UTF8
#include <stdio.h>