summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-utf8.c323
1 files changed, 166 insertions, 157 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 7c3baac3c39..5e339725b1a 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1,15 +1,15 @@
/* Copyright (C) 2000 MySQL AB
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
-
+
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
@@ -1524,7 +1524,7 @@ MY_UNICASE_INFO *uni_plane[256]={
#ifdef HAVE_CHARSET_utf8
-/*
+/*
We consider bytes with code more than 127 as a letter.
This garantees that word boundaries work fine with regular
expressions. Note, there is no need to mark byte 255 as a
@@ -1590,99 +1590,108 @@ static uchar to_upper_utf8[] = {
240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
};
+static inline int bincmp(const uchar *s, const uchar *se,
+ const uchar *t, const uchar *te)
+{
+ int slen=se-s, tlen=te-t;
+ int len=min(slen,tlen);
+ int cmp= memcmp(s,t,len);
+ return cmp ? cmp : slen-tlen;
+}
+
static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
- my_wc_t * pwc, const uchar *s, const uchar *e)
+ my_wc_t * pwc, const uchar *s, const uchar *e)
{
unsigned char c;
-
+
if (s >= e)
return MY_CS_TOOFEW(0);
c= s[0];
- if (c < 0x80)
+ if (c < 0x80)
{
*pwc = c;
return 1;
- }
- else if (c < 0xc2)
+ }
+ else if (c < 0xc2)
return MY_CS_ILSEQ;
- else if (c < 0xe0)
+ else if (c < 0xe0)
{
- if (s+2 > e) /* We need 2 characters */
+ if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOFEW(0);
-
+
if (!((s[1] ^ 0x80) < 0x40))
return MY_CS_ILSEQ;
-
+
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
- }
- else if (c < 0xf0)
+ }
+ else if (c < 0xf0)
{
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOFEW(0);
-
+
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
-
- *pwc = ((my_wc_t) (c & 0x0f) << 12) |
- ((my_wc_t) (s[1] ^ 0x80) << 6) |
+
+ *pwc = ((my_wc_t) (c & 0x0f) << 12) |
+ ((my_wc_t) (s[1] ^ 0x80) << 6) |
(my_wc_t) (s[2] ^ 0x80);
-
+
return 3;
- }
+ }
#ifdef UNICODE_32BIT
- else if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
+ else if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
{
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOFEW(0);
-
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
+
+ if (!((s[1] ^ 0x80) < 0x40 &&
+ (s[2] ^ 0x80) < 0x40 &&
+ (s[3] ^ 0x80) < 0x40 &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
-
- *pwc = ((my_wc_t) (c & 0x07) << 18) |
- ((my_wc_t) (s[1] ^ 0x80) << 12) |
- ((my_wc_t) (s[2] ^ 0x80) << 6) |
+
+ *pwc = ((my_wc_t) (c & 0x07) << 18) |
+ ((my_wc_t) (s[1] ^ 0x80) << 12) |
+ ((my_wc_t) (s[2] ^ 0x80) << 6) |
(my_wc_t) (s[3] ^ 0x80);
-
+
return 4;
}
- else if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
+ else if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
{
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOFEW(0);
-
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
- (s[4] ^ 0x80) < 0x40 &&
+
+ if (!((s[1] ^ 0x80) < 0x40 &&
+ (s[2] ^ 0x80) < 0x40 &&
+ (s[3] ^ 0x80) < 0x40 &&
+ (s[4] ^ 0x80) < 0x40 &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
-
- *pwc = ((my_wc_t) (c & 0x03) << 24) |
+
+ *pwc = ((my_wc_t) (c & 0x03) << 24) |
((my_wc_t) (s[1] ^ 0x80) << 18) |
((my_wc_t) (s[2] ^ 0x80) << 12) |
((my_wc_t) (s[3] ^ 0x80) << 6) |
(my_wc_t) (s[4] ^ 0x80);
return 5;
- }
- else if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
+ }
+ else if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
{
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOFEW(0);
-
- if (!((s[1] ^ 0x80) < 0x40 &&
- (s[2] ^ 0x80) < 0x40 &&
- (s[3] ^ 0x80) < 0x40 &&
- (s[4] ^ 0x80) < 0x40 &&
- (s[5] ^ 0x80) < 0x40 &&
+
+ if (!((s[1] ^ 0x80) < 0x40 &&
+ (s[2] ^ 0x80) < 0x40 &&
+ (s[3] ^ 0x80) < 0x40 &&
+ (s[4] ^ 0x80) < 0x40 &&
+ (s[5] ^ 0x80) < 0x40 &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
-
+
*pwc = ((my_wc_t) (c & 0x01) << 30)
| ((my_wc_t) (s[1] ^ 0x80) << 24)
| ((my_wc_t) (s[2] ^ 0x80) << 18)
@@ -1702,12 +1711,12 @@ static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) ,
if (r >= e)
return MY_CS_TOOSMALL;
-
- if (wc < 0x80)
+
+ if (wc < 0x80)
count = 1;
- else if (wc < 0x800)
+ else if (wc < 0x800)
count = 2;
- else if (wc < 0x10000)
+ else if (wc < 0x10000)
count = 3;
#ifdef UNICODE_32BIT
else if (wc < 0x200000)
@@ -1718,15 +1727,15 @@ static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) ,
count = 6;
#endif
else return MY_CS_ILUNI;
-
- /*
- e is a character after the string r, not the last character of it.
+
+ /*
+ e is a character after the string r, not the last character of it.
Because of it (r+count > e), not (r+count-1 >e )
*/
- if ( r+count > e )
+ if ( r+count > e )
return MY_CS_TOOSMALL;
-
- switch (count) {
+
+ switch (count) {
/* Fall through all cases!!! */
#ifdef UNICODE_32BIT
case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
@@ -1806,8 +1815,8 @@ static void my_casedn_str_utf8(CHARSET_INFO *cs, char * s)
}
-static int my_strnncoll_utf8(CHARSET_INFO *cs,
- const uchar *s, uint slen,
+static int my_strnncoll_utf8(CHARSET_INFO *cs,
+ const uchar *s, uint slen,
const uchar *t, uint tlen,
my_bool t_is_prefix)
{
@@ -1821,13 +1830,13 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs,
int plane;
s_res=my_utf8_uni(cs,&s_wc, s, se);
t_res=my_utf8_uni(cs,&t_wc, t, te);
-
+
if ( s_res <= 0 || t_res <= 0 )
{
- /* Incorrect string, compare by char value */
- return ((int)s[0]-(int)t[0]);
+ /* Incorrect string, compare byte by byte value */
+ return bincmp(s, se, t, te);
}
-
+
plane=(s_wc>>8) & 0xFF;
s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
plane=(t_wc>>8) & 0xFF;
@@ -1836,7 +1845,7 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs,
{
return ((int) s_wc) - ((int) t_wc);
}
-
+
s+=s_res;
t+=t_res;
}
@@ -1850,11 +1859,11 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs,
SYNOPSIS
my_strnncollsp_utf8()
- cs character set handler
- a First string to compare
- a_length Length of 'a'
- b Second string to compare
- b_length Length of 'b'
+ cs character set handler
+ a First string to compare
+ a_length Length of 'a'
+ b Second string to compare
+ b_length Length of 'b'
IMPLEMENTATION
If one string is shorter as the other, then we space extend the other
@@ -1867,32 +1876,32 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs,
"a\0" < "a "
RETURN
- < 0 a < b
- = 0 a == b
- > 0 a > b
+ < 0 a < b
+ = 0 a == b
+ > 0 a > b
*/
-static int my_strnncollsp_utf8(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- const uchar *t, uint tlen)
+static int my_strnncollsp_utf8(CHARSET_INFO *cs,
+ const uchar *s, uint slen,
+ const uchar *t, uint tlen)
{
int s_res,t_res;
my_wc_t s_wc,t_wc;
const uchar *se= s+slen;
const uchar *te= t+tlen;
-
+
while ( s < se && t < te )
{
int plane;
s_res=my_utf8_uni(cs,&s_wc, s, se);
t_res=my_utf8_uni(cs,&t_wc, t, te);
-
+
if ( s_res <= 0 || t_res <= 0 )
{
- /* Incorrect string, compare by char value */
- return ((int)s[0]-(int)t[0]);
+ /* Incorrect string, compare byte by byte value */
+ return bincmp(s, se, t, te);
}
-
+
plane=(s_wc>>8) & 0xFF;
s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
plane=(t_wc>>8) & 0xFF;
@@ -1901,14 +1910,14 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs,
{
return ((int) s_wc) - ((int) t_wc);
}
-
+
s+=s_res;
t+=t_res;
}
-
+
slen= se-s;
tlen= te-t;
-
+
if (slen != tlen)
{
int swap= 0;
@@ -1940,35 +1949,35 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs,
static int my_strncasecmp_utf8(CHARSET_INFO *cs,
- const char *s, const char *t, uint len)
+ const char *s, const char *t, uint len)
{
int s_res,t_res;
my_wc_t s_wc,t_wc;
const char *se=s+len;
const char *te=t+len;
-
+
while ( s < se && t < te )
{
int plane;
-
+
s_res=my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*)se);
t_res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*)te);
-
+
if ( s_res <= 0 || t_res <= 0 )
{
- /* Incorrect string, compare by char value */
- return ((int)s[0]-(int)t[0]);
+ /* Incorrect string, compare byte by byte value */
+ return bincmp(s, se, t, te);
}
-
+
plane=(s_wc>>8) & 0xFF;
s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
plane=(t_wc>>8) & 0xFF;
t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
-
+
if ( s_wc != t_wc )
return ((int) s_wc) - ((int) t_wc);
-
+
s+=s_res;
t+=t_res;
}
@@ -1983,9 +1992,9 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
return my_strncasecmp_utf8(cs, s, t, len);
}
-static int my_strnxfrm_utf8(CHARSET_INFO *cs,
- uchar *dst, uint dstlen,
- const uchar *src, uint srclen)
+static int my_strnxfrm_utf8(CHARSET_INFO *cs,
+ uchar *dst, uint dstlen,
+ const uchar *src, uint srclen)
{
my_wc_t wc;
int res;
@@ -2002,10 +2011,10 @@ static int my_strnxfrm_utf8(CHARSET_INFO *cs,
}
src+=res;
srclen-=res;
-
+
plane=(wc>>8) & 0xFF;
wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc;
-
+
if ((res=my_uni_utf8(cs,wc,dst,de)) <0)
{
break;
@@ -2026,18 +2035,18 @@ static int my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)) , uint c)
{
if (c < 0x80)
return 1;
- else if (c < 0xc2)
+ else if (c < 0xc2)
return 0; /* Illegal mb head */
- else if (c < 0xe0)
+ else if (c < 0xe0)
return 2;
- else if (c < 0xf0)
+ else if (c < 0xf0)
return 3;
#ifdef UNICODE_32BIT
- else if (c < 0xf8)
+ else if (c < 0xf8)
return 4;
- else if (c < 0xfc)
+ else if (c < 0xfc)
return 5;
- else if (c < 0xfe)
+ else if (c < 0xfe)
return 6;
#endif
return 0; /* Illegal mb head */;
@@ -2046,7 +2055,7 @@ static int my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)) , uint c)
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
- NULL, /* init */
+ NULL, /* init */
my_strnncoll_utf8,
my_strnncollsp_utf8,
my_strnxfrm_utf8,
@@ -2059,7 +2068,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
MY_CHARSET_HANDLER my_charset_utf8_handler=
{
- NULL, /* init */
+ NULL, /* init */
my_ismbchar_utf8,
my_mbcharlen_utf8,
my_numchars_mb,
@@ -2089,27 +2098,27 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
CHARSET_INFO my_charset_utf8_general_ci=
{
- 33,0,0, /* number */
- MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE, /* state */
- "utf8", /* cs name */
- "utf8_general_ci", /* name */
- "", /* comment */
- NULL, /* tailoring */
- ctype_utf8, /* ctype */
- to_lower_utf8, /* to_lower */
- to_upper_utf8, /* to_upper */
- to_upper_utf8, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 1, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 0, /* min_sort_char */
- 255, /* max_sort_char */
+ 33,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE, /* state */
+ "utf8", /* cs name */
+ "utf8_general_ci", /* name */
+ "", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8, /* ctype */
+ to_lower_utf8, /* to_lower */
+ to_upper_utf8, /* to_upper */
+ to_upper_utf8, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* mbminlen */
+ 3, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 255, /* max_sort_char */
&my_charset_utf8_handler,
&my_collation_ci_handler
};
@@ -2117,27 +2126,27 @@ CHARSET_INFO my_charset_utf8_general_ci=
CHARSET_INFO my_charset_utf8_bin=
{
- 83,0,0, /* number */
- MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE, /* state */
- "utf8", /* cs name */
- "utf8_bin", /* name */
- "", /* comment */
- NULL, /* tailoring */
- ctype_utf8, /* ctype */
- to_lower_utf8, /* to_lower */
- to_upper_utf8, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 1, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 0, /* min_sort_char */
- 255, /* max_sort_char */
+ 83,0,0, /* number */
+ MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE, /* state */
+ "utf8", /* cs name */
+ "utf8_bin", /* name */
+ "", /* comment */
+ NULL, /* tailoring */
+ ctype_utf8, /* ctype */
+ to_lower_utf8, /* to_lower */
+ to_upper_utf8, /* to_upper */
+ NULL, /* sort_order */
+ NULL, /* contractions */
+ NULL, /* sort_order_big*/
+ NULL, /* tab_to_uni */
+ NULL, /* tab_from_uni */
+ NULL, /* state_map */
+ NULL, /* ident_map */
+ 1, /* strxfrm_multiply */
+ 1, /* mbminlen */
+ 3, /* mbmaxlen */
+ 0, /* min_sort_char */
+ 255, /* max_sort_char */
&my_charset_utf8_handler,
&my_collation_mb_bin_handler
};
@@ -2155,8 +2164,8 @@ static void test_mb(CHARSET_INFO *cs, uchar *s)
int len=my_mbcharlen_utf8(cs,*s);
while(len--)
{
- printf("%c",*s);
- s++;
+ printf("%c",*s);
+ s++;
}
printf("\n");
}
@@ -2172,23 +2181,23 @@ int main()
{
char str[1024]=" utf8 test проба ПЕРА по-РУССКИ";
CHARSET_INFO *cs;
-
+
test_mb(cs,(uchar*)str);
-
+
printf("orig :'%s'\n",str);
-
+
my_caseup_utf8(cs,str,15);
printf("caseup :'%s'\n",str);
-
+
my_caseup_str_utf8(cs,str);
printf("caseup_str:'%s'\n",str);
-
+
my_casedn_utf8(cs,str,15);
printf("casedn :'%s'\n",str);
-
+
my_casedn_str_utf8(cs,str);
printf("casedn_str:'%s'\n",str);
-
+
return 0;
}