summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorunknown <bar@bar.intranet.mysql.r18.ru>2004-03-26 18:14:39 +0400
committerunknown <bar@bar.intranet.mysql.r18.ru>2004-03-26 18:14:39 +0400
commit63e1d22f8f46966c13d88a4f2e9acd7fa3e9c9b6 (patch)
tree0473522f9282da1daba81a8db274d031c52e0fe8 /strings
parent403948cbb3a27da905269857616c60c8fc4675ba (diff)
downloadmariadb-git-63e1d22f8f46966c13d88a4f2e9acd7fa3e9c9b6.tar.gz
UTF8 now process space as PAD character correctly.
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-utf8.c96
1 files changed, 88 insertions, 8 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 886ecfbd0c9..82787f2b65f 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1837,18 +1837,98 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs,
}
+
/*
- TODO: Has to be fixed as strnncollsp in ctype-simple
+ Compare strings, discarding end space
+
+ SYNOPSIS
+ my_strnncollsp_utf8()
+ cs character set handler
+ a First string to compare
+ a_length Length of 'a'
+ b Second string to compare
+ b_length Length of 'b'
+
+ IMPLEMENTATION
+ If one string is shorter as the other, then we space extend the other
+ so that the strings have equal length.
+
+ This will ensure that the following things hold:
+
+ "a" == "a "
+ "a\0" < "a"
+ "a\0" < "a "
+
+ RETURN
+ < 0 a < b
+ = 0 a == b
+ > 0 a > b
*/
-static
-int my_strnncollsp_utf8(CHARSET_INFO * cs,
- const uchar *s, uint slen,
- const uchar *t, uint tlen)
+static int my_strnncollsp_utf8(CHARSET_INFO *cs,
+ const uchar *s, uint slen,
+ const uchar *t, uint tlen)
{
- for ( ; slen && s[slen-1] == ' ' ; slen--);
- for ( ; tlen && t[tlen-1] == ' ' ; tlen--);
- return my_strnncoll_utf8(cs,s,slen,t,tlen);
+ int s_res,t_res;
+ my_wc_t s_wc,t_wc;
+ const uchar *se= s+slen;
+ const uchar *te= t+tlen;
+
+ while ( s < se && t < te )
+ {
+ int plane;
+ s_res=my_utf8_uni(cs,&s_wc, s, se);
+ t_res=my_utf8_uni(cs,&t_wc, t, te);
+
+ if ( s_res <= 0 || t_res <= 0 )
+ {
+ /* Incorrect string, compare by char value */
+ return ((int)s[0]-(int)t[0]);
+ }
+
+ plane=(s_wc>>8) & 0xFF;
+ s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
+ plane=(t_wc>>8) & 0xFF;
+ t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc;
+ if ( s_wc != t_wc )
+ {
+ return ((int) s_wc) - ((int) t_wc);
+ }
+
+ s+=s_res;
+ t+=t_res;
+ }
+
+ slen= se-s;
+ tlen= te-t;
+
+ if (slen != tlen)
+ {
+ int swap= 0;
+ if (slen < tlen)
+ {
+ slen= tlen;
+ s= t;
+ se= te;
+ swap= -1;
+ }
+ /*
+ This following loop uses the fact that in UTF-8
+ all multibyte characters are greater than space,
+ and all multibyte head characters are greater than
+ space. It means if we meet a character greater
+ than space, it always means that the longer string
+ is greater. So we can reuse the same loop from the
+ 8bit version, without having to process full multibute
+ sequences.
+ */
+ for ( ; s < se; s++)
+ {
+ if (*s != ' ')
+ return ((int)*s - (int) ' ') ^ swap;
+ }
+ }
+ return 0;
}