ctype-utf8.c:

A faster UTF8 null-terminated string implementation. It is used for identifier comparison, so it's quite critical.
author: bar@mysql.com <> 2004-10-20 18:07:59 +0500
committer: bar@mysql.com <> 2004-10-20 18:07:59 +0500
commit: ed2a655a20d45e165cfc3ee904fce84d2e9b46b5 (patch)
tree: 8fda8fbf818ad79785bcfbb1d49f020213fa8326 /strings
parent: 0130f4669a67cb64dc22dad6a70dbdf7838bd3fb (diff)
download: mariadb-git-ed2a655a20d45e165cfc3ee904fce84d2e9b46b5.tar.gz
1 files changed, 87 insertions, 33 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index f5d40fb8ded..c08a1c0acfb 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2103,49 +2103,103 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs,
 }
 
 
-static int my_strncasecmp_utf8(CHARSET_INFO *cs,
-                const char *s, const char *t,  uint len)
-{
-  int s_res,t_res;
-  my_wc_t s_wc,t_wc;
-  const char *se=s+len;
-  const char *te=t+len;
-
-  while ( s < se && t < te )
-  {
-    int plane;
+/*
+  Compare 0-terminated UTF8 strings.
 
-    s_res=my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*)se);
-    t_res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*)te);
+  SYNOPSIS
+    my_strcasecmp_utf8()
+    cs                  character set handler
+    s                   First 0-terminated string to compare
+    t                   Second 0-terminated string to compare
 
-    if ( s_res <= 0 || t_res <= 0 )
-    {
-      /* Incorrect string, compare byte by byte value */
-      return bincmp(s, se, t, te);
-    }
+  IMPLEMENTATION
 
-    plane=(s_wc>>8) & 0xFF;
-    s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
+  RETURN
+    - negative number if s < t
+    - positive number if s > t
+    - 0 is the strings are equal
+*/
 
-    plane=(t_wc>>8) & 0xFF;
-    t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
+static
+int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
+{
+  while (s[0] && t[0])
+  {
+    my_wc_t s_wc,t_wc;
 
+    if (s[0] >= 0)
+    {
+      /* 
+        s[0] is between 0 and 127.
+        It represents a single byte character.
+        Convert it into weight according to collation.
+      */
+      s_wc= plane00[(uchar) s[0]].tolower;
+      s++;
+    }
+    else
+    {
+      int plane, res;
+      
+      /*
+        Scan a multibyte character.
+
+        In the future it is worth to write a special version of my_utf8_uni()
+        for 0-terminated strings which will not take in account length. Now
+        we call the regular version of my_utf8_uni() with s+3 in the
+        last argument. s+3 is enough to scan any multibyte sequence.
+
+        Calling the regular version of my_utf8_uni is safe for 0-terminated
+        strings: we will never lose the end of the string:
+        If we have 0 character in the middle of a multibyte sequence,
+        then my_utf8_uni will always return a negative number, so the
+        loop with finish.
+      */
+      
+      res= my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*) s + 3);
+      
+      /* 
+         In the case of wrong multibyte sequence we will
+         call strcmp() for byte-to-byte comparison.
+      */
+      if (res <= 0)
+        return strcmp(s, t);
+      s+= res;
+      
+      /* Convert Unicode code into weight according to collation */
+      plane=(s_wc>>8) & 0xFF;
+      s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
+    }
+    
+    
+    /* Do the same for the second string */
+    
+    if (t[0] >= 0)
+    {
+      /* Convert single byte character into weight */
+      t_wc= plane00[(uchar) t[0]].tolower;
+      t++;
+    }
+    else
+    {
+      int plane;
+      int res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*) t + 3);
+      if (res <= 0)
+        return strcmp(s, t);
+      t+= res;
+      
+      /* Convert code into weight */
+      plane=(t_wc>>8) & 0xFF;
+      t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
+    }
+    
+    /* Now we have two weights, let's compare them */
     if ( s_wc != t_wc )
       return  ((int) s_wc) - ((int) t_wc);
-
-    s+=s_res;
-    t+=t_res;
   }
-  return ( (se-s) - (te-t) );
+  return ((int)(uchar)s[0]) - ((int) (uchar) t[0]);
 }
 
-static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
-{
-  uint s_len=strlen(s);
-  uint t_len=strlen(t);
-  uint len = (s_len > t_len) ? s_len : t_len;
-  return  my_strncasecmp_utf8(cs, s, t, len);
-}
 
 static
 int my_wildcmp_utf8(CHARSET_INFO *cs,
author	bar@mysql.com <>	2004-10-20 18:07:59 +0500
committer	bar@mysql.com <>	2004-10-20 18:07:59 +0500
commit	ed2a655a20d45e165cfc3ee904fce84d2e9b46b5 (patch)
tree	8fda8fbf818ad79785bcfbb1d49f020213fa8326 /strings
parent	0130f4669a67cb64dc22dad6a70dbdf7838bd3fb (diff)
download	mariadb-git-ed2a655a20d45e165cfc3ee904fce84d2e9b46b5.tar.gz