merge with 4.1

BitKeeper/etc/ignore: auto-union BitKeeper/etc/logging_ok: auto-union BitKeeper/triggers/post-commit: Auto merged Docs/Support/texi2html: Auto merged Makefile.am: Auto merged client/Makefile.am: Auto merged client/mysql.cc: Auto merged client/mysqldump.c: Auto merged include/my_base.h: Auto merged include/my_global.h: Auto merged include/my_pthread.h: Auto merged include/my_sys.h: Auto merged include/my_time.h: Auto merged include/mysql.h: Auto merged include/mysql_com.h: Auto merged innobase/buf/buf0buf.c: Auto merged innobase/include/row0mysql.h: Auto merged innobase/row/row0sel.c: Auto merged libmysql/libmysql.c: Auto merged libmysqld/examples/Makefile.am: Auto merged myisam/mi_check.c: Auto merged mysql-test/include/ps_modify.inc: Auto merged mysql-test/install_test_db.sh: Auto merged mysql-test/r/alter_table.result: Auto merged mysql-test/r/auto_increment.result: Auto merged mysql-test/r/bdb.result: Auto merged mysql-test/r/ctype_latin1_de.result: Auto merged mysql-test/r/ctype_recoding.result: Auto merged mysql-test/r/fulltext.result: Auto merged mysql-test/r/func_gconcat.result: Auto merged mysql-test/r/func_group.result: Auto merged mysql-test/r/func_if.result: Auto merged mysql-test/t/derived.test: Auto merged mysql-test/t/insert.test: merge with 4.1 Fixed test case to not use 'if exists' when it shouldn't mysql-test/t/range.test: merge with 4.1 Added missing drop table sql/ha_ndbcluster.cc: merge with 4.1 Simple optimization: use max() instead of ? : sql/item_func.cc: merge with 4.1 (Added back old variable names for easier merges) sql/opt_range.cc: merge with 4.1 Removed argument 'parent_alloc' from QUICK_RANGE_SELECT as this was not used Added assert if using QUICK_GROUP_MIN_MAX_SELECT with parent_alloc as the init() function can't handle this Changed back get_quick_select_for_ref() to use it's own alloc root becasue this function may be called several times for one query sql/sql_handler.cc: merge with 4.1 change variable 'err' to 'error' as same function had a label named 'err' sql/sql_update.cc: Use multi-update code from 5.0 instead of 4.1 We will fix the locking code shortly in 5.0 to be faster than in 4.1
author: unknown <monty@mysql.com> 2004-10-29 19:26:52 +0300
committer: unknown <monty@mysql.com> 2004-10-29 19:26:52 +0300
commit: f095274fe8c3d3394d6c0ce0a68f4bea04311999 (patch)
tree: 23bcc9a71fe7237887a111b158e30f5a6bb665d3 /strings
parent: f41bba8c6156a7adf4c67dfa75e16112767a5d3c (diff)
parent: 5be6c328f5a9f78f37176bbbd88a538fa3b65fe9 (diff)
download: mariadb-git-f095274fe8c3d3394d6c0ce0a68f4bea04311999.tar.gz
17 files changed, 1083 insertions, 283 deletions
diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt
new file mode 100644
index 00000000000..f7a10f95880
--- /dev/null
+++ b/strings/CHARSET_INFO.txt
@@ -0,0 +1,230 @@
+
+CHARSET_INFO
+============
+A structure containing data for charset+collation pair implementation. 
+
+Virtual functions which use this data are collected
+into separate structures MY_CHARSET_HANDLER and
+MY_COLLATION_HANDLER.
+
+
+typedef struct charset_info_st
+{
+  uint      number;
+  uint      primary_number;
+  uint      binary_number;
+  uint      state;
+
+  const char *csname;
+  const char *name;
+  const char *comment;
+
+  uchar    *ctype;
+  uchar    *to_lower;
+  uchar    *to_upper;
+  uchar    *sort_order;
+
+  uint16      *tab_to_uni;
+  MY_UNI_IDX  *tab_from_uni;
+
+  uchar state_map[256];
+  uchar ident_map[256];
+
+  uint      strxfrm_multiply;
+  uint      mbminlen;
+  uint      mbmaxlen;
+  char      max_sort_char; /* For LIKE optimization */
+
+  MY_CHARSET_HANDLER *cset;
+  MY_COLLATION_HANDLER *coll;
+
+} CHARSET_INFO;
+
+
+CHARSET_INFO fields description:
+===============================
+
+
+Numbers (identifiers)
+---------------------
+
+number - an ID uniquely identifying this charset+collation pair.
+
+primary_number - ID of a charset+collation pair, which consists
+of the same character set and the default collation of this
+character set. Not really used now. Intended to optimize some
+parts of the code where we need to find the default collation
+using its non-default counterpart for the given character set.
+
+binary_numner - ID of a charset+collation pair, which consists
+of the same character set and the binary collation of this
+character set. Not really used now. 
+
+Names
+-----
+
+  csname  - name of the character set for this charset+collation pair.
+  name    - name of the collation for this charset+collation pair.
+  comment - a text comment, dysplayed in "Description" column of
+            SHOW CHARACTER SET output.
+
+Conversion tables
+-----------------
+  
+  ctype      - pointer to array[257] of "type of characters"
+               bit mask for each chatacter, e.g. if a 
+               character is a digit or a letter or a separator, etc.
+
+               Monty 2004-10-21:
+                 If you look at the macros, we use ctype[(char)+1].
+                 ctype[0] is traditionally in most ctype libraries
+                 reserved for EOF (-1). The idea is that you can use
+                 the result from fgetc() directly with ctype[]. As
+                 we have to be compatible with external ctype[] versions,
+                 it's better to do it the same way as they do...
+
+  to_lower   - pointer to array[256] used in LCASE()
+  to_upper   - pointer to array[256] used in UCASE()
+  sort_order - pointer to array[256] used for strings comparison
+
+
+
+Unicode conversion data
+-----------------------
+For 8bit character sets:
+
+tab_to_uni  : array[256] of charset->Unicode translation
+tab_from_uni: a structure for Unicode->charset translation
+
+Non-8 bit charsets have their own structures per charset
+hidden in correspondent ctype-xxx.c file and don't use
+tab_to_uni and tab_from_uni tables.
+
+
+Parser maps
+-----------
+state_map[]
+ident_map[]
+
+ These maps are to quickly identify if a character is
+an identificator part, a digit, a special character, 
+or a part of other SQL language lexical item.
+
+Probably can be combined with ctype array in the future.
+But for some reasons these two arrays are used in the parser,
+while a separate ctype[] array is used in the other part of the
+code, like fulltext, etc.
+
+
+Misc fields
+-----------
+
+  strxfrm_multiply - how many times a sort key (i.e. a string
+                     which can be passed into memcmp() for comparison)
+                     can be longer than the original string. 
+                     Usually it is 1. For some complex
+                     collations it can be bigger. For example
+                     in latin1_german2_ci, a sort key is up to
+                     twice longer than the original string.
+                     e.g. Letter 'A' with two dots above is
+                     substituted with 'AE'. 
+  mbminlen         - mininum multibyte sequence length.
+                     Now always 1 except ucs2. For ucs2
+                     it is 2.
+  mbmaxlen         - maximum multibyte sequence length.
+                     1 for 8bit charsets. Can be also 2 or 3.
+
+
+
+MY_CHARSET_HANDLER
+==================
+
+MY_CHARSET_HANDLER is a collection of character-set
+related routines. Defined in m_ctype.h. Have the 
+following set of functions:
+
+Multibyte routines
+------------------
+ismbchar()  - detects if the given string is a multibyte sequence
+mbcharlen() - returns length of multibyte sequence starting with
+              the given character
+numchars()  - returns number of characters in the given string, e.g.
+              in SQL function CHAR_LENGTH().
+charpos()   - calculates the offset of the given position in the string.
+              Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), 
+              INSERT()
+
+well_formed_length()
+            - finds the length of correctly formed multybyte beginning.
+              Used in INSERTs to cut a beginning of the given string
+              which is
+              a) "well formed" according to the given character set.
+              b)  can fit into the given data type
+              Terminates the string in the good position, taking in account
+              multibyte character boundaries.
+
+lengthsp()  - returns the length of the given string without traling spaces.
+
+
+Unicode conversion routines
+---------------------------
+mb_wc       - converts the left multibyte sequence into it Unicode code.
+mc_mb       - converts the given Unicode code into multibyte sequence.
+
+
+Case and sort convertion
+------------------------
+caseup_str  - converts the given 0-terminated string into the upper case
+casedn_str  - converts the given 0-terminated string into the lower case
+caseup      - converts the given string into the lower case using length
+casedn      - converts the given string into the lower case using length
+
+Number-to-string conversion routines
+------------------------------------
+snprintf()
+long10_to_str()
+longlong10_to_str()
+
+The names are pretty self-descripting.
+
+String padding routines
+-----------------------
+fill()     - writes the given Unicode value into the given string
+             with the given length. Used to pad the string, usually
+             with space character, according to the given charset.
+
+String-to-numner conversion routines
+------------------------------------
+strntol()
+strntoul()
+strntoll()
+strntoull()
+strntod()
+
+These functions are almost for the same thing with their
+STDLIB counterparts, but also:
+  - accept length instead of 0-terminator
+  - and are character set dependant
+
+Simple scanner routines
+-----------------------
+scan()    - to skip leading spaces in the given string.
+            Used when a string value is inserted into a numeric field.
+
+
+
+MY_COLLATION_HANDLER
+====================
+strnncoll()   - compares two strings according to the given collation
+strnncollsp() - like the above but ignores trailing spaces
+strnxfrm()    - makes a sort key suitable for memcmp() corresponding
+                to the given string
+like_range()  - creates a LIKE range, for optimizer
+wildcmp()     - wildcard comparison, for LIKE
+strcasecmp()  - 0-terminated string comparison
+instr()       - finds the first substring appearence in the string
+hash_sort()   - calculates hash value taking in account
+                the collation rules, e.g. case-insensitivity, 
+                accent sensitivity, etc.
+
+ 
+\ No newline at end of file
diff --git a/strings/Makefile.am b/strings/Makefile.am
index 31b5195d5cb..f8fcfbc5ea3 100644
--- a/strings/Makefile.am
+++ b/strings/Makefile.am
@@ -57,7 +57,7 @@ EXTRA_DIST =		ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-win1250ch.c \
 			t_ctype.h
 
 libmystrings_a_LIBADD=
-conf_to_src_SOURCES = conf_to_src.c xml.c ctype.c
+conf_to_src_SOURCES = conf_to_src.c xml.c ctype.c bcmp.c
 conf_to_src_LDADD=
 #force static linking of conf_to_src - essential when linking against
 #custom installation of libc
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index 3f35f7504ac..8345c53202c 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -6290,7 +6290,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
   my_charpos_mb,
   my_well_formed_len_mb,
   my_lengthsp_8bit,
-  my_numcells_mb,
+  my_numcells_8bit,
   my_mb_wc_big5,	/* mb_wc       */
   my_wc_mb_big5,	/* wc_mb       */
   my_caseup_str_mb,
@@ -6306,6 +6306,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
   my_strntoll_8bit,
   my_strntoull_8bit,
   my_strntod_8bit,
+  my_strtoll10_8bit,
   my_scan_8bit
 };
 
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 42dc0ab086d..7d17f62c8d0 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -465,6 +465,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntoll_8bit,
     my_strntoull_8bit,
     my_strntod_8bit,
+    my_strtoll10_8bit,
     my_scan_8bit
 };
 
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index 43a50b0dfbe..ee792d9c3e4 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -8657,7 +8657,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_charpos_mb,
   my_well_formed_len_mb,
   my_lengthsp_8bit,
-  my_numcells_mb,
+  my_numcells_8bit,
   my_mb_wc_euc_kr,	/* mb_wc   */
   my_wc_mb_euc_kr,	/* wc_mb   */
   my_caseup_str_mb,
@@ -8673,6 +8673,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntoll_8bit,
   my_strntoull_8bit,
   my_strntod_8bit,
+  my_strtoll10_8bit,
   my_scan_8bit
 };
 
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 8d97ac9ca1d..f17cc94723f 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -5708,7 +5708,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_charpos_mb,
   my_well_formed_len_mb,
   my_lengthsp_8bit,
-  my_numcells_mb,
+  my_numcells_8bit,
   my_mb_wc_gb2312,	/* mb_wc      */
   my_wc_mb_gb2312,	/* wc_mb      */
   my_caseup_str_mb,
@@ -5724,6 +5724,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntoll_8bit,
   my_strntoull_8bit,
   my_strntod_8bit,
+  my_strtoll10_8bit,
   my_scan_8bit
 };
 
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 9400fb08f2b..0be56e8d946 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -9939,7 +9939,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_charpos_mb,
   my_well_formed_len_mb,
   my_lengthsp_8bit,
-  my_numcells_mb,
+  my_numcells_8bit,
   my_mb_wc_gbk,
   my_wc_mb_gbk,
   my_caseup_str_mb,
@@ -9955,6 +9955,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntoll_8bit,
   my_strntoull_8bit,
   my_strntod_8bit,
+  my_strtoll10_8bit,
   my_scan_8bit
 };
 
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index aea517811ab..5f1850b7772 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -403,6 +403,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntoll_8bit,
     my_strntoull_8bit,
     my_strntod_8bit,
+    my_strtoll10_8bit,
     my_scan_8bit
 };
 
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index 2548a68ab19..7d81766c4cb 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -123,8 +123,7 @@ int my_strcasecmp_mb(CHARSET_INFO * cs,const char *s, const char *t)
 **	 1 if matched with wildcard
 */
 
-#define INC_PTR(cs,A,B) A+=((use_mb_flag && \
-                          my_ismbchar(cs,A,B)) ? my_ismbchar(cs,A,B) : 1)
+#define INC_PTR(cs,A,B) A+=(my_ismbchar(cs,A,B) ? my_ismbchar(cs,A,B) : 1)
 
 #define likeconv(s,A) (uchar) (s)->sort_order[(uchar) (A)]
 
@@ -135,8 +134,6 @@ int my_wildcmp_mb(CHARSET_INFO *cs,
 {
   int result= -1;				/* Not found, using wildcards */
 
-  bool use_mb_flag=use_mb(cs);
-
   while (wildstr != wildend)
   {
     while (*wildstr != w_many && *wildstr != w_one)
@@ -144,8 +141,7 @@ int my_wildcmp_mb(CHARSET_INFO *cs,
       int l;
       if (*wildstr == escape && wildstr+1 != wildend)
 	wildstr++;
-      if (use_mb_flag &&
-          (l = my_ismbchar(cs, wildstr, wildend)))
+      if ((l = my_ismbchar(cs, wildstr, wildend)))
       {
 	  if (str+l > str_end || memcmp(str, wildstr, l) != 0)
 	      return 1;
@@ -200,41 +196,30 @@ int my_wildcmp_mb(CHARSET_INFO *cs,
 	cmp= *++wildstr;
 	
       mb=wildstr;
-      LINT_INIT(mblen);
-      if (use_mb_flag)
-        mblen = my_ismbchar(cs, wildstr, wildend);
+      mblen= my_ismbchar(cs, wildstr, wildend);
       INC_PTR(cs,wildstr,wildend);		/* This is compared trough cmp */
       cmp=likeconv(cs,cmp);   
       do
       {
-        if (use_mb_flag)
-	{
-          for (;;)
+        for (;;)
+        {
+          if (str >= str_end)
+            return -1;
+          if (mblen)
           {
-            if (str >= str_end)
-              return -1;
-            if (mblen)
+            if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
             {
-              if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
-              {
-                str += mblen;
-                break;
-              }
-            }
-            else if (!my_ismbchar(cs, str, str_end) &&
-                     likeconv(cs,*str) == cmp)
-            {
-              str++;
+              str += mblen;
               break;
             }
-            INC_PTR(cs,str, str_end);
           }
-	}
-        else
-        {
-          while (str != str_end && likeconv(cs,*str) != cmp)
+          else if (!my_ismbchar(cs, str, str_end) &&
+                   likeconv(cs,*str) == cmp)
+          {
             str++;
-          if (str++ == str_end) return (-1);
+            break;
+          }
+          INC_PTR(cs,str, str_end);
         }
 	{
 	  int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one,
@@ -458,6 +443,97 @@ static void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
   }
 }
 
+/*
+** Calculate min_str and max_str that ranges a LIKE string.
+** Arguments:
+** ptr		Pointer to LIKE string.
+** ptr_length	Length of LIKE string.
+** escape	Escape character in LIKE.  (Normally '\').
+**		All escape characters should be removed from min_str and max_str
+** res_length	Length of min_str and max_str.
+** min_str	Smallest case sensitive string that ranges LIKE.
+**		Should be space padded to res_length.
+** max_str	Largest case sensitive string that ranges LIKE.
+**		Normally padded with the biggest character sort value.
+**
+** The function should return 0 if ok and 1 if the LIKE string can't be
+** optimized !
+*/
+
+my_bool my_like_range_mb(CHARSET_INFO *cs,
+			 const char *ptr,uint ptr_length,
+			 pbool escape, pbool w_one, pbool w_many,
+			 uint res_length,
+			 char *min_str,char *max_str,
+			 uint *min_length,uint *max_length)
+{
+  const char *end=ptr+ptr_length;
+  char *min_org=min_str;
+  char *min_end=min_str+res_length;
+  char *max_end=max_str+res_length;
+
+  for (; ptr != end && min_str != min_end ; ptr++)
+  {
+    if (*ptr == escape && ptr+1 != end)
+    {
+      ptr++;					/* Skip escape */
+      *min_str++= *max_str++ = *ptr;
+      continue;
+    }
+    if (*ptr == w_one || *ptr == w_many)	/* '_' and '%' in SQL */
+    {
+      char buf[10];
+      uint buflen;
+      uint charlen= my_charpos(cs, min_org, min_str, res_length/cs->mbmaxlen);
+      
+      if (charlen < (uint) (min_str - min_org))
+        min_str= min_org + charlen;
+      
+      /* Write min key  */
+      *min_length= (uint) (min_str - min_org);
+      *max_length=res_length;
+      do
+      {
+	*min_str++= (char) cs->min_sort_char;
+      } while (min_str != min_end);
+      
+      /* 
+        Write max key: create a buffer with multibyte
+        representation of the max_sort_char character,
+        and copy it into max_str in a loop. 
+      */
+      buflen= cs->cset->wc_mb(cs, cs->max_sort_char, (uchar*) buf,
+                              (uchar*) buf + sizeof(buf));
+      DBUG_ASSERT(buflen > 0);
+      do
+      {
+        if ((max_str + buflen) <= max_end)
+        {
+          /* Enough space for max characer */
+          memcpy(max_str, buf, buflen);
+          max_str+= buflen;
+        }
+        else
+        {
+          /* 
+            There is no space for whole multibyte
+            character, then add trailing spaces.
+          */
+          
+	  *max_str++= ' ';
+	}
+      } while (max_str != max_end);
+      return 0;
+    }
+    *min_str++= *max_str++ = *ptr;
+  }
+  *min_length= *max_length = (uint) (min_str - min_org);
+
+  while (min_str != min_end)
+    *min_str++ = *max_str++ = ' ';	/* Because if key compression */
+  return 0;
+}
+
 static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
 		  const char *str,const char *str_end,
 		  const char *wildstr,const char *wildend,
@@ -465,8 +541,6 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
 {
   int result= -1;				/* Not found, using wildcards */
 
-  bool use_mb_flag=use_mb(cs);
-
   while (wildstr != wildend)
   {
     while (*wildstr != w_many && *wildstr != w_one)
@@ -474,8 +548,7 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
       int l;
       if (*wildstr == escape && wildstr+1 != wildend)
 	wildstr++;
-      if (use_mb_flag &&
-          (l = my_ismbchar(cs, wildstr, wildend)))
+      if ((l = my_ismbchar(cs, wildstr, wildend)))
       {
 	  if (str+l > str_end || memcmp(str, wildstr, l) != 0)
 	      return 1;
@@ -530,42 +603,31 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
 	cmp= *++wildstr;
 	
       mb=wildstr;
-      LINT_INIT(mblen);
-      if (use_mb_flag)
-        mblen = my_ismbchar(cs, wildstr, wildend);
+      mblen= my_ismbchar(cs, wildstr, wildend);
       INC_PTR(cs,wildstr,wildend);		/* This is compared trough cmp */
       do
       {
-        if (use_mb_flag)
-	{
-          for (;;)
+        for (;;)
+        {
+          if (str >= str_end)
+            return -1;
+          if (mblen)
           {
-            if (str >= str_end)
-              return -1;
-            if (mblen)
-            {
-              if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
-              {
-                str += mblen;
-                break;
-              }
-            }
-            else if (!my_ismbchar(cs, str, str_end) && *str == cmp)
+            if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
             {
-              str++;
+              str += mblen;
               break;
             }
-            INC_PTR(cs,str, str_end);
           }
-	}
-        else
-        {
-          while (str != str_end && *str != cmp)
+          else if (!my_ismbchar(cs, str, str_end) && *str == cmp)
+          {
             str++;
-          if (str++ == str_end) return (-1);
+            break;
+          }
+          INC_PTR(cs,str, str_end);
         }
 	{
-	  int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one,w_many);
+	  int tmp=my_wildcmp_mb_bin(cs,str,str_end,wildstr,wildend,escape,w_one,w_many);
 	  if (tmp <= 0)
 	    return (tmp);
 	}
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 84bfcb0b171..a019665a235 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -27,8 +27,7 @@ int my_strnxfrm_simple(CHARSET_INFO * cs,
                        const uchar *src, uint srclen)
 {
   uchar *map= cs->sort_order;
-  DBUG_ASSERT(len >= srclen);
-  len= min(len,srclen);
+  set_if_smaller(len, srclen);
   if (dest != src)
   {
     const uchar *end;
@@ -1284,6 +1283,12 @@ static my_bool my_coll_init_simple(CHARSET_INFO *cs,
 }
 
 
+longlong my_strtoll10_8bit(CHARSET_INFO *cs __attribute__((unused)),
+                           const char *nptr, char **endptr, int *error)
+{
+  return my_strtoll10(nptr, endptr, error);
+}
+
 
 MY_CHARSET_HANDLER my_charset_8bit_handler=
 {
@@ -1310,6 +1315,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
     my_strntoll_8bit,
     my_strntoull_8bit,
     my_strntod_8bit,
+    my_strtoll10_8bit,
     my_scan_8bit
 };
 
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index b4cfee0f24a..4176ff2e538 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -4534,6 +4534,70 @@ my_mb_wc_sjis(CHARSET_INFO *cs  __attribute__((unused)),
   return 2;
 }
 
+static
+uint my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
+                      const char *str, const char *strend)
+{
+  uint clen= 0;
+  const unsigned char *b= (const unsigned char *) str;
+  const unsigned char *e= (const unsigned char *) strend;
+  
+  for (clen= 0; b < e; )
+  {
+    if (*b >= 0xA1 && *b <= 0xDF)
+    {
+      clen++;
+      b++;
+    }
+    else if (*b > 0x7F)
+    {
+      clen+= 2;
+      b+= 2;
+    }
+    else
+    {
+      clen++;
+      b++;
+    }
+  }
+  return clen;
+}
+
+/*
+  Returns a well formed length of a SJIS string.
+  CP932 additional characters are also accepted.
+*/
+static
+uint my_well_formed_len_sjis(CHARSET_INFO *cs __attribute__((unused)),
+                             const char *b, const char *e, uint pos)
+{
+  const char *b0= b;
+  while (pos && b < e)
+  {
+    /*
+      Cast to int8 for extra safety.
+      "char" can be unsigned by default
+      on some platforms.
+    */
+    if (((int8)b[0]) >= 0)
+    {
+      /* Single byte character */
+      b+= 1;
+    }
+    else  if (issjishead((uchar)*b) && (e-b)>1 && issjistail((uchar)b[1]))
+    {
+      /* Double byte character */
+      b+= 2;
+    }
+    else
+    {
+      /* Wrong byte sequence */
+      break;
+    }
+  }
+  return b - b0;
+}
+
 
 static MY_COLLATION_HANDLER my_collation_ci_handler =
 {
@@ -4556,9 +4620,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
   mbcharlen_sjis,
   my_numchars_mb,
   my_charpos_mb,
-  my_well_formed_len_mb,
+  my_well_formed_len_sjis,
   my_lengthsp_8bit,
-  my_numcells_mb,
+  my_numcells_sjis,
   my_mb_wc_sjis,	/* mb_wc */
   my_wc_mb_sjis,	/* wc_mb */
   my_caseup_str_8bit,
@@ -4574,6 +4638,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strntoll_8bit,
   my_strntoull_8bit,
   my_strntod_8bit,
+  my_strtoll10_8bit,
   my_scan_8bit
 };
 
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 420c5b5582e..a2ba4783591 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -946,6 +946,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntoll_8bit,
     my_strntoull_8bit,
     my_strntod_8bit,
+    my_strtoll10_8bit,
     my_scan_8bit
 };
 
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index cecc3be5045..91af7af0c54 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -6658,6 +6658,42 @@ static const char roman[]= /* i.e. Classical Latin */
     "& V << u <<< U ";
 
 /*
+  Persian collation support was provided by 
+  Jody McIntyre <mysql@modernduck.com>
+  
+  To: internals@lists.mysql.com
+  Subject: Persian UTF8 collation support
+  Date: 17.08.2004
+  
+  Contraction is not implemented.  Some implementations do perform
+  contraction but others do not, and it is able to sort all my test
+  strings correctly.
+  
+  Jody.
+*/
+static const char persian[]=
+    "& \\u066D < \\u064E < \\uFE76 < \\uFE77 < \\u0650 < \\uFE7A < \\uFE7B"
+             " < \\u064F < \\uFE78 < \\uFE79 < \\u064B < \\uFE70 < \\uFE71"
+             " < \\u064D < \\uFE74 < \\u064C < \\uFE72"
+    "& \\uFE7F < \\u0653 < \\u0654 < \\u0655 < \\u0670"
+    "& \\u0669 < \\u0622 < \\u0627 < \\u0671 < \\u0621 < \\u0623 < \\u0625"
+             " < \\u0624 < \\u0626"
+    "& \\u0642 < \\u06A9 < \\u0643"
+    "& \\u0648 < \\u0647 < \\u0629 < \\u06C0 < \\u06CC < \\u0649 < \\u064A"
+    "& \\uFE80 < \\uFE81 < \\uFE82 < \\uFE8D < \\uFE8E < \\uFB50 < \\uFB51"
+             " < \\uFE80 < \\uFE83 < \\uFE84 < \\uFE87 < \\uFE88 < \\uFE85"
+             " < \\uFE86 < \\u0689 < \\u068A"
+    "& \\uFEAE < \\uFDFC"
+    "& \\uFED8 < \\uFB8E < \\uFB8F < \\uFB90 < \\uFB91 < \\uFED9 < \\uFEDA"
+             " < \\uFEDB < \\uFEDC"
+    "& \\uFEEE < \\uFEE9 < \\uFEEA < \\uFEEB < \\uFEEC < \\uFE93 < \\uFE94"
+             " < \\uFBA4 < \\uFBA5 < \\uFBFC < \\uFBFD < \\uFBFE < \\uFBFF"
+             " < \\uFEEF < \\uFEF0 < \\uFEF1 < \\uFEF2 < \\uFEF3 < \\uFEF4"
+             " < \\uFEF5 < \\uFEF6 < \\uFEF7 < \\uFEF8 < \\uFEF9 < \\uFEFA"
+             " < \\uFEFB < \\uFEFC";
+
+
+/*
   Unicode Collation Algorithm:
   Collation element (weight) scanner, 
   for consequent scan of collations
@@ -6876,7 +6912,8 @@ static int my_uca_scanner_next_any(my_uca_scanner *scanner)
     int mblen;
     
     if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc, 
-                                          scanner->sbeg, scanner->send)) < 0))
+                                          scanner->sbeg,
+                                          scanner->send)) <= 0))
       return -1;
     
     scanner->page= wc >> 8;
@@ -7015,6 +7052,28 @@ static int my_strnncoll_uca(CHARSET_INFO *cs,
   NOTES:
     Works exactly the same with my_strnncoll_uca(),
     but ignores trailing spaces.
+
+    In the while() comparison these situations are possible:
+    1. (s_res>0) and (t_res>0) and (s_res == t_res)
+       Weights are the same so far, continue comparison
+    2. (s_res>0) and (t_res>0) and (s_res!=t_res)
+       A difference has been found, return.
+    3. (s_res>0) and (t_res<0)
+       We have reached the end of the second string, or found
+       an illegal multibyte sequence in the second string.
+       Compare the first string to an infinite array of
+       space characters until difference is found, or until
+       the end of the first string.
+    4. (s_res<0) and (t_res>0)   
+       We have reached the end of the first string, or found
+       an illegal multibyte sequence in the first string.
+       Compare the second string to an infinite array of
+       space characters until difference is found or until
+       the end of the second steing.
+    5. (s_res<0) and (t_res<0)
+       Both scanners returned -1. It means we have riched
+       the end-of-string of illegal-sequence in both strings
+       at the same time. Return 0, strings are equal.
   
   RETURN
     Difference between two strings, according to the collation:
@@ -7033,9 +7092,6 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs,
   int s_res;
   int t_res;
   
-  slen= cs->cset->lengthsp(cs, (char*) s, slen);
-  tlen= cs->cset->lengthsp(cs, (char*) t, tlen);
-  
   scanner_handler->init(&sscanner, cs, s, slen);
   scanner_handler->init(&tscanner, cs, t, tlen);
   
@@ -7044,6 +7100,36 @@ static int my_strnncollsp_uca(CHARSET_INFO *cs,
     s_res= scanner_handler->next(&sscanner);
     t_res= scanner_handler->next(&tscanner);
   } while ( s_res == t_res && s_res >0);
+
+  if (s_res > 0 && t_res < 0)
+  { 
+    /* Calculate weight for SPACE character */
+    t_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
+      
+    /* compare the first string to spaces */
+    do
+    {
+      if (s_res != t_res)
+        return (s_res - t_res);
+      s_res= scanner_handler->next(&sscanner);
+    } while (s_res > 0);
+    return 0;
+  }
+    
+  if (s_res < 0 && t_res > 0)
+  {
+    /* Calculate weight for SPACE character */
+    s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
+      
+    /* compare the second string to spaces */
+    do
+    {
+      if (s_res != t_res)
+        return (s_res - t_res);
+      t_res= scanner_handler->next(&tscanner);
+    } while (t_res > 0);
+    return 0;
+  }
   
   return ( s_res - t_res );
 }
@@ -7670,7 +7756,7 @@ static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
   return (size_t) nitems;
 }
 
-#define MY_MAX_COLL_RULE 64
+#define MY_MAX_COLL_RULE 128
 
 /*
   This function copies an UCS2 collation from
@@ -7918,7 +8004,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
     my_strnncoll_ucs2_uca,
     my_strnncollsp_ucs2_uca,
     my_strnxfrm_ucs2_uca,
-    my_like_range_simple,
+    my_like_range_ucs2,
     my_wildcmp_uca,
     NULL,
     my_instr_mb,
@@ -8359,6 +8445,35 @@ CHARSET_INFO my_charset_ucs2_roman_uca_ci=
     &my_collation_ucs2_uca_handler
 };
 
+
+CHARSET_INFO my_charset_ucs2_persian_uca_ci=
+{
+    144,0,0,		/* number       */
+    MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
+    "ucs2",		/* cs name    */
+    "ucs2_persian_ci",	/* name         */
+    "",			/* comment      */
+    persian,		/* tailoring    */
+    NULL,		/* ctype        */
+    NULL,		/* to_lower     */
+    NULL,		/* to_upper     */
+    NULL,		/* sort_order   */
+    NULL,		/* contractions */
+    NULL,		/* sort_order_big*/
+    NULL,		/* tab_to_uni   */
+    NULL,		/* tab_from_uni */
+    NULL,		/* state_map    */
+    NULL,		/* ident_map    */
+    8,			/* strxfrm_multiply */
+    2,			/* mbminlen     */
+    2,			/* mbmaxlen     */
+    9,			/* min_sort_char */
+    0xFFFF,		/* max_sort_char */
+    &my_charset_ucs2_handler,
+    &my_collation_ucs2_uca_handler
+};
+
+
 #endif
 
 
@@ -8369,7 +8484,7 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler =
     my_strnncoll_any_uca,
     my_strnncollsp_any_uca,
     my_strnxfrm_any_uca,
-    my_like_range_simple,
+    my_like_range_mb,
     my_wildcmp_uca,
     NULL,
     my_instr_mb,
@@ -8837,4 +8952,32 @@ CHARSET_INFO my_charset_utf8_roman_uca_ci=
     &my_charset_utf8_handler,
     &my_collation_any_uca_handler
 };
+
+CHARSET_INFO my_charset_utf8_persian_uca_ci=
+{
+    208,0,0,		/* number       */
+    MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
+    "utf8",		/* cs name    */
+    "utf8_persian_ci",	/* name         */
+    "",			/* comment      */
+    persian,		/* tailoring    */
+    ctype_utf8,		/* ctype        */
+    NULL,		/* to_lower     */
+    NULL,		/* to_upper     */
+    NULL,		/* sort_order   */
+    NULL,		/* contractions */
+    NULL,		/* sort_order_big*/
+    NULL,		/* tab_to_uni   */
+    NULL,		/* tab_from_uni */
+    NULL,		/* state_map    */
+    NULL,		/* ident_map    */
+    8,			/* strxfrm_multiply */
+    1,			/* mbminlen     */
+    2,			/* mbmaxlen     */
+    9,			/* min_sort_char */
+    0xFFFF,		/* max_sort_char */
+    &my_charset_utf8_handler,
+    &my_collation_any_uca_handler
+};
+
 #endif
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index c6e55ee8f0e..851c2044f47 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -18,6 +18,7 @@
 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
 
 #include <my_global.h>
+#include <my_sys.h>
 #include "m_string.h"
 #include "m_ctype.h"
 #include <errno.h>
@@ -852,7 +853,6 @@ bs:
   return (negative ? -((longlong) res) : (longlong) res);
 }
 
-
 double      my_strntod_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 			   char *nptr, uint length, 
 			   char **endptr, int *err)
@@ -1000,6 +1000,188 @@ cnv:
 }
 
 
+#undef  ULONGLONG_MAX
+#define ULONGLONG_MAX		(~(ulonglong) 0)
+#define MAX_NEGATIVE_NUMBER	((ulonglong) LL(0x8000000000000000))
+#define INIT_CNT  9
+#define LFACTOR   ULL(1000000000)
+#define LFACTOR1  ULL(10000000000)
+#define LFACTOR2  ULL(100000000000)
+
+static unsigned long lfactor[9]=
+{
+  1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L
+};
+
+
+longlong my_strtoll10_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+                           const char *nptr, char **endptr, int *error)
+{
+  const char *s, *end, *start, *n_end, *true_end;
+  unsigned char c;
+  unsigned long i, j, k;
+  ulonglong li;
+  int negative;
+  ulong cutoff, cutoff2, cutoff3;
+
+  s= nptr;
+  /* If fixed length string */
+  if (endptr)
+  {
+    /* Make sure string length is even */
+    end= s + ((*endptr - s) / 2) * 2;
+    while (s < end && !s[0] && (s[1] == ' ' || s[1] == '\t'))
+      s+= 2;
+    if (s == end)
+      goto no_conv;
+  }
+  else
+  {
+     /* We don't support null terminated strings in UCS2 */
+     goto no_conv;
+  }
+
+  /* Check for a sign.	*/
+  negative= 0;
+  if (!s[0] && s[1] == '-')
+  {
+    *error= -1;					/* Mark as negative number */
+    negative= 1;
+    s+= 2;
+    if (s == end)
+      goto no_conv;
+    cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
+    cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
+    cutoff3=  MAX_NEGATIVE_NUMBER % 100;
+  }
+  else
+  {
+    *error= 0;
+    if (!s[0] && s[1] == '+')
+    {
+      s+= 2;
+      if (s == end)
+	goto no_conv;
+    }
+    cutoff=  ULONGLONG_MAX / LFACTOR2;
+    cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
+    cutoff3=  ULONGLONG_MAX % 100;
+  }
+
+  /* Handle case where we have a lot of pre-zero */
+  if (!s[0] && s[1] == '0')
+  {
+    i= 0;
+    do
+    {
+      s+= 2;
+      if (s == end)
+	goto end_i;				/* Return 0 */
+    }
+    while (!s[0] && s[1] == '0');
+    n_end= s + 2 * INIT_CNT;
+  }
+  else
+  {
+    /* Read first digit to check that it's a valid number */
+    if (s[0] || (c= (s[1]-'0')) > 9)
+      goto no_conv;
+    i= c;
+    s+= 2;
+    n_end= s + 2 * (INIT_CNT-1);
+  }
+
+  /* Handle first 9 digits and store them in i */
+  if (n_end > end)
+    n_end= end;
+  for (; s != n_end ; s+= 2)
+  {
+    if (s[0] || (c= (s[1]-'0')) > 9)
+      goto end_i;
+    i= i*10+c;
+  }
+  if (s == end)
+    goto end_i;
+
+  /* Handle next 9 digits and store them in j */
+  j= 0;
+  start= s;				/* Used to know how much to shift i */
+  n_end= true_end= s + 2 * INIT_CNT;
+  if (n_end > end)
+    n_end= end;
+  do
+  {
+    if (s[0] || (c= (s[1]-'0')) > 9)
+      goto end_i_and_j;
+    j= j*10+c;
+    s+= 2;
+  } while (s != n_end);
+  if (s == end)
+  {
+    if (s != true_end)
+      goto end_i_and_j;
+    goto end3;
+  }
+  if (s[0] || (c= (s[1]-'0')) > 9)
+    goto end3;
+
+  /* Handle the next 1 or 2 digits and store them in k */
+  k=c;
+  s+= 2;
+  if (s == end || s[0] || (c= (s[1]-'0')) > 9)
+    goto end4;
+  k= k*10+c;
+  s+= 2;
+  *endptr= (char*) s;
+
+  /* number string should have ended here */
+  if (s != end && !s[0] && (c= (s[1]-'0')) <= 9)
+    goto overflow;
+
+  /* Check that we didn't get an overflow with the last digit */
+  if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
+                                     k > cutoff3)))
+    goto overflow;
+  li=i*LFACTOR2+ (ulonglong) j*100 + k;
+  return (longlong) li;
+
+overflow:					/* *endptr is set here */
+  *error= MY_ERRNO_ERANGE;
+  return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
+
+end_i:
+  *endptr= (char*) s;
+  return (negative ? ((longlong) -(long) i) : (longlong) i);
+
+end_i_and_j:
+  li= (ulonglong) i * lfactor[(uint) (s-start) / 2] + j;
+  *endptr= (char*) s;
+  return (negative ? -((longlong) li) : (longlong) li);
+
+end3:
+  li=(ulonglong) i*LFACTOR+ (ulonglong) j;
+  *endptr= (char*) s;
+  return (negative ? -((longlong) li) : (longlong) li);
+
+end4:
+  li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
+  *endptr= (char*) s;
+  if (negative)
+  {
+   if (li > MAX_NEGATIVE_NUMBER)
+     goto overflow;
+   return -((longlong) li);
+  }
+  return (longlong) li;
+
+no_conv:
+  /* There was no number to convert.  */
+  *error= MY_ERRNO_EDOM;
+  *endptr= (char *) nptr;
+  return 0;
+}
+
+
 static
 uint my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 		      const char *b, const char *e)
@@ -1049,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
-/*
-** Compare string against string with wildcard
-**	0 if matched
-**	-1 if not matched with wildcard
-**	 1 if matched with wildcard
-*/
-
-static
-int my_wildcmp_ucs2(CHARSET_INFO *cs,
-		    const char *str,const char *str_end,
-		    const char *wildstr,const char *wildend,
-		    int escape, int w_one, int w_many,
-		    MY_UNICASE_INFO **weights)
-{
-  int result= -1;			/* Not found, using wildcards */
-  my_wc_t s_wc, w_wc;
-  int scan, plane;
-  
-  while (wildstr != wildend)
-  {
-    
-    while (1)
-    {
-      scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			(const uchar*)wildend);
-      if (scan <= 0)
-        return 1;
-      
-      if (w_wc ==  (my_wc_t)escape)
-      {
-        wildstr+= scan;
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-      }
-      
-      if (w_wc == (my_wc_t)w_many)
-      {
-        result= 1;				/* Found an anchor char */
-        break;
-      }
-      
-      wildstr+= scan;
-      scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end);
-      if (scan <=0)
-        return 1;
-      str+= scan;
-      
-      if (w_wc == (my_wc_t)w_one)
-      {
-        result= 1;				/* Found an anchor char */
-      }
-      else
-      {
-        if (weights)
-        {
-          plane=(s_wc>>8) & 0xFF;
-          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-          plane=(w_wc>>8) & 0xFF;
-          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
-        }
-        if (s_wc != w_wc)
-          return 1;				/* No match */
-      }
-      if (wildstr == wildend)
-	return (str != str_end);		/* Match if both are at end */
-    }
-    
-    
-    if (w_wc == (my_wc_t)w_many)
-    {						/* Found w_many */
-    
-      /* Remove any '%' and '_' from the wild search string */
-      for ( ; wildstr != wildend ; )
-      {
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-        
-	if (w_wc == (my_wc_t)w_many)
-	{
-	  wildstr+= scan;
-	  continue;
-	} 
-	
-	if (w_wc == (my_wc_t)w_one)
-	{
-	  wildstr+= scan;
-	  scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str,
-			    (const uchar*)str_end);
-          if (scan <=0)
-            return 1;
-          str+= scan;
-	  continue;
-	}
-	break;					/* Not a wild character */
-      }
-      
-      if (wildstr == wildend)
-	return 0;				/* Ok if w_many is last */
-      
-      if (str == str_end)
-	return -1;
-      
-      scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			(const uchar*)wildend);
-      if (scan <= 0)
-        return 1;
-      
-      if (w_wc ==  (my_wc_t)escape)
-      {
-        wildstr+= scan;
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-      }
-      
-      while (1)
-      {
-        /* Skip until the first character from wildstr is found */
-        while (str != str_end)
-        {
-          scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str,
-			    (const uchar*)str_end);
-          if (scan <= 0)
-            return 1;
-          if (weights)
-          {
-            plane=(s_wc>>8) & 0xFF;
-            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-            plane=(w_wc>>8) & 0xFF;
-            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
-          }
-          
-          if (s_wc == w_wc)
-            break;
-          str+= scan;
-        }
-        if (str == str_end)
-          return -1;
-        
-        result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape,
-                                w_one,w_many,weights);
-        
-        if (result <= 0)
-          return result;
-        
-        str+= scan;
-      } 
-    }
-  }
-  return (str != str_end ? 1 : 0);
-}
-
-
 static
 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
 		    const char *str,const char *str_end,
 		    const char *wildstr,const char *wildend,
 		    int escape, int w_one, int w_many)
 {
-  return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
-                         escape,w_one,w_many,uni_plane); 
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,uni_plane); 
 }
 
 
@@ -1224,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
 		    const char *wildstr,const char *wildend,
 		    int escape, int w_one, int w_many)
 {
-  return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
-                         escape,w_one,w_many,NULL); 
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,NULL); 
 }
 
 
@@ -1345,10 +1369,10 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
     }
     if (ptr[0] == '\0' && ptr[1] == w_one)	/* '_' in SQL */
     {
-      *min_str++= (char) cs->min_sort_char >> 8;
-      *min_str++= (char) cs->min_sort_char & 255;
-      *max_str++= (char) cs->max_sort_char >> 8;
-      *max_str++= (char) cs->max_sort_char & 255;
+      *min_str++= (char) (cs->min_sort_char >> 8);
+      *min_str++= (char) (cs->min_sort_char & 255);
+      *max_str++= (char) (cs->max_sort_char >> 8);
+      *max_str++= (char) (cs->max_sort_char & 255);
       continue;
     }
     if (ptr[0] == '\0' && ptr[1] == w_many)	/* '%' in SQL */
@@ -1358,8 +1382,8 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
       do {
         *min_str++ = 0;
 	*min_str++ = 0;
-	*max_str++ = (char) cs->max_sort_char >>8;
-	*max_str++ = (char) cs->max_sort_char & 255;
+	*max_str++ = (char) (cs->max_sort_char >> 8);
+	*max_str++ = (char) (cs->max_sort_char & 255);
       } while (min_str + 1 < min_end);
       return 0;
     }
@@ -1439,6 +1463,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
     my_strntoll_ucs2,
     my_strntoull_ucs2,
     my_strntod_ucs2,
+    my_strtoll10_ucs2,
     my_scan_8bit
 };
 
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 37c26a3bbc4..94673a20795 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -8252,6 +8252,40 @@ my_jisx0212_uni_onechar(int code){
   [xA1-xFE][xA1-xFE]		# JIS X 0208:1997 (two bytes/char)
 */
 
+static
+uint my_numcells_eucjp(CHARSET_INFO *cs __attribute__((unused)),
+                       const char *str, const char *strend)
+{
+  uint clen= 0;
+  const unsigned char *b= (const unsigned char *) str;
+  const unsigned char *e= (const unsigned char *) strend;
+  
+  for (clen= 0; b < e; )
+  {
+    if (*b == 0x8E)
+    {
+      clen++;
+      b+= 2;
+    }
+    else if (*b == 0x8F)
+    {
+      clen+= 2;
+      b+= 3;
+    }
+    else if (*b & 0x80)
+    {
+      clen+= 2;
+      b+= 2;
+    }
+    else
+    {
+      clen++;
+      b++;
+    }
+  }
+  return clen;
+}
+
 static int
 my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e)
 {
@@ -8443,7 +8477,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_charpos_mb,
     my_well_formed_len_mb,
     my_lengthsp_8bit,
-    my_numcells_mb,
+    my_numcells_eucjp,
     my_mb_wc_euc_jp,	/* mb_wc       */
     my_wc_mb_euc_jp,	/* wc_mb       */
     my_caseup_str_mb,
@@ -8459,6 +8493,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strntoll_8bit,
     my_strntoull_8bit,
     my_strntod_8bit,
+    my_strtoll10_8bit,
     my_scan_8bit
 };
 
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 5e339725b1a..b3097649158 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={
 
 };
 
+
+/*
+** Compare string against string with wildcard
+** This function is used in UTF8 and UCS2
+**
+**	0 if matched
+**	-1 if not matched with wildcard
+**	 1 if matched with wildcard
+*/
+
+int my_wildcmp_unicode(CHARSET_INFO *cs,
+		       const char *str,const char *str_end,
+		       const char *wildstr,const char *wildend,
+		       int escape, int w_one, int w_many,
+		       MY_UNICASE_INFO **weights)
+{
+  int result= -1;			/* Not found, using wildcards */
+  my_wc_t s_wc, w_wc;
+  int scan, plane;
+  int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc,
+               const unsigned char *s,const unsigned char *e);
+  mb_wc= cs->cset->mb_wc;
+  
+  while (wildstr != wildend)
+  {
+    while (1)
+    {
+      if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                       (const uchar*)wildend)) <= 0)
+        return 1;
+      
+      if (w_wc ==  (my_wc_t)escape)
+      {
+        wildstr+= scan;
+        if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <= 0)
+          return 1;
+      }
+      
+      if (w_wc == (my_wc_t)w_many)
+      {
+        result= 1;				/* Found an anchor char */
+        break;
+      }
+      
+      wildstr+= scan;
+      if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                       (const uchar*)str_end)) <=0)
+        return 1;
+      str+= scan;
+      
+      if (w_wc == (my_wc_t)w_one)
+      {
+        result= 1;				/* Found an anchor char */
+      }
+      else
+      {
+        if (weights)
+        {
+          plane=(s_wc>>8) & 0xFF;
+          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+          plane=(w_wc>>8) & 0xFF;
+          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+        }
+        if (s_wc != w_wc)
+          return 1;				/* No match */
+      }
+      if (wildstr == wildend)
+	return (str != str_end);		/* Match if both are at end */
+    }
+    
+    
+    if (w_wc == (my_wc_t)w_many)
+    {						/* Found w_many */
+    
+      /* Remove any '%' and '_' from the wild search string */
+      for ( ; wildstr != wildend ; )
+      {
+        if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <= 0)
+          return 1;
+        
+	if (w_wc == (my_wc_t)w_many)
+	{
+	  wildstr+= scan;
+	  continue;
+	} 
+	
+	if (w_wc == (my_wc_t)w_one)
+	{
+	  wildstr+= scan;
+          if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                           (const uchar*)str_end)) <=0)
+            return 1;
+          str+= scan;
+	  continue;
+	}
+	break;					/* Not a wild character */
+      }
+      
+      if (wildstr == wildend)
+	return 0;				/* Ok if w_many is last */
+      
+      if (str == str_end)
+	return -1;
+      
+      if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                       (const uchar*)wildend)) <=0)
+        return 1;
+      
+      if (w_wc ==  (my_wc_t)escape)
+      {
+        wildstr+= scan;
+        if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <=0)
+          return 1;
+      }
+      
+      while (1)
+      {
+        /* Skip until the first character from wildstr is found */
+        while (str != str_end)
+        {
+          if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                           (const uchar*)str_end)) <=0)
+            return 1;
+          if (weights)
+          {
+            plane=(s_wc>>8) & 0xFF;
+            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+            plane=(w_wc>>8) & 0xFF;
+            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+          }
+          
+          if (s_wc == w_wc)
+            break;
+          str+= scan;
+        }
+        if (str == str_end)
+          return -1;
+        
+        result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
+                                   escape, w_one, w_many,
+                                   weights);
+        
+        if (result <= 0)
+          return result;
+        
+        str+= scan;
+      } 
+    }
+  }
+  return (str != str_end ? 1 : 0);
+}
+
 #endif
 
 
@@ -1948,50 +2103,120 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs,
 }
 
 
-static int my_strncasecmp_utf8(CHARSET_INFO *cs,
-                const char *s, const char *t,  uint len)
-{
-  int s_res,t_res;
-  my_wc_t s_wc,t_wc;
-  const char *se=s+len;
-  const char *te=t+len;
+/*
+  Compare 0-terminated UTF8 strings.
 
-  while ( s < se && t < te )
-  {
-    int plane;
+  SYNOPSIS
+    my_strcasecmp_utf8()
+    cs                  character set handler
+    s                   First 0-terminated string to compare
+    t                   Second 0-terminated string to compare
 
-    s_res=my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*)se);
-    t_res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*)te);
+  IMPLEMENTATION
 
-    if ( s_res <= 0 || t_res <= 0 )
+  RETURN
+    - negative number if s < t
+    - positive number if s > t
+    - 0 is the strings are equal
+*/
+
+static
+int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
+{
+  while (s[0] && t[0])
+  {
+    my_wc_t s_wc,t_wc;
+    
+    /*
+      Cast to int8 for extra safety.
+      char can be unsigned by default
+      on some platforms.
+    */
+    if (((int8)s[0]) >= 0)
     {
-      /* Incorrect string, compare byte by byte value */
-      return bincmp(s, se, t, te);
+      /* 
+        s[0] is between 0 and 127.
+        It represents a single byte character.
+        Convert it into weight according to collation.
+      */
+      s_wc= plane00[(uchar) s[0]].tolower;
+      s++;
     }
-
-    plane=(s_wc>>8) & 0xFF;
-    s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
-
-    plane=(t_wc>>8) & 0xFF;
-    t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
-
+    else
+    {
+      int plane, res;
+      
+      /*
+        Scan a multibyte character.
+
+        In the future it is worth to write a special version of my_utf8_uni()
+        for 0-terminated strings which will not take in account length. Now
+        we call the regular version of my_utf8_uni() with s+3 in the
+        last argument. s+3 is enough to scan any multibyte sequence.
+
+        Calling the regular version of my_utf8_uni is safe for 0-terminated
+        strings: we will never lose the end of the string:
+        If we have 0 character in the middle of a multibyte sequence,
+        then my_utf8_uni will always return a negative number, so the
+        loop with finish.
+      */
+      
+      res= my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*) s + 3);
+      
+      /* 
+         In the case of wrong multibyte sequence we will
+         call strcmp() for byte-to-byte comparison.
+      */
+      if (res <= 0)
+        return strcmp(s, t);
+      s+= res;
+      
+      /* Convert Unicode code into weight according to collation */
+      plane=(s_wc>>8) & 0xFF;
+      s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
+    }
+    
+    
+    /* Do the same for the second string */
+    
+    if (((int8)t[0]) >= 0)
+    {
+      /* Convert single byte character into weight */
+      t_wc= plane00[(uchar) t[0]].tolower;
+      t++;
+    }
+    else
+    {
+      int plane;
+      int res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*) t + 3);
+      if (res <= 0)
+        return strcmp(s, t);
+      t+= res;
+      
+      /* Convert code into weight */
+      plane=(t_wc>>8) & 0xFF;
+      t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
+    }
+    
+    /* Now we have two weights, let's compare them */
     if ( s_wc != t_wc )
       return  ((int) s_wc) - ((int) t_wc);
-
-    s+=s_res;
-    t+=t_res;
   }
-  return ( (se-s) - (te-t) );
+  return ((int)(uchar)s[0]) - ((int) (uchar) t[0]);
 }
 
-static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
+
+static
+int my_wildcmp_utf8(CHARSET_INFO *cs,
+		    const char *str,const char *str_end,
+		    const char *wildstr,const char *wildend,
+		    int escape, int w_one, int w_many)
 {
-  uint s_len=strlen(s);
-  uint t_len=strlen(t);
-  uint len = (s_len > t_len) ? s_len : t_len;
-  return  my_strncasecmp_utf8(cs, s, t, len);
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,uni_plane); 
 }
 
+
 static int my_strnxfrm_utf8(CHARSET_INFO *cs,
                             uchar *dst, uint dstlen,
                             const uchar *src, uint srclen)
@@ -2059,8 +2284,8 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
     my_strnncoll_utf8,
     my_strnncollsp_utf8,
     my_strnxfrm_utf8,
-    my_like_range_simple,
-    my_wildcmp_mb,
+    my_like_range_mb,
+    my_wildcmp_utf8,
     my_strcasecmp_utf8,
     my_instr_mb,
     my_hash_sort_utf8
@@ -2091,6 +2316,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
     my_strntoll_8bit,
     my_strntoull_8bit,
     my_strntod_8bit,
+    my_strtoll10_8bit,
     my_scan_8bit
 };
 
@@ -2118,7 +2344,7 @@ CHARSET_INFO my_charset_utf8_general_ci=
     1,                  /* mbminlen     */
     3,                  /* mbmaxlen     */
     0,                  /* min_sort_char */
-    255,                /* max_sort_char */
+    0xFFFF,             /* max_sort_char */
     &my_charset_utf8_handler,
     &my_collation_ci_handler
 };
diff --git a/strings/xml.c b/strings/xml.c
index 7d7839e1603..6ba52ea41a8 100644
--- a/strings/xml.c
+++ b/strings/xml.c
@@ -81,10 +81,11 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
   a->beg=p->cur;
   a->end=p->cur;
   
-  if (!memcmp(p->cur,"<!--",4))
+  if (!bcmp(p->cur,"<!--",4))
   {
-    for( ; (p->cur < p->end) && memcmp(p->cur, "-->", 3); p->cur++);
-    if(!memcmp(p->cur, "-->", 3))
+    for( ; (p->cur < p->end) && bcmp(p->cur, "-->", 3); p->cur++)
+    {}
+    if (!bcmp(p->cur, "-->", 3))
       p->cur+=3;
     a->end=p->cur;
     lex=MY_XML_COMMENT;
author	unknown <monty@mysql.com>	2004-10-29 19:26:52 +0300
committer	unknown <monty@mysql.com>	2004-10-29 19:26:52 +0300
commit	f095274fe8c3d3394d6c0ce0a68f4bea04311999 (patch)
tree	23bcc9a71fe7237887a111b158e30f5a6bb665d3 /strings
parent	f41bba8c6156a7adf4c67dfa75e16112767a5d3c (diff)
parent	5be6c328f5a9f78f37176bbbd88a538fa3b65fe9 (diff)
download	mariadb-git-f095274fe8c3d3394d6c0ce0a68f4bea04311999.tar.gz