MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion

author: Alexander Barkov <bar@mariadb.org> 2015-03-13 16:51:36 +0400
committer: Alexander Barkov <bar@mariadb.org> 2015-03-13 16:51:36 +0400
commit: 197afb413fcc9f06b5e5e6ef41ce980d108b354f (patch)
tree: 7052fbaa1bf1af1c5c849e8fda4a3a790af09b25 /strings
parent: 702fba1511c90ea9c72b6c00122e0f31a05237b4 (diff)
download: mariadb-git-197afb413fcc9f06b5e5e6ef41ce980d108b354f.tar.gz
16 files changed, 569 insertions, 103 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index d631bd0a34e..eda81c0c4d3 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -50,7 +50,7 @@
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _big5
 #define IS_MB2_CHAR(x,y)      (isbig5head(x) && isbig5tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -6843,6 +6843,9 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
   if (s+2>e)
     return MY_CS_TOOSMALL2;
 
+  if (!IS_MB2_CHAR(hi, s[1]))
+    return MY_CS_ILSEQ;
+
   if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1])))
     return -2;
   
@@ -6894,7 +6897,9 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_big5,
+  my_well_formed_char_length_big5,
+  my_copy_fix_mb,
 };
 
 struct charset_info_st my_charset_big5_chinese_ci=
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 6b53b34159a..95f31038ee6 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -549,6 +549,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
+  my_charlen_8bit,
+  my_well_formed_char_length_8bit,
   my_copy_8bit,
 };
 
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 13129a6a874..2e26a98bf05 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -186,7 +186,7 @@ static const uchar sort_order_cp932[]=
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _cp932
 #define IS_8BIT_CHAR(x)       iscp932kata(x)
 #define IS_MB2_CHAR(x,y)      (iscp932head(x) && iscp932tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -34765,7 +34765,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_cp932,
+  my_well_formed_char_length_cp932,
+  my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index eab9539ad45..a2c95bf77c8 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -204,7 +204,7 @@ static const uchar sort_order_euc_kr[]=
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _euckr
 #define IS_MB2_CHAR(x,y)      (iseuc_kr_head(x) && iseuc_kr_tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -9928,6 +9928,9 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
   if (s+2>e)
     return MY_CS_TOOSMALL2;
   
+  if (!IS_MB2_CHAR(hi, s[1]))
+    return MY_CS_ILSEQ;
+  
   if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1])))
     return -2;
   
@@ -9979,7 +9982,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_euckr,
+  my_well_formed_char_length_euckr,
+  my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 52873c2f87e..827feda927b 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -198,7 +198,7 @@ static const uchar sort_order_eucjpms[]=
 #define IS_MB2_KATA(x,y)      (iseucjpms_ss2(x) && iskata(y))
 #define IS_MB2_CHAR(x,y)      (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
 #define IS_MB3_CHAR(x,y,z)    (iseucjpms_ss3(x) && IS_MB2_JIS(y,z))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -67511,7 +67511,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
-    my_copy_abort_mb,
+    my_charlen_eucjpms,
+    my_well_formed_char_length_eucjpms,
+    my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index a4268b8fd68..129e8edb966 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -167,7 +167,7 @@ static const uchar sort_order_gb2312[]=
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _gb2312
 #define IS_MB2_CHAR(x,y)      (isgb2312head(x) && isgb2312tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -6330,7 +6330,10 @@ my_mb_wc_gb2312(CHARSET_INFO *cs  __attribute__((unused)),
   
   if (s+2>e)
     return MY_CS_TOOSMALL2;
-  
+
+  if (!IS_MB2_CHAR(hi, s[1]))  
+    return MY_CS_ILSEQ;
+
   if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F)))
     return -2;
   
@@ -6382,7 +6385,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_gb2312,
+  my_well_formed_char_length_gb2312,
+  my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 392fdb487b6..b3bd1efb6c4 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -45,7 +45,7 @@
 
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _gbk
 #define IS_MB2_CHAR(x,y)      (isgbkhead(x) && isgbktail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -10724,6 +10724,9 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
   if (s+2>e)
     return MY_CS_TOOSMALL2;
     
+  if (!IS_MB2_CHAR(hi, s[1]))
+    return MY_CS_ILSEQ;
+  
   if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1])))
     return -2;
   
@@ -10776,7 +10779,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_gbk,
+  my_well_formed_char_length_gbk,
+  my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 099f03460ce..bc51911dceb 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -422,6 +422,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
+    my_charlen_8bit,
+    my_well_formed_char_length_8bit,
     my_copy_8bit,
 };
 
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index fc41563324a..5947c3d4f4a 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -424,25 +424,95 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e,
 
 
 /*
-  Copy a multi-byte string. Abort if a bad byte sequence was found.
-  Note more than "nchars" characters are copied.
+  Append a badly formed piece of string.
+  Bad bytes are fixed to '?'.
+  
+  @param to        The destination string
+  @param to_end    The end of the destination string
+  @param from      The source string
+  @param from_end  The end of the source string
+  @param nchars    Write not more than "nchars" characters.
+  @param status    Copying status, must be previously initialized,
+                   e.g. using well_formed_char_length() on the original
+                   full source string.
 */
+static size_t
+my_append_fix_badly_formed_tail(CHARSET_INFO *cs,
+                                char *to, char *to_end,
+                                const char *from, const char *from_end,
+                                size_t nchars,
+                                MY_STRCOPY_STATUS *status)
+{
+  char *to0= to;
+
+  for ( ; nchars; nchars--)
+  {
+    int chlen;
+    if ((chlen= cs->cset->charlen(cs, (const uchar*) from,
+                                      (const uchar *) from_end)) > 0)
+    {
+      /* Found a valid character */         /* chlen == 1..MBMAXLEN  */
+      DBUG_ASSERT(chlen <= (int) cs->mbmaxlen);
+      if (to + chlen > to_end)
+        goto end;                           /* Does not fit to "to" */
+      memcpy(to, from, (size_t) chlen);
+      from+= chlen;
+      to+= chlen;
+      continue;
+    }
+    if (chlen == MY_CS_ILSEQ)              /* chlen == 0 */
+    {
+      DBUG_ASSERT(from < from_end);  /* Shouldn't get MY_CS_ILSEQ if empty */
+      goto bad;
+    }
+    /* Got an incomplete character */       /* chlen == MY_CS_TOOSMALLXXX  */
+    DBUG_ASSERT(chlen >= MY_CS_TOOSMALL6); 
+    DBUG_ASSERT(chlen <= MY_CS_TOOSMALL);
+    if (from >= from_end)                   
+      break;                                /* End of the source string    */
+bad:
+    /* Bad byte sequence, or incomplete character found */
+    if (!status->m_well_formed_error_pos)
+      status->m_well_formed_error_pos= from;
+
+    if ((chlen= cs->cset->wc_mb(cs, '?', (uchar*) to, (uchar *) to_end)) <= 0)
+      break; /* Question mark does not fit into the destination */
+    to+= chlen;
+    from++;
+  }
+end:
+  status->m_source_end_pos= from;
+  return to - to0;
+}
+
+
 size_t
-my_copy_abort_mb(CHARSET_INFO *cs,
-                 char *dst, size_t dst_length,
-                 const char *src, size_t src_length,
-                 size_t nchars, MY_STRCOPY_STATUS *status)
+my_copy_fix_mb(CHARSET_INFO *cs,
+               char *dst, size_t dst_length,
+               const char *src, size_t src_length,
+               size_t nchars, MY_STRCOPY_STATUS *status)
 {
-  int well_formed_error;
-  size_t res;
+  size_t well_formed_nchars;
+  size_t well_formed_length;
+  size_t fixed_length;
 
   set_if_smaller(src_length, dst_length);
-  res= cs->cset->well_formed_len(cs, src, src + src_length,
-                                 nchars, &well_formed_error);
-  memmove(dst, src, res);
-  status->m_source_end_pos= src + res;
-  status->m_well_formed_error_pos= well_formed_error ? src + res : NULL;
-  return res;
+  well_formed_nchars= cs->cset->well_formed_char_length(cs,
+                                                        src, src + src_length,
+                                                        nchars, status);
+  DBUG_ASSERT(well_formed_nchars <= nchars);
+  memmove(dst, src, (well_formed_length= status->m_source_end_pos - src));
+  if (!status->m_well_formed_error_pos)
+    return well_formed_length;
+
+  fixed_length= my_append_fix_badly_formed_tail(cs,
+                                                dst + well_formed_length,
+                                                dst + dst_length,
+                                                src + well_formed_length,
+                                                src + src_length,
+                                                nchars - well_formed_nchars,
+                                                status);
+  return well_formed_length + fixed_length;
 }
 
 
diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic
index 70cc89c9af0..55094535d5e 100644
--- a/strings/ctype-mb.ic
+++ b/strings/ctype-mb.ic
@@ -29,7 +29,70 @@
 #endif
 
 
-#ifdef WELL_FORMED_LEN
+#ifdef DEFINE_ASIAN_ROUTINES
+#define DEFINE_WELL_FORMED_LEN
+#define DEFINE_WELL_FORMED_CHAR_LENGTH
+#define DEFINE_CHARLEN
+#endif
+
+
+#ifdef DEFINE_CHARLEN
+/**
+  Returns length of the left-most character of a string.
+  @param cs - charset with mbminlen==1 and mbmaxlen<=4
+  @param b  - the beginning of the string
+  @param e  - the end of the string
+
+  @return   MY_CS_ILSEQ         if a bad byte sequence was found
+  @return   MY_CS_TOOSMALL(N)   if the string ended unexpectedly
+  @return   >0                  if a valid character was found
+*/
+static int
+MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)),
+                          const uchar *b, const uchar *e)
+{
+  DBUG_ASSERT(cs->mbminlen == 1);
+  DBUG_ASSERT(cs->mbmaxlen <= 4);
+
+  if (b >= e)
+    return MY_CS_TOOSMALL;
+  if ((uchar) b[0] < 128)
+    return 1; /* Single byte ASCII character */
+
+#ifdef IS_8BIT_CHAR
+  if (IS_8BIT_CHAR(b[0]))
+  {      
+    /* Single byte non-ASCII character, e.g. half width kana in sjis */
+    return 1;
+  }
+#endif
+
+  if (b + 2 > e)
+    return MY_CS_TOOSMALLN(2);
+  if (IS_MB2_CHAR(b[0], b[1]))
+    return 2; /* Double byte character */
+
+#ifdef IS_MB3_CHAR
+  if (b + 3 > e)
+    return MY_CS_TOOSMALLN(3);
+  if (IS_MB3_CHAR(b[0], b[1], b[2]))
+    return 3; /* Three-byte character */
+#endif
+
+#ifdef IS_MB4_CHAR
+  if (b + 4 > e)
+    return MY_CS_TOOSMALLN(4);
+  if (IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
+    return 4; /* Four-byte character */
+#endif
+
+  /* Wrong byte sequence */
+  return MY_CS_ILSEQ;
+}
+#endif /* DEFINE_WELL_FORMED_LEN */
+
+
+#ifdef DEFINE_WELL_FORMED_LEN
 /**
   Returns well formed length of a character string with
   variable character length for character sets with:
@@ -91,4 +154,105 @@ MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)),
   return b - b0;
 }
 
-#endif /* WELL_FORMED_LEN */
+#endif /* DEFINE_WELL_FORMED_LEN */
+
+
+
+#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH
+/**
+  Returns well formed length of a string 
+  measured in characters (rather than in bytes).
+  Version for character sets that define IS_MB?_CHAR(), e.g. big5.
+*/
+static size_t
+MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
+                                          const char *b, const char *e,
+                                          size_t nchars,
+                                          MY_STRCOPY_STATUS *status)
+{
+  size_t nchars0= nchars;
+  for ( ; b < e && nchars ; nchars--)
+  {
+    if ((uchar) b[0] < 128)
+    {
+      b++; /* Single byte ASCII character */
+      continue;
+    }
+
+    if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1]))
+    {
+      b+= 2; /* Double byte character */
+      continue;
+    }
+
+#ifdef IS_MB3_CHAR
+    if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2]))
+    {
+      b+= 3; /* Three-byte character */
+      continue;
+    }
+#endif
+
+#ifdef IS_MB4_CHAR
+    if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
+    {
+      b+= 4; /* Four-byte character */
+      continue;
+    }
+#endif
+
+#ifdef IS_8BIT_CHAR
+    if (IS_8BIT_CHAR(b[0]))
+    {      
+      b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */
+      continue;
+    }
+#endif
+
+    /* Wrong byte sequence */
+    status->m_source_end_pos= status->m_well_formed_error_pos= b;
+    return nchars0 - nchars;
+  }
+  status->m_source_end_pos= b;
+  status->m_well_formed_error_pos= NULL;
+  return nchars0 - nchars;
+}
+#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */
+
+
+#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#ifndef CHARLEN
+#error CHARLEN is not defined
+#endif
+/**
+  Returns well formed length of a string 
+  measured in characters (rather than in bytes).
+  Version for character sets that define CHARLEN(), e.g. utf8.
+  CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does:
+  - a positive number in the range [1-mbmaxlen] if a valid
+    single-byte or multi-byte character was found
+  - MY_CS_ILSEQ (0) on a bad byte sequence
+  - MY_CS_TOOSMALLxx if the incoming sequence is incomplete
+*/
+static size_t
+MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
+                                          const char *b, const char *e,
+                                          size_t nchars,
+                                          MY_STRCOPY_STATUS *status)
+{
+  size_t nchars0= nchars;
+  int chlen;
+  for ( ; nchars ; nchars--, b+= chlen)
+  {
+    if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0)
+    {
+      status->m_well_formed_error_pos= b < e ? b : NULL;
+      status->m_source_end_pos= b;
+      return nchars0 - nchars;
+    }
+  }
+  status->m_well_formed_error_pos= NULL;
+  status->m_source_end_pos= b;
+  return nchars0 - nchars;
+}
+#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index b010c528979..d7a1b3f33b4 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -248,6 +248,13 @@ int my_strcasecmp_8bit(CHARSET_INFO * cs,const char *s, const char *t)
 }
 
 
+int my_charlen_8bit(CHARSET_INFO *cs __attribute__((unused)),
+                    const uchar *str, const uchar *end)
+{
+  return str >= end ? MY_CS_TOOSMALL : 1;
+}
+
+
 int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc,
 		  const uchar *str,
 		  const uchar *end __attribute__((unused)))
@@ -1108,6 +1115,19 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
+size_t
+my_well_formed_char_length_8bit(CHARSET_INFO *cs __attribute__((unused)),
+                                const char *start, const char *end,
+                                size_t nchars, MY_STRCOPY_STATUS *status)
+{
+  size_t nbytes= (size_t) (end - start);
+  size_t res= MY_MIN(nbytes, nchars);
+  status->m_well_formed_error_pos= NULL;
+  status->m_source_end_pos= start + res;
+  return res;
+}
+
+
 /*
   Copy a 8-bit string. Not more than "nchars" character are copied.
 */
@@ -1906,6 +1926,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
+    my_charlen_8bit,
+    my_well_formed_char_length_8bit,
     my_copy_8bit,
 };
 
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index 432e2e5e823..bbf0026cf2b 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -187,7 +187,7 @@ static const uchar sort_order_sjis[]=
 #define MY_FUNCTION_NAME(x)   my_ ## x ## _sjis
 #define IS_8BIT_CHAR(x)       issjiskata(x)
 #define IS_MB2_CHAR(x,y)      (issjishead(x) && issjistail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -34144,7 +34144,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_sjis,
+  my_well_formed_char_length_sjis,
+  my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 343fb812e20..6537b380ab3 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -886,6 +886,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
+    my_charlen_8bit,
+    my_well_formed_char_length_8bit,
     my_copy_8bit,
 };
 
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 8f234e9e3a8..d1441a4d3a5 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -92,62 +92,107 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
+typedef enum
+{
+  MY_CHAR_COPY_OK=       0, /* The character was Okey */
+  MY_CHAR_COPY_ERROR=    1, /* The character was not Ok, and could not fix */
+  MY_CHAR_COPY_FIXED=    2  /* The character was not Ok, was fixed to '?' */
+} my_char_copy_status_t;
+
+
 /*
-  Copy an UCS2/UTF16/UTF32 string.
-  Not more that "nchars" characters are copied.
+  Copies an incomplete character, lef-padding it with 0x00 bytes.
+  
+  @param cs           Character set
+  @param dst          The destination string
+  @param dst_length   Space available in dst
+  @param src          The source string
+  @param src_length   Length of src
+  @param nchars       Copy not more than nchars characters.
+                      The "nchars" parameter of the caller.
+                      Only 0 and non-0 are important here.
+  @param fix          What to do if after zero-padding didn't get a valid 
+                      character:
+                      - FALSE - exit with error.
+                      - TRUE  - try to put '?' instead.
+  
+  @return  MY_CHAR_COPY_OK     if after zero-padding got a valid character.
+                               cs->mbmaxlen bytes were written to "dst".
+  @return  MY_CHAR_COPY_FIXED  if after zero-padding did not get a valid
+                               character, but wrote '?' to the destination
+                               string instead.
+                               cs->mbminlen bytes were written to "dst".
+  @return  MY_CHAR_COPY_ERROR  If failed and nothing was written to "dst".
+                               Possible reasons:
+                               - dst_length was too short
+                               - nchars was 0
+                               - the character after padding appeared not
+                                 to be valid, and could not fix it to '?'.
+*/
+static my_char_copy_status_t
+my_copy_incomplete_char(CHARSET_INFO *cs,
+                        char *dst, size_t dst_length,
+                        const char *src, size_t src_length,
+                        size_t nchars, my_bool fix)
+{
+  size_t pad_length;
+  size_t src_offset= src_length % cs->mbminlen;
+  if (dst_length < cs->mbminlen || !nchars)
+    return MY_CHAR_COPY_ERROR;
+
+  pad_length= cs->mbminlen - src_offset;
+  bzero(dst, pad_length);
+  memmove(dst + pad_length, src, src_offset);
+  /*
+    In some cases left zero-padding can create an incorrect character.
+    For example:
+      INSERT INTO t1 (utf32_column) VALUES (0x110000);
+    We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
+    The valid characters range is limited to 0x00000000..0x0010FFFF.
+    
+    Make sure we didn't pad to an incorrect character.
+  */
+  if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
+      (int) cs->mbminlen)
+    return MY_CHAR_COPY_OK;
 
-  UCS2/UTF16/UTF32 may need to prepend zero some bytes,
-  e.g. when copying from a BINARY source:
-  INSERT INTO t1 (ucs2_column) VALUES (0x01);
-  0x01 -> 0x0001
+  if (fix &&
+      cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
+      (int) cs->mbminlen)
+    return MY_CHAR_COPY_FIXED;
+
+  return MY_CHAR_COPY_ERROR;
+}
+
+
+/*
+  Copy an UCS2/UTF16/UTF32 string, fix bad characters.
 */
 static size_t
-my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs,
-                         char *dst, size_t dst_length,
-                         const char *src, size_t src_length,
-                         size_t nchars, MY_STRCOPY_STATUS *status)
+my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs,
+                       char *dst, size_t dst_length,
+                       const char *src, size_t src_length,
+                       size_t nchars, MY_STRCOPY_STATUS *status)
 {
-  size_t src_offset;
-
-  if ((src_offset= (src_length % cs->mbminlen)))
-  {
-    int well_formed_error;
-    size_t pad_length;
-    if (dst_length < cs->mbminlen || !nchars)
-    {
-      status->m_source_end_pos= status->m_well_formed_error_pos= src;
-      return 0;
-    }
-
-    pad_length= cs->mbminlen - src_offset;
-    bzero(dst, pad_length);
-    memmove(dst + pad_length, src, src_offset);
-    /*
-      In some cases left zero-padding can create an incorrect character.
-      For example:
-        INSERT INTO t1 (utf32_column) VALUES (0x110000);
-      We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
-      The valid characters range is limited to 0x00000000..0x0010FFFF.
-      
-      Make sure we didn't pad to an incorrect character.
-    */
-    if (cs->cset->well_formed_len(cs,
-                                  dst, dst + cs->mbminlen, 1,
-                                  &well_formed_error) != cs->mbminlen)
-    {
-      status->m_source_end_pos= status->m_well_formed_error_pos= src;
-      return 0;
-    }
-    nchars--;
-    src+= src_offset;
-    src_length-= src_offset;
-    dst+= cs->mbminlen;
-    dst_length-= cs->mbminlen;
-    return
-      cs->mbminlen /* The left-padded character */ +
-      my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+  size_t length2, src_offset= src_length % cs->mbminlen;
+  my_char_copy_status_t padstatus;
+  
+  if (!src_offset)
+    return  my_copy_fix_mb(cs, dst, dst_length,
+                               src, src_length, nchars, status);
+  if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length,
+                                          src, src_length, nchars, TRUE)) ==
+      MY_CHAR_COPY_ERROR)
+  {
+    status->m_source_end_pos= status->m_well_formed_error_pos= src;
+    return 0;
   }
-  return  my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+  length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen,
+                          src + src_offset, src_length - src_offset,
+                          nchars - 1, status);
+  if (padstatus == MY_CHAR_COPY_FIXED)
+    status->m_well_formed_error_pos= src;
+  return cs->mbminlen /* The left-padded character */ + length2;
 }
 
 
@@ -1475,6 +1520,24 @@ my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e)
 }
 
 
+static int
+my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
+{
+  my_wc_t wc;
+  return cs->cset->mb_wc(cs, &wc, str, end);
+}
+
+
+#define MY_FUNCTION_NAME(x)       my_ ## x ## _utf16
+#define CHARLEN(cs,str,end)       my_charlen_utf16(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* Defines my_well_formed_char_length_utf16 */
+
+
 static uint
 my_mbcharlen_utf16(CHARSET_INFO *cs  __attribute__((unused)),
                    uint c __attribute__((unused)))
@@ -1742,7 +1805,9 @@ MY_CHARSET_HANDLER my_charset_utf16_handler=
   my_strtoll10_mb2,
   my_strntoull10rnd_mb2_or_mb4,
   my_scan_mb2,
-  my_copy_abort_mb2_or_mb4,
+  my_charlen_utf16,
+  my_well_formed_char_length_utf16,
+  my_copy_fix_mb2_or_mb4,
 };
 
 
@@ -1912,7 +1977,9 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler=
   my_strtoll10_mb2,
   my_strntoull10rnd_mb2_or_mb4,
   my_scan_mb2,
-  my_copy_abort_mb2_or_mb4,
+  my_charlen_utf16,
+  my_well_formed_char_length_utf16,
+  my_copy_fix_mb2_or_mb4,
 };
 
 
@@ -1987,6 +2054,13 @@ struct charset_info_st my_charset_utf16le_bin=
 
 #ifdef HAVE_CHARSET_utf32
 
+/*
+  Check is b0 and b1 start a valid UTF32 four-byte sequence.
+  Don't accept characters greater than U+10FFFF.
+*/
+#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
+
+
 static int
 my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
              my_wc_t *pwc, const uchar *s, const uchar *e)
@@ -1994,7 +2068,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
   if (s + 4 > e)
     return MY_CS_TOOSMALL4;
   *pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
-  return 4;
+  return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
 }
 
 
@@ -2004,7 +2078,10 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
 {
   if (s + 4 > e) 
     return MY_CS_TOOSMALL4;
-  
+
+  if (wc > 0x10FFFF)  
+    return MY_CS_ILUNI;
+
   s[0]= (uchar) (wc >> 24);
   s[1]= (uchar) (wc >> 16) & 0xFF;
   s[2]= (uchar) (wc >> 8)  & 0xFF;
@@ -2263,10 +2340,29 @@ my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)),
                   const char *b,
                   const char *e)
 {
-  return b + 4 > e ? 0 : 4;
+  return b + 4 > e || !IS_UTF32_MBHEAD4(b[0], b[1]) ? 0 : 4;
 }
 
 
+static int
+my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
+                 const uchar *b, const uchar *e)
+{
+  return b + 4 > e ? MY_CS_TOOSMALL4 :
+         IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ;
+}
+
+
+#define MY_FUNCTION_NAME(x)       my_ ## x ## _utf32
+#define CHARLEN(cs,str,end)       my_charlen_utf32(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* Defines my_well_formed_char_length_utf32 */
+
+
 static uint
 my_mbcharlen_utf32(CHARSET_INFO *cs  __attribute__((unused)) , 
                    uint c __attribute__((unused)))
@@ -2579,8 +2675,7 @@ my_well_formed_len_utf32(CHARSET_INFO *cs __attribute__((unused)),
   }
   for (; b < e; b+= 4)
   {
-    /* Don't accept characters greater than U+10FFFF */
-    if (b[0] || (uchar) b[1] > 0x10)
+    if (!IS_UTF32_MBHEAD4(b[0], b[1]))
     {
       *error= 1;
       return b - b0;
@@ -2827,7 +2922,9 @@ MY_CHARSET_HANDLER my_charset_utf32_handler=
   my_strtoll10_utf32,
   my_strntoull10rnd_mb2_or_mb4,
   my_scan_utf32,
-  my_copy_abort_mb2_or_mb4,
+  my_charlen_utf32,
+  my_well_formed_char_length_utf32,
+  my_copy_fix_mb2_or_mb4,
 };
 
 
@@ -2961,6 +3058,14 @@ static const uchar to_upper_ucs2[] = {
 };
 
 
+static int
+my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+		const uchar *s, const uchar *e)
+{
+  return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2;
+}
+
+
 static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
 		       my_wc_t * pwc, const uchar *s, const uchar *e)
 {
@@ -3264,6 +3369,31 @@ size_t my_well_formed_len_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
+static size_t
+my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+                                const char *b, const char *e,
+                                size_t nchars, MY_STRCOPY_STATUS *status)
+{
+  size_t length= e - b;
+  if (nchars * 2 <= length)
+  {
+    status->m_well_formed_error_pos= NULL;
+    status->m_source_end_pos= b + (nchars * 2);
+    return nchars;
+  }
+  if (length % 2)
+  {
+    status->m_well_formed_error_pos= status->m_source_end_pos= e - 1;
+  }
+  else
+  {
+    status->m_well_formed_error_pos= NULL;
+    status->m_source_end_pos= e;
+  }
+  return length / 2;
+}
+
+
 static
 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
 		    const char *str,const char *str_end,
@@ -3446,7 +3576,9 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
     my_strtoll10_mb2,
     my_strntoull10rnd_mb2_or_mb4,
     my_scan_mb2,
-    my_copy_abort_mb2_or_mb4,
+    my_charlen_ucs2,
+    my_well_formed_char_length_ucs2,
+    my_copy_fix_mb2_or_mb4,
 };
 
 
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 99f5be3fa38..cb000a2afa0 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -197,7 +197,7 @@ static const uchar sort_order_ujis[]=
 #define IS_MB2_KATA(x,y)      (isujis_ss2(x)    && iskata(y))
 #define IS_MB2_CHAR(x, y)     (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
 #define IS_MB3_CHAR(x, y, z)  (isujis_ss3(x)    && IS_MB2_JIS(y,z))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
 #include "ctype-mb.ic"
 
 
@@ -67255,7 +67255,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
-    my_copy_abort_mb,
+    my_charlen_ujis,
+    my_well_formed_char_length_ujis,
+    my_copy_fix_mb,
 };
 
 
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 1116228f706..56824aac59e 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5446,8 +5446,8 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
 
 
 static
-int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
-                            const uchar *s, const uchar *e)
+int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+                    const uchar *s, const uchar *e)
 {
   uchar c;
 
@@ -5515,7 +5515,7 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
   {
     int mb_len;
 
-    if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+    if ((mb_len= my_charlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
     {
       *error= b < e ? 1 : 0;
       break;
@@ -5526,9 +5526,20 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
   return (size_t) (b - b_start);
 }
 
+
+#define MY_FUNCTION_NAME(x)       my_ ## x ## _utf8
+#define CHARLEN(cs,str,end)       my_charlen_utf8(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_utf8 */
+
+
 static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
 {
-  int  res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
+  int  res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e);
   return (res>1) ? res : 0;
 }
 
@@ -5615,7 +5626,9 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
-    my_copy_abort_mb,
+    my_charlen_utf8,
+    my_well_formed_char_length_utf8,
+    my_copy_fix_mb,
 };
 
 
@@ -7125,6 +7138,24 @@ my_wc_mb_filename(CHARSET_INFO *cs __attribute__((unused)),
 }
 
 
+static int
+my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
+{
+  my_wc_t wc;
+  return cs->cset->mb_wc(cs, &wc, str, end);
+}
+
+
+#define MY_FUNCTION_NAME(x)       my_ ## x ## _filename
+#define CHARLEN(cs,str,end)       my_charlen_filename(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_filename */
+
+
 static MY_COLLATION_HANDLER my_collation_filename_handler =
 {
     NULL,               /* init */
@@ -7169,7 +7200,9 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
     my_strtoll10_8bit,
     my_strntoull10rnd_8bit,
     my_scan_8bit,
-    my_copy_abort_mb,
+    my_charlen_filename,
+    my_well_formed_char_length_filename,
+    my_copy_fix_mb,
 };
 
 
@@ -7954,8 +7987,8 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs,
 
 
 static int
-my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
-                           const uchar *s, const uchar *e)
+my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+                   const uchar *s, const uchar *e)
 {
   uchar c;
 
@@ -8015,7 +8048,7 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
   {
     int mb_len;
 
-    if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+    if ((mb_len= my_charlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
     {
       *error= b < e ? 1 : 0;
       break;
@@ -8027,10 +8060,19 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
 }
 
 
+#define MY_FUNCTION_NAME(x)       my_ ## x ## _utf8mb4
+#define CHARLEN(cs,str,end)       my_charlen_utf8mb4(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_utf8mb4 */
+
 static uint
 my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
 {
-  int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
+  int res= my_charlen_utf8mb4(cs, (const uchar*) b, (const uchar*) e);
   return (res > 1) ? res : 0;
 }
 
@@ -8113,7 +8155,9 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
   my_strtoll10_8bit,
   my_strntoull10rnd_8bit,
   my_scan_8bit,
-  my_copy_abort_mb,
+  my_charlen_utf8mb4,
+  my_well_formed_char_length_utf8mb4,
+  my_copy_fix_mb,
 };
author	Alexander Barkov <bar@mariadb.org>	2015-03-13 16:51:36 +0400
committer	Alexander Barkov <bar@mariadb.org>	2015-03-13 16:51:36 +0400
commit	197afb413fcc9f06b5e5e6ef41ce980d108b354f (patch)
tree	7052fbaa1bf1af1c5c849e8fda4a3a790af09b25 /strings
parent	702fba1511c90ea9c72b6c00122e0f31a05237b4 (diff)
download	mariadb-git-197afb413fcc9f06b5e5e6ef41ce980d108b354f.tar.gz