summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2015-03-13 16:51:36 +0400
committerAlexander Barkov <bar@mariadb.org>2015-03-13 16:51:36 +0400
commit197afb413fcc9f06b5e5e6ef41ce980d108b354f (patch)
tree7052fbaa1bf1af1c5c849e8fda4a3a790af09b25 /strings
parent702fba1511c90ea9c72b6c00122e0f31a05237b4 (diff)
downloadmariadb-git-197afb413fcc9f06b5e5e6ef41ce980d108b354f.tar.gz
MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
Diffstat (limited to 'strings')
-rw-r--r--strings/ctype-big5.c9
-rw-r--r--strings/ctype-bin.c2
-rw-r--r--strings/ctype-cp932.c6
-rw-r--r--strings/ctype-euc_kr.c9
-rw-r--r--strings/ctype-eucjpms.c6
-rw-r--r--strings/ctype-gb2312.c11
-rw-r--r--strings/ctype-gbk.c9
-rw-r--r--strings/ctype-latin1.c2
-rw-r--r--strings/ctype-mb.c98
-rw-r--r--strings/ctype-mb.ic168
-rw-r--r--strings/ctype-simple.c22
-rw-r--r--strings/ctype-sjis.c6
-rw-r--r--strings/ctype-tis620.c2
-rw-r--r--strings/ctype-ucs2.c250
-rw-r--r--strings/ctype-ujis.c6
-rw-r--r--strings/ctype-utf8.c66
16 files changed, 569 insertions, 103 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index d631bd0a34e..eda81c0c4d3 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -50,7 +50,7 @@
#define MY_FUNCTION_NAME(x) my_ ## x ## _big5
#define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -6843,6 +6843,9 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1])))
return -2;
@@ -6894,7 +6897,9 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_big5,
+ my_well_formed_char_length_big5,
+ my_copy_fix_mb,
};
struct charset_info_st my_charset_big5_chinese_ci=
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 6b53b34159a..95f31038ee6 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -549,6 +549,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 13129a6a874..2e26a98bf05 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -186,7 +186,7 @@ static const uchar sort_order_cp932[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932
#define IS_8BIT_CHAR(x) iscp932kata(x)
#define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -34765,7 +34765,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_cp932,
+ my_well_formed_char_length_cp932,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index eab9539ad45..a2c95bf77c8 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -204,7 +204,7 @@ static const uchar sort_order_euc_kr[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr
#define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -9928,6 +9928,9 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1])))
return -2;
@@ -9979,7 +9982,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_euckr,
+ my_well_formed_char_length_euckr,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 52873c2f87e..827feda927b 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -198,7 +198,7 @@ static const uchar sort_order_eucjpms[]=
#define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y))
#define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
#define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -67511,7 +67511,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_eucjpms,
+ my_well_formed_char_length_eucjpms,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index a4268b8fd68..129e8edb966 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -167,7 +167,7 @@ static const uchar sort_order_gb2312[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312
#define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -6330,7 +6330,10 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
-
+
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F)))
return -2;
@@ -6382,7 +6385,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_gb2312,
+ my_well_formed_char_length_gb2312,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 392fdb487b6..b3bd1efb6c4 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -45,7 +45,7 @@
#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk
#define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -10724,6 +10724,9 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1])))
return -2;
@@ -10776,7 +10779,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_gbk,
+ my_well_formed_char_length_gbk,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 099f03460ce..bc51911dceb 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -422,6 +422,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index fc41563324a..5947c3d4f4a 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -424,25 +424,95 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e,
/*
- Copy a multi-byte string. Abort if a bad byte sequence was found.
- Note more than "nchars" characters are copied.
+ Append a badly formed piece of string.
+ Bad bytes are fixed to '?'.
+
+ @param to The destination string
+ @param to_end The end of the destination string
+ @param from The source string
+ @param from_end The end of the source string
+ @param nchars Write not more than "nchars" characters.
+ @param status Copying status, must be previously initialized,
+ e.g. using well_formed_char_length() on the original
+ full source string.
*/
+static size_t
+my_append_fix_badly_formed_tail(CHARSET_INFO *cs,
+ char *to, char *to_end,
+ const char *from, const char *from_end,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status)
+{
+ char *to0= to;
+
+ for ( ; nchars; nchars--)
+ {
+ int chlen;
+ if ((chlen= cs->cset->charlen(cs, (const uchar*) from,
+ (const uchar *) from_end)) > 0)
+ {
+ /* Found a valid character */ /* chlen == 1..MBMAXLEN */
+ DBUG_ASSERT(chlen <= (int) cs->mbmaxlen);
+ if (to + chlen > to_end)
+ goto end; /* Does not fit to "to" */
+ memcpy(to, from, (size_t) chlen);
+ from+= chlen;
+ to+= chlen;
+ continue;
+ }
+ if (chlen == MY_CS_ILSEQ) /* chlen == 0 */
+ {
+ DBUG_ASSERT(from < from_end); /* Shouldn't get MY_CS_ILSEQ if empty */
+ goto bad;
+ }
+ /* Got an incomplete character */ /* chlen == MY_CS_TOOSMALLXXX */
+ DBUG_ASSERT(chlen >= MY_CS_TOOSMALL6);
+ DBUG_ASSERT(chlen <= MY_CS_TOOSMALL);
+ if (from >= from_end)
+ break; /* End of the source string */
+bad:
+ /* Bad byte sequence, or incomplete character found */
+ if (!status->m_well_formed_error_pos)
+ status->m_well_formed_error_pos= from;
+
+ if ((chlen= cs->cset->wc_mb(cs, '?', (uchar*) to, (uchar *) to_end)) <= 0)
+ break; /* Question mark does not fit into the destination */
+ to+= chlen;
+ from++;
+ }
+end:
+ status->m_source_end_pos= from;
+ return to - to0;
+}
+
+
size_t
-my_copy_abort_mb(CHARSET_INFO *cs,
- char *dst, size_t dst_length,
- const char *src, size_t src_length,
- size_t nchars, MY_STRCOPY_STATUS *status)
+my_copy_fix_mb(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, MY_STRCOPY_STATUS *status)
{
- int well_formed_error;
- size_t res;
+ size_t well_formed_nchars;
+ size_t well_formed_length;
+ size_t fixed_length;
set_if_smaller(src_length, dst_length);
- res= cs->cset->well_formed_len(cs, src, src + src_length,
- nchars, &well_formed_error);
- memmove(dst, src, res);
- status->m_source_end_pos= src + res;
- status->m_well_formed_error_pos= well_formed_error ? src + res : NULL;
- return res;
+ well_formed_nchars= cs->cset->well_formed_char_length(cs,
+ src, src + src_length,
+ nchars, status);
+ DBUG_ASSERT(well_formed_nchars <= nchars);
+ memmove(dst, src, (well_formed_length= status->m_source_end_pos - src));
+ if (!status->m_well_formed_error_pos)
+ return well_formed_length;
+
+ fixed_length= my_append_fix_badly_formed_tail(cs,
+ dst + well_formed_length,
+ dst + dst_length,
+ src + well_formed_length,
+ src + src_length,
+ nchars - well_formed_nchars,
+ status);
+ return well_formed_length + fixed_length;
}
diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic
index 70cc89c9af0..55094535d5e 100644
--- a/strings/ctype-mb.ic
+++ b/strings/ctype-mb.ic
@@ -29,7 +29,70 @@
#endif
-#ifdef WELL_FORMED_LEN
+#ifdef DEFINE_ASIAN_ROUTINES
+#define DEFINE_WELL_FORMED_LEN
+#define DEFINE_WELL_FORMED_CHAR_LENGTH
+#define DEFINE_CHARLEN
+#endif
+
+
+#ifdef DEFINE_CHARLEN
+/**
+ Returns length of the left-most character of a string.
+ @param cs - charset with mbminlen==1 and mbmaxlen<=4
+ @param b - the beginning of the string
+ @param e - the end of the string
+
+ @return MY_CS_ILSEQ if a bad byte sequence was found
+ @return MY_CS_TOOSMALL(N) if the string ended unexpectedly
+ @return >0 if a valid character was found
+*/
+static int
+MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *b, const uchar *e)
+{
+ DBUG_ASSERT(cs->mbminlen == 1);
+ DBUG_ASSERT(cs->mbmaxlen <= 4);
+
+ if (b >= e)
+ return MY_CS_TOOSMALL;
+ if ((uchar) b[0] < 128)
+ return 1; /* Single byte ASCII character */
+
+#ifdef IS_8BIT_CHAR
+ if (IS_8BIT_CHAR(b[0]))
+ {
+ /* Single byte non-ASCII character, e.g. half width kana in sjis */
+ return 1;
+ }
+#endif
+
+ if (b + 2 > e)
+ return MY_CS_TOOSMALLN(2);
+ if (IS_MB2_CHAR(b[0], b[1]))
+ return 2; /* Double byte character */
+
+#ifdef IS_MB3_CHAR
+ if (b + 3 > e)
+ return MY_CS_TOOSMALLN(3);
+ if (IS_MB3_CHAR(b[0], b[1], b[2]))
+ return 3; /* Three-byte character */
+#endif
+
+#ifdef IS_MB4_CHAR
+ if (b + 4 > e)
+ return MY_CS_TOOSMALLN(4);
+ if (IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
+ return 4; /* Four-byte character */
+#endif
+
+ /* Wrong byte sequence */
+ return MY_CS_ILSEQ;
+}
+#endif /* DEFINE_WELL_FORMED_LEN */
+
+
+#ifdef DEFINE_WELL_FORMED_LEN
/**
Returns well formed length of a character string with
variable character length for character sets with:
@@ -91,4 +154,105 @@ MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)),
return b - b0;
}
-#endif /* WELL_FORMED_LEN */
+#endif /* DEFINE_WELL_FORMED_LEN */
+
+
+
+#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH
+/**
+ Returns well formed length of a string
+ measured in characters (rather than in bytes).
+ Version for character sets that define IS_MB?_CHAR(), e.g. big5.
+*/
+static size_t
+MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status)
+{
+ size_t nchars0= nchars;
+ for ( ; b < e && nchars ; nchars--)
+ {
+ if ((uchar) b[0] < 128)
+ {
+ b++; /* Single byte ASCII character */
+ continue;
+ }
+
+ if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1]))
+ {
+ b+= 2; /* Double byte character */
+ continue;
+ }
+
+#ifdef IS_MB3_CHAR
+ if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2]))
+ {
+ b+= 3; /* Three-byte character */
+ continue;
+ }
+#endif
+
+#ifdef IS_MB4_CHAR
+ if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
+ {
+ b+= 4; /* Four-byte character */
+ continue;
+ }
+#endif
+
+#ifdef IS_8BIT_CHAR
+ if (IS_8BIT_CHAR(b[0]))
+ {
+ b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */
+ continue;
+ }
+#endif
+
+ /* Wrong byte sequence */
+ status->m_source_end_pos= status->m_well_formed_error_pos= b;
+ return nchars0 - nchars;
+ }
+ status->m_source_end_pos= b;
+ status->m_well_formed_error_pos= NULL;
+ return nchars0 - nchars;
+}
+#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */
+
+
+#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#ifndef CHARLEN
+#error CHARLEN is not defined
+#endif
+/**
+ Returns well formed length of a string
+ measured in characters (rather than in bytes).
+ Version for character sets that define CHARLEN(), e.g. utf8.
+ CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does:
+ - a positive number in the range [1-mbmaxlen] if a valid
+ single-byte or multi-byte character was found
+ - MY_CS_ILSEQ (0) on a bad byte sequence
+ - MY_CS_TOOSMALLxx if the incoming sequence is incomplete
+*/
+static size_t
+MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status)
+{
+ size_t nchars0= nchars;
+ int chlen;
+ for ( ; nchars ; nchars--, b+= chlen)
+ {
+ if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0)
+ {
+ status->m_well_formed_error_pos= b < e ? b : NULL;
+ status->m_source_end_pos= b;
+ return nchars0 - nchars;
+ }
+ }
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= b;
+ return nchars0 - nchars;
+}
+#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index b010c528979..d7a1b3f33b4 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -248,6 +248,13 @@ int my_strcasecmp_8bit(CHARSET_INFO * cs,const char *s, const char *t)
}
+int my_charlen_8bit(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *str, const uchar *end)
+{
+ return str >= end ? MY_CS_TOOSMALL : 1;
+}
+
+
int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc,
const uchar *str,
const uchar *end __attribute__((unused)))
@@ -1108,6 +1115,19 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)),
}
+size_t
+my_well_formed_char_length_8bit(CHARSET_INFO *cs __attribute__((unused)),
+ const char *start, const char *end,
+ size_t nchars, MY_STRCOPY_STATUS *status)
+{
+ size_t nbytes= (size_t) (end - start);
+ size_t res= MY_MIN(nbytes, nchars);
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= start + res;
+ return res;
+}
+
+
/*
Copy a 8-bit string. Not more than "nchars" character are copied.
*/
@@ -1906,6 +1926,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index 432e2e5e823..bbf0026cf2b 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -187,7 +187,7 @@ static const uchar sort_order_sjis[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis
#define IS_8BIT_CHAR(x) issjiskata(x)
#define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -34144,7 +34144,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_sjis,
+ my_well_formed_char_length_sjis,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 343fb812e20..6537b380ab3 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -886,6 +886,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 8f234e9e3a8..d1441a4d3a5 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -92,62 +92,107 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
}
+typedef enum
+{
+ MY_CHAR_COPY_OK= 0, /* The character was Okey */
+ MY_CHAR_COPY_ERROR= 1, /* The character was not Ok, and could not fix */
+ MY_CHAR_COPY_FIXED= 2 /* The character was not Ok, was fixed to '?' */
+} my_char_copy_status_t;
+
+
/*
- Copy an UCS2/UTF16/UTF32 string.
- Not more that "nchars" characters are copied.
+ Copies an incomplete character, lef-padding it with 0x00 bytes.
+
+ @param cs Character set
+ @param dst The destination string
+ @param dst_length Space available in dst
+ @param src The source string
+ @param src_length Length of src
+ @param nchars Copy not more than nchars characters.
+ The "nchars" parameter of the caller.
+ Only 0 and non-0 are important here.
+ @param fix What to do if after zero-padding didn't get a valid
+ character:
+ - FALSE - exit with error.
+ - TRUE - try to put '?' instead.
+
+ @return MY_CHAR_COPY_OK if after zero-padding got a valid character.
+ cs->mbmaxlen bytes were written to "dst".
+ @return MY_CHAR_COPY_FIXED if after zero-padding did not get a valid
+ character, but wrote '?' to the destination
+ string instead.
+ cs->mbminlen bytes were written to "dst".
+ @return MY_CHAR_COPY_ERROR If failed and nothing was written to "dst".
+ Possible reasons:
+ - dst_length was too short
+ - nchars was 0
+ - the character after padding appeared not
+ to be valid, and could not fix it to '?'.
+*/
+static my_char_copy_status_t
+my_copy_incomplete_char(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, my_bool fix)
+{
+ size_t pad_length;
+ size_t src_offset= src_length % cs->mbminlen;
+ if (dst_length < cs->mbminlen || !nchars)
+ return MY_CHAR_COPY_ERROR;
+
+ pad_length= cs->mbminlen - src_offset;
+ bzero(dst, pad_length);
+ memmove(dst + pad_length, src, src_offset);
+ /*
+ In some cases left zero-padding can create an incorrect character.
+ For example:
+ INSERT INTO t1 (utf32_column) VALUES (0x110000);
+ We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
+ The valid characters range is limited to 0x00000000..0x0010FFFF.
+
+ Make sure we didn't pad to an incorrect character.
+ */
+ if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
+ (int) cs->mbminlen)
+ return MY_CHAR_COPY_OK;
- UCS2/UTF16/UTF32 may need to prepend zero some bytes,
- e.g. when copying from a BINARY source:
- INSERT INTO t1 (ucs2_column) VALUES (0x01);
- 0x01 -> 0x0001
+ if (fix &&
+ cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
+ (int) cs->mbminlen)
+ return MY_CHAR_COPY_FIXED;
+
+ return MY_CHAR_COPY_ERROR;
+}
+
+
+/*
+ Copy an UCS2/UTF16/UTF32 string, fix bad characters.
*/
static size_t
-my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs,
- char *dst, size_t dst_length,
- const char *src, size_t src_length,
- size_t nchars, MY_STRCOPY_STATUS *status)
+my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, MY_STRCOPY_STATUS *status)
{
- size_t src_offset;
-
- if ((src_offset= (src_length % cs->mbminlen)))
- {
- int well_formed_error;
- size_t pad_length;
- if (dst_length < cs->mbminlen || !nchars)
- {
- status->m_source_end_pos= status->m_well_formed_error_pos= src;
- return 0;
- }
-
- pad_length= cs->mbminlen - src_offset;
- bzero(dst, pad_length);
- memmove(dst + pad_length, src, src_offset);
- /*
- In some cases left zero-padding can create an incorrect character.
- For example:
- INSERT INTO t1 (utf32_column) VALUES (0x110000);
- We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
- The valid characters range is limited to 0x00000000..0x0010FFFF.
-
- Make sure we didn't pad to an incorrect character.
- */
- if (cs->cset->well_formed_len(cs,
- dst, dst + cs->mbminlen, 1,
- &well_formed_error) != cs->mbminlen)
- {
- status->m_source_end_pos= status->m_well_formed_error_pos= src;
- return 0;
- }
- nchars--;
- src+= src_offset;
- src_length-= src_offset;
- dst+= cs->mbminlen;
- dst_length-= cs->mbminlen;
- return
- cs->mbminlen /* The left-padded character */ +
- my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+ size_t length2, src_offset= src_length % cs->mbminlen;
+ my_char_copy_status_t padstatus;
+
+ if (!src_offset)
+ return my_copy_fix_mb(cs, dst, dst_length,
+ src, src_length, nchars, status);
+ if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length,
+ src, src_length, nchars, TRUE)) ==
+ MY_CHAR_COPY_ERROR)
+ {
+ status->m_source_end_pos= status->m_well_formed_error_pos= src;
+ return 0;
}
- return my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+ length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen,
+ src + src_offset, src_length - src_offset,
+ nchars - 1, status);
+ if (padstatus == MY_CHAR_COPY_FIXED)
+ status->m_well_formed_error_pos= src;
+ return cs->mbminlen /* The left-padded character */ + length2;
}
@@ -1475,6 +1520,24 @@ my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e)
}
+static int
+my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
+{
+ my_wc_t wc;
+ return cs->cset->mb_wc(cs, &wc, str, end);
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16
+#define CHARLEN(cs,str,end) my_charlen_utf16(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* Defines my_well_formed_char_length_utf16 */
+
+
static uint
my_mbcharlen_utf16(CHARSET_INFO *cs __attribute__((unused)),
uint c __attribute__((unused)))
@@ -1742,7 +1805,9 @@ MY_CHARSET_HANDLER my_charset_utf16_handler=
my_strtoll10_mb2,
my_strntoull10rnd_mb2_or_mb4,
my_scan_mb2,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_utf16,
+ my_well_formed_char_length_utf16,
+ my_copy_fix_mb2_or_mb4,
};
@@ -1912,7 +1977,9 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler=
my_strtoll10_mb2,
my_strntoull10rnd_mb2_or_mb4,
my_scan_mb2,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_utf16,
+ my_well_formed_char_length_utf16,
+ my_copy_fix_mb2_or_mb4,
};
@@ -1987,6 +2054,13 @@ struct charset_info_st my_charset_utf16le_bin=
#ifdef HAVE_CHARSET_utf32
+/*
+ Check is b0 and b1 start a valid UTF32 four-byte sequence.
+ Don't accept characters greater than U+10FFFF.
+*/
+#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
+
+
static int
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e)
@@ -1994,7 +2068,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s + 4 > e)
return MY_CS_TOOSMALL4;
*pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
- return 4;
+ return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
}
@@ -2004,7 +2078,10 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
-
+
+ if (wc > 0x10FFFF)
+ return MY_CS_ILUNI;
+
s[0]= (uchar) (wc >> 24);
s[1]= (uchar) (wc >> 16) & 0xFF;
s[2]= (uchar) (wc >> 8) & 0xFF;
@@ -2263,10 +2340,29 @@ my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *b,
const char *e)
{
- return b + 4 > e ? 0 : 4;
+ return b + 4 > e || !IS_UTF32_MBHEAD4(b[0], b[1]) ? 0 : 4;
}
+static int
+my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *b, const uchar *e)
+{
+ return b + 4 > e ? MY_CS_TOOSMALL4 :
+ IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ;
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32
+#define CHARLEN(cs,str,end) my_charlen_utf32(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* Defines my_well_formed_char_length_utf32 */
+
+
static uint
my_mbcharlen_utf32(CHARSET_INFO *cs __attribute__((unused)) ,
uint c __attribute__((unused)))
@@ -2579,8 +2675,7 @@ my_well_formed_len_utf32(CHARSET_INFO *cs __attribute__((unused)),
}
for (; b < e; b+= 4)
{
- /* Don't accept characters greater than U+10FFFF */
- if (b[0] || (uchar) b[1] > 0x10)
+ if (!IS_UTF32_MBHEAD4(b[0], b[1]))
{
*error= 1;
return b - b0;
@@ -2827,7 +2922,9 @@ MY_CHARSET_HANDLER my_charset_utf32_handler=
my_strtoll10_utf32,
my_strntoull10rnd_mb2_or_mb4,
my_scan_utf32,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_utf32,
+ my_well_formed_char_length_utf32,
+ my_copy_fix_mb2_or_mb4,
};
@@ -2961,6 +3058,14 @@ static const uchar to_upper_ucs2[] = {
};
+static int
+my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
+{
+ return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2;
+}
+
+
static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
@@ -3264,6 +3369,31 @@ size_t my_well_formed_len_ucs2(CHARSET_INFO *cs __attribute__((unused)),
}
+static size_t
+my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e,
+ size_t nchars, MY_STRCOPY_STATUS *status)
+{
+ size_t length= e - b;
+ if (nchars * 2 <= length)
+ {
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= b + (nchars * 2);
+ return nchars;
+ }
+ if (length % 2)
+ {
+ status->m_well_formed_error_pos= status->m_source_end_pos= e - 1;
+ }
+ else
+ {
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= e;
+ }
+ return length / 2;
+}
+
+
static
int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
const char *str,const char *str_end,
@@ -3446,7 +3576,9 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
my_strtoll10_mb2,
my_strntoull10rnd_mb2_or_mb4,
my_scan_mb2,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_ucs2,
+ my_well_formed_char_length_ucs2,
+ my_copy_fix_mb2_or_mb4,
};
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 99f5be3fa38..cb000a2afa0 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -197,7 +197,7 @@ static const uchar sort_order_ujis[]=
#define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y))
#define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
#define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -67255,7 +67255,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_ujis,
+ my_well_formed_char_length_ujis,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 1116228f706..56824aac59e 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5446,8 +5446,8 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
static
-int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *s, const uchar *e)
+int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
{
uchar c;
@@ -5515,7 +5515,7 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
{
int mb_len;
- if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+ if ((mb_len= my_charlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
@@ -5526,9 +5526,20 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
return (size_t) (b - b_start);
}
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8
+#define CHARLEN(cs,str,end) my_charlen_utf8(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_utf8 */
+
+
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
- int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
+ int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e);
return (res>1) ? res : 0;
}
@@ -5615,7 +5626,9 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_utf8,
+ my_well_formed_char_length_utf8,
+ my_copy_fix_mb,
};
@@ -7125,6 +7138,24 @@ my_wc_mb_filename(CHARSET_INFO *cs __attribute__((unused)),
}
+static int
+my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
+{
+ my_wc_t wc;
+ return cs->cset->mb_wc(cs, &wc, str, end);
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _filename
+#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_filename */
+
+
static MY_COLLATION_HANDLER my_collation_filename_handler =
{
NULL, /* init */
@@ -7169,7 +7200,9 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_filename,
+ my_well_formed_char_length_filename,
+ my_copy_fix_mb,
};
@@ -7954,8 +7987,8 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs,
static int
-my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *s, const uchar *e)
+my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
{
uchar c;
@@ -8015,7 +8048,7 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
{
int mb_len;
- if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+ if ((mb_len= my_charlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
@@ -8027,10 +8060,19 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
}
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4
+#define CHARLEN(cs,str,end) my_charlen_utf8mb4(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_utf8mb4 */
+
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
- int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
+ int res= my_charlen_utf8mb4(cs, (const uchar*) b, (const uchar*) e);
return (res > 1) ? res : 0;
}
@@ -8113,7 +8155,9 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_utf8mb4,
+ my_well_formed_char_length_utf8mb4,
+ my_copy_fix_mb,
};