diff options
author | Alexander Barkov <bar@mariadb.org> | 2015-03-13 16:51:36 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2015-03-13 16:51:36 +0400 |
commit | 197afb413fcc9f06b5e5e6ef41ce980d108b354f (patch) | |
tree | 7052fbaa1bf1af1c5c849e8fda4a3a790af09b25 /strings | |
parent | 702fba1511c90ea9c72b6c00122e0f31a05237b4 (diff) | |
download | mariadb-git-197afb413fcc9f06b5e5e6ef41ce980d108b354f.tar.gz |
MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
Diffstat (limited to 'strings')
-rw-r--r-- | strings/ctype-big5.c | 9 | ||||
-rw-r--r-- | strings/ctype-bin.c | 2 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 6 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 9 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 6 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 11 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 9 | ||||
-rw-r--r-- | strings/ctype-latin1.c | 2 | ||||
-rw-r--r-- | strings/ctype-mb.c | 98 | ||||
-rw-r--r-- | strings/ctype-mb.ic | 168 | ||||
-rw-r--r-- | strings/ctype-simple.c | 22 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 6 | ||||
-rw-r--r-- | strings/ctype-tis620.c | 2 | ||||
-rw-r--r-- | strings/ctype-ucs2.c | 250 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 6 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 66 |
16 files changed, 569 insertions, 103 deletions
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index d631bd0a34e..eda81c0c4d3 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -50,7 +50,7 @@ #define MY_FUNCTION_NAME(x) my_ ## x ## _big5 #define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -6843,6 +6843,9 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1]))) return -2; @@ -6894,7 +6897,9 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_big5, + my_well_formed_char_length_big5, + my_copy_fix_mb, }; struct charset_info_st my_charset_big5_chinese_ci= diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 6b53b34159a..95f31038ee6 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -549,6 +549,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 13129a6a874..2e26a98bf05 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -186,7 +186,7 @@ static const uchar sort_order_cp932[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _cp932 #define IS_8BIT_CHAR(x) iscp932kata(x) #define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -34765,7 +34765,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_cp932, + my_well_formed_char_length_cp932, + my_copy_fix_mb, }; diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index eab9539ad45..a2c95bf77c8 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -204,7 +204,7 @@ static const uchar sort_order_euc_kr[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr #define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -9928,6 +9928,9 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1]))) return -2; @@ -9979,7 +9982,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_euckr, + my_well_formed_char_length_euckr, + my_copy_fix_mb, }; diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 52873c2f87e..827feda927b 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -198,7 +198,7 @@ static const uchar sort_order_eucjpms[]= #define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y)) #define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) #define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -67511,7 +67511,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_eucjpms, + my_well_formed_char_length_eucjpms, + my_copy_fix_mb, }; diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index a4268b8fd68..129e8edb966 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -167,7 +167,7 @@ static const uchar sort_order_gb2312[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312 #define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -6330,7 +6330,10 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; - + + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F))) return -2; @@ -6382,7 +6385,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_gb2312, + my_well_formed_char_length_gb2312, + my_copy_fix_mb, }; diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index 392fdb487b6..b3bd1efb6c4 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -45,7 +45,7 @@ #define MY_FUNCTION_NAME(x) my_ ## x ## _gbk #define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -10724,6 +10724,9 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1]))) return -2; @@ -10776,7 +10779,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_gbk, + my_well_formed_char_length_gbk, + my_copy_fix_mb, }; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 099f03460ce..bc51911dceb 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -422,6 +422,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index fc41563324a..5947c3d4f4a 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -424,25 +424,95 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e, /* - Copy a multi-byte string. Abort if a bad byte sequence was found. - Note more than "nchars" characters are copied. + Append a badly formed piece of string. + Bad bytes are fixed to '?'. + + @param to The destination string + @param to_end The end of the destination string + @param from The source string + @param from_end The end of the source string + @param nchars Write not more than "nchars" characters. + @param status Copying status, must be previously initialized, + e.g. using well_formed_char_length() on the original + full source string. */ +static size_t +my_append_fix_badly_formed_tail(CHARSET_INFO *cs, + char *to, char *to_end, + const char *from, const char *from_end, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + char *to0= to; + + for ( ; nchars; nchars--) + { + int chlen; + if ((chlen= cs->cset->charlen(cs, (const uchar*) from, + (const uchar *) from_end)) > 0) + { + /* Found a valid character */ /* chlen == 1..MBMAXLEN */ + DBUG_ASSERT(chlen <= (int) cs->mbmaxlen); + if (to + chlen > to_end) + goto end; /* Does not fit to "to" */ + memcpy(to, from, (size_t) chlen); + from+= chlen; + to+= chlen; + continue; + } + if (chlen == MY_CS_ILSEQ) /* chlen == 0 */ + { + DBUG_ASSERT(from < from_end); /* Shouldn't get MY_CS_ILSEQ if empty */ + goto bad; + } + /* Got an incomplete character */ /* chlen == MY_CS_TOOSMALLXXX */ + DBUG_ASSERT(chlen >= MY_CS_TOOSMALL6); + DBUG_ASSERT(chlen <= MY_CS_TOOSMALL); + if (from >= from_end) + break; /* End of the source string */ +bad: + /* Bad byte sequence, or incomplete character found */ + if (!status->m_well_formed_error_pos) + status->m_well_formed_error_pos= from; + + if ((chlen= cs->cset->wc_mb(cs, '?', (uchar*) to, (uchar *) to_end)) <= 0) + break; /* Question mark does not fit into the destination */ + to+= chlen; + from++; + } +end: + status->m_source_end_pos= from; + return to - to0; +} + + size_t -my_copy_abort_mb(CHARSET_INFO *cs, - char *dst, size_t dst_length, - const char *src, size_t src_length, - size_t nchars, MY_STRCOPY_STATUS *status) +my_copy_fix_mb(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) { - int well_formed_error; - size_t res; + size_t well_formed_nchars; + size_t well_formed_length; + size_t fixed_length; set_if_smaller(src_length, dst_length); - res= cs->cset->well_formed_len(cs, src, src + src_length, - nchars, &well_formed_error); - memmove(dst, src, res); - status->m_source_end_pos= src + res; - status->m_well_formed_error_pos= well_formed_error ? src + res : NULL; - return res; + well_formed_nchars= cs->cset->well_formed_char_length(cs, + src, src + src_length, + nchars, status); + DBUG_ASSERT(well_formed_nchars <= nchars); + memmove(dst, src, (well_formed_length= status->m_source_end_pos - src)); + if (!status->m_well_formed_error_pos) + return well_formed_length; + + fixed_length= my_append_fix_badly_formed_tail(cs, + dst + well_formed_length, + dst + dst_length, + src + well_formed_length, + src + src_length, + nchars - well_formed_nchars, + status); + return well_formed_length + fixed_length; } diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic index 70cc89c9af0..55094535d5e 100644 --- a/strings/ctype-mb.ic +++ b/strings/ctype-mb.ic @@ -29,7 +29,70 @@ #endif -#ifdef WELL_FORMED_LEN +#ifdef DEFINE_ASIAN_ROUTINES +#define DEFINE_WELL_FORMED_LEN +#define DEFINE_WELL_FORMED_CHAR_LENGTH +#define DEFINE_CHARLEN +#endif + + +#ifdef DEFINE_CHARLEN +/** + Returns length of the left-most character of a string. + @param cs - charset with mbminlen==1 and mbmaxlen<=4 + @param b - the beginning of the string + @param e - the end of the string + + @return MY_CS_ILSEQ if a bad byte sequence was found + @return MY_CS_TOOSMALL(N) if the string ended unexpectedly + @return >0 if a valid character was found +*/ +static int +MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)), + const uchar *b, const uchar *e) +{ + DBUG_ASSERT(cs->mbminlen == 1); + DBUG_ASSERT(cs->mbmaxlen <= 4); + + if (b >= e) + return MY_CS_TOOSMALL; + if ((uchar) b[0] < 128) + return 1; /* Single byte ASCII character */ + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + /* Single byte non-ASCII character, e.g. half width kana in sjis */ + return 1; + } +#endif + + if (b + 2 > e) + return MY_CS_TOOSMALLN(2); + if (IS_MB2_CHAR(b[0], b[1])) + return 2; /* Double byte character */ + +#ifdef IS_MB3_CHAR + if (b + 3 > e) + return MY_CS_TOOSMALLN(3); + if (IS_MB3_CHAR(b[0], b[1], b[2])) + return 3; /* Three-byte character */ +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 > e) + return MY_CS_TOOSMALLN(4); + if (IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + return 4; /* Four-byte character */ +#endif + + /* Wrong byte sequence */ + return MY_CS_ILSEQ; +} +#endif /* DEFINE_WELL_FORMED_LEN */ + + +#ifdef DEFINE_WELL_FORMED_LEN /** Returns well formed length of a character string with variable character length for character sets with: @@ -91,4 +154,105 @@ MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)), return b - b0; } -#endif /* WELL_FORMED_LEN */ +#endif /* DEFINE_WELL_FORMED_LEN */ + + + +#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH +/** + Returns well formed length of a string + measured in characters (rather than in bytes). + Version for character sets that define IS_MB?_CHAR(), e.g. big5. +*/ +static size_t +MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + size_t nchars0= nchars; + for ( ; b < e && nchars ; nchars--) + { + if ((uchar) b[0] < 128) + { + b++; /* Single byte ASCII character */ + continue; + } + + if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1])) + { + b+= 2; /* Double byte character */ + continue; + } + +#ifdef IS_MB3_CHAR + if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2])) + { + b+= 3; /* Three-byte character */ + continue; + } +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + { + b+= 4; /* Four-byte character */ + continue; + } +#endif + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */ + continue; + } +#endif + + /* Wrong byte sequence */ + status->m_source_end_pos= status->m_well_formed_error_pos= b; + return nchars0 - nchars; + } + status->m_source_end_pos= b; + status->m_well_formed_error_pos= NULL; + return nchars0 - nchars; +} +#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */ + + +#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#ifndef CHARLEN +#error CHARLEN is not defined +#endif +/** + Returns well formed length of a string + measured in characters (rather than in bytes). + Version for character sets that define CHARLEN(), e.g. utf8. + CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does: + - a positive number in the range [1-mbmaxlen] if a valid + single-byte or multi-byte character was found + - MY_CS_ILSEQ (0) on a bad byte sequence + - MY_CS_TOOSMALLxx if the incoming sequence is incomplete +*/ +static size_t +MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + size_t nchars0= nchars; + int chlen; + for ( ; nchars ; nchars--, b+= chlen) + { + if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0) + { + status->m_well_formed_error_pos= b < e ? b : NULL; + status->m_source_end_pos= b; + return nchars0 - nchars; + } + } + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= b; + return nchars0 - nchars; +} +#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */ diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index b010c528979..d7a1b3f33b4 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -248,6 +248,13 @@ int my_strcasecmp_8bit(CHARSET_INFO * cs,const char *s, const char *t) } +int my_charlen_8bit(CHARSET_INFO *cs __attribute__((unused)), + const uchar *str, const uchar *end) +{ + return str >= end ? MY_CS_TOOSMALL : 1; +} + + int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc, const uchar *str, const uchar *end __attribute__((unused))) @@ -1108,6 +1115,19 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)), } +size_t +my_well_formed_char_length_8bit(CHARSET_INFO *cs __attribute__((unused)), + const char *start, const char *end, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + size_t nbytes= (size_t) (end - start); + size_t res= MY_MIN(nbytes, nchars); + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= start + res; + return res; +} + + /* Copy a 8-bit string. Not more than "nchars" character are copied. */ @@ -1906,6 +1926,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 432e2e5e823..bbf0026cf2b 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -187,7 +187,7 @@ static const uchar sort_order_sjis[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _sjis #define IS_8BIT_CHAR(x) issjiskata(x) #define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -34144,7 +34144,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_sjis, + my_well_formed_char_length_sjis, + my_copy_fix_mb, }; diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 343fb812e20..6537b380ab3 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -886,6 +886,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 8f234e9e3a8..d1441a4d3a5 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -92,62 +92,107 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)), } +typedef enum +{ + MY_CHAR_COPY_OK= 0, /* The character was Okey */ + MY_CHAR_COPY_ERROR= 1, /* The character was not Ok, and could not fix */ + MY_CHAR_COPY_FIXED= 2 /* The character was not Ok, was fixed to '?' */ +} my_char_copy_status_t; + + /* - Copy an UCS2/UTF16/UTF32 string. - Not more that "nchars" characters are copied. + Copies an incomplete character, lef-padding it with 0x00 bytes. + + @param cs Character set + @param dst The destination string + @param dst_length Space available in dst + @param src The source string + @param src_length Length of src + @param nchars Copy not more than nchars characters. + The "nchars" parameter of the caller. + Only 0 and non-0 are important here. + @param fix What to do if after zero-padding didn't get a valid + character: + - FALSE - exit with error. + - TRUE - try to put '?' instead. + + @return MY_CHAR_COPY_OK if after zero-padding got a valid character. + cs->mbmaxlen bytes were written to "dst". + @return MY_CHAR_COPY_FIXED if after zero-padding did not get a valid + character, but wrote '?' to the destination + string instead. + cs->mbminlen bytes were written to "dst". + @return MY_CHAR_COPY_ERROR If failed and nothing was written to "dst". + Possible reasons: + - dst_length was too short + - nchars was 0 + - the character after padding appeared not + to be valid, and could not fix it to '?'. +*/ +static my_char_copy_status_t +my_copy_incomplete_char(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, my_bool fix) +{ + size_t pad_length; + size_t src_offset= src_length % cs->mbminlen; + if (dst_length < cs->mbminlen || !nchars) + return MY_CHAR_COPY_ERROR; + + pad_length= cs->mbminlen - src_offset; + bzero(dst, pad_length); + memmove(dst + pad_length, src, src_offset); + /* + In some cases left zero-padding can create an incorrect character. + For example: + INSERT INTO t1 (utf32_column) VALUES (0x110000); + We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! + The valid characters range is limited to 0x00000000..0x0010FFFF. + + Make sure we didn't pad to an incorrect character. + */ + if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) == + (int) cs->mbminlen) + return MY_CHAR_COPY_OK; - UCS2/UTF16/UTF32 may need to prepend zero some bytes, - e.g. when copying from a BINARY source: - INSERT INTO t1 (ucs2_column) VALUES (0x01); - 0x01 -> 0x0001 + if (fix && + cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) == + (int) cs->mbminlen) + return MY_CHAR_COPY_FIXED; + + return MY_CHAR_COPY_ERROR; +} + + +/* + Copy an UCS2/UTF16/UTF32 string, fix bad characters. */ static size_t -my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs, - char *dst, size_t dst_length, - const char *src, size_t src_length, - size_t nchars, MY_STRCOPY_STATUS *status) +my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) { - size_t src_offset; - - if ((src_offset= (src_length % cs->mbminlen))) - { - int well_formed_error; - size_t pad_length; - if (dst_length < cs->mbminlen || !nchars) - { - status->m_source_end_pos= status->m_well_formed_error_pos= src; - return 0; - } - - pad_length= cs->mbminlen - src_offset; - bzero(dst, pad_length); - memmove(dst + pad_length, src, src_offset); - /* - In some cases left zero-padding can create an incorrect character. - For example: - INSERT INTO t1 (utf32_column) VALUES (0x110000); - We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! - The valid characters range is limited to 0x00000000..0x0010FFFF. - - Make sure we didn't pad to an incorrect character. - */ - if (cs->cset->well_formed_len(cs, - dst, dst + cs->mbminlen, 1, - &well_formed_error) != cs->mbminlen) - { - status->m_source_end_pos= status->m_well_formed_error_pos= src; - return 0; - } - nchars--; - src+= src_offset; - src_length-= src_offset; - dst+= cs->mbminlen; - dst_length-= cs->mbminlen; - return - cs->mbminlen /* The left-padded character */ + - my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status); + size_t length2, src_offset= src_length % cs->mbminlen; + my_char_copy_status_t padstatus; + + if (!src_offset) + return my_copy_fix_mb(cs, dst, dst_length, + src, src_length, nchars, status); + if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length, + src, src_length, nchars, TRUE)) == + MY_CHAR_COPY_ERROR) + { + status->m_source_end_pos= status->m_well_formed_error_pos= src; + return 0; } - return my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status); + length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen, + src + src_offset, src_length - src_offset, + nchars - 1, status); + if (padstatus == MY_CHAR_COPY_FIXED) + status->m_well_formed_error_pos= src; + return cs->mbminlen /* The left-padded character */ + length2; } @@ -1475,6 +1520,24 @@ my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e) } +static int +my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end) +{ + my_wc_t wc; + return cs->cset->mb_wc(cs, &wc, str, end); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16 +#define CHARLEN(cs,str,end) my_charlen_utf16(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* Defines my_well_formed_char_length_utf16 */ + + static uint my_mbcharlen_utf16(CHARSET_INFO *cs __attribute__((unused)), uint c __attribute__((unused))) @@ -1742,7 +1805,9 @@ MY_CHARSET_HANDLER my_charset_utf16_handler= my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, my_scan_mb2, - my_copy_abort_mb2_or_mb4, + my_charlen_utf16, + my_well_formed_char_length_utf16, + my_copy_fix_mb2_or_mb4, }; @@ -1912,7 +1977,9 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler= my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, my_scan_mb2, - my_copy_abort_mb2_or_mb4, + my_charlen_utf16, + my_well_formed_char_length_utf16, + my_copy_fix_mb2_or_mb4, }; @@ -1987,6 +2054,13 @@ struct charset_info_st my_charset_utf16le_bin= #ifdef HAVE_CHARSET_utf32 +/* + Check is b0 and b1 start a valid UTF32 four-byte sequence. + Don't accept characters greater than U+10FFFF. +*/ +#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10)) + + static int my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) @@ -1994,7 +2068,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), if (s + 4 > e) return MY_CS_TOOSMALL4; *pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]); - return 4; + return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4; } @@ -2004,7 +2078,10 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)), { if (s + 4 > e) return MY_CS_TOOSMALL4; - + + if (wc > 0x10FFFF) + return MY_CS_ILUNI; + s[0]= (uchar) (wc >> 24); s[1]= (uchar) (wc >> 16) & 0xFF; s[2]= (uchar) (wc >> 8) & 0xFF; @@ -2263,10 +2340,29 @@ my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)), const char *b, const char *e) { - return b + 4 > e ? 0 : 4; + return b + 4 > e || !IS_UTF32_MBHEAD4(b[0], b[1]) ? 0 : 4; } +static int +my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)), + const uchar *b, const uchar *e) +{ + return b + 4 > e ? MY_CS_TOOSMALL4 : + IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ; +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32 +#define CHARLEN(cs,str,end) my_charlen_utf32(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* Defines my_well_formed_char_length_utf32 */ + + static uint my_mbcharlen_utf32(CHARSET_INFO *cs __attribute__((unused)) , uint c __attribute__((unused))) @@ -2579,8 +2675,7 @@ my_well_formed_len_utf32(CHARSET_INFO *cs __attribute__((unused)), } for (; b < e; b+= 4) { - /* Don't accept characters greater than U+10FFFF */ - if (b[0] || (uchar) b[1] > 0x10) + if (!IS_UTF32_MBHEAD4(b[0], b[1])) { *error= 1; return b - b0; @@ -2827,7 +2922,9 @@ MY_CHARSET_HANDLER my_charset_utf32_handler= my_strtoll10_utf32, my_strntoull10rnd_mb2_or_mb4, my_scan_utf32, - my_copy_abort_mb2_or_mb4, + my_charlen_utf32, + my_well_formed_char_length_utf32, + my_copy_fix_mb2_or_mb4, }; @@ -2961,6 +3058,14 @@ static const uchar to_upper_ucs2[] = { }; +static int +my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2; +} + + static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t * pwc, const uchar *s, const uchar *e) { @@ -3264,6 +3369,31 @@ size_t my_well_formed_len_ucs2(CHARSET_INFO *cs __attribute__((unused)), } +static size_t +my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + size_t length= e - b; + if (nchars * 2 <= length) + { + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= b + (nchars * 2); + return nchars; + } + if (length % 2) + { + status->m_well_formed_error_pos= status->m_source_end_pos= e - 1; + } + else + { + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= e; + } + return length / 2; +} + + static int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, const char *str,const char *str_end, @@ -3446,7 +3576,9 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, my_scan_mb2, - my_copy_abort_mb2_or_mb4, + my_charlen_ucs2, + my_well_formed_char_length_ucs2, + my_copy_fix_mb2_or_mb4, }; diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 99f5be3fa38..cb000a2afa0 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -197,7 +197,7 @@ static const uchar sort_order_ujis[]= #define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y)) #define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) #define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -67255,7 +67255,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_ujis, + my_well_formed_char_length_ujis, + my_copy_fix_mb, }; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 1116228f706..56824aac59e 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -5446,8 +5446,8 @@ int my_wildcmp_utf8(CHARSET_INFO *cs, static -int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, const uchar *e) +int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) { uchar c; @@ -5515,7 +5515,7 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, { int mb_len; - if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + if ((mb_len= my_charlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) { *error= b < e ? 1 : 0; break; @@ -5526,9 +5526,20 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, return (size_t) (b - b_start); } + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8 +#define CHARLEN(cs,str,end) my_charlen_utf8(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* my_well_formed_char_length_utf8 */ + + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); + int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e); return (res>1) ? res : 0; } @@ -5615,7 +5626,9 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_utf8, + my_well_formed_char_length_utf8, + my_copy_fix_mb, }; @@ -7125,6 +7138,24 @@ my_wc_mb_filename(CHARSET_INFO *cs __attribute__((unused)), } +static int +my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end) +{ + my_wc_t wc; + return cs->cset->mb_wc(cs, &wc, str, end); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _filename +#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* my_well_formed_char_length_filename */ + + static MY_COLLATION_HANDLER my_collation_filename_handler = { NULL, /* init */ @@ -7169,7 +7200,9 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_filename, + my_well_formed_char_length_filename, + my_copy_fix_mb, }; @@ -7954,8 +7987,8 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs, static int -my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, const uchar *e) +my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) { uchar c; @@ -8015,7 +8048,7 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, { int mb_len; - if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + if ((mb_len= my_charlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) { *error= b < e ? 1 : 0; break; @@ -8027,10 +8060,19 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, } +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4 +#define CHARLEN(cs,str,end) my_charlen_utf8mb4(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* my_well_formed_char_length_utf8mb4 */ + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); + int res= my_charlen_utf8mb4(cs, (const uchar*) b, (const uchar*) e); return (res > 1) ? res : 0; } @@ -8113,7 +8155,9 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_utf8mb4, + my_well_formed_char_length_utf8mb4, + my_copy_fix_mb, }; |