diff options
42 files changed, 1899 insertions, 252 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index f08efb461b7..7f4ccee2a3e 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -444,22 +444,64 @@ struct my_charset_handler_st size_t (*scan)(CHARSET_INFO *, const char *b, const char *e, int sq); - /* Copying routines */ + /* String copying routines and helpers for them */ /* - copy_abort() - copy a string, abort if a bad byte sequence was found. + charlen() - calculate length of the left-most character in bytes. + @param cs Character set + @param str The beginning of the string + @param end The end of the string + + @return MY_CS_ILSEQ if a bad byte sequence was found. + @return MY_CS_TOOSMALLN(x) if the string ended unexpectedly. + @return a positive number in the range 1..mbmaxlen, + if a valid character was found. + */ + int (*charlen)(CHARSET_INFO *cs, const uchar *str, const uchar *end); + /* + well_formed_char_length() - returns character length of a string. + + @param cs Character set + @param str The beginning of the string + @param end The end of the string + @param nchars Not more than "nchars" left-most characters are checked. + @param status[OUT] Additional statistics is returned here. + "status" can be uninitialized before the call, + and it is fully initialized after the call. + + status->m_source_end_pos is set to the position where reading stopped. + + If a bad byte sequence is found, the function returns immediately and + status->m_well_formed_error_pos is set to the position where a bad byte + sequence was found. + + status->m_well_formed_error_pos is set to NULL if no bad bytes were found. + If status->m_well_formed_error_pos is NULL after the call, that means: + - either the function reached the end of the string, + - or all "nchars" characters were read. + The caller can check status->m_source_end_pos to detect which of these two + happened. + */ + size_t (*well_formed_char_length)(CHARSET_INFO *cs, + const char *str, const char *end, + size_t nchars, + MY_STRCOPY_STATUS *status); + + /* + copy_fix() - copy a string, replace bad bytes to '?'. Not more than "nchars" characters are copied. status->m_source_end_pos is set to a position in the range - between "src" and "src + src_length". + between "src" and "src + src_length", where reading stopped. status->m_well_formed_error_pos is set to NULL if the string in the range "src" and "status->m_source_end_pos" was well formed, - or is set to "src + src_length" otherwise. + or is set to a position between "src" and "src + src_length" where + the leftmost bad byte sequence was found. */ - size_t (*copy_abort)(CHARSET_INFO *, - char *dst, size_t dst_length, - const char *src, size_t src_length, - size_t nchars, MY_STRCOPY_STATUS *status); + size_t (*copy_fix)(CHARSET_INFO *, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status); }; extern MY_CHARSET_HANDLER my_charset_8bit_handler; @@ -596,10 +638,10 @@ size_t my_copy_8bit(CHARSET_INFO *, char *dst, size_t dst_length, const char *src, size_t src_length, size_t nchars, MY_STRCOPY_STATUS *); -size_t my_copy_abort_mb(CHARSET_INFO *cs, - char *dst, size_t dst_length, - const char *src, size_t src_length, - size_t nchars, MY_STRCOPY_STATUS *); +size_t my_copy_fix_mb(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *); /* Functions for 8bit */ extern size_t my_caseup_str_8bit(CHARSET_INFO *, char *); @@ -691,6 +733,11 @@ size_t my_numcells_8bit(CHARSET_INFO *, const char *b, const char *e); size_t my_charpos_8bit(CHARSET_INFO *, const char *b, const char *e, size_t pos); size_t my_well_formed_len_8bit(CHARSET_INFO *, const char *b, const char *e, size_t pos, int *error); +size_t my_well_formed_char_length_8bit(CHARSET_INFO *cs, + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status); +int my_charlen_8bit(CHARSET_INFO *, const uchar *str, const uchar *end); uint my_mbcharlen_8bit(CHARSET_INFO *, uint c); diff --git a/mysql-test/r/ctype_big5.result b/mysql-test/r/ctype_big5.result index 175bbf0f09f..d18c2a00c6f 100644 --- a/mysql-test/r/ctype_big5.result +++ b/mysql-test/r/ctype_big5.result @@ -597,7 +597,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61 Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62 Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63 Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; COUNT(*) 13973 SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a)); diff --git a/mysql-test/r/ctype_cp932_binlog_stm.result b/mysql-test/r/ctype_cp932_binlog_stm.result index 0e6ae25a395..fd920223091 100644 --- a/mysql-test/r/ctype_cp932_binlog_stm.result +++ b/mysql-test/r/ctype_cp932_binlog_stm.result @@ -165,7 +165,7 @@ Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 SELECT COUNT(*) FROM t1; COUNT(*) 14623 -SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1; +SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1; COUNT(*) 63 SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; diff --git a/mysql-test/r/ctype_eucjpms.result b/mysql-test/r/ctype_eucjpms.result index a1232c115e9..49d86c18a3d 100644 --- a/mysql-test/r/ctype_eucjpms.result +++ b/mysql-test/r/ctype_eucjpms.result @@ -10101,6 +10101,9 @@ COUNT(*) 56959 SELECT COUNT(*) FROM t1 WHERE a<>''; COUNT(*) +56959 +SELECT COUNT(*) FROM t1 WHERE a<>'' AND a<>'?'; +COUNT(*) 17735 SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; COUNT(*) @@ -33632,7 +33635,7 @@ CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET eucjpms); INSERT INTO t1 VALUES (0x8EA0); SELECT HEX(a), CHAR_LENGTH(a) FROM t1; HEX(a) CHAR_LENGTH(a) - 0 +3F3F 2 DROP TABLE t1; SELECT _eucjpms 0x8EA0; ERROR HY000: Invalid eucjpms character string: '8EA0' diff --git a/mysql-test/r/ctype_euckr.result b/mysql-test/r/ctype_euckr.result index dcb68cfe60b..0ee63bb76b2 100644 --- a/mysql-test/r/ctype_euckr.result +++ b/mysql-test/r/ctype_euckr.result @@ -407,12 +407,12 @@ Warnings: Warning 1366 Incorrect string value: '\xA1\xFF' for column 's1' at row 1 select hex(s1), hex(convert(s1 using utf8)) from t1 order by binary s1; hex(s1) hex(convert(s1 using utf8)) - - - - - - +3F3F 3F3F +3F3F 3F3F +3F40 3F40 +3F5B 3F5B +3F60 3F60 +3F7B 3F7B A141 ECA2A5 A15A ECA381 A161 ECA382 @@ -445,7 +445,7 @@ FROM t1 t11, t1 t12 WHERE t11.a >= 0x81 AND t11.a <= 0xFE AND t12.a >= 0x41 AND t12.a <= 0xFE ORDER BY t11.a, t12.a; -SELECT s as bad_code FROM t2 WHERE a='' ORDER BY s; +SELECT s as bad_code FROM t2 WHERE a='?' ORDER BY s; bad_code 815B 815C @@ -1959,7 +1959,7 @@ FE7D FE7E FE7F FE80 -DELETE FROM t2 WHERE a=''; +DELETE FROM t2 WHERE a='?'; ALTER TABLE t2 ADD u VARCHAR(1) CHARACTER SET utf8, ADD a2 VARCHAR(1) CHARACTER SET euckr; UPDATE t2 SET u=a, a2=u; SELECT s as unassigned_code FROM t2 WHERE u='?'; @@ -24492,7 +24492,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61 Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62 Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63 Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; COUNT(*) 22428 SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a)); diff --git a/mysql-test/r/ctype_gb2312.result b/mysql-test/r/ctype_gb2312.result index 5db6e2d3035..ceecb7786b0 100644 --- a/mysql-test/r/ctype_gb2312.result +++ b/mysql-test/r/ctype_gb2312.result @@ -553,7 +553,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61 Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62 Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63 Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; COUNT(*) 8178 SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a)); diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result index c5d997b0213..55561cfa289 100644 --- a/mysql-test/r/ctype_gbk.result +++ b/mysql-test/r/ctype_gbk.result @@ -573,7 +573,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61 Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62 Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63 Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; COUNT(*) 23940 SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a)); @@ -4946,3 +4946,814 @@ DROP TABLE t1; # # End of 10.0 tests # +# +# Start of 10.1 tests +# +# +# MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion +# +CREATE TABLE t1 ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +b VARBINARY(16), +type SET('ascii','bad','head','tail','mb2','unassigned') +); +INSERT INTO t1 (b, type) VALUES (0x40, 'ascii,tail'); +INSERT INTO t1 (b, type) VALUES (0x80, 'tail'); +INSERT INTO t1 (b, type) VALUES (0x81, 'head,tail'); +INSERT INTO t1 (b, type) VALUES (0xFF, 'bad'); +INSERT INTO t1 (b, type) VALUES (0xA140, 'mb2,unassigned'); +INSERT INTO t1 (b, type) VALUES (0xA1A3, 'mb2'); +INSERT INTO t1 (b, type) VALUES (0xFE40, 'mb2'); +CREATE TABLE t2 AS SELECT +CONCAT(t1.b,t2.b) AS b, +t1.type AS type1, +t2.type AS type2, +CONCAT('[',t1.type,'][',t2.type,']') AS comment +FROM t1, t1 t2; +CREATE TABLE t3 +( +b VARBINARY(16), +c VARCHAR(16) CHARACTER SET gbk, +comment VARCHAR(128) +); +# +# A combination of two valid characters, should give no warnings +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE +(FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND +(FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2)) +ORDER BY b; +SELECT COUNT(*) FROM t3; +COUNT(*) +16 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +4040 [ascii,tail][ascii,tail] +40A140 [ascii,tail][mb2,unassigned] +40A1A3 [ascii,tail][mb2] +40FE40 [ascii,tail][mb2] +A14040 [mb2,unassigned][ascii,tail] +A140A140 [mb2,unassigned][mb2,unassigned] +A140A1A3 [mb2,unassigned][mb2] +A140FE40 [mb2,unassigned][mb2] +A1A340 [mb2][ascii,tail] +A1A3A140 [mb2][mb2,unassigned] +A1A3A1A3 [mb2][mb2] +A1A3FE40 [mb2][mb2] +FE4040 [mb2][ascii,tail] +FE40A140 [mb2][mb2,unassigned] +FE40A1A3 [mb2][mb2] +FE40FE40 [mb2][mb2] +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that start with a tail or a bad byte, +# or end with a bad byte, all should be fixed. +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE type1='tail' OR type1='bad' OR type2='bad' +ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 2 +Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 4 +Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\x80\xFF' for column 'c' at row 8 +Warning 1366 Incorrect string value: '\x81\xFF' for column 'c' at row 9 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 10 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 11 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 12 +Warning 1366 Incorrect string value: '\xFF@' for column 'c' at row 13 +Warning 1366 Incorrect string value: '\xFF\x80' for column 'c' at row 14 +Warning 1366 Incorrect string value: '\xFF\x81' for column 'c' at row 15 +Warning 1366 Incorrect string value: '\xFF\xA1@' for column 'c' at row 16 +Warning 1366 Incorrect string value: '\xFF\xA1\xA3' for column 'c' at row 17 +Warning 1366 Incorrect string value: '\xFF\xFE@' for column 'c' at row 18 +Warning 1366 Incorrect string value: '\xFF\xFF' for column 'c' at row 19 +SELECT COUNT(*) FROM t3; +COUNT(*) +19 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +403F 40FF [ascii,tail][bad] +3F40 8040 [tail][ascii,tail] +3F3F 8080 [tail][tail] +3F3F 8081 [tail][head,tail] +3FA140 80A140 [tail][mb2,unassigned] +3FA1A3 80A1A3 [tail][mb2] +3FFE40 80FE40 [tail][mb2] +3F3F 80FF [tail][bad] +3F3F 81FF [head,tail][bad] +A1403F A140FF [mb2,unassigned][bad] +A1A33F A1A3FF [mb2][bad] +FE403F FE40FF [mb2][bad] +3F40 FF40 [bad][ascii,tail] +3F3F FF80 [bad][tail] +3F3F FF81 [bad][head,tail] +3FA140 FFA140 [bad][mb2,unassigned] +3FA1A3 FFA1A3 [bad][mb2] +3FFE40 FFFE40 [bad][mb2] +3F3F FFFF [bad][bad] +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that start with an ASCII or an MB2 character, +# followed by a non-ASCII tail, all should be fixed. +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) +AND (FIND_IN_SET('tail',type2) AND NOT FIND_IN_SET('ascii',type2)) +ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 2 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 4 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 8 +SELECT COUNT(*) FROM t3; +COUNT(*) +8 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +403F 4080 [ascii,tail][tail] +403F 4081 [ascii,tail][head,tail] +A1403F A14080 [mb2,unassigned][tail] +A1403F A14081 [mb2,unassigned][head,tail] +A1A33F A1A380 [mb2][tail] +A1A33F A1A381 [mb2][head,tail] +FE403F FE4080 [mb2][tail] +FE403F FE4081 [mb2][head,tail] +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Other sequences +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 5 +SELECT COUNT(*) FROM t3; +COUNT(*) +6 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +8140 [head,tail][ascii,tail] +8180 [head,tail][tail] +8181 [head,tail][head,tail] +81A140 [head,tail][mb2,unassigned] +81FE40 [head,tail][mb2] +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +81A13F 81A1A3 [head,tail][mb2] +DELETE FROM t3; +DROP TABLE t3; +DROP TABLE t2; +CREATE TABLE t2 AS SELECT +CONCAT(t1.b,t2.b,t3.b) AS b, +t1.type AS type1, +t2.type AS type2, +t3.type AS type3, +CONCAT('[',t1.type,'][',t2.type,'][',t3.type,']') AS comment +FROM t1, t1 t2,t1 t3; +SELECT COUNT(*) FROM t2; +COUNT(*) +343 +CREATE TABLE t3 +( +b VARBINARY(16), +c VARCHAR(16) CHARACTER SET gbk, +comment VARCHAR(128) +); +# +# A combination of three valid characters, should give no warnings +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE +(FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND +(FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2)) AND +(FIND_IN_SET('ascii',type3) OR FIND_IN_SET('mb2',type3)) +ORDER BY b; +SELECT COUNT(*) FROM t3; +COUNT(*) +64 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +404040 [ascii,tail][ascii,tail][ascii,tail] +4040A140 [ascii,tail][ascii,tail][mb2,unassigned] +4040A1A3 [ascii,tail][ascii,tail][mb2] +4040FE40 [ascii,tail][ascii,tail][mb2] +40A14040 [ascii,tail][mb2,unassigned][ascii,tail] +40A140A140 [ascii,tail][mb2,unassigned][mb2,unassigned] +40A140A1A3 [ascii,tail][mb2,unassigned][mb2] +40A140FE40 [ascii,tail][mb2,unassigned][mb2] +40A1A340 [ascii,tail][mb2][ascii,tail] +40A1A3A140 [ascii,tail][mb2][mb2,unassigned] +40A1A3A1A3 [ascii,tail][mb2][mb2] +40A1A3FE40 [ascii,tail][mb2][mb2] +40FE4040 [ascii,tail][mb2][ascii,tail] +40FE40A140 [ascii,tail][mb2][mb2,unassigned] +40FE40A1A3 [ascii,tail][mb2][mb2] +40FE40FE40 [ascii,tail][mb2][mb2] +A1404040 [mb2,unassigned][ascii,tail][ascii,tail] +A14040A140 [mb2,unassigned][ascii,tail][mb2,unassigned] +A14040A1A3 [mb2,unassigned][ascii,tail][mb2] +A14040FE40 [mb2,unassigned][ascii,tail][mb2] +A140A14040 [mb2,unassigned][mb2,unassigned][ascii,tail] +A140A140A140 [mb2,unassigned][mb2,unassigned][mb2,unassigned] +A140A140A1A3 [mb2,unassigned][mb2,unassigned][mb2] +A140A140FE40 [mb2,unassigned][mb2,unassigned][mb2] +A140A1A340 [mb2,unassigned][mb2][ascii,tail] +A140A1A3A140 [mb2,unassigned][mb2][mb2,unassigned] +A140A1A3A1A3 [mb2,unassigned][mb2][mb2] +A140A1A3FE40 [mb2,unassigned][mb2][mb2] +A140FE4040 [mb2,unassigned][mb2][ascii,tail] +A140FE40A140 [mb2,unassigned][mb2][mb2,unassigned] +A140FE40A1A3 [mb2,unassigned][mb2][mb2] +A140FE40FE40 [mb2,unassigned][mb2][mb2] +A1A34040 [mb2][ascii,tail][ascii,tail] +A1A340A140 [mb2][ascii,tail][mb2,unassigned] +A1A340A1A3 [mb2][ascii,tail][mb2] +A1A340FE40 [mb2][ascii,tail][mb2] +A1A3A14040 [mb2][mb2,unassigned][ascii,tail] +A1A3A140A140 [mb2][mb2,unassigned][mb2,unassigned] +A1A3A140A1A3 [mb2][mb2,unassigned][mb2] +A1A3A140FE40 [mb2][mb2,unassigned][mb2] +A1A3A1A340 [mb2][mb2][ascii,tail] +A1A3A1A3A140 [mb2][mb2][mb2,unassigned] +A1A3A1A3A1A3 [mb2][mb2][mb2] +A1A3A1A3FE40 [mb2][mb2][mb2] +A1A3FE4040 [mb2][mb2][ascii,tail] +A1A3FE40A140 [mb2][mb2][mb2,unassigned] +A1A3FE40A1A3 [mb2][mb2][mb2] +A1A3FE40FE40 [mb2][mb2][mb2] +FE404040 [mb2][ascii,tail][ascii,tail] +FE4040A140 [mb2][ascii,tail][mb2,unassigned] +FE4040A1A3 [mb2][ascii,tail][mb2] +FE4040FE40 [mb2][ascii,tail][mb2] +FE40A14040 [mb2][mb2,unassigned][ascii,tail] +FE40A140A140 [mb2][mb2,unassigned][mb2,unassigned] +FE40A140A1A3 [mb2][mb2,unassigned][mb2] +FE40A140FE40 [mb2][mb2,unassigned][mb2] +FE40A1A340 [mb2][mb2][ascii,tail] +FE40A1A3A140 [mb2][mb2][mb2,unassigned] +FE40A1A3A1A3 [mb2][mb2][mb2] +FE40A1A3FE40 [mb2][mb2][mb2] +FE40FE4040 [mb2][mb2][ascii,tail] +FE40FE40A140 [mb2][mb2][mb2,unassigned] +FE40FE40A1A3 [mb2][mb2][mb2] +FE40FE40FE40 [mb2][mb2][mb2] +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that start with a tail or a bad byte, +# or have a bad byte, all should be fixed. +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE type1='tail' OR type1='bad' OR type2='bad' OR type3='bad' +ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\x80\xFF' for column 'c' at row 2 +Warning 1366 Incorrect string value: '\x81\xFF' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 4 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\xFF@' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\xFF\x80' for column 'c' at row 8 +Warning 1366 Incorrect string value: '\xFF\x81' for column 'c' at row 9 +Warning 1366 Incorrect string value: '\xFF\xA1@' for column 'c' at row 10 +Warning 1366 Incorrect string value: '\xFF\xA1\xA3' for column 'c' at row 11 +Warning 1366 Incorrect string value: '\xFF\xFE@' for column 'c' at row 12 +Warning 1366 Incorrect string value: '\xFF\xFF' for column 'c' at row 13 +Warning 1366 Incorrect string value: '\x80@@' for column 'c' at row 14 +Warning 1366 Incorrect string value: '\x80@\x80' for column 'c' at row 15 +Warning 1366 Incorrect string value: '\x80@\x81' for column 'c' at row 16 +Warning 1366 Incorrect string value: '\x80@\xA1@' for column 'c' at row 17 +Warning 1366 Incorrect string value: '\x80@\xA1\xA3' for column 'c' at row 18 +Warning 1366 Incorrect string value: '\x80@\xFE@' for column 'c' at row 19 +Warning 1366 Incorrect string value: '\x80@\xFF' for column 'c' at row 20 +Warning 1366 Incorrect string value: '\x80\x80@' for column 'c' at row 21 +Warning 1366 Incorrect string value: '\x80\x80\x80' for column 'c' at row 22 +Warning 1366 Incorrect string value: '\x80\x80\x81' for column 'c' at row 23 +Warning 1366 Incorrect string value: '\x80\x80\xA1@' for column 'c' at row 24 +Warning 1366 Incorrect string value: '\x80\x80\xA1\xA3' for column 'c' at row 25 +Warning 1366 Incorrect string value: '\x80\x80\xFE@' for column 'c' at row 26 +Warning 1366 Incorrect string value: '\x80\x80\xFF' for column 'c' at row 27 +Warning 1366 Incorrect string value: '\x80\x81@' for column 'c' at row 28 +Warning 1366 Incorrect string value: '\x80\x81\x80' for column 'c' at row 29 +Warning 1366 Incorrect string value: '\x80\x81\x81' for column 'c' at row 30 +Warning 1366 Incorrect string value: '\x80\x81\xA1@' for column 'c' at row 31 +Warning 1366 Incorrect string value: '\x80\x81\xA1\xA3' for column 'c' at row 32 +Warning 1366 Incorrect string value: '\x80\x81\xFE@' for column 'c' at row 33 +Warning 1366 Incorrect string value: '\x80\x81\xFF' for column 'c' at row 34 +Warning 1366 Incorrect string value: '\x80\xA1@@' for column 'c' at row 35 +Warning 1366 Incorrect string value: '\x80\xA1@\x80' for column 'c' at row 36 +Warning 1366 Incorrect string value: '\x80\xA1@\x81' for column 'c' at row 37 +Warning 1366 Incorrect string value: '\x80\xA1@\xA1@' for column 'c' at row 38 +Warning 1366 Incorrect string value: '\x80\xA1@\xA1\xA3' for column 'c' at row 39 +Warning 1366 Incorrect string value: '\x80\xA1@\xFE@' for column 'c' at row 40 +Warning 1366 Incorrect string value: '\x80\xA1@\xFF' for column 'c' at row 41 +Warning 1366 Incorrect string value: '\x80\xA1\xA3@' for column 'c' at row 42 +Warning 1366 Incorrect string value: '\x80\xA1\xA3\x80' for column 'c' at row 43 +Warning 1366 Incorrect string value: '\x80\xA1\xA3\x81' for column 'c' at row 44 +Warning 1366 Incorrect string value: '\x80\xA1\xA3\xA1@' for column 'c' at row 45 +Warning 1366 Incorrect string value: '\x80\xA1\xA3\xA1\xA3' for column 'c' at row 46 +Warning 1366 Incorrect string value: '\x80\xA1\xA3\xFE@' for column 'c' at row 47 +Warning 1366 Incorrect string value: '\x80\xA1\xA3\xFF' for column 'c' at row 48 +Warning 1366 Incorrect string value: '\x80\xFE@@' for column 'c' at row 49 +Warning 1366 Incorrect string value: '\x80\xFE@\x80' for column 'c' at row 50 +Warning 1366 Incorrect string value: '\x80\xFE@\x81' for column 'c' at row 51 +Warning 1366 Incorrect string value: '\x80\xFE@\xA1@' for column 'c' at row 52 +Warning 1366 Incorrect string value: '\x80\xFE@\xA1\xA3' for column 'c' at row 53 +Warning 1366 Incorrect string value: '\x80\xFE@\xFE@' for column 'c' at row 54 +Warning 1366 Incorrect string value: '\x80\xFE@\xFF' for column 'c' at row 55 +Warning 1366 Incorrect string value: '\x80\xFF@' for column 'c' at row 56 +Warning 1366 Incorrect string value: '\x80\xFF\x80' for column 'c' at row 57 +Warning 1366 Incorrect string value: '\x80\xFF\x81' for column 'c' at row 58 +Warning 1366 Incorrect string value: '\x80\xFF\xA1@' for column 'c' at row 59 +Warning 1366 Incorrect string value: '\x80\xFF\xA1\xA3' for column 'c' at row 60 +Warning 1366 Incorrect string value: '\x80\xFF\xFE@' for column 'c' at row 61 +Warning 1366 Incorrect string value: '\x80\xFF\xFF' for column 'c' at row 62 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 63 +Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 64 +SELECT COUNT(*) FROM t3; +COUNT(*) +163 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +40403F 4040FF [ascii,tail][ascii,tail][bad] +403F3F 4080FF [ascii,tail][tail][bad] +403F3F 4081FF [ascii,tail][head,tail][bad] +40A1403F 40A140FF [ascii,tail][mb2,unassigned][bad] +40A1A33F 40A1A3FF [ascii,tail][mb2][bad] +40FE403F 40FE40FF [ascii,tail][mb2][bad] +403F40 40FF40 [ascii,tail][bad][ascii,tail] +403F3F 40FF80 [ascii,tail][bad][tail] +403F3F 40FF81 [ascii,tail][bad][head,tail] +403FA140 40FFA140 [ascii,tail][bad][mb2,unassigned] +403FA1A3 40FFA1A3 [ascii,tail][bad][mb2] +403FFE40 40FFFE40 [ascii,tail][bad][mb2] +403F3F 40FFFF [ascii,tail][bad][bad] +3F4040 804040 [tail][ascii,tail][ascii,tail] +3F403F 804080 [tail][ascii,tail][tail] +3F403F 804081 [tail][ascii,tail][head,tail] +3F40A140 8040A140 [tail][ascii,tail][mb2,unassigned] +3F40A1A3 8040A1A3 [tail][ascii,tail][mb2] +3F40FE40 8040FE40 [tail][ascii,tail][mb2] +3F403F 8040FF [tail][ascii,tail][bad] +3F3F40 808040 [tail][tail][ascii,tail] +3F3F3F 808080 [tail][tail][tail] +3F3F3F 808081 [tail][tail][head,tail] +3F3FA140 8080A140 [tail][tail][mb2,unassigned] +3F3FA1A3 8080A1A3 [tail][tail][mb2] +3F3FFE40 8080FE40 [tail][tail][mb2] +3F3F3F 8080FF [tail][tail][bad] +3F8140 808140 [tail][head,tail][ascii,tail] +3F8180 808180 [tail][head,tail][tail] +3F8181 808181 [tail][head,tail][head,tail] +3F81A140 8081A140 [tail][head,tail][mb2,unassigned] +3F81A13F 8081A1A3 [tail][head,tail][mb2] +3F81FE40 8081FE40 [tail][head,tail][mb2] +3F3F3F 8081FF [tail][head,tail][bad] +3FA14040 80A14040 [tail][mb2,unassigned][ascii,tail] +3FA1403F 80A14080 [tail][mb2,unassigned][tail] +3FA1403F 80A14081 [tail][mb2,unassigned][head,tail] +3FA140A140 80A140A140 [tail][mb2,unassigned][mb2,unassigned] +3FA140A1A3 80A140A1A3 [tail][mb2,unassigned][mb2] +3FA140FE40 80A140FE40 [tail][mb2,unassigned][mb2] +3FA1403F 80A140FF [tail][mb2,unassigned][bad] +3FA1A340 80A1A340 [tail][mb2][ascii,tail] +3FA1A33F 80A1A380 [tail][mb2][tail] +3FA1A33F 80A1A381 [tail][mb2][head,tail] +3FA1A3A140 80A1A3A140 [tail][mb2][mb2,unassigned] +3FA1A3A1A3 80A1A3A1A3 [tail][mb2][mb2] +3FA1A3FE40 80A1A3FE40 [tail][mb2][mb2] +3FA1A33F 80A1A3FF [tail][mb2][bad] +3FFE4040 80FE4040 [tail][mb2][ascii,tail] +3FFE403F 80FE4080 [tail][mb2][tail] +3FFE403F 80FE4081 [tail][mb2][head,tail] +3FFE40A140 80FE40A140 [tail][mb2][mb2,unassigned] +3FFE40A1A3 80FE40A1A3 [tail][mb2][mb2] +3FFE40FE40 80FE40FE40 [tail][mb2][mb2] +3FFE403F 80FE40FF [tail][mb2][bad] +3F3F40 80FF40 [tail][bad][ascii,tail] +3F3F3F 80FF80 [tail][bad][tail] +3F3F3F 80FF81 [tail][bad][head,tail] +3F3FA140 80FFA140 [tail][bad][mb2,unassigned] +3F3FA1A3 80FFA1A3 [tail][bad][mb2] +3F3FFE40 80FFFE40 [tail][bad][mb2] +3F3F3F 80FFFF [tail][bad][bad] +81403F 8140FF [head,tail][ascii,tail][bad] +81803F 8180FF [head,tail][tail][bad] +81813F 8181FF [head,tail][head,tail][bad] +81A1403F 81A140FF [head,tail][mb2,unassigned][bad] +81A13F3F 81A1A3FF [head,tail][mb2][bad] +81FE403F 81FE40FF [head,tail][mb2][bad] +3F3F40 81FF40 [head,tail][bad][ascii,tail] +3F3F3F 81FF80 [head,tail][bad][tail] +3F3F3F 81FF81 [head,tail][bad][head,tail] +3F3FA140 81FFA140 [head,tail][bad][mb2,unassigned] +3F3FA1A3 81FFA1A3 [head,tail][bad][mb2] +3F3FFE40 81FFFE40 [head,tail][bad][mb2] +3F3F3F 81FFFF [head,tail][bad][bad] +A140403F A14040FF [mb2,unassigned][ascii,tail][bad] +A1403F3F A14080FF [mb2,unassigned][tail][bad] +A1403F3F A14081FF [mb2,unassigned][head,tail][bad] +A140A1403F A140A140FF [mb2,unassigned][mb2,unassigned][bad] +A140A1A33F A140A1A3FF [mb2,unassigned][mb2][bad] +A140FE403F A140FE40FF [mb2,unassigned][mb2][bad] +A1403F40 A140FF40 [mb2,unassigned][bad][ascii,tail] +A1403F3F A140FF80 [mb2,unassigned][bad][tail] +A1403F3F A140FF81 [mb2,unassigned][bad][head,tail] +A1403FA140 A140FFA140 [mb2,unassigned][bad][mb2,unassigned] +A1403FA1A3 A140FFA1A3 [mb2,unassigned][bad][mb2] +A1403FFE40 A140FFFE40 [mb2,unassigned][bad][mb2] +A1403F3F A140FFFF [mb2,unassigned][bad][bad] +A1A3403F A1A340FF [mb2][ascii,tail][bad] +A1A33F3F A1A380FF [mb2][tail][bad] +A1A33F3F A1A381FF [mb2][head,tail][bad] +A1A3A1403F A1A3A140FF [mb2][mb2,unassigned][bad] +A1A3A1A33F A1A3A1A3FF [mb2][mb2][bad] +A1A3FE403F A1A3FE40FF [mb2][mb2][bad] +A1A33F40 A1A3FF40 [mb2][bad][ascii,tail] +A1A33F3F A1A3FF80 [mb2][bad][tail] +A1A33F3F A1A3FF81 [mb2][bad][head,tail] +A1A33FA140 A1A3FFA140 [mb2][bad][mb2,unassigned] +A1A33FA1A3 A1A3FFA1A3 [mb2][bad][mb2] +A1A33FFE40 A1A3FFFE40 [mb2][bad][mb2] +A1A33F3F A1A3FFFF [mb2][bad][bad] +FE40403F FE4040FF [mb2][ascii,tail][bad] +FE403F3F FE4080FF [mb2][tail][bad] +FE403F3F FE4081FF [mb2][head,tail][bad] +FE40A1403F FE40A140FF [mb2][mb2,unassigned][bad] +FE40A1A33F FE40A1A3FF [mb2][mb2][bad] +FE40FE403F FE40FE40FF [mb2][mb2][bad] +FE403F40 FE40FF40 [mb2][bad][ascii,tail] +FE403F3F FE40FF80 [mb2][bad][tail] +FE403F3F FE40FF81 [mb2][bad][head,tail] +FE403FA140 FE40FFA140 [mb2][bad][mb2,unassigned] +FE403FA1A3 FE40FFA1A3 [mb2][bad][mb2] +FE403FFE40 FE40FFFE40 [mb2][bad][mb2] +FE403F3F FE40FFFF [mb2][bad][bad] +3F4040 FF4040 [bad][ascii,tail][ascii,tail] +3F403F FF4080 [bad][ascii,tail][tail] +3F403F FF4081 [bad][ascii,tail][head,tail] +3F40A140 FF40A140 [bad][ascii,tail][mb2,unassigned] +3F40A1A3 FF40A1A3 [bad][ascii,tail][mb2] +3F40FE40 FF40FE40 [bad][ascii,tail][mb2] +3F403F FF40FF [bad][ascii,tail][bad] +3F3F40 FF8040 [bad][tail][ascii,tail] +3F3F3F FF8080 [bad][tail][tail] +3F3F3F FF8081 [bad][tail][head,tail] +3F3FA140 FF80A140 [bad][tail][mb2,unassigned] +3F3FA1A3 FF80A1A3 [bad][tail][mb2] +3F3FFE40 FF80FE40 [bad][tail][mb2] +3F3F3F FF80FF [bad][tail][bad] +3F8140 FF8140 [bad][head,tail][ascii,tail] +3F8180 FF8180 [bad][head,tail][tail] +3F8181 FF8181 [bad][head,tail][head,tail] +3F81A140 FF81A140 [bad][head,tail][mb2,unassigned] +3F81A13F FF81A1A3 [bad][head,tail][mb2] +3F81FE40 FF81FE40 [bad][head,tail][mb2] +3F3F3F FF81FF [bad][head,tail][bad] +3FA14040 FFA14040 [bad][mb2,unassigned][ascii,tail] +3FA1403F FFA14080 [bad][mb2,unassigned][tail] +3FA1403F FFA14081 [bad][mb2,unassigned][head,tail] +3FA140A140 FFA140A140 [bad][mb2,unassigned][mb2,unassigned] +3FA140A1A3 FFA140A1A3 [bad][mb2,unassigned][mb2] +3FA140FE40 FFA140FE40 [bad][mb2,unassigned][mb2] +3FA1403F FFA140FF [bad][mb2,unassigned][bad] +3FA1A340 FFA1A340 [bad][mb2][ascii,tail] +3FA1A33F FFA1A380 [bad][mb2][tail] +3FA1A33F FFA1A381 [bad][mb2][head,tail] +3FA1A3A140 FFA1A3A140 [bad][mb2][mb2,unassigned] +3FA1A3A1A3 FFA1A3A1A3 [bad][mb2][mb2] +3FA1A3FE40 FFA1A3FE40 [bad][mb2][mb2] +3FA1A33F FFA1A3FF [bad][mb2][bad] +3FFE4040 FFFE4040 [bad][mb2][ascii,tail] +3FFE403F FFFE4080 [bad][mb2][tail] +3FFE403F FFFE4081 [bad][mb2][head,tail] +3FFE40A140 FFFE40A140 [bad][mb2][mb2,unassigned] +3FFE40A1A3 FFFE40A1A3 [bad][mb2][mb2] +3FFE40FE40 FFFE40FE40 [bad][mb2][mb2] +3FFE403F FFFE40FF [bad][mb2][bad] +3F3F40 FFFF40 [bad][bad][ascii,tail] +3F3F3F FFFF80 [bad][bad][tail] +3F3F3F FFFF81 [bad][bad][head,tail] +3F3FA140 FFFFA140 [bad][bad][mb2,unassigned] +3F3FA1A3 FFFFA1A3 [bad][bad][mb2] +3F3FFE40 FFFFFE40 [bad][bad][mb2] +3F3F3F FFFFFF [bad][bad][bad] +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that start with an ASCII or an MB2 character, +# followed by a pure non-ASCII tail, all should be fixed. +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) +AND type2='tail' +ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 2 +Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 4 +Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 8 +Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 9 +Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 10 +Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 11 +Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 12 +Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 13 +Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 14 +Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 15 +Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 16 +Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 17 +Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 18 +Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 19 +Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 20 +Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 21 +Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 22 +Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 23 +Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 24 +SELECT COUNT(*) FROM t3; +COUNT(*) +24 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +403F40 408040 [ascii,tail][tail][ascii,tail] +403F3F 408080 [ascii,tail][tail][tail] +403F3F 408081 [ascii,tail][tail][head,tail] +403FA140 4080A140 [ascii,tail][tail][mb2,unassigned] +403FA1A3 4080A1A3 [ascii,tail][tail][mb2] +403FFE40 4080FE40 [ascii,tail][tail][mb2] +A1403F40 A1408040 [mb2,unassigned][tail][ascii,tail] +A1403F3F A1408080 [mb2,unassigned][tail][tail] +A1403F3F A1408081 [mb2,unassigned][tail][head,tail] +A1403FA140 A14080A140 [mb2,unassigned][tail][mb2,unassigned] +A1403FA1A3 A14080A1A3 [mb2,unassigned][tail][mb2] +A1403FFE40 A14080FE40 [mb2,unassigned][tail][mb2] +A1A33F40 A1A38040 [mb2][tail][ascii,tail] +A1A33F3F A1A38080 [mb2][tail][tail] +A1A33F3F A1A38081 [mb2][tail][head,tail] +A1A33FA140 A1A380A140 [mb2][tail][mb2,unassigned] +A1A33FA1A3 A1A380A1A3 [mb2][tail][mb2] +A1A33FFE40 A1A380FE40 [mb2][tail][mb2] +FE403F40 FE408040 [mb2][tail][ascii,tail] +FE403F3F FE408080 [mb2][tail][tail] +FE403F3F FE408081 [mb2][tail][head,tail] +FE403FA140 FE4080A140 [mb2][tail][mb2,unassigned] +FE403FA1A3 FE4080A1A3 [mb2][tail][mb2] +FE403FFE40 FE4080FE40 [mb2][tail][mb2] +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that consist of two ASCII or MB2 characters, +# followed by a pure non-ASCII tail, all should be fixed. +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) AND +(FIND_IN_SET('mb2',type2) OR FIND_IN_SET('ascii',type2)) AND +type3='tail' +ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 2 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 4 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 8 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 9 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 10 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 11 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 12 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 13 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 14 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 15 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 16 +SELECT COUNT(*) FROM t3; +COUNT(*) +16 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +40403F 404080 [ascii,tail][ascii,tail][tail] +40A1403F 40A14080 [ascii,tail][mb2,unassigned][tail] +40A1A33F 40A1A380 [ascii,tail][mb2][tail] +40FE403F 40FE4080 [ascii,tail][mb2][tail] +A140403F A1404080 [mb2,unassigned][ascii,tail][tail] +A140A1403F A140A14080 [mb2,unassigned][mb2,unassigned][tail] +A140A1A33F A140A1A380 [mb2,unassigned][mb2][tail] +A140FE403F A140FE4080 [mb2,unassigned][mb2][tail] +A1A3403F A1A34080 [mb2][ascii,tail][tail] +A1A3A1403F A1A3A14080 [mb2][mb2,unassigned][tail] +A1A3A1A33F A1A3A1A380 [mb2][mb2][tail] +A1A3FE403F A1A3FE4080 [mb2][mb2][tail] +FE40403F FE404080 [mb2][ascii,tail][tail] +FE40A1403F FE40A14080 [mb2][mb2,unassigned][tail] +FE40A1A33F FE40A1A380 [mb2][mb2][tail] +FE40FE403F FE40FE4080 [mb2][mb2][tail] +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that consist of two MB2 characters, +# followed by a non-ASCII head or tail, all should be fixed. +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE FIND_IN_SET('mb2',type1) AND FIND_IN_SET('mb2',type2) +AND NOT FIND_IN_SET('ascii',type3) +AND NOT FIND_IN_SET('mb2',type3) +ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 2 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 4 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 8 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 9 +SELECT COUNT(*) FROM t3; +COUNT(*) +9 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +A140A1403F A140A14081 [mb2,unassigned][mb2,unassigned][head,tail] +A140A1A33F A140A1A381 [mb2,unassigned][mb2][head,tail] +A140FE403F A140FE4081 [mb2,unassigned][mb2][head,tail] +A1A3A1403F A1A3A14081 [mb2][mb2,unassigned][head,tail] +A1A3A1A33F A1A3A1A381 [mb2][mb2][head,tail] +A1A3FE403F A1A3FE4081 [mb2][mb2][head,tail] +FE40A1403F FE40A14081 [mb2][mb2,unassigned][head,tail] +FE40A1A33F FE40A1A381 [mb2][mb2][head,tail] +FE40FE403F FE40FE4081 [mb2][mb2][head,tail] +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that consist of head + tail + MB2 should go without warnings +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE FIND_IN_SET('head',type1) +AND FIND_IN_SET('tail',type2) +AND FIND_IN_SET('mb2',type3) +ORDER BY b; +SELECT COUNT(*) FROM t3; +COUNT(*) +9 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +8140A140 [head,tail][ascii,tail][mb2,unassigned] +8140A1A3 [head,tail][ascii,tail][mb2] +8140FE40 [head,tail][ascii,tail][mb2] +8180A140 [head,tail][tail][mb2,unassigned] +8180A1A3 [head,tail][tail][mb2] +8180FE40 [head,tail][tail][mb2] +8181A140 [head,tail][head,tail][mb2,unassigned] +8181A1A3 [head,tail][head,tail][mb2] +8181FE40 [head,tail][head,tail][mb2] +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +# +# Sequences that consist of (ascii or mb2) + head + tail should go without warnings +# +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) +AND FIND_IN_SET('head',type2) +AND FIND_IN_SET('tail',type3) +ORDER BY b; +SELECT COUNT(*) FROM t3; +COUNT(*) +12 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +408140 [ascii,tail][head,tail][ascii,tail] +408180 [ascii,tail][head,tail][tail] +408181 [ascii,tail][head,tail][head,tail] +A1408140 [mb2,unassigned][head,tail][ascii,tail] +A1408180 [mb2,unassigned][head,tail][tail] +A1408181 [mb2,unassigned][head,tail][head,tail] +A1A38140 [mb2][head,tail][ascii,tail] +A1A38180 [mb2][head,tail][tail] +A1A38181 [mb2][head,tail][head,tail] +FE408140 [mb2][head,tail][ascii,tail] +FE408180 [mb2][head,tail][tail] +FE408181 [mb2][head,tail][head,tail] +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b; +Warnings: +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 1 +Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 3 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 5 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 6 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 7 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 9 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 10 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 12 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 13 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 15 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 16 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 18 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 19 +Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 27 +Warning 1366 Incorrect string value: '\x80' for column 'c' at row 30 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 31 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 35 +Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 37 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 39 +Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 41 +Warning 1366 Incorrect string value: '\x81' for column 'c' at row 43 +Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 45 +SELECT COUNT(*) FROM t3; +COUNT(*) +46 +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +HEX(c) comment +4081A140 [ascii,tail][head,tail][mb2,unassigned] +4081FE40 [ascii,tail][head,tail][mb2] +814040 [head,tail][ascii,tail][ascii,tail] +818040 [head,tail][tail][ascii,tail] +818140 [head,tail][head,tail][ascii,tail] +81A14040 [head,tail][mb2,unassigned][ascii,tail] +81A140A140 [head,tail][mb2,unassigned][mb2,unassigned] +81A140A1A3 [head,tail][mb2,unassigned][mb2] +81A140FE40 [head,tail][mb2,unassigned][mb2] +81A1A340 [head,tail][mb2][ascii,tail] +81A1A380 [head,tail][mb2][tail] +81A1A381 [head,tail][mb2][head,tail] +81A1A3A140 [head,tail][mb2][mb2,unassigned] +81A1A3FE40 [head,tail][mb2][mb2] +81FE4040 [head,tail][mb2][ascii,tail] +81FE40A140 [head,tail][mb2][mb2,unassigned] +81FE40A1A3 [head,tail][mb2][mb2] +81FE40FE40 [head,tail][mb2][mb2] +A14081A140 [mb2,unassigned][head,tail][mb2,unassigned] +A14081FE40 [mb2,unassigned][head,tail][mb2] +A1A381A140 [mb2][head,tail][mb2,unassigned] +A1A381FE40 [mb2][head,tail][mb2] +FE4081A140 [mb2][head,tail][mb2,unassigned] +FE4081FE40 [mb2][head,tail][mb2] +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +HEX(c) HEX(b) comment +40403F 404081 [ascii,tail][ascii,tail][head,tail] +4081A13F 4081A1A3 [ascii,tail][head,tail][mb2] +40A1403F 40A14081 [ascii,tail][mb2,unassigned][head,tail] +40A1A33F 40A1A381 [ascii,tail][mb2][head,tail] +40FE403F 40FE4081 [ascii,tail][mb2][head,tail] +81403F 814080 [head,tail][ascii,tail][tail] +81403F 814081 [head,tail][ascii,tail][head,tail] +81803F 818080 [head,tail][tail][tail] +81803F 818081 [head,tail][tail][head,tail] +81813F 818180 [head,tail][head,tail][tail] +81813F 818181 [head,tail][head,tail][head,tail] +81A1403F 81A14080 [head,tail][mb2,unassigned][tail] +81A1403F 81A14081 [head,tail][mb2,unassigned][head,tail] +81A1A3A13F 81A1A3A1A3 [head,tail][mb2][mb2] +81FE403F 81FE4080 [head,tail][mb2][tail] +81FE403F 81FE4081 [head,tail][mb2][head,tail] +A140403F A1404081 [mb2,unassigned][ascii,tail][head,tail] +A14081A13F A14081A1A3 [mb2,unassigned][head,tail][mb2] +A1A3403F A1A34081 [mb2][ascii,tail][head,tail] +A1A381A13F A1A381A1A3 [mb2][head,tail][mb2] +FE40403F FE404081 [mb2][ascii,tail][head,tail] +FE4081A13F FE4081A1A3 [mb2][head,tail][mb2] +DROP TABLE t3; +DROP TABLE t2; +DROP TABLE t1; +# +# END OF MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion +# +# +# End of 10.1 tests +# diff --git a/mysql-test/r/ctype_sjis.result b/mysql-test/r/ctype_sjis.result index 48456c16705..b4ef6f8c7e5 100644 --- a/mysql-test/r/ctype_sjis.result +++ b/mysql-test/r/ctype_sjis.result @@ -477,7 +477,7 @@ Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 SELECT COUNT(*) FROM t1; COUNT(*) 14623 -SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1; +SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1; COUNT(*) 63 SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; diff --git a/mysql-test/r/ctype_ujis.result b/mysql-test/r/ctype_ujis.result index 413ab4efe31..4074d98c00d 100644 --- a/mysql-test/r/ctype_ujis.result +++ b/mysql-test/r/ctype_ujis.result @@ -2626,7 +2626,7 @@ Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64 SELECT COUNT(*) FROM t1; COUNT(*) 44671 -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; COUNT(*) 17735 SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; @@ -25938,7 +25938,7 @@ CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET ujis); INSERT INTO t1 VALUES (0x8EA0); SELECT HEX(a), CHAR_LENGTH(a) FROM t1; HEX(a) CHAR_LENGTH(a) - 0 +3F3F 2 DROP TABLE t1; SELECT _ujis 0x8EA0; ERROR HY000: Invalid ujis character string: '8EA0' diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 4b23b010c79..2779ea5fa0f 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -225,7 +225,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 varchar(10) character set utf8); insert into t1 values (0x41FF); @@ -233,7 +233,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 text character set utf8); insert into t1 values (0x41FF); @@ -241,7 +241,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (a text character set utf8, primary key(a(371))); ERROR 42000: Specified key was too long; max key length is 1000 bytes diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result index 0dc94e90454..d8f4eb32132 100644 --- a/mysql-test/r/ctype_utf8mb4.result +++ b/mysql-test/r/ctype_utf8mb4.result @@ -225,7 +225,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 varchar(10) character set utf8mb4); insert into t1 values (0x41FF); @@ -233,7 +233,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 text character set utf8mb4); insert into t1 values (0x41FF); @@ -241,7 +241,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (a text character set utf8mb4, primary key(a(371))); ERROR 42000: Specified key was too long; max key length is 1000 bytes @@ -2327,7 +2327,7 @@ select hex(utf8mb4) from t1; hex(utf8mb4) F0908080 F0BFBFBF - +3F delete from t1; Testing [F2..F3][80..BF][80..BF][80..BF] insert into t1 values (0xF2808080); @@ -2347,7 +2347,7 @@ select hex(utf8mb4) from t1; hex(utf8mb4) F4808080 F48F8080 - +3F drop table t1; # # Check strnxfrm() with odd length @@ -2472,45 +2472,45 @@ F3A087AFEA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 -EA9DA8 +3F3F3F3FEA9DA8 SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2; HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) F09D8480EA9DA8 F09D8480EFB9AB -F09D8480 +F09D84803F3F3F3F F09D849EEA9DA8 F09D849EEFB9AB -F09D849E +F09D849E3F3F3F3F F09D859EEA9DA8 F09D859EEFB9AB -F09D859E +F09D859E3F3F3F3F F09D878FEA9DA8 F09D878FEFB9AB -F09D878F +F09D878F3F3F3F3F F09D9C9FEA9DA8 F09D9C9FEFB9AB -F09D9C9F +F09D9C9F3F3F3F3F F09D9E9FEA9DA8 F09D9E9FEFB9AB -F09D9E9F +F09D9E9F3F3F3F3F F48FBFBFEA9DA8 F48FBFBFEFB9AB -F48FBFBF +F48FBFBF3F3F3F3F F3A087AFEA9DA8 F3A087AFEFB9AB -F3A087AF +F3A087AF3F3F3F3F F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB -EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB +EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 -EA9DA8 -EFB9AB - +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F +3F3F3F3FEA9DA8 +3F3F3F3FEFB9AB +3F3F3F3F3F3F3F3F SELECT count(*) FROM t1, t2 WHERE t1.utf8mb4_encoding > t2.utf8mb3_encoding; count(*) @@ -2547,7 +2547,7 @@ u_decimal hex(utf8mb4_encoding) utf8mb4_encoding 119070 3F3F3F3F3F3F3F3F3F3F ?????????? 65131 EFB9AB3F3F3F3F3FEFB9ABEFB9AB3FEFB9AB ﹫?????﹫﹫?﹫ 119070 3F3F3F3F3F3F3F3F3F3F ?????????? -1114111 +1114111 3F3F3F3F ???? ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb4; SHOW CREATE TABLE t2; Table Create Table @@ -2559,7 +2559,7 @@ SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) 42856 EA9DA8 65131 EFB9AB -1114111 +1114111 3F3F3F3F ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3; SHOW CREATE TABLE t2; Table Create Table @@ -2571,7 +2571,7 @@ SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) 42856 EA9DA8 65131 EFB9AB -1114111 +1114111 3F3F3F3F ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3; SHOW CREATE TABLE t1; Table Create Table @@ -2592,7 +2592,7 @@ u_decimal hex(utf8mb4_encoding) 119070 3F3F3F3F3F3F3F3F3F3F 65131 EFB9AB3F3F3F3F3FEFB9ABEFB9AB3FEFB9AB 119070 3F3F3F3F3F3F3F3F3F3F -1114111 +1114111 3F3F3F3F ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb4; SHOW CREATE TABLE t1; Table Create Table @@ -2613,7 +2613,7 @@ u_decimal hex(utf8mb4_encoding) 119070 3F3F3F3F3F3F3F3F3F3F 65131 EFB9AB3F3F3F3F3FEFB9ABEFB9AB3FEFB9AB 119070 3F3F3F3F3F3F3F3F3F3F -1114111 +1114111 3F3F3F3F ALTER TABLE t2 MODIFY utf8mb3_encoding VARCHAR(10) CHARACTER SET utf8mb4; SHOW CREATE TABLE t2; Table Create Table @@ -2625,7 +2625,7 @@ SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) 42856 EA9DA8 65131 EFB9AB -1114111 +1114111 3F3F3F3F DROP TABLE IF EXISTS t3; CREATE TABLE t3 ( u_decimal int NOT NULL, @@ -3306,5 +3306,53 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF # End of 5.6 tests # # +# Start of 10.0 tests +# +# +# MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion +# +# +# This test sets session character set to 3-byte utf8, +# but then sends a 4-byte sequence (which is wrong for 3-byte utf8). +# It should be replaced to four question marks: '????' in both columns +# (i.e. four unknown bytes are replaced to four question marks), +# then the rest of the string should be stored, so we get 'a ???? b'. +# +SET NAMES utf8; +CREATE TABLE t1 ( +a VARCHAR(32) CHARACTER SET utf8mb4, +b VARCHAR(32) CHARACTER SET utf8 +); +INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b'; +Warnings: +Warning 1366 Incorrect string value: '\xF0\x9F\x98\x81 b' for column 'a' at row 1 +Warning 1366 Incorrect string value: '\xF0\x9F\x98\x81 b' for column 'b' at row 1 +SELECT * FROM t1; +a b +a ???? b a ???? b +DROP TABLE t1; +# +# This test sets session character set to 4-byte utf8, +# then normally sends a 4-byte sequence. +# It should be stored AS IS into the utf8mb4 column (a), +# and should be replaced to a single question mark in the utf8 column (b) +# (i.e. one character that cannot be converted is replaced to one question mark). +# +SET NAMES utf8mb4; +CREATE TABLE t1 ( +a VARCHAR(32) CHARACTER SET utf8mb4, +b VARCHAR(32) CHARACTER SET utf8 +); +INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b'; +Warnings: +Warning 1366 Incorrect string value: '\xF0\x9F\x98\x81 b' for column 'b' at row 1 +SELECT * FROM t1; +a b +a 😁 b a ? b +DROP TABLE t1; +# +# End of 10.0 tests +# +# # End of tests # diff --git a/mysql-test/r/ctype_utf8mb4_heap.result b/mysql-test/r/ctype_utf8mb4_heap.result index 57d29a24fd0..7f5125ae2ba 100644 --- a/mysql-test/r/ctype_utf8mb4_heap.result +++ b/mysql-test/r/ctype_utf8mb4_heap.result @@ -225,7 +225,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 varchar(10) character set utf8mb4) engine heap; insert into t1 values (0x41FF); @@ -233,7 +233,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; CREATE TABLE t1 ( a varchar(10) ) CHARACTER SET utf8mb4 ENGINE heap; INSERT INTO t1 VALUES ( 'test' ); @@ -2157,7 +2157,7 @@ Warnings: Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8mb4' at row 1 select hex(utf8mb4) from t1; hex(utf8mb4) - +3F F0908080 F0BFBFBF delete from t1; @@ -2177,7 +2177,7 @@ Warnings: Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8mb4' at row 1 select hex(utf8mb4) from t1; hex(utf8mb4) - +3F F4808080 F48F8080 drop table t1; @@ -2274,7 +2274,7 @@ Warning 1366 Incorrect string value: '\xF4\x8F\xBF\xBD' for column 'utf8mb3_enco UPDATE t2 SET utf8mb3_encoding= _utf8mb4 x'ea9da8' where u_decimal= 42856; SELECT HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) FROM t1; HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) -EA9DA8 +3F3F3F3FEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 F09D8480EA9DA8 F09D849EEA9DA8 @@ -2288,40 +2288,40 @@ F3A087AFEA9DA8 F48FBFBFEA9DA8 SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2; HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) - -EA9DA8 -EFB9AB -EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB +3F3F3F3F3F3F3F3F +3F3F3F3FEA9DA8 +3F3F3F3FEFB9AB +EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB -F09D8480 +F09D84803F3F3F3F F09D8480EA9DA8 F09D8480EFB9AB -F09D849E +F09D849E3F3F3F3F F09D849EEA9DA8 F09D849EEFB9AB -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB -F09D859E +F09D859E3F3F3F3F F09D859EEA9DA8 F09D859EEFB9AB -F09D878F +F09D878F3F3F3F3F F09D878FEA9DA8 F09D878FEFB9AB -F09D9C9F +F09D9C9F3F3F3F3F F09D9C9FEA9DA8 F09D9C9FEFB9AB -F09D9E9F +F09D9E9F3F3F3F3F F09D9E9FEA9DA8 F09D9E9FEFB9AB -F3A087AF +F3A087AF3F3F3F3F F3A087AFEA9DA8 F3A087AFEFB9AB -F48FBFBF +F48FBFBF3F3F3F3F F48FBFBFEA9DA8 F48FBFBFEFB9AB SELECT count(*) FROM t1, t2 @@ -2337,8 +2337,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=MEMORY DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding),utf8mb4_encoding FROM t1; u_decimal hex(utf8mb4_encoding) utf8mb4_encoding -1114111 1114111 3F ? +1114111 3F3F3F3F ???? 119040 3F ? 119070 3F ? 119070 3F3F3F3F3F3F3F3F3F3F ?????????? @@ -2358,7 +2358,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=MEMORY DEFAULT CHARSET=utf8mb4 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3; @@ -2370,7 +2370,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=MEMORY DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3; @@ -2382,8 +2382,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=MEMORY DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding) FROM t1; u_decimal hex(utf8mb4_encoding) -1114111 1114111 3F +1114111 3F3F3F3F 119040 3F 119070 3F 119070 3F3F3F3F3F3F3F3F3F3F @@ -2403,8 +2403,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=MEMORY DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding) FROM t1; u_decimal hex(utf8mb4_encoding) -1114111 1114111 3F +1114111 3F3F3F3F 119040 3F 119070 3F 119070 3F3F3F3F3F3F3F3F3F3F @@ -2424,7 +2424,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=MEMORY DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB DROP TABLE IF EXISTS t3; diff --git a/mysql-test/r/ctype_utf8mb4_innodb.result b/mysql-test/r/ctype_utf8mb4_innodb.result index ba03a3f66e6..053e6de8fe1 100644 --- a/mysql-test/r/ctype_utf8mb4_innodb.result +++ b/mysql-test/r/ctype_utf8mb4_innodb.result @@ -225,7 +225,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 varchar(10) character set utf8mb4) engine InnoDB; insert into t1 values (0x41FF); @@ -233,7 +233,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 text character set utf8mb4) engine InnoDB; insert into t1 values (0x41FF); @@ -241,7 +241,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (a text character set utf8mb4, primary key(a(371))) engine InnoDB; ERROR 42000: Specified key was too long; max key length is 767 bytes @@ -2285,7 +2285,7 @@ Warnings: Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8mb4' at row 1 select hex(utf8mb4) from t1; hex(utf8mb4) - +3F F0908080 F0BFBFBF delete from t1; @@ -2305,7 +2305,7 @@ Warnings: Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8mb4' at row 1 select hex(utf8mb4) from t1; hex(utf8mb4) - +3F F4808080 F48F8080 drop table t1; @@ -2421,7 +2421,7 @@ Warning 1366 Incorrect string value: '\xF4\x8F\xBF\xBD' for column 'utf8mb3_enco UPDATE t2 SET utf8mb3_encoding= _utf8mb4 x'ea9da8' where u_decimal= 42856; SELECT HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) FROM t1; HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) -EA9DA8 +3F3F3F3FEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 F09D8480EA9DA8 F09D849EEA9DA8 @@ -2435,40 +2435,40 @@ F3A087AFEA9DA8 F48FBFBFEA9DA8 SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2; HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) - -EA9DA8 -EFB9AB -EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB +3F3F3F3F3F3F3F3F +3F3F3F3FEA9DA8 +3F3F3F3FEFB9AB +EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB -F09D8480 +F09D84803F3F3F3F F09D8480EA9DA8 F09D8480EFB9AB -F09D849E +F09D849E3F3F3F3F F09D849EEA9DA8 F09D849EEFB9AB -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB -F09D859E +F09D859E3F3F3F3F F09D859EEA9DA8 F09D859EEFB9AB -F09D878F +F09D878F3F3F3F3F F09D878FEA9DA8 F09D878FEFB9AB -F09D9C9F +F09D9C9F3F3F3F3F F09D9C9FEA9DA8 F09D9C9FEFB9AB -F09D9E9F +F09D9E9F3F3F3F3F F09D9E9FEA9DA8 F09D9E9FEFB9AB -F3A087AF +F3A087AF3F3F3F3F F3A087AFEA9DA8 F3A087AFEFB9AB -F48FBFBF +F48FBFBF3F3F3F3F F48FBFBFEA9DA8 F48FBFBFEFB9AB SELECT count(*) FROM t1, t2 @@ -2484,8 +2484,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding),utf8mb4_encoding FROM t1; u_decimal hex(utf8mb4_encoding) utf8mb4_encoding -1114111 1114111 3F ? +1114111 3F3F3F3F ???? 119040 3F ? 119070 3F ? 119070 3F3F3F3F3F3F3F3F3F3F ?????????? @@ -2505,7 +2505,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3; @@ -2517,7 +2517,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3; @@ -2529,8 +2529,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding) FROM t1; u_decimal hex(utf8mb4_encoding) -1114111 1114111 3F +1114111 3F3F3F3F 119040 3F 119070 3F 119070 3F3F3F3F3F3F3F3F3F3F @@ -2550,8 +2550,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding) FROM t1; u_decimal hex(utf8mb4_encoding) -1114111 1114111 3F +1114111 3F3F3F3F 119040 3F 119070 3F 119070 3F3F3F3F3F3F3F3F3F3F @@ -2571,7 +2571,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB DROP TABLE IF EXISTS t3; diff --git a/mysql-test/r/ctype_utf8mb4_myisam.result b/mysql-test/r/ctype_utf8mb4_myisam.result index c4ff8e0a882..5bfdfe8ca71 100644 --- a/mysql-test/r/ctype_utf8mb4_myisam.result +++ b/mysql-test/r/ctype_utf8mb4_myisam.result @@ -225,7 +225,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 varchar(10) character set utf8mb4) engine MyISAM; insert into t1 values (0x41FF); @@ -233,7 +233,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (s1 text character set utf8mb4) engine MyISAM; insert into t1 values (0x41FF); @@ -241,7 +241,7 @@ Warnings: Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1 select hex(s1) from t1; hex(s1) -41 +413F drop table t1; create table t1 (a text character set utf8mb4, primary key(a(371))) engine MyISAM; ERROR 42000: Specified key was too long; max key length is 1000 bytes @@ -2285,7 +2285,7 @@ Warnings: Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8mb4' at row 1 select hex(utf8mb4) from t1; hex(utf8mb4) - +3F F0908080 F0BFBFBF delete from t1; @@ -2305,7 +2305,7 @@ Warnings: Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8mb4' at row 1 select hex(utf8mb4) from t1; hex(utf8mb4) - +3F F4808080 F48F8080 drop table t1; @@ -2421,7 +2421,7 @@ Warning 1366 Incorrect string value: '\xF4\x8F\xBF\xBD' for column 'utf8mb3_enco UPDATE t2 SET utf8mb3_encoding= _utf8mb4 x'ea9da8' where u_decimal= 42856; SELECT HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) FROM t1; HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) -EA9DA8 +3F3F3F3FEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 F09D8480EA9DA8 F09D849EEA9DA8 @@ -2435,40 +2435,40 @@ F3A087AFEA9DA8 F48FBFBFEA9DA8 SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2; HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) - -EA9DA8 -EFB9AB -EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB +3F3F3F3F3F3F3F3F +3F3F3F3FEA9DA8 +3F3F3F3FEFB9AB +EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8 EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB -F09D8480 +F09D84803F3F3F3F F09D8480EA9DA8 F09D8480EFB9AB -F09D849E +F09D849E3F3F3F3F F09D849EEA9DA8 F09D849EEFB9AB -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 -F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480 +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F +F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8 F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB -F09D859E +F09D859E3F3F3F3F F09D859EEA9DA8 F09D859EEFB9AB -F09D878F +F09D878F3F3F3F3F F09D878FEA9DA8 F09D878FEFB9AB -F09D9C9F +F09D9C9F3F3F3F3F F09D9C9FEA9DA8 F09D9C9FEFB9AB -F09D9E9F +F09D9E9F3F3F3F3F F09D9E9FEA9DA8 F09D9E9FEFB9AB -F3A087AF +F3A087AF3F3F3F3F F3A087AFEA9DA8 F3A087AFEFB9AB -F48FBFBF +F48FBFBF3F3F3F3F F48FBFBFEA9DA8 F48FBFBFEFB9AB SELECT count(*) FROM t1, t2 @@ -2484,8 +2484,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=MyISAM DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding),utf8mb4_encoding FROM t1; u_decimal hex(utf8mb4_encoding) utf8mb4_encoding -1114111 1114111 3F ? +1114111 3F3F3F3F ???? 119040 3F ? 119070 3F ? 119070 3F3F3F3F3F3F3F3F3F3F ?????????? @@ -2505,7 +2505,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3; @@ -2517,7 +2517,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=MyISAM DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3; @@ -2529,8 +2529,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=MyISAM DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding) FROM t1; u_decimal hex(utf8mb4_encoding) -1114111 1114111 3F +1114111 3F3F3F3F 119040 3F 119070 3F 119070 3F3F3F3F3F3F3F3F3F3F @@ -2550,8 +2550,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=MyISAM DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb4_encoding) FROM t1; u_decimal hex(utf8mb4_encoding) -1114111 1114111 3F +1114111 3F3F3F3F 119040 3F 119070 3F 119070 3F3F3F3F3F3F3F3F3F3F @@ -2571,7 +2571,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=MyISAM DEFAULT CHARSET=utf8 SELECT u_decimal,hex(utf8mb3_encoding) FROM t2; u_decimal hex(utf8mb3_encoding) -1114111 +1114111 3F3F3F3F 42856 EA9DA8 65131 EFB9AB DROP TABLE IF EXISTS t3; diff --git a/mysql-test/suite/funcs_2/include/check_charset.inc b/mysql-test/suite/funcs_2/include/check_charset.inc index df4a58d0eeb..0242d4390ac 100644 --- a/mysql-test/suite/funcs_2/include/check_charset.inc +++ b/mysql-test/suite/funcs_2/include/check_charset.inc @@ -22,13 +22,15 @@ SHOW TABLE STATUS LIKE 't1'; --disable_warnings --disable_query_log +ALTER TABLE test.t1 ADD code VARCHAR(16) NOT NULL; let $1= 221; while ($1) { - eval INSERT INTO test.t1 VALUES(CHAR(254-$1)); + eval INSERT INTO test.t1 VALUES(CHAR(254-$1), HEX(254-$1)); dec $1; } DELETE FROM test.t1 WHERE CHAR_LENGTH(a) <> 1; +DELETE FROM test.t1 WHERE a='?' AND code<>'3F'; --enable_query_log --enable_warnings diff --git a/mysql-test/suite/innodb/r/innodb-update-insert.result b/mysql-test/suite/innodb/r/innodb-update-insert.result index cd0fed101ab..034a63bca6c 100644 --- a/mysql-test/suite/innodb/r/innodb-update-insert.result +++ b/mysql-test/suite/innodb/r/innodb-update-insert.result @@ -30,7 +30,7 @@ Warnings: Warning 1366 Incorrect string value: '\xA3' for column 'f1' at row 1 select f1 from t1; f1 - +? update t1 set f1=0x6a; update t1 set f3=repeat(0xb1,8103); update t1 set f1=0x4a; @@ -39,5 +39,5 @@ Warnings: Warning 1366 Incorrect string value: '\x82' for column 'f1' at row 1 select f1 from t1; f1 - +? drop table t1; diff --git a/mysql-test/t/ctype_big5.test b/mysql-test/t/ctype_big5.test index 5c0bdff4633..46bb29514ff 100644 --- a/mysql-test/t/ctype_big5.test +++ b/mysql-test/t/ctype_big5.test @@ -121,7 +121,7 @@ DROP TEMPORARY TABLE head, tail; SHOW CREATE TABLE t1; SELECT COUNT(*) FROM t1; UPDATE t1 SET a=unhex(code) ORDER BY code; -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; # # Display all characters that have upper or lower case mapping. # diff --git a/mysql-test/t/ctype_cp932_binlog_stm.test b/mysql-test/t/ctype_cp932_binlog_stm.test index 304c9f5d05c..1b92006c949 100644 --- a/mysql-test/t/ctype_cp932_binlog_stm.test +++ b/mysql-test/t/ctype_cp932_binlog_stm.test @@ -99,7 +99,7 @@ DROP TEMPORARY TABLE head, tail; SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; SELECT COUNT(*) FROM t1; -SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1; +SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1; SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; # # Display all characters that have upper or lower case mapping. diff --git a/mysql-test/t/ctype_eucjpms.test b/mysql-test/t/ctype_eucjpms.test index 49ca81850ed..2dd806ed027 100644 --- a/mysql-test/t/ctype_eucjpms.test +++ b/mysql-test/t/ctype_eucjpms.test @@ -446,6 +446,7 @@ SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; SELECT COUNT(*) FROM t1; SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'' AND a<>'?'; SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; SELECT * FROM t1 WHERE CHAR_LENGTH(a)=2; SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=3; diff --git a/mysql-test/t/ctype_euckr.test b/mysql-test/t/ctype_euckr.test index 155b8ebed00..33b3e96cba8 100644 --- a/mysql-test/t/ctype_euckr.test +++ b/mysql-test/t/ctype_euckr.test @@ -95,8 +95,8 @@ WHERE t11.a >= 0x81 AND t11.a <= 0xFE AND t12.a >= 0x41 AND t12.a <= 0xFE ORDER BY t11.a, t12.a; --enable_warnings -SELECT s as bad_code FROM t2 WHERE a='' ORDER BY s; -DELETE FROM t2 WHERE a=''; +SELECT s as bad_code FROM t2 WHERE a='?' ORDER BY s; +DELETE FROM t2 WHERE a='?'; ALTER TABLE t2 ADD u VARCHAR(1) CHARACTER SET utf8, ADD a2 VARCHAR(1) CHARACTER SET euckr; --disable_warnings UPDATE t2 SET u=a, a2=u; @@ -145,7 +145,7 @@ ORDER BY head, tail; DROP TEMPORARY TABLE head, tail; SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; # # Display all characters that have upper or lower case mapping. # diff --git a/mysql-test/t/ctype_gb2312.test b/mysql-test/t/ctype_gb2312.test index e3dd448f54c..3ca6941705c 100644 --- a/mysql-test/t/ctype_gb2312.test +++ b/mysql-test/t/ctype_gb2312.test @@ -69,7 +69,7 @@ ORDER BY head, tail; DROP TEMPORARY TABLE head, tail; SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; # # Display all characters that have upper or lower case mapping. # diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test index d44009b6109..d98be88326e 100644 --- a/mysql-test/t/ctype_gbk.test +++ b/mysql-test/t/ctype_gbk.test @@ -104,7 +104,7 @@ ORDER BY head, tail; DROP TEMPORARY TABLE head, tail; SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; # # Display all characters that have upper or lower case mapping. # @@ -203,3 +203,228 @@ SET NAMES gbk; --echo # --echo # End of 10.0 tests --echo # + + +--echo # +--echo # Start of 10.1 tests +--echo # + +--echo # +--echo # MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion +--echo # + +CREATE TABLE t1 ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + b VARBINARY(16), + type SET('ascii','bad','head','tail','mb2','unassigned') +); +INSERT INTO t1 (b, type) VALUES (0x40, 'ascii,tail'); +INSERT INTO t1 (b, type) VALUES (0x80, 'tail'); +INSERT INTO t1 (b, type) VALUES (0x81, 'head,tail'); +INSERT INTO t1 (b, type) VALUES (0xFF, 'bad'); +INSERT INTO t1 (b, type) VALUES (0xA140, 'mb2,unassigned'); +INSERT INTO t1 (b, type) VALUES (0xA1A3, 'mb2'); +INSERT INTO t1 (b, type) VALUES (0xFE40, 'mb2'); +CREATE TABLE t2 AS SELECT + CONCAT(t1.b,t2.b) AS b, + t1.type AS type1, + t2.type AS type2, + CONCAT('[',t1.type,'][',t2.type,']') AS comment +FROM t1, t1 t2; + +CREATE TABLE t3 +( + b VARBINARY(16), + c VARCHAR(16) CHARACTER SET gbk, + comment VARCHAR(128) +); +--echo # +--echo # A combination of two valid characters, should give no warnings +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE + (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND + (FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2)) +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Sequences that start with a tail or a bad byte, +--echo # or end with a bad byte, all should be fixed. +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE type1='tail' OR type1='bad' OR type2='bad' +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Sequences that start with an ASCII or an MB2 character, +--echo # followed by a non-ASCII tail, all should be fixed. +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) + AND (FIND_IN_SET('tail',type2) AND NOT FIND_IN_SET('ascii',type2)) +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Other sequences +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t3; +DROP TABLE t3; +DROP TABLE t2; + +CREATE TABLE t2 AS SELECT + CONCAT(t1.b,t2.b,t3.b) AS b, + t1.type AS type1, + t2.type AS type2, + t3.type AS type3, + CONCAT('[',t1.type,'][',t2.type,'][',t3.type,']') AS comment +FROM t1, t1 t2,t1 t3; +SELECT COUNT(*) FROM t2; + +CREATE TABLE t3 +( + b VARBINARY(16), + c VARCHAR(16) CHARACTER SET gbk, + comment VARCHAR(128) +); + +--echo # +--echo # A combination of three valid characters, should give no warnings +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE + (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND + (FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2)) AND + (FIND_IN_SET('ascii',type3) OR FIND_IN_SET('mb2',type3)) +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Sequences that start with a tail or a bad byte, +--echo # or have a bad byte, all should be fixed. +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE type1='tail' OR type1='bad' OR type2='bad' OR type3='bad' +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Sequences that start with an ASCII or an MB2 character, +--echo # followed by a pure non-ASCII tail, all should be fixed. +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) + AND type2='tail' +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Sequences that consist of two ASCII or MB2 characters, +--echo # followed by a pure non-ASCII tail, all should be fixed. +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) AND + (FIND_IN_SET('mb2',type2) OR FIND_IN_SET('ascii',type2)) AND + type3='tail' +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + + +--echo # +--echo # Sequences that consist of two MB2 characters, +--echo # followed by a non-ASCII head or tail, all should be fixed. +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE FIND_IN_SET('mb2',type1) AND FIND_IN_SET('mb2',type2) + AND NOT FIND_IN_SET('ascii',type3) + AND NOT FIND_IN_SET('mb2',type3) +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + + +--echo # +--echo # Sequences that consist of head + tail + MB2 should go without warnings +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE FIND_IN_SET('head',type1) + AND FIND_IN_SET('tail',type2) + AND FIND_IN_SET('mb2',type3) +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + +--echo # +--echo # Sequences that consist of (ascii or mb2) + head + tail should go without warnings +--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 +WHERE (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) + AND FIND_IN_SET('head',type2) + AND FIND_IN_SET('tail',type3) +ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; +DELETE FROM t2 WHERE b IN (SELECT b FROM t3); +DELETE FROM t3; + + +#--echo # +#--echo # Other sequences +#--echo # +INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b; +SELECT COUNT(*) FROM t3; +SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b; +SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b; + +DROP TABLE t3; +DROP TABLE t2; +DROP TABLE t1; + +--echo # +--echo # END OF MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion +--echo # + +--echo # +--echo # End of 10.1 tests +--echo # diff --git a/mysql-test/t/ctype_sjis.test b/mysql-test/t/ctype_sjis.test index ae110b20cb2..2777cf6a035 100644 --- a/mysql-test/t/ctype_sjis.test +++ b/mysql-test/t/ctype_sjis.test @@ -145,7 +145,7 @@ DROP TEMPORARY TABLE head, tail; SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; SELECT COUNT(*) FROM t1; -SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1; +SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1; SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; # # Display all characters that have upper or lower case mapping. diff --git a/mysql-test/t/ctype_ujis.test b/mysql-test/t/ctype_ujis.test index 48dc0e63058..94fc7ffe4c0 100644 --- a/mysql-test/t/ctype_ujis.test +++ b/mysql-test/t/ctype_ujis.test @@ -1276,7 +1276,7 @@ SHOW CREATE TABLE t1; UPDATE t1 SET a=unhex(code) ORDER BY code; SELECT COUNT(*) FROM t1; -SELECT COUNT(*) FROM t1 WHERE a<>''; +SELECT COUNT(*) FROM t1 WHERE a<>'?'; SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2; SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=3; # diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test index 7a3c67bb417..232dd8fcb5d 100644 --- a/mysql-test/t/ctype_utf8mb4.test +++ b/mysql-test/t/ctype_utf8mb4.test @@ -1832,6 +1832,50 @@ set @@collation_connection=utf8mb4_bin; --echo # End of 5.6 tests --echo # +--echo # +--echo # Start of 10.0 tests +--echo # + +--echo # +--echo # MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion +--echo # + +--echo # +--echo # This test sets session character set to 3-byte utf8, +--echo # but then sends a 4-byte sequence (which is wrong for 3-byte utf8). +--echo # It should be replaced to four question marks: '????' in both columns +--echo # (i.e. four unknown bytes are replaced to four question marks), +--echo # then the rest of the string should be stored, so we get 'a ???? b'. +--echo # +SET NAMES utf8; +CREATE TABLE t1 ( + a VARCHAR(32) CHARACTER SET utf8mb4, + b VARCHAR(32) CHARACTER SET utf8 +); +INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b'; +SELECT * FROM t1; +DROP TABLE t1; + +--echo # +--echo # This test sets session character set to 4-byte utf8, +--echo # then normally sends a 4-byte sequence. +--echo # It should be stored AS IS into the utf8mb4 column (a), +--echo # and should be replaced to a single question mark in the utf8 column (b) +--echo # (i.e. one character that cannot be converted is replaced to one question mark). +--echo # + +SET NAMES utf8mb4; +CREATE TABLE t1 ( + a VARCHAR(32) CHARACTER SET utf8mb4, + b VARCHAR(32) CHARACTER SET utf8 +); +INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b'; +SELECT * FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.0 tests +--echo # --echo # --echo # End of tests diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 9fb462e9a9d..a0b63956ed0 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -922,8 +922,8 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs, my_charset_same(from_cs, to_cs)) { m_cannot_convert_error_pos= NULL; - return to_cs->cset->copy_abort(to_cs, to, to_length, from, from_length, - nchars, this); + return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length, + nchars, this); } else { diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index d631bd0a34e..eda81c0c4d3 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -50,7 +50,7 @@ #define MY_FUNCTION_NAME(x) my_ ## x ## _big5 #define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -6843,6 +6843,9 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1]))) return -2; @@ -6894,7 +6897,9 @@ static MY_CHARSET_HANDLER my_charset_big5_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_big5, + my_well_formed_char_length_big5, + my_copy_fix_mb, }; struct charset_info_st my_charset_big5_chinese_ci= diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 6b53b34159a..95f31038ee6 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -549,6 +549,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 13129a6a874..2e26a98bf05 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -186,7 +186,7 @@ static const uchar sort_order_cp932[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _cp932 #define IS_8BIT_CHAR(x) iscp932kata(x) #define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -34765,7 +34765,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_cp932, + my_well_formed_char_length_cp932, + my_copy_fix_mb, }; diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index eab9539ad45..a2c95bf77c8 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -204,7 +204,7 @@ static const uchar sort_order_euc_kr[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr #define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -9928,6 +9928,9 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1]))) return -2; @@ -9979,7 +9982,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_euckr, + my_well_formed_char_length_euckr, + my_copy_fix_mb, }; diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 52873c2f87e..827feda927b 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -198,7 +198,7 @@ static const uchar sort_order_eucjpms[]= #define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y)) #define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) #define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -67511,7 +67511,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_eucjpms, + my_well_formed_char_length_eucjpms, + my_copy_fix_mb, }; diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index a4268b8fd68..129e8edb966 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -167,7 +167,7 @@ static const uchar sort_order_gb2312[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312 #define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -6330,7 +6330,10 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; - + + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F))) return -2; @@ -6382,7 +6385,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_gb2312, + my_well_formed_char_length_gb2312, + my_copy_fix_mb, }; diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index 392fdb487b6..b3bd1efb6c4 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -45,7 +45,7 @@ #define MY_FUNCTION_NAME(x) my_ ## x ## _gbk #define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -10724,6 +10724,9 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), if (s+2>e) return MY_CS_TOOSMALL2; + if (!IS_MB2_CHAR(hi, s[1])) + return MY_CS_ILSEQ; + if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1]))) return -2; @@ -10776,7 +10779,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_gbk, + my_well_formed_char_length_gbk, + my_copy_fix_mb, }; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 099f03460ce..bc51911dceb 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -422,6 +422,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index fc41563324a..5947c3d4f4a 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -424,25 +424,95 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e, /* - Copy a multi-byte string. Abort if a bad byte sequence was found. - Note more than "nchars" characters are copied. + Append a badly formed piece of string. + Bad bytes are fixed to '?'. + + @param to The destination string + @param to_end The end of the destination string + @param from The source string + @param from_end The end of the source string + @param nchars Write not more than "nchars" characters. + @param status Copying status, must be previously initialized, + e.g. using well_formed_char_length() on the original + full source string. */ +static size_t +my_append_fix_badly_formed_tail(CHARSET_INFO *cs, + char *to, char *to_end, + const char *from, const char *from_end, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + char *to0= to; + + for ( ; nchars; nchars--) + { + int chlen; + if ((chlen= cs->cset->charlen(cs, (const uchar*) from, + (const uchar *) from_end)) > 0) + { + /* Found a valid character */ /* chlen == 1..MBMAXLEN */ + DBUG_ASSERT(chlen <= (int) cs->mbmaxlen); + if (to + chlen > to_end) + goto end; /* Does not fit to "to" */ + memcpy(to, from, (size_t) chlen); + from+= chlen; + to+= chlen; + continue; + } + if (chlen == MY_CS_ILSEQ) /* chlen == 0 */ + { + DBUG_ASSERT(from < from_end); /* Shouldn't get MY_CS_ILSEQ if empty */ + goto bad; + } + /* Got an incomplete character */ /* chlen == MY_CS_TOOSMALLXXX */ + DBUG_ASSERT(chlen >= MY_CS_TOOSMALL6); + DBUG_ASSERT(chlen <= MY_CS_TOOSMALL); + if (from >= from_end) + break; /* End of the source string */ +bad: + /* Bad byte sequence, or incomplete character found */ + if (!status->m_well_formed_error_pos) + status->m_well_formed_error_pos= from; + + if ((chlen= cs->cset->wc_mb(cs, '?', (uchar*) to, (uchar *) to_end)) <= 0) + break; /* Question mark does not fit into the destination */ + to+= chlen; + from++; + } +end: + status->m_source_end_pos= from; + return to - to0; +} + + size_t -my_copy_abort_mb(CHARSET_INFO *cs, - char *dst, size_t dst_length, - const char *src, size_t src_length, - size_t nchars, MY_STRCOPY_STATUS *status) +my_copy_fix_mb(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) { - int well_formed_error; - size_t res; + size_t well_formed_nchars; + size_t well_formed_length; + size_t fixed_length; set_if_smaller(src_length, dst_length); - res= cs->cset->well_formed_len(cs, src, src + src_length, - nchars, &well_formed_error); - memmove(dst, src, res); - status->m_source_end_pos= src + res; - status->m_well_formed_error_pos= well_formed_error ? src + res : NULL; - return res; + well_formed_nchars= cs->cset->well_formed_char_length(cs, + src, src + src_length, + nchars, status); + DBUG_ASSERT(well_formed_nchars <= nchars); + memmove(dst, src, (well_formed_length= status->m_source_end_pos - src)); + if (!status->m_well_formed_error_pos) + return well_formed_length; + + fixed_length= my_append_fix_badly_formed_tail(cs, + dst + well_formed_length, + dst + dst_length, + src + well_formed_length, + src + src_length, + nchars - well_formed_nchars, + status); + return well_formed_length + fixed_length; } diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic index 70cc89c9af0..55094535d5e 100644 --- a/strings/ctype-mb.ic +++ b/strings/ctype-mb.ic @@ -29,7 +29,70 @@ #endif -#ifdef WELL_FORMED_LEN +#ifdef DEFINE_ASIAN_ROUTINES +#define DEFINE_WELL_FORMED_LEN +#define DEFINE_WELL_FORMED_CHAR_LENGTH +#define DEFINE_CHARLEN +#endif + + +#ifdef DEFINE_CHARLEN +/** + Returns length of the left-most character of a string. + @param cs - charset with mbminlen==1 and mbmaxlen<=4 + @param b - the beginning of the string + @param e - the end of the string + + @return MY_CS_ILSEQ if a bad byte sequence was found + @return MY_CS_TOOSMALL(N) if the string ended unexpectedly + @return >0 if a valid character was found +*/ +static int +MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)), + const uchar *b, const uchar *e) +{ + DBUG_ASSERT(cs->mbminlen == 1); + DBUG_ASSERT(cs->mbmaxlen <= 4); + + if (b >= e) + return MY_CS_TOOSMALL; + if ((uchar) b[0] < 128) + return 1; /* Single byte ASCII character */ + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + /* Single byte non-ASCII character, e.g. half width kana in sjis */ + return 1; + } +#endif + + if (b + 2 > e) + return MY_CS_TOOSMALLN(2); + if (IS_MB2_CHAR(b[0], b[1])) + return 2; /* Double byte character */ + +#ifdef IS_MB3_CHAR + if (b + 3 > e) + return MY_CS_TOOSMALLN(3); + if (IS_MB3_CHAR(b[0], b[1], b[2])) + return 3; /* Three-byte character */ +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 > e) + return MY_CS_TOOSMALLN(4); + if (IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + return 4; /* Four-byte character */ +#endif + + /* Wrong byte sequence */ + return MY_CS_ILSEQ; +} +#endif /* DEFINE_WELL_FORMED_LEN */ + + +#ifdef DEFINE_WELL_FORMED_LEN /** Returns well formed length of a character string with variable character length for character sets with: @@ -91,4 +154,105 @@ MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)), return b - b0; } -#endif /* WELL_FORMED_LEN */ +#endif /* DEFINE_WELL_FORMED_LEN */ + + + +#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH +/** + Returns well formed length of a string + measured in characters (rather than in bytes). + Version for character sets that define IS_MB?_CHAR(), e.g. big5. +*/ +static size_t +MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + size_t nchars0= nchars; + for ( ; b < e && nchars ; nchars--) + { + if ((uchar) b[0] < 128) + { + b++; /* Single byte ASCII character */ + continue; + } + + if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1])) + { + b+= 2; /* Double byte character */ + continue; + } + +#ifdef IS_MB3_CHAR + if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2])) + { + b+= 3; /* Three-byte character */ + continue; + } +#endif + +#ifdef IS_MB4_CHAR + if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3])) + { + b+= 4; /* Four-byte character */ + continue; + } +#endif + +#ifdef IS_8BIT_CHAR + if (IS_8BIT_CHAR(b[0])) + { + b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */ + continue; + } +#endif + + /* Wrong byte sequence */ + status->m_source_end_pos= status->m_well_formed_error_pos= b; + return nchars0 - nchars; + } + status->m_source_end_pos= b; + status->m_well_formed_error_pos= NULL; + return nchars0 - nchars; +} +#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */ + + +#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#ifndef CHARLEN +#error CHARLEN is not defined +#endif +/** + Returns well formed length of a string + measured in characters (rather than in bytes). + Version for character sets that define CHARLEN(), e.g. utf8. + CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does: + - a positive number in the range [1-mbmaxlen] if a valid + single-byte or multi-byte character was found + - MY_CS_ILSEQ (0) on a bad byte sequence + - MY_CS_TOOSMALLxx if the incoming sequence is incomplete +*/ +static size_t +MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, + MY_STRCOPY_STATUS *status) +{ + size_t nchars0= nchars; + int chlen; + for ( ; nchars ; nchars--, b+= chlen) + { + if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0) + { + status->m_well_formed_error_pos= b < e ? b : NULL; + status->m_source_end_pos= b; + return nchars0 - nchars; + } + } + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= b; + return nchars0 - nchars; +} +#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */ diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index b010c528979..d7a1b3f33b4 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -248,6 +248,13 @@ int my_strcasecmp_8bit(CHARSET_INFO * cs,const char *s, const char *t) } +int my_charlen_8bit(CHARSET_INFO *cs __attribute__((unused)), + const uchar *str, const uchar *end) +{ + return str >= end ? MY_CS_TOOSMALL : 1; +} + + int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc, const uchar *str, const uchar *end __attribute__((unused))) @@ -1108,6 +1115,19 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)), } +size_t +my_well_formed_char_length_8bit(CHARSET_INFO *cs __attribute__((unused)), + const char *start, const char *end, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + size_t nbytes= (size_t) (end - start); + size_t res= MY_MIN(nbytes, nchars); + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= start + res; + return res; +} + + /* Copy a 8-bit string. Not more than "nchars" character are copied. */ @@ -1906,6 +1926,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 432e2e5e823..bbf0026cf2b 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -187,7 +187,7 @@ static const uchar sort_order_sjis[]= #define MY_FUNCTION_NAME(x) my_ ## x ## _sjis #define IS_8BIT_CHAR(x) issjiskata(x) #define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -34144,7 +34144,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_sjis, + my_well_formed_char_length_sjis, + my_copy_fix_mb, }; diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 343fb812e20..6537b380ab3 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -886,6 +886,8 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, + my_charlen_8bit, + my_well_formed_char_length_8bit, my_copy_8bit, }; diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 8f234e9e3a8..d1441a4d3a5 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -92,62 +92,107 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)), } +typedef enum +{ + MY_CHAR_COPY_OK= 0, /* The character was Okey */ + MY_CHAR_COPY_ERROR= 1, /* The character was not Ok, and could not fix */ + MY_CHAR_COPY_FIXED= 2 /* The character was not Ok, was fixed to '?' */ +} my_char_copy_status_t; + + /* - Copy an UCS2/UTF16/UTF32 string. - Not more that "nchars" characters are copied. + Copies an incomplete character, lef-padding it with 0x00 bytes. + + @param cs Character set + @param dst The destination string + @param dst_length Space available in dst + @param src The source string + @param src_length Length of src + @param nchars Copy not more than nchars characters. + The "nchars" parameter of the caller. + Only 0 and non-0 are important here. + @param fix What to do if after zero-padding didn't get a valid + character: + - FALSE - exit with error. + - TRUE - try to put '?' instead. + + @return MY_CHAR_COPY_OK if after zero-padding got a valid character. + cs->mbmaxlen bytes were written to "dst". + @return MY_CHAR_COPY_FIXED if after zero-padding did not get a valid + character, but wrote '?' to the destination + string instead. + cs->mbminlen bytes were written to "dst". + @return MY_CHAR_COPY_ERROR If failed and nothing was written to "dst". + Possible reasons: + - dst_length was too short + - nchars was 0 + - the character after padding appeared not + to be valid, and could not fix it to '?'. +*/ +static my_char_copy_status_t +my_copy_incomplete_char(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, my_bool fix) +{ + size_t pad_length; + size_t src_offset= src_length % cs->mbminlen; + if (dst_length < cs->mbminlen || !nchars) + return MY_CHAR_COPY_ERROR; + + pad_length= cs->mbminlen - src_offset; + bzero(dst, pad_length); + memmove(dst + pad_length, src, src_offset); + /* + In some cases left zero-padding can create an incorrect character. + For example: + INSERT INTO t1 (utf32_column) VALUES (0x110000); + We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! + The valid characters range is limited to 0x00000000..0x0010FFFF. + + Make sure we didn't pad to an incorrect character. + */ + if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) == + (int) cs->mbminlen) + return MY_CHAR_COPY_OK; - UCS2/UTF16/UTF32 may need to prepend zero some bytes, - e.g. when copying from a BINARY source: - INSERT INTO t1 (ucs2_column) VALUES (0x01); - 0x01 -> 0x0001 + if (fix && + cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) == + (int) cs->mbminlen) + return MY_CHAR_COPY_FIXED; + + return MY_CHAR_COPY_ERROR; +} + + +/* + Copy an UCS2/UTF16/UTF32 string, fix bad characters. */ static size_t -my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs, - char *dst, size_t dst_length, - const char *src, size_t src_length, - size_t nchars, MY_STRCOPY_STATUS *status) +my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs, + char *dst, size_t dst_length, + const char *src, size_t src_length, + size_t nchars, MY_STRCOPY_STATUS *status) { - size_t src_offset; - - if ((src_offset= (src_length % cs->mbminlen))) - { - int well_formed_error; - size_t pad_length; - if (dst_length < cs->mbminlen || !nchars) - { - status->m_source_end_pos= status->m_well_formed_error_pos= src; - return 0; - } - - pad_length= cs->mbminlen - src_offset; - bzero(dst, pad_length); - memmove(dst + pad_length, src, src_offset); - /* - In some cases left zero-padding can create an incorrect character. - For example: - INSERT INTO t1 (utf32_column) VALUES (0x110000); - We'll pad the value to 0x00110000, which is a wrong UTF32 sequence! - The valid characters range is limited to 0x00000000..0x0010FFFF. - - Make sure we didn't pad to an incorrect character. - */ - if (cs->cset->well_formed_len(cs, - dst, dst + cs->mbminlen, 1, - &well_formed_error) != cs->mbminlen) - { - status->m_source_end_pos= status->m_well_formed_error_pos= src; - return 0; - } - nchars--; - src+= src_offset; - src_length-= src_offset; - dst+= cs->mbminlen; - dst_length-= cs->mbminlen; - return - cs->mbminlen /* The left-padded character */ + - my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status); + size_t length2, src_offset= src_length % cs->mbminlen; + my_char_copy_status_t padstatus; + + if (!src_offset) + return my_copy_fix_mb(cs, dst, dst_length, + src, src_length, nchars, status); + if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length, + src, src_length, nchars, TRUE)) == + MY_CHAR_COPY_ERROR) + { + status->m_source_end_pos= status->m_well_formed_error_pos= src; + return 0; } - return my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status); + length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen, + src + src_offset, src_length - src_offset, + nchars - 1, status); + if (padstatus == MY_CHAR_COPY_FIXED) + status->m_well_formed_error_pos= src; + return cs->mbminlen /* The left-padded character */ + length2; } @@ -1475,6 +1520,24 @@ my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e) } +static int +my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end) +{ + my_wc_t wc; + return cs->cset->mb_wc(cs, &wc, str, end); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16 +#define CHARLEN(cs,str,end) my_charlen_utf16(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* Defines my_well_formed_char_length_utf16 */ + + static uint my_mbcharlen_utf16(CHARSET_INFO *cs __attribute__((unused)), uint c __attribute__((unused))) @@ -1742,7 +1805,9 @@ MY_CHARSET_HANDLER my_charset_utf16_handler= my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, my_scan_mb2, - my_copy_abort_mb2_or_mb4, + my_charlen_utf16, + my_well_formed_char_length_utf16, + my_copy_fix_mb2_or_mb4, }; @@ -1912,7 +1977,9 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler= my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, my_scan_mb2, - my_copy_abort_mb2_or_mb4, + my_charlen_utf16, + my_well_formed_char_length_utf16, + my_copy_fix_mb2_or_mb4, }; @@ -1987,6 +2054,13 @@ struct charset_info_st my_charset_utf16le_bin= #ifdef HAVE_CHARSET_utf32 +/* + Check is b0 and b1 start a valid UTF32 four-byte sequence. + Don't accept characters greater than U+10FFFF. +*/ +#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10)) + + static int my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) @@ -1994,7 +2068,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)), if (s + 4 > e) return MY_CS_TOOSMALL4; *pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]); - return 4; + return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4; } @@ -2004,7 +2078,10 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)), { if (s + 4 > e) return MY_CS_TOOSMALL4; - + + if (wc > 0x10FFFF) + return MY_CS_ILUNI; + s[0]= (uchar) (wc >> 24); s[1]= (uchar) (wc >> 16) & 0xFF; s[2]= (uchar) (wc >> 8) & 0xFF; @@ -2263,10 +2340,29 @@ my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)), const char *b, const char *e) { - return b + 4 > e ? 0 : 4; + return b + 4 > e || !IS_UTF32_MBHEAD4(b[0], b[1]) ? 0 : 4; } +static int +my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)), + const uchar *b, const uchar *e) +{ + return b + 4 > e ? MY_CS_TOOSMALL4 : + IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ; +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32 +#define CHARLEN(cs,str,end) my_charlen_utf32(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* Defines my_well_formed_char_length_utf32 */ + + static uint my_mbcharlen_utf32(CHARSET_INFO *cs __attribute__((unused)) , uint c __attribute__((unused))) @@ -2579,8 +2675,7 @@ my_well_formed_len_utf32(CHARSET_INFO *cs __attribute__((unused)), } for (; b < e; b+= 4) { - /* Don't accept characters greater than U+10FFFF */ - if (b[0] || (uchar) b[1] > 0x10) + if (!IS_UTF32_MBHEAD4(b[0], b[1])) { *error= 1; return b - b0; @@ -2827,7 +2922,9 @@ MY_CHARSET_HANDLER my_charset_utf32_handler= my_strtoll10_utf32, my_strntoull10rnd_mb2_or_mb4, my_scan_utf32, - my_copy_abort_mb2_or_mb4, + my_charlen_utf32, + my_well_formed_char_length_utf32, + my_copy_fix_mb2_or_mb4, }; @@ -2961,6 +3058,14 @@ static const uchar to_upper_ucs2[] = { }; +static int +my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) +{ + return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2; +} + + static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t * pwc, const uchar *s, const uchar *e) { @@ -3264,6 +3369,31 @@ size_t my_well_formed_len_ucs2(CHARSET_INFO *cs __attribute__((unused)), } +static size_t +my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)), + const char *b, const char *e, + size_t nchars, MY_STRCOPY_STATUS *status) +{ + size_t length= e - b; + if (nchars * 2 <= length) + { + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= b + (nchars * 2); + return nchars; + } + if (length % 2) + { + status->m_well_formed_error_pos= status->m_source_end_pos= e - 1; + } + else + { + status->m_well_formed_error_pos= NULL; + status->m_source_end_pos= e; + } + return length / 2; +} + + static int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, const char *str,const char *str_end, @@ -3446,7 +3576,9 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler= my_strtoll10_mb2, my_strntoull10rnd_mb2_or_mb4, my_scan_mb2, - my_copy_abort_mb2_or_mb4, + my_charlen_ucs2, + my_well_formed_char_length_ucs2, + my_copy_fix_mb2_or_mb4, }; diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 99f5be3fa38..cb000a2afa0 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -197,7 +197,7 @@ static const uchar sort_order_ujis[]= #define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y)) #define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) #define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z)) -#define WELL_FORMED_LEN +#define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" @@ -67255,7 +67255,9 @@ static MY_CHARSET_HANDLER my_charset_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_ujis, + my_well_formed_char_length_ujis, + my_copy_fix_mb, }; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 1116228f706..56824aac59e 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -5446,8 +5446,8 @@ int my_wildcmp_utf8(CHARSET_INFO *cs, static -int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, const uchar *e) +int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) { uchar c; @@ -5515,7 +5515,7 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, { int mb_len; - if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) + if ((mb_len= my_charlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0) { *error= b < e ? 1 : 0; break; @@ -5526,9 +5526,20 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e, return (size_t) (b - b_start); } + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8 +#define CHARLEN(cs,str,end) my_charlen_utf8(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* my_well_formed_char_length_utf8 */ + + static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e) { - int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e); + int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e); return (res>1) ? res : 0; } @@ -5615,7 +5626,9 @@ MY_CHARSET_HANDLER my_charset_utf8_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_utf8, + my_well_formed_char_length_utf8, + my_copy_fix_mb, }; @@ -7125,6 +7138,24 @@ my_wc_mb_filename(CHARSET_INFO *cs __attribute__((unused)), } +static int +my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end) +{ + my_wc_t wc; + return cs->cset->mb_wc(cs, &wc, str, end); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _filename +#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* my_well_formed_char_length_filename */ + + static MY_COLLATION_HANDLER my_collation_filename_handler = { NULL, /* init */ @@ -7169,7 +7200,9 @@ static MY_CHARSET_HANDLER my_charset_filename_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_filename, + my_well_formed_char_length_filename, + my_copy_fix_mb, }; @@ -7954,8 +7987,8 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs, static int -my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, const uchar *e) +my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), + const uchar *s, const uchar *e) { uchar c; @@ -8015,7 +8048,7 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, { int mb_len; - if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) + if ((mb_len= my_charlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0) { *error= b < e ? 1 : 0; break; @@ -8027,10 +8060,19 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, } +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4 +#define CHARLEN(cs,str,end) my_charlen_utf8mb4(cs,str,end) +#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +#include "ctype-mb.ic" +#undef MY_FUNCTION_NAME +#undef CHARLEN +#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN +/* my_well_formed_char_length_utf8mb4 */ + static uint my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) { - int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e); + int res= my_charlen_utf8mb4(cs, (const uchar*) b, (const uchar*) e); return (res > 1) ? res : 0; } @@ -8113,7 +8155,9 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler= my_strtoll10_8bit, my_strntoull10rnd_8bit, my_scan_8bit, - my_copy_abort_mb, + my_charlen_utf8mb4, + my_well_formed_char_length_utf8mb4, + my_copy_fix_mb, }; |