summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/m_ctype.h71
-rw-r--r--mysql-test/r/ctype_big5.result2
-rw-r--r--mysql-test/r/ctype_cp932_binlog_stm.result2
-rw-r--r--mysql-test/r/ctype_eucjpms.result5
-rw-r--r--mysql-test/r/ctype_euckr.result18
-rw-r--r--mysql-test/r/ctype_gb2312.result2
-rw-r--r--mysql-test/r/ctype_gbk.result813
-rw-r--r--mysql-test/r/ctype_sjis.result2
-rw-r--r--mysql-test/r/ctype_ujis.result4
-rw-r--r--mysql-test/r/ctype_utf8.result6
-rw-r--r--mysql-test/r/ctype_utf8mb4.result100
-rw-r--r--mysql-test/r/ctype_utf8mb4_heap.result50
-rw-r--r--mysql-test/r/ctype_utf8mb4_innodb.result52
-rw-r--r--mysql-test/r/ctype_utf8mb4_myisam.result52
-rw-r--r--mysql-test/suite/funcs_2/include/check_charset.inc4
-rw-r--r--mysql-test/suite/innodb/r/innodb-update-insert.result4
-rw-r--r--mysql-test/t/ctype_big5.test2
-rw-r--r--mysql-test/t/ctype_cp932_binlog_stm.test2
-rw-r--r--mysql-test/t/ctype_eucjpms.test1
-rw-r--r--mysql-test/t/ctype_euckr.test6
-rw-r--r--mysql-test/t/ctype_gb2312.test2
-rw-r--r--mysql-test/t/ctype_gbk.test227
-rw-r--r--mysql-test/t/ctype_sjis.test2
-rw-r--r--mysql-test/t/ctype_ujis.test2
-rw-r--r--mysql-test/t/ctype_utf8mb4.test44
-rw-r--r--sql/sql_string.cc4
-rw-r--r--strings/ctype-big5.c9
-rw-r--r--strings/ctype-bin.c2
-rw-r--r--strings/ctype-cp932.c6
-rw-r--r--strings/ctype-euc_kr.c9
-rw-r--r--strings/ctype-eucjpms.c6
-rw-r--r--strings/ctype-gb2312.c11
-rw-r--r--strings/ctype-gbk.c9
-rw-r--r--strings/ctype-latin1.c2
-rw-r--r--strings/ctype-mb.c98
-rw-r--r--strings/ctype-mb.ic168
-rw-r--r--strings/ctype-simple.c22
-rw-r--r--strings/ctype-sjis.c6
-rw-r--r--strings/ctype-tis620.c2
-rw-r--r--strings/ctype-ucs2.c250
-rw-r--r--strings/ctype-ujis.c6
-rw-r--r--strings/ctype-utf8.c66
42 files changed, 1899 insertions, 252 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index f08efb461b7..7f4ccee2a3e 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -444,22 +444,64 @@ struct my_charset_handler_st
size_t (*scan)(CHARSET_INFO *, const char *b, const char *e,
int sq);
- /* Copying routines */
+ /* String copying routines and helpers for them */
/*
- copy_abort() - copy a string, abort if a bad byte sequence was found.
+ charlen() - calculate length of the left-most character in bytes.
+ @param cs Character set
+ @param str The beginning of the string
+ @param end The end of the string
+
+ @return MY_CS_ILSEQ if a bad byte sequence was found.
+ @return MY_CS_TOOSMALLN(x) if the string ended unexpectedly.
+ @return a positive number in the range 1..mbmaxlen,
+ if a valid character was found.
+ */
+ int (*charlen)(CHARSET_INFO *cs, const uchar *str, const uchar *end);
+ /*
+ well_formed_char_length() - returns character length of a string.
+
+ @param cs Character set
+ @param str The beginning of the string
+ @param end The end of the string
+ @param nchars Not more than "nchars" left-most characters are checked.
+ @param status[OUT] Additional statistics is returned here.
+ "status" can be uninitialized before the call,
+ and it is fully initialized after the call.
+
+ status->m_source_end_pos is set to the position where reading stopped.
+
+ If a bad byte sequence is found, the function returns immediately and
+ status->m_well_formed_error_pos is set to the position where a bad byte
+ sequence was found.
+
+ status->m_well_formed_error_pos is set to NULL if no bad bytes were found.
+ If status->m_well_formed_error_pos is NULL after the call, that means:
+ - either the function reached the end of the string,
+ - or all "nchars" characters were read.
+ The caller can check status->m_source_end_pos to detect which of these two
+ happened.
+ */
+ size_t (*well_formed_char_length)(CHARSET_INFO *cs,
+ const char *str, const char *end,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status);
+
+ /*
+ copy_fix() - copy a string, replace bad bytes to '?'.
Not more than "nchars" characters are copied.
status->m_source_end_pos is set to a position in the range
- between "src" and "src + src_length".
+ between "src" and "src + src_length", where reading stopped.
status->m_well_formed_error_pos is set to NULL if the string
in the range "src" and "status->m_source_end_pos" was well formed,
- or is set to "src + src_length" otherwise.
+ or is set to a position between "src" and "src + src_length" where
+ the leftmost bad byte sequence was found.
*/
- size_t (*copy_abort)(CHARSET_INFO *,
- char *dst, size_t dst_length,
- const char *src, size_t src_length,
- size_t nchars, MY_STRCOPY_STATUS *status);
+ size_t (*copy_fix)(CHARSET_INFO *,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, MY_STRCOPY_STATUS *status);
};
extern MY_CHARSET_HANDLER my_charset_8bit_handler;
@@ -596,10 +638,10 @@ size_t my_copy_8bit(CHARSET_INFO *,
char *dst, size_t dst_length,
const char *src, size_t src_length,
size_t nchars, MY_STRCOPY_STATUS *);
-size_t my_copy_abort_mb(CHARSET_INFO *cs,
- char *dst, size_t dst_length,
- const char *src, size_t src_length,
- size_t nchars, MY_STRCOPY_STATUS *);
+size_t my_copy_fix_mb(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, MY_STRCOPY_STATUS *);
/* Functions for 8bit */
extern size_t my_caseup_str_8bit(CHARSET_INFO *, char *);
@@ -691,6 +733,11 @@ size_t my_numcells_8bit(CHARSET_INFO *, const char *b, const char *e);
size_t my_charpos_8bit(CHARSET_INFO *, const char *b, const char *e, size_t pos);
size_t my_well_formed_len_8bit(CHARSET_INFO *, const char *b, const char *e,
size_t pos, int *error);
+size_t my_well_formed_char_length_8bit(CHARSET_INFO *cs,
+ const char *b, const char *e,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status);
+int my_charlen_8bit(CHARSET_INFO *, const uchar *str, const uchar *end);
uint my_mbcharlen_8bit(CHARSET_INFO *, uint c);
diff --git a/mysql-test/r/ctype_big5.result b/mysql-test/r/ctype_big5.result
index 175bbf0f09f..d18c2a00c6f 100644
--- a/mysql-test/r/ctype_big5.result
+++ b/mysql-test/r/ctype_big5.result
@@ -597,7 +597,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61
Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62
Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63
Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
COUNT(*)
13973
SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a));
diff --git a/mysql-test/r/ctype_cp932_binlog_stm.result b/mysql-test/r/ctype_cp932_binlog_stm.result
index 0e6ae25a395..fd920223091 100644
--- a/mysql-test/r/ctype_cp932_binlog_stm.result
+++ b/mysql-test/r/ctype_cp932_binlog_stm.result
@@ -165,7 +165,7 @@ Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
SELECT COUNT(*) FROM t1;
COUNT(*)
14623
-SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1;
+SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1;
COUNT(*)
63
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
diff --git a/mysql-test/r/ctype_eucjpms.result b/mysql-test/r/ctype_eucjpms.result
index a1232c115e9..49d86c18a3d 100644
--- a/mysql-test/r/ctype_eucjpms.result
+++ b/mysql-test/r/ctype_eucjpms.result
@@ -10101,6 +10101,9 @@ COUNT(*)
56959
SELECT COUNT(*) FROM t1 WHERE a<>'';
COUNT(*)
+56959
+SELECT COUNT(*) FROM t1 WHERE a<>'' AND a<>'?';
+COUNT(*)
17735
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
COUNT(*)
@@ -33632,7 +33635,7 @@ CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET eucjpms);
INSERT INTO t1 VALUES (0x8EA0);
SELECT HEX(a), CHAR_LENGTH(a) FROM t1;
HEX(a) CHAR_LENGTH(a)
- 0
+3F3F 2
DROP TABLE t1;
SELECT _eucjpms 0x8EA0;
ERROR HY000: Invalid eucjpms character string: '8EA0'
diff --git a/mysql-test/r/ctype_euckr.result b/mysql-test/r/ctype_euckr.result
index dcb68cfe60b..0ee63bb76b2 100644
--- a/mysql-test/r/ctype_euckr.result
+++ b/mysql-test/r/ctype_euckr.result
@@ -407,12 +407,12 @@ Warnings:
Warning 1366 Incorrect string value: '\xA1\xFF' for column 's1' at row 1
select hex(s1), hex(convert(s1 using utf8)) from t1 order by binary s1;
hex(s1) hex(convert(s1 using utf8))
-
-
-
-
-
-
+3F3F 3F3F
+3F3F 3F3F
+3F40 3F40
+3F5B 3F5B
+3F60 3F60
+3F7B 3F7B
A141 ECA2A5
A15A ECA381
A161 ECA382
@@ -445,7 +445,7 @@ FROM t1 t11, t1 t12
WHERE t11.a >= 0x81 AND t11.a <= 0xFE
AND t12.a >= 0x41 AND t12.a <= 0xFE
ORDER BY t11.a, t12.a;
-SELECT s as bad_code FROM t2 WHERE a='' ORDER BY s;
+SELECT s as bad_code FROM t2 WHERE a='?' ORDER BY s;
bad_code
815B
815C
@@ -1959,7 +1959,7 @@ FE7D
FE7E
FE7F
FE80
-DELETE FROM t2 WHERE a='';
+DELETE FROM t2 WHERE a='?';
ALTER TABLE t2 ADD u VARCHAR(1) CHARACTER SET utf8, ADD a2 VARCHAR(1) CHARACTER SET euckr;
UPDATE t2 SET u=a, a2=u;
SELECT s as unassigned_code FROM t2 WHERE u='?';
@@ -24492,7 +24492,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61
Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62
Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63
Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
COUNT(*)
22428
SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a));
diff --git a/mysql-test/r/ctype_gb2312.result b/mysql-test/r/ctype_gb2312.result
index 5db6e2d3035..ceecb7786b0 100644
--- a/mysql-test/r/ctype_gb2312.result
+++ b/mysql-test/r/ctype_gb2312.result
@@ -553,7 +553,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61
Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62
Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63
Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
COUNT(*)
8178
SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a));
diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result
index c5d997b0213..55561cfa289 100644
--- a/mysql-test/r/ctype_gbk.result
+++ b/mysql-test/r/ctype_gbk.result
@@ -573,7 +573,7 @@ Warning 1366 Incorrect string value: '\x80\' for column 'a' at row 61
Warning 1366 Incorrect string value: '\x80]' for column 'a' at row 62
Warning 1366 Incorrect string value: '\x80^' for column 'a' at row 63
Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
COUNT(*)
23940
SELECT code, hex(upper(a)), hex(lower(a)),a, upper(a), lower(a) FROM t1 WHERE hex(a)<>hex(upper(a)) OR hex(a)<>hex(lower(a));
@@ -4946,3 +4946,814 @@ DROP TABLE t1;
#
# End of 10.0 tests
#
+#
+# Start of 10.1 tests
+#
+#
+# MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
+#
+CREATE TABLE t1 (
+id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
+b VARBINARY(16),
+type SET('ascii','bad','head','tail','mb2','unassigned')
+);
+INSERT INTO t1 (b, type) VALUES (0x40, 'ascii,tail');
+INSERT INTO t1 (b, type) VALUES (0x80, 'tail');
+INSERT INTO t1 (b, type) VALUES (0x81, 'head,tail');
+INSERT INTO t1 (b, type) VALUES (0xFF, 'bad');
+INSERT INTO t1 (b, type) VALUES (0xA140, 'mb2,unassigned');
+INSERT INTO t1 (b, type) VALUES (0xA1A3, 'mb2');
+INSERT INTO t1 (b, type) VALUES (0xFE40, 'mb2');
+CREATE TABLE t2 AS SELECT
+CONCAT(t1.b,t2.b) AS b,
+t1.type AS type1,
+t2.type AS type2,
+CONCAT('[',t1.type,'][',t2.type,']') AS comment
+FROM t1, t1 t2;
+CREATE TABLE t3
+(
+b VARBINARY(16),
+c VARCHAR(16) CHARACTER SET gbk,
+comment VARCHAR(128)
+);
+#
+# A combination of two valid characters, should give no warnings
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE
+(FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND
+(FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2))
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+16
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+4040 [ascii,tail][ascii,tail]
+40A140 [ascii,tail][mb2,unassigned]
+40A1A3 [ascii,tail][mb2]
+40FE40 [ascii,tail][mb2]
+A14040 [mb2,unassigned][ascii,tail]
+A140A140 [mb2,unassigned][mb2,unassigned]
+A140A1A3 [mb2,unassigned][mb2]
+A140FE40 [mb2,unassigned][mb2]
+A1A340 [mb2][ascii,tail]
+A1A3A140 [mb2][mb2,unassigned]
+A1A3A1A3 [mb2][mb2]
+A1A3FE40 [mb2][mb2]
+FE4040 [mb2][ascii,tail]
+FE40A140 [mb2][mb2,unassigned]
+FE40A1A3 [mb2][mb2]
+FE40FE40 [mb2][mb2]
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that start with a tail or a bad byte,
+# or end with a bad byte, all should be fixed.
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE type1='tail' OR type1='bad' OR type2='bad'
+ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 2
+Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 4
+Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\x80\xFF' for column 'c' at row 8
+Warning 1366 Incorrect string value: '\x81\xFF' for column 'c' at row 9
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 10
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 11
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 12
+Warning 1366 Incorrect string value: '\xFF@' for column 'c' at row 13
+Warning 1366 Incorrect string value: '\xFF\x80' for column 'c' at row 14
+Warning 1366 Incorrect string value: '\xFF\x81' for column 'c' at row 15
+Warning 1366 Incorrect string value: '\xFF\xA1@' for column 'c' at row 16
+Warning 1366 Incorrect string value: '\xFF\xA1\xA3' for column 'c' at row 17
+Warning 1366 Incorrect string value: '\xFF\xFE@' for column 'c' at row 18
+Warning 1366 Incorrect string value: '\xFF\xFF' for column 'c' at row 19
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+19
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+403F 40FF [ascii,tail][bad]
+3F40 8040 [tail][ascii,tail]
+3F3F 8080 [tail][tail]
+3F3F 8081 [tail][head,tail]
+3FA140 80A140 [tail][mb2,unassigned]
+3FA1A3 80A1A3 [tail][mb2]
+3FFE40 80FE40 [tail][mb2]
+3F3F 80FF [tail][bad]
+3F3F 81FF [head,tail][bad]
+A1403F A140FF [mb2,unassigned][bad]
+A1A33F A1A3FF [mb2][bad]
+FE403F FE40FF [mb2][bad]
+3F40 FF40 [bad][ascii,tail]
+3F3F FF80 [bad][tail]
+3F3F FF81 [bad][head,tail]
+3FA140 FFA140 [bad][mb2,unassigned]
+3FA1A3 FFA1A3 [bad][mb2]
+3FFE40 FFFE40 [bad][mb2]
+3F3F FFFF [bad][bad]
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that start with an ASCII or an MB2 character,
+# followed by a non-ASCII tail, all should be fixed.
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1))
+AND (FIND_IN_SET('tail',type2) AND NOT FIND_IN_SET('ascii',type2))
+ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 2
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 4
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 8
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+8
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+403F 4080 [ascii,tail][tail]
+403F 4081 [ascii,tail][head,tail]
+A1403F A14080 [mb2,unassigned][tail]
+A1403F A14081 [mb2,unassigned][head,tail]
+A1A33F A1A380 [mb2][tail]
+A1A33F A1A381 [mb2][head,tail]
+FE403F FE4080 [mb2][tail]
+FE403F FE4081 [mb2][head,tail]
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Other sequences
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 5
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+6
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+8140 [head,tail][ascii,tail]
+8180 [head,tail][tail]
+8181 [head,tail][head,tail]
+81A140 [head,tail][mb2,unassigned]
+81FE40 [head,tail][mb2]
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+81A13F 81A1A3 [head,tail][mb2]
+DELETE FROM t3;
+DROP TABLE t3;
+DROP TABLE t2;
+CREATE TABLE t2 AS SELECT
+CONCAT(t1.b,t2.b,t3.b) AS b,
+t1.type AS type1,
+t2.type AS type2,
+t3.type AS type3,
+CONCAT('[',t1.type,'][',t2.type,'][',t3.type,']') AS comment
+FROM t1, t1 t2,t1 t3;
+SELECT COUNT(*) FROM t2;
+COUNT(*)
+343
+CREATE TABLE t3
+(
+b VARBINARY(16),
+c VARCHAR(16) CHARACTER SET gbk,
+comment VARCHAR(128)
+);
+#
+# A combination of three valid characters, should give no warnings
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE
+(FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND
+(FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2)) AND
+(FIND_IN_SET('ascii',type3) OR FIND_IN_SET('mb2',type3))
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+64
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+404040 [ascii,tail][ascii,tail][ascii,tail]
+4040A140 [ascii,tail][ascii,tail][mb2,unassigned]
+4040A1A3 [ascii,tail][ascii,tail][mb2]
+4040FE40 [ascii,tail][ascii,tail][mb2]
+40A14040 [ascii,tail][mb2,unassigned][ascii,tail]
+40A140A140 [ascii,tail][mb2,unassigned][mb2,unassigned]
+40A140A1A3 [ascii,tail][mb2,unassigned][mb2]
+40A140FE40 [ascii,tail][mb2,unassigned][mb2]
+40A1A340 [ascii,tail][mb2][ascii,tail]
+40A1A3A140 [ascii,tail][mb2][mb2,unassigned]
+40A1A3A1A3 [ascii,tail][mb2][mb2]
+40A1A3FE40 [ascii,tail][mb2][mb2]
+40FE4040 [ascii,tail][mb2][ascii,tail]
+40FE40A140 [ascii,tail][mb2][mb2,unassigned]
+40FE40A1A3 [ascii,tail][mb2][mb2]
+40FE40FE40 [ascii,tail][mb2][mb2]
+A1404040 [mb2,unassigned][ascii,tail][ascii,tail]
+A14040A140 [mb2,unassigned][ascii,tail][mb2,unassigned]
+A14040A1A3 [mb2,unassigned][ascii,tail][mb2]
+A14040FE40 [mb2,unassigned][ascii,tail][mb2]
+A140A14040 [mb2,unassigned][mb2,unassigned][ascii,tail]
+A140A140A140 [mb2,unassigned][mb2,unassigned][mb2,unassigned]
+A140A140A1A3 [mb2,unassigned][mb2,unassigned][mb2]
+A140A140FE40 [mb2,unassigned][mb2,unassigned][mb2]
+A140A1A340 [mb2,unassigned][mb2][ascii,tail]
+A140A1A3A140 [mb2,unassigned][mb2][mb2,unassigned]
+A140A1A3A1A3 [mb2,unassigned][mb2][mb2]
+A140A1A3FE40 [mb2,unassigned][mb2][mb2]
+A140FE4040 [mb2,unassigned][mb2][ascii,tail]
+A140FE40A140 [mb2,unassigned][mb2][mb2,unassigned]
+A140FE40A1A3 [mb2,unassigned][mb2][mb2]
+A140FE40FE40 [mb2,unassigned][mb2][mb2]
+A1A34040 [mb2][ascii,tail][ascii,tail]
+A1A340A140 [mb2][ascii,tail][mb2,unassigned]
+A1A340A1A3 [mb2][ascii,tail][mb2]
+A1A340FE40 [mb2][ascii,tail][mb2]
+A1A3A14040 [mb2][mb2,unassigned][ascii,tail]
+A1A3A140A140 [mb2][mb2,unassigned][mb2,unassigned]
+A1A3A140A1A3 [mb2][mb2,unassigned][mb2]
+A1A3A140FE40 [mb2][mb2,unassigned][mb2]
+A1A3A1A340 [mb2][mb2][ascii,tail]
+A1A3A1A3A140 [mb2][mb2][mb2,unassigned]
+A1A3A1A3A1A3 [mb2][mb2][mb2]
+A1A3A1A3FE40 [mb2][mb2][mb2]
+A1A3FE4040 [mb2][mb2][ascii,tail]
+A1A3FE40A140 [mb2][mb2][mb2,unassigned]
+A1A3FE40A1A3 [mb2][mb2][mb2]
+A1A3FE40FE40 [mb2][mb2][mb2]
+FE404040 [mb2][ascii,tail][ascii,tail]
+FE4040A140 [mb2][ascii,tail][mb2,unassigned]
+FE4040A1A3 [mb2][ascii,tail][mb2]
+FE4040FE40 [mb2][ascii,tail][mb2]
+FE40A14040 [mb2][mb2,unassigned][ascii,tail]
+FE40A140A140 [mb2][mb2,unassigned][mb2,unassigned]
+FE40A140A1A3 [mb2][mb2,unassigned][mb2]
+FE40A140FE40 [mb2][mb2,unassigned][mb2]
+FE40A1A340 [mb2][mb2][ascii,tail]
+FE40A1A3A140 [mb2][mb2][mb2,unassigned]
+FE40A1A3A1A3 [mb2][mb2][mb2]
+FE40A1A3FE40 [mb2][mb2][mb2]
+FE40FE4040 [mb2][mb2][ascii,tail]
+FE40FE40A140 [mb2][mb2][mb2,unassigned]
+FE40FE40A1A3 [mb2][mb2][mb2]
+FE40FE40FE40 [mb2][mb2][mb2]
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that start with a tail or a bad byte,
+# or have a bad byte, all should be fixed.
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE type1='tail' OR type1='bad' OR type2='bad' OR type3='bad'
+ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\x80\xFF' for column 'c' at row 2
+Warning 1366 Incorrect string value: '\x81\xFF' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 4
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\xFF@' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\xFF\x80' for column 'c' at row 8
+Warning 1366 Incorrect string value: '\xFF\x81' for column 'c' at row 9
+Warning 1366 Incorrect string value: '\xFF\xA1@' for column 'c' at row 10
+Warning 1366 Incorrect string value: '\xFF\xA1\xA3' for column 'c' at row 11
+Warning 1366 Incorrect string value: '\xFF\xFE@' for column 'c' at row 12
+Warning 1366 Incorrect string value: '\xFF\xFF' for column 'c' at row 13
+Warning 1366 Incorrect string value: '\x80@@' for column 'c' at row 14
+Warning 1366 Incorrect string value: '\x80@\x80' for column 'c' at row 15
+Warning 1366 Incorrect string value: '\x80@\x81' for column 'c' at row 16
+Warning 1366 Incorrect string value: '\x80@\xA1@' for column 'c' at row 17
+Warning 1366 Incorrect string value: '\x80@\xA1\xA3' for column 'c' at row 18
+Warning 1366 Incorrect string value: '\x80@\xFE@' for column 'c' at row 19
+Warning 1366 Incorrect string value: '\x80@\xFF' for column 'c' at row 20
+Warning 1366 Incorrect string value: '\x80\x80@' for column 'c' at row 21
+Warning 1366 Incorrect string value: '\x80\x80\x80' for column 'c' at row 22
+Warning 1366 Incorrect string value: '\x80\x80\x81' for column 'c' at row 23
+Warning 1366 Incorrect string value: '\x80\x80\xA1@' for column 'c' at row 24
+Warning 1366 Incorrect string value: '\x80\x80\xA1\xA3' for column 'c' at row 25
+Warning 1366 Incorrect string value: '\x80\x80\xFE@' for column 'c' at row 26
+Warning 1366 Incorrect string value: '\x80\x80\xFF' for column 'c' at row 27
+Warning 1366 Incorrect string value: '\x80\x81@' for column 'c' at row 28
+Warning 1366 Incorrect string value: '\x80\x81\x80' for column 'c' at row 29
+Warning 1366 Incorrect string value: '\x80\x81\x81' for column 'c' at row 30
+Warning 1366 Incorrect string value: '\x80\x81\xA1@' for column 'c' at row 31
+Warning 1366 Incorrect string value: '\x80\x81\xA1\xA3' for column 'c' at row 32
+Warning 1366 Incorrect string value: '\x80\x81\xFE@' for column 'c' at row 33
+Warning 1366 Incorrect string value: '\x80\x81\xFF' for column 'c' at row 34
+Warning 1366 Incorrect string value: '\x80\xA1@@' for column 'c' at row 35
+Warning 1366 Incorrect string value: '\x80\xA1@\x80' for column 'c' at row 36
+Warning 1366 Incorrect string value: '\x80\xA1@\x81' for column 'c' at row 37
+Warning 1366 Incorrect string value: '\x80\xA1@\xA1@' for column 'c' at row 38
+Warning 1366 Incorrect string value: '\x80\xA1@\xA1\xA3' for column 'c' at row 39
+Warning 1366 Incorrect string value: '\x80\xA1@\xFE@' for column 'c' at row 40
+Warning 1366 Incorrect string value: '\x80\xA1@\xFF' for column 'c' at row 41
+Warning 1366 Incorrect string value: '\x80\xA1\xA3@' for column 'c' at row 42
+Warning 1366 Incorrect string value: '\x80\xA1\xA3\x80' for column 'c' at row 43
+Warning 1366 Incorrect string value: '\x80\xA1\xA3\x81' for column 'c' at row 44
+Warning 1366 Incorrect string value: '\x80\xA1\xA3\xA1@' for column 'c' at row 45
+Warning 1366 Incorrect string value: '\x80\xA1\xA3\xA1\xA3' for column 'c' at row 46
+Warning 1366 Incorrect string value: '\x80\xA1\xA3\xFE@' for column 'c' at row 47
+Warning 1366 Incorrect string value: '\x80\xA1\xA3\xFF' for column 'c' at row 48
+Warning 1366 Incorrect string value: '\x80\xFE@@' for column 'c' at row 49
+Warning 1366 Incorrect string value: '\x80\xFE@\x80' for column 'c' at row 50
+Warning 1366 Incorrect string value: '\x80\xFE@\x81' for column 'c' at row 51
+Warning 1366 Incorrect string value: '\x80\xFE@\xA1@' for column 'c' at row 52
+Warning 1366 Incorrect string value: '\x80\xFE@\xA1\xA3' for column 'c' at row 53
+Warning 1366 Incorrect string value: '\x80\xFE@\xFE@' for column 'c' at row 54
+Warning 1366 Incorrect string value: '\x80\xFE@\xFF' for column 'c' at row 55
+Warning 1366 Incorrect string value: '\x80\xFF@' for column 'c' at row 56
+Warning 1366 Incorrect string value: '\x80\xFF\x80' for column 'c' at row 57
+Warning 1366 Incorrect string value: '\x80\xFF\x81' for column 'c' at row 58
+Warning 1366 Incorrect string value: '\x80\xFF\xA1@' for column 'c' at row 59
+Warning 1366 Incorrect string value: '\x80\xFF\xA1\xA3' for column 'c' at row 60
+Warning 1366 Incorrect string value: '\x80\xFF\xFE@' for column 'c' at row 61
+Warning 1366 Incorrect string value: '\x80\xFF\xFF' for column 'c' at row 62
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 63
+Warning 1366 Incorrect string value: '\xFF' for column 'c' at row 64
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+163
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+40403F 4040FF [ascii,tail][ascii,tail][bad]
+403F3F 4080FF [ascii,tail][tail][bad]
+403F3F 4081FF [ascii,tail][head,tail][bad]
+40A1403F 40A140FF [ascii,tail][mb2,unassigned][bad]
+40A1A33F 40A1A3FF [ascii,tail][mb2][bad]
+40FE403F 40FE40FF [ascii,tail][mb2][bad]
+403F40 40FF40 [ascii,tail][bad][ascii,tail]
+403F3F 40FF80 [ascii,tail][bad][tail]
+403F3F 40FF81 [ascii,tail][bad][head,tail]
+403FA140 40FFA140 [ascii,tail][bad][mb2,unassigned]
+403FA1A3 40FFA1A3 [ascii,tail][bad][mb2]
+403FFE40 40FFFE40 [ascii,tail][bad][mb2]
+403F3F 40FFFF [ascii,tail][bad][bad]
+3F4040 804040 [tail][ascii,tail][ascii,tail]
+3F403F 804080 [tail][ascii,tail][tail]
+3F403F 804081 [tail][ascii,tail][head,tail]
+3F40A140 8040A140 [tail][ascii,tail][mb2,unassigned]
+3F40A1A3 8040A1A3 [tail][ascii,tail][mb2]
+3F40FE40 8040FE40 [tail][ascii,tail][mb2]
+3F403F 8040FF [tail][ascii,tail][bad]
+3F3F40 808040 [tail][tail][ascii,tail]
+3F3F3F 808080 [tail][tail][tail]
+3F3F3F 808081 [tail][tail][head,tail]
+3F3FA140 8080A140 [tail][tail][mb2,unassigned]
+3F3FA1A3 8080A1A3 [tail][tail][mb2]
+3F3FFE40 8080FE40 [tail][tail][mb2]
+3F3F3F 8080FF [tail][tail][bad]
+3F8140 808140 [tail][head,tail][ascii,tail]
+3F8180 808180 [tail][head,tail][tail]
+3F8181 808181 [tail][head,tail][head,tail]
+3F81A140 8081A140 [tail][head,tail][mb2,unassigned]
+3F81A13F 8081A1A3 [tail][head,tail][mb2]
+3F81FE40 8081FE40 [tail][head,tail][mb2]
+3F3F3F 8081FF [tail][head,tail][bad]
+3FA14040 80A14040 [tail][mb2,unassigned][ascii,tail]
+3FA1403F 80A14080 [tail][mb2,unassigned][tail]
+3FA1403F 80A14081 [tail][mb2,unassigned][head,tail]
+3FA140A140 80A140A140 [tail][mb2,unassigned][mb2,unassigned]
+3FA140A1A3 80A140A1A3 [tail][mb2,unassigned][mb2]
+3FA140FE40 80A140FE40 [tail][mb2,unassigned][mb2]
+3FA1403F 80A140FF [tail][mb2,unassigned][bad]
+3FA1A340 80A1A340 [tail][mb2][ascii,tail]
+3FA1A33F 80A1A380 [tail][mb2][tail]
+3FA1A33F 80A1A381 [tail][mb2][head,tail]
+3FA1A3A140 80A1A3A140 [tail][mb2][mb2,unassigned]
+3FA1A3A1A3 80A1A3A1A3 [tail][mb2][mb2]
+3FA1A3FE40 80A1A3FE40 [tail][mb2][mb2]
+3FA1A33F 80A1A3FF [tail][mb2][bad]
+3FFE4040 80FE4040 [tail][mb2][ascii,tail]
+3FFE403F 80FE4080 [tail][mb2][tail]
+3FFE403F 80FE4081 [tail][mb2][head,tail]
+3FFE40A140 80FE40A140 [tail][mb2][mb2,unassigned]
+3FFE40A1A3 80FE40A1A3 [tail][mb2][mb2]
+3FFE40FE40 80FE40FE40 [tail][mb2][mb2]
+3FFE403F 80FE40FF [tail][mb2][bad]
+3F3F40 80FF40 [tail][bad][ascii,tail]
+3F3F3F 80FF80 [tail][bad][tail]
+3F3F3F 80FF81 [tail][bad][head,tail]
+3F3FA140 80FFA140 [tail][bad][mb2,unassigned]
+3F3FA1A3 80FFA1A3 [tail][bad][mb2]
+3F3FFE40 80FFFE40 [tail][bad][mb2]
+3F3F3F 80FFFF [tail][bad][bad]
+81403F 8140FF [head,tail][ascii,tail][bad]
+81803F 8180FF [head,tail][tail][bad]
+81813F 8181FF [head,tail][head,tail][bad]
+81A1403F 81A140FF [head,tail][mb2,unassigned][bad]
+81A13F3F 81A1A3FF [head,tail][mb2][bad]
+81FE403F 81FE40FF [head,tail][mb2][bad]
+3F3F40 81FF40 [head,tail][bad][ascii,tail]
+3F3F3F 81FF80 [head,tail][bad][tail]
+3F3F3F 81FF81 [head,tail][bad][head,tail]
+3F3FA140 81FFA140 [head,tail][bad][mb2,unassigned]
+3F3FA1A3 81FFA1A3 [head,tail][bad][mb2]
+3F3FFE40 81FFFE40 [head,tail][bad][mb2]
+3F3F3F 81FFFF [head,tail][bad][bad]
+A140403F A14040FF [mb2,unassigned][ascii,tail][bad]
+A1403F3F A14080FF [mb2,unassigned][tail][bad]
+A1403F3F A14081FF [mb2,unassigned][head,tail][bad]
+A140A1403F A140A140FF [mb2,unassigned][mb2,unassigned][bad]
+A140A1A33F A140A1A3FF [mb2,unassigned][mb2][bad]
+A140FE403F A140FE40FF [mb2,unassigned][mb2][bad]
+A1403F40 A140FF40 [mb2,unassigned][bad][ascii,tail]
+A1403F3F A140FF80 [mb2,unassigned][bad][tail]
+A1403F3F A140FF81 [mb2,unassigned][bad][head,tail]
+A1403FA140 A140FFA140 [mb2,unassigned][bad][mb2,unassigned]
+A1403FA1A3 A140FFA1A3 [mb2,unassigned][bad][mb2]
+A1403FFE40 A140FFFE40 [mb2,unassigned][bad][mb2]
+A1403F3F A140FFFF [mb2,unassigned][bad][bad]
+A1A3403F A1A340FF [mb2][ascii,tail][bad]
+A1A33F3F A1A380FF [mb2][tail][bad]
+A1A33F3F A1A381FF [mb2][head,tail][bad]
+A1A3A1403F A1A3A140FF [mb2][mb2,unassigned][bad]
+A1A3A1A33F A1A3A1A3FF [mb2][mb2][bad]
+A1A3FE403F A1A3FE40FF [mb2][mb2][bad]
+A1A33F40 A1A3FF40 [mb2][bad][ascii,tail]
+A1A33F3F A1A3FF80 [mb2][bad][tail]
+A1A33F3F A1A3FF81 [mb2][bad][head,tail]
+A1A33FA140 A1A3FFA140 [mb2][bad][mb2,unassigned]
+A1A33FA1A3 A1A3FFA1A3 [mb2][bad][mb2]
+A1A33FFE40 A1A3FFFE40 [mb2][bad][mb2]
+A1A33F3F A1A3FFFF [mb2][bad][bad]
+FE40403F FE4040FF [mb2][ascii,tail][bad]
+FE403F3F FE4080FF [mb2][tail][bad]
+FE403F3F FE4081FF [mb2][head,tail][bad]
+FE40A1403F FE40A140FF [mb2][mb2,unassigned][bad]
+FE40A1A33F FE40A1A3FF [mb2][mb2][bad]
+FE40FE403F FE40FE40FF [mb2][mb2][bad]
+FE403F40 FE40FF40 [mb2][bad][ascii,tail]
+FE403F3F FE40FF80 [mb2][bad][tail]
+FE403F3F FE40FF81 [mb2][bad][head,tail]
+FE403FA140 FE40FFA140 [mb2][bad][mb2,unassigned]
+FE403FA1A3 FE40FFA1A3 [mb2][bad][mb2]
+FE403FFE40 FE40FFFE40 [mb2][bad][mb2]
+FE403F3F FE40FFFF [mb2][bad][bad]
+3F4040 FF4040 [bad][ascii,tail][ascii,tail]
+3F403F FF4080 [bad][ascii,tail][tail]
+3F403F FF4081 [bad][ascii,tail][head,tail]
+3F40A140 FF40A140 [bad][ascii,tail][mb2,unassigned]
+3F40A1A3 FF40A1A3 [bad][ascii,tail][mb2]
+3F40FE40 FF40FE40 [bad][ascii,tail][mb2]
+3F403F FF40FF [bad][ascii,tail][bad]
+3F3F40 FF8040 [bad][tail][ascii,tail]
+3F3F3F FF8080 [bad][tail][tail]
+3F3F3F FF8081 [bad][tail][head,tail]
+3F3FA140 FF80A140 [bad][tail][mb2,unassigned]
+3F3FA1A3 FF80A1A3 [bad][tail][mb2]
+3F3FFE40 FF80FE40 [bad][tail][mb2]
+3F3F3F FF80FF [bad][tail][bad]
+3F8140 FF8140 [bad][head,tail][ascii,tail]
+3F8180 FF8180 [bad][head,tail][tail]
+3F8181 FF8181 [bad][head,tail][head,tail]
+3F81A140 FF81A140 [bad][head,tail][mb2,unassigned]
+3F81A13F FF81A1A3 [bad][head,tail][mb2]
+3F81FE40 FF81FE40 [bad][head,tail][mb2]
+3F3F3F FF81FF [bad][head,tail][bad]
+3FA14040 FFA14040 [bad][mb2,unassigned][ascii,tail]
+3FA1403F FFA14080 [bad][mb2,unassigned][tail]
+3FA1403F FFA14081 [bad][mb2,unassigned][head,tail]
+3FA140A140 FFA140A140 [bad][mb2,unassigned][mb2,unassigned]
+3FA140A1A3 FFA140A1A3 [bad][mb2,unassigned][mb2]
+3FA140FE40 FFA140FE40 [bad][mb2,unassigned][mb2]
+3FA1403F FFA140FF [bad][mb2,unassigned][bad]
+3FA1A340 FFA1A340 [bad][mb2][ascii,tail]
+3FA1A33F FFA1A380 [bad][mb2][tail]
+3FA1A33F FFA1A381 [bad][mb2][head,tail]
+3FA1A3A140 FFA1A3A140 [bad][mb2][mb2,unassigned]
+3FA1A3A1A3 FFA1A3A1A3 [bad][mb2][mb2]
+3FA1A3FE40 FFA1A3FE40 [bad][mb2][mb2]
+3FA1A33F FFA1A3FF [bad][mb2][bad]
+3FFE4040 FFFE4040 [bad][mb2][ascii,tail]
+3FFE403F FFFE4080 [bad][mb2][tail]
+3FFE403F FFFE4081 [bad][mb2][head,tail]
+3FFE40A140 FFFE40A140 [bad][mb2][mb2,unassigned]
+3FFE40A1A3 FFFE40A1A3 [bad][mb2][mb2]
+3FFE40FE40 FFFE40FE40 [bad][mb2][mb2]
+3FFE403F FFFE40FF [bad][mb2][bad]
+3F3F40 FFFF40 [bad][bad][ascii,tail]
+3F3F3F FFFF80 [bad][bad][tail]
+3F3F3F FFFF81 [bad][bad][head,tail]
+3F3FA140 FFFFA140 [bad][bad][mb2,unassigned]
+3F3FA1A3 FFFFA1A3 [bad][bad][mb2]
+3F3FFE40 FFFFFE40 [bad][bad][mb2]
+3F3F3F FFFFFF [bad][bad][bad]
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that start with an ASCII or an MB2 character,
+# followed by a pure non-ASCII tail, all should be fixed.
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1))
+AND type2='tail'
+ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 2
+Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 4
+Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 8
+Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 9
+Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 10
+Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 11
+Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 12
+Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 13
+Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 14
+Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 15
+Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 16
+Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 17
+Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 18
+Warning 1366 Incorrect string value: '\x80@' for column 'c' at row 19
+Warning 1366 Incorrect string value: '\x80\x80' for column 'c' at row 20
+Warning 1366 Incorrect string value: '\x80\x81' for column 'c' at row 21
+Warning 1366 Incorrect string value: '\x80\xA1@' for column 'c' at row 22
+Warning 1366 Incorrect string value: '\x80\xA1\xA3' for column 'c' at row 23
+Warning 1366 Incorrect string value: '\x80\xFE@' for column 'c' at row 24
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+24
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+403F40 408040 [ascii,tail][tail][ascii,tail]
+403F3F 408080 [ascii,tail][tail][tail]
+403F3F 408081 [ascii,tail][tail][head,tail]
+403FA140 4080A140 [ascii,tail][tail][mb2,unassigned]
+403FA1A3 4080A1A3 [ascii,tail][tail][mb2]
+403FFE40 4080FE40 [ascii,tail][tail][mb2]
+A1403F40 A1408040 [mb2,unassigned][tail][ascii,tail]
+A1403F3F A1408080 [mb2,unassigned][tail][tail]
+A1403F3F A1408081 [mb2,unassigned][tail][head,tail]
+A1403FA140 A14080A140 [mb2,unassigned][tail][mb2,unassigned]
+A1403FA1A3 A14080A1A3 [mb2,unassigned][tail][mb2]
+A1403FFE40 A14080FE40 [mb2,unassigned][tail][mb2]
+A1A33F40 A1A38040 [mb2][tail][ascii,tail]
+A1A33F3F A1A38080 [mb2][tail][tail]
+A1A33F3F A1A38081 [mb2][tail][head,tail]
+A1A33FA140 A1A380A140 [mb2][tail][mb2,unassigned]
+A1A33FA1A3 A1A380A1A3 [mb2][tail][mb2]
+A1A33FFE40 A1A380FE40 [mb2][tail][mb2]
+FE403F40 FE408040 [mb2][tail][ascii,tail]
+FE403F3F FE408080 [mb2][tail][tail]
+FE403F3F FE408081 [mb2][tail][head,tail]
+FE403FA140 FE4080A140 [mb2][tail][mb2,unassigned]
+FE403FA1A3 FE4080A1A3 [mb2][tail][mb2]
+FE403FFE40 FE4080FE40 [mb2][tail][mb2]
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that consist of two ASCII or MB2 characters,
+# followed by a pure non-ASCII tail, all should be fixed.
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) AND
+(FIND_IN_SET('mb2',type2) OR FIND_IN_SET('ascii',type2)) AND
+type3='tail'
+ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 2
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 4
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 8
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 9
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 10
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 11
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 12
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 13
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 14
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 15
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 16
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+16
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+40403F 404080 [ascii,tail][ascii,tail][tail]
+40A1403F 40A14080 [ascii,tail][mb2,unassigned][tail]
+40A1A33F 40A1A380 [ascii,tail][mb2][tail]
+40FE403F 40FE4080 [ascii,tail][mb2][tail]
+A140403F A1404080 [mb2,unassigned][ascii,tail][tail]
+A140A1403F A140A14080 [mb2,unassigned][mb2,unassigned][tail]
+A140A1A33F A140A1A380 [mb2,unassigned][mb2][tail]
+A140FE403F A140FE4080 [mb2,unassigned][mb2][tail]
+A1A3403F A1A34080 [mb2][ascii,tail][tail]
+A1A3A1403F A1A3A14080 [mb2][mb2,unassigned][tail]
+A1A3A1A33F A1A3A1A380 [mb2][mb2][tail]
+A1A3FE403F A1A3FE4080 [mb2][mb2][tail]
+FE40403F FE404080 [mb2][ascii,tail][tail]
+FE40A1403F FE40A14080 [mb2][mb2,unassigned][tail]
+FE40A1A33F FE40A1A380 [mb2][mb2][tail]
+FE40FE403F FE40FE4080 [mb2][mb2][tail]
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that consist of two MB2 characters,
+# followed by a non-ASCII head or tail, all should be fixed.
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE FIND_IN_SET('mb2',type1) AND FIND_IN_SET('mb2',type2)
+AND NOT FIND_IN_SET('ascii',type3)
+AND NOT FIND_IN_SET('mb2',type3)
+ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 2
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 4
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 8
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 9
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+9
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+A140A1403F A140A14081 [mb2,unassigned][mb2,unassigned][head,tail]
+A140A1A33F A140A1A381 [mb2,unassigned][mb2][head,tail]
+A140FE403F A140FE4081 [mb2,unassigned][mb2][head,tail]
+A1A3A1403F A1A3A14081 [mb2][mb2,unassigned][head,tail]
+A1A3A1A33F A1A3A1A381 [mb2][mb2][head,tail]
+A1A3FE403F A1A3FE4081 [mb2][mb2][head,tail]
+FE40A1403F FE40A14081 [mb2][mb2,unassigned][head,tail]
+FE40A1A33F FE40A1A381 [mb2][mb2][head,tail]
+FE40FE403F FE40FE4081 [mb2][mb2][head,tail]
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that consist of head + tail + MB2 should go without warnings
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE FIND_IN_SET('head',type1)
+AND FIND_IN_SET('tail',type2)
+AND FIND_IN_SET('mb2',type3)
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+9
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+8140A140 [head,tail][ascii,tail][mb2,unassigned]
+8140A1A3 [head,tail][ascii,tail][mb2]
+8140FE40 [head,tail][ascii,tail][mb2]
+8180A140 [head,tail][tail][mb2,unassigned]
+8180A1A3 [head,tail][tail][mb2]
+8180FE40 [head,tail][tail][mb2]
+8181A140 [head,tail][head,tail][mb2,unassigned]
+8181A1A3 [head,tail][head,tail][mb2]
+8181FE40 [head,tail][head,tail][mb2]
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+#
+# Sequences that consist of (ascii or mb2) + head + tail should go without warnings
+#
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1))
+AND FIND_IN_SET('head',type2)
+AND FIND_IN_SET('tail',type3)
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+12
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+408140 [ascii,tail][head,tail][ascii,tail]
+408180 [ascii,tail][head,tail][tail]
+408181 [ascii,tail][head,tail][head,tail]
+A1408140 [mb2,unassigned][head,tail][ascii,tail]
+A1408180 [mb2,unassigned][head,tail][tail]
+A1408181 [mb2,unassigned][head,tail][head,tail]
+A1A38140 [mb2][head,tail][ascii,tail]
+A1A38180 [mb2][head,tail][tail]
+A1A38181 [mb2][head,tail][head,tail]
+FE408140 [mb2][head,tail][ascii,tail]
+FE408180 [mb2][head,tail][tail]
+FE408181 [mb2][head,tail][head,tail]
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b;
+Warnings:
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 1
+Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 3
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 5
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 6
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 7
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 9
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 10
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 12
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 13
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 15
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 16
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 18
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 19
+Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 27
+Warning 1366 Incorrect string value: '\x80' for column 'c' at row 30
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 31
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 35
+Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 37
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 39
+Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 41
+Warning 1366 Incorrect string value: '\x81' for column 'c' at row 43
+Warning 1366 Incorrect string value: '\xA3' for column 'c' at row 45
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+46
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+HEX(c) comment
+4081A140 [ascii,tail][head,tail][mb2,unassigned]
+4081FE40 [ascii,tail][head,tail][mb2]
+814040 [head,tail][ascii,tail][ascii,tail]
+818040 [head,tail][tail][ascii,tail]
+818140 [head,tail][head,tail][ascii,tail]
+81A14040 [head,tail][mb2,unassigned][ascii,tail]
+81A140A140 [head,tail][mb2,unassigned][mb2,unassigned]
+81A140A1A3 [head,tail][mb2,unassigned][mb2]
+81A140FE40 [head,tail][mb2,unassigned][mb2]
+81A1A340 [head,tail][mb2][ascii,tail]
+81A1A380 [head,tail][mb2][tail]
+81A1A381 [head,tail][mb2][head,tail]
+81A1A3A140 [head,tail][mb2][mb2,unassigned]
+81A1A3FE40 [head,tail][mb2][mb2]
+81FE4040 [head,tail][mb2][ascii,tail]
+81FE40A140 [head,tail][mb2][mb2,unassigned]
+81FE40A1A3 [head,tail][mb2][mb2]
+81FE40FE40 [head,tail][mb2][mb2]
+A14081A140 [mb2,unassigned][head,tail][mb2,unassigned]
+A14081FE40 [mb2,unassigned][head,tail][mb2]
+A1A381A140 [mb2][head,tail][mb2,unassigned]
+A1A381FE40 [mb2][head,tail][mb2]
+FE4081A140 [mb2][head,tail][mb2,unassigned]
+FE4081FE40 [mb2][head,tail][mb2]
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+HEX(c) HEX(b) comment
+40403F 404081 [ascii,tail][ascii,tail][head,tail]
+4081A13F 4081A1A3 [ascii,tail][head,tail][mb2]
+40A1403F 40A14081 [ascii,tail][mb2,unassigned][head,tail]
+40A1A33F 40A1A381 [ascii,tail][mb2][head,tail]
+40FE403F 40FE4081 [ascii,tail][mb2][head,tail]
+81403F 814080 [head,tail][ascii,tail][tail]
+81403F 814081 [head,tail][ascii,tail][head,tail]
+81803F 818080 [head,tail][tail][tail]
+81803F 818081 [head,tail][tail][head,tail]
+81813F 818180 [head,tail][head,tail][tail]
+81813F 818181 [head,tail][head,tail][head,tail]
+81A1403F 81A14080 [head,tail][mb2,unassigned][tail]
+81A1403F 81A14081 [head,tail][mb2,unassigned][head,tail]
+81A1A3A13F 81A1A3A1A3 [head,tail][mb2][mb2]
+81FE403F 81FE4080 [head,tail][mb2][tail]
+81FE403F 81FE4081 [head,tail][mb2][head,tail]
+A140403F A1404081 [mb2,unassigned][ascii,tail][head,tail]
+A14081A13F A14081A1A3 [mb2,unassigned][head,tail][mb2]
+A1A3403F A1A34081 [mb2][ascii,tail][head,tail]
+A1A381A13F A1A381A1A3 [mb2][head,tail][mb2]
+FE40403F FE404081 [mb2][ascii,tail][head,tail]
+FE4081A13F FE4081A1A3 [mb2][head,tail][mb2]
+DROP TABLE t3;
+DROP TABLE t2;
+DROP TABLE t1;
+#
+# END OF MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
+#
+#
+# End of 10.1 tests
+#
diff --git a/mysql-test/r/ctype_sjis.result b/mysql-test/r/ctype_sjis.result
index 48456c16705..b4ef6f8c7e5 100644
--- a/mysql-test/r/ctype_sjis.result
+++ b/mysql-test/r/ctype_sjis.result
@@ -477,7 +477,7 @@ Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
SELECT COUNT(*) FROM t1;
COUNT(*)
14623
-SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1;
+SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1;
COUNT(*)
63
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
diff --git a/mysql-test/r/ctype_ujis.result b/mysql-test/r/ctype_ujis.result
index 413ab4efe31..4074d98c00d 100644
--- a/mysql-test/r/ctype_ujis.result
+++ b/mysql-test/r/ctype_ujis.result
@@ -2626,7 +2626,7 @@ Warning 1366 Incorrect string value: '\x80_' for column 'a' at row 64
SELECT COUNT(*) FROM t1;
COUNT(*)
44671
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
COUNT(*)
17735
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
@@ -25938,7 +25938,7 @@ CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET ujis);
INSERT INTO t1 VALUES (0x8EA0);
SELECT HEX(a), CHAR_LENGTH(a) FROM t1;
HEX(a) CHAR_LENGTH(a)
- 0
+3F3F 2
DROP TABLE t1;
SELECT _ujis 0x8EA0;
ERROR HY000: Invalid ujis character string: '8EA0'
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 4b23b010c79..2779ea5fa0f 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -225,7 +225,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 varchar(10) character set utf8);
insert into t1 values (0x41FF);
@@ -233,7 +233,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 text character set utf8);
insert into t1 values (0x41FF);
@@ -241,7 +241,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (a text character set utf8, primary key(a(371)));
ERROR 42000: Specified key was too long; max key length is 1000 bytes
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index 0dc94e90454..d8f4eb32132 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -225,7 +225,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 varchar(10) character set utf8mb4);
insert into t1 values (0x41FF);
@@ -233,7 +233,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 text character set utf8mb4);
insert into t1 values (0x41FF);
@@ -241,7 +241,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (a text character set utf8mb4, primary key(a(371)));
ERROR 42000: Specified key was too long; max key length is 1000 bytes
@@ -2327,7 +2327,7 @@ select hex(utf8mb4) from t1;
hex(utf8mb4)
F0908080
F0BFBFBF
-
+3F
delete from t1;
Testing [F2..F3][80..BF][80..BF][80..BF]
insert into t1 values (0xF2808080);
@@ -2347,7 +2347,7 @@ select hex(utf8mb4) from t1;
hex(utf8mb4)
F4808080
F48F8080
-
+3F
drop table t1;
#
# Check strnxfrm() with odd length
@@ -2472,45 +2472,45 @@ F3A087AFEA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
-EA9DA8
+3F3F3F3FEA9DA8
SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2;
HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding))
F09D8480EA9DA8
F09D8480EFB9AB
-F09D8480
+F09D84803F3F3F3F
F09D849EEA9DA8
F09D849EEFB9AB
-F09D849E
+F09D849E3F3F3F3F
F09D859EEA9DA8
F09D859EEFB9AB
-F09D859E
+F09D859E3F3F3F3F
F09D878FEA9DA8
F09D878FEFB9AB
-F09D878F
+F09D878F3F3F3F3F
F09D9C9FEA9DA8
F09D9C9FEFB9AB
-F09D9C9F
+F09D9C9F3F3F3F3F
F09D9E9FEA9DA8
F09D9E9FEFB9AB
-F09D9E9F
+F09D9E9F3F3F3F3F
F48FBFBFEA9DA8
F48FBFBFEFB9AB
-F48FBFBF
+F48FBFBF3F3F3F3F
F3A087AFEA9DA8
F3A087AFEFB9AB
-F3A087AF
+F3A087AF3F3F3F3F
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB
-EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB
+EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
-EA9DA8
-EFB9AB
-
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
+3F3F3F3FEA9DA8
+3F3F3F3FEFB9AB
+3F3F3F3F3F3F3F3F
SELECT count(*) FROM t1, t2
WHERE t1.utf8mb4_encoding > t2.utf8mb3_encoding;
count(*)
@@ -2547,7 +2547,7 @@ u_decimal hex(utf8mb4_encoding) utf8mb4_encoding
119070 3F3F3F3F3F3F3F3F3F3F ??????????
65131 EFB9AB3F3F3F3F3FEFB9ABEFB9AB3FEFB9AB ﹫?????﹫﹫?﹫
119070 3F3F3F3F3F3F3F3F3F3F ??????????
-1114111
+1114111 3F3F3F3F ????
ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb4;
SHOW CREATE TABLE t2;
Table Create Table
@@ -2559,7 +2559,7 @@ SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
42856 EA9DA8
65131 EFB9AB
-1114111
+1114111 3F3F3F3F
ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3;
SHOW CREATE TABLE t2;
Table Create Table
@@ -2571,7 +2571,7 @@ SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
42856 EA9DA8
65131 EFB9AB
-1114111
+1114111 3F3F3F3F
ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3;
SHOW CREATE TABLE t1;
Table Create Table
@@ -2592,7 +2592,7 @@ u_decimal hex(utf8mb4_encoding)
119070 3F3F3F3F3F3F3F3F3F3F
65131 EFB9AB3F3F3F3F3FEFB9ABEFB9AB3FEFB9AB
119070 3F3F3F3F3F3F3F3F3F3F
-1114111
+1114111 3F3F3F3F
ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb4;
SHOW CREATE TABLE t1;
Table Create Table
@@ -2613,7 +2613,7 @@ u_decimal hex(utf8mb4_encoding)
119070 3F3F3F3F3F3F3F3F3F3F
65131 EFB9AB3F3F3F3F3FEFB9ABEFB9AB3FEFB9AB
119070 3F3F3F3F3F3F3F3F3F3F
-1114111
+1114111 3F3F3F3F
ALTER TABLE t2 MODIFY utf8mb3_encoding VARCHAR(10) CHARACTER SET utf8mb4;
SHOW CREATE TABLE t2;
Table Create Table
@@ -2625,7 +2625,7 @@ SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
42856 EA9DA8
65131 EFB9AB
-1114111
+1114111 3F3F3F3F
DROP TABLE IF EXISTS t3;
CREATE TABLE t3 (
u_decimal int NOT NULL,
@@ -3306,5 +3306,53 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF
# End of 5.6 tests
#
#
+# Start of 10.0 tests
+#
+#
+# MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
+#
+#
+# This test sets session character set to 3-byte utf8,
+# but then sends a 4-byte sequence (which is wrong for 3-byte utf8).
+# It should be replaced to four question marks: '????' in both columns
+# (i.e. four unknown bytes are replaced to four question marks),
+# then the rest of the string should be stored, so we get 'a ???? b'.
+#
+SET NAMES utf8;
+CREATE TABLE t1 (
+a VARCHAR(32) CHARACTER SET utf8mb4,
+b VARCHAR(32) CHARACTER SET utf8
+);
+INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b';
+Warnings:
+Warning 1366 Incorrect string value: '\xF0\x9F\x98\x81 b' for column 'a' at row 1
+Warning 1366 Incorrect string value: '\xF0\x9F\x98\x81 b' for column 'b' at row 1
+SELECT * FROM t1;
+a b
+a ???? b a ???? b
+DROP TABLE t1;
+#
+# This test sets session character set to 4-byte utf8,
+# then normally sends a 4-byte sequence.
+# It should be stored AS IS into the utf8mb4 column (a),
+# and should be replaced to a single question mark in the utf8 column (b)
+# (i.e. one character that cannot be converted is replaced to one question mark).
+#
+SET NAMES utf8mb4;
+CREATE TABLE t1 (
+a VARCHAR(32) CHARACTER SET utf8mb4,
+b VARCHAR(32) CHARACTER SET utf8
+);
+INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b';
+Warnings:
+Warning 1366 Incorrect string value: '\xF0\x9F\x98\x81 b' for column 'b' at row 1
+SELECT * FROM t1;
+a b
+a 😁 b a ? b
+DROP TABLE t1;
+#
+# End of 10.0 tests
+#
+#
# End of tests
#
diff --git a/mysql-test/r/ctype_utf8mb4_heap.result b/mysql-test/r/ctype_utf8mb4_heap.result
index 57d29a24fd0..7f5125ae2ba 100644
--- a/mysql-test/r/ctype_utf8mb4_heap.result
+++ b/mysql-test/r/ctype_utf8mb4_heap.result
@@ -225,7 +225,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 varchar(10) character set utf8mb4) engine heap;
insert into t1 values (0x41FF);
@@ -233,7 +233,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
CREATE TABLE t1 ( a varchar(10) ) CHARACTER SET utf8mb4 ENGINE heap;
INSERT INTO t1 VALUES ( 'test' );
@@ -2157,7 +2157,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8mb4' at row 1
select hex(utf8mb4) from t1;
hex(utf8mb4)
-
+3F
F0908080
F0BFBFBF
delete from t1;
@@ -2177,7 +2177,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8mb4' at row 1
select hex(utf8mb4) from t1;
hex(utf8mb4)
-
+3F
F4808080
F48F8080
drop table t1;
@@ -2274,7 +2274,7 @@ Warning 1366 Incorrect string value: '\xF4\x8F\xBF\xBD' for column 'utf8mb3_enco
UPDATE t2 SET utf8mb3_encoding= _utf8mb4 x'ea9da8' where u_decimal= 42856;
SELECT HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) FROM t1;
HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8'))
-EA9DA8
+3F3F3F3FEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
F09D8480EA9DA8
F09D849EEA9DA8
@@ -2288,40 +2288,40 @@ F3A087AFEA9DA8
F48FBFBFEA9DA8
SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2;
HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding))
-
-EA9DA8
-EFB9AB
-EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB
+3F3F3F3F3F3F3F3F
+3F3F3F3FEA9DA8
+3F3F3F3FEFB9AB
+EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB
-F09D8480
+F09D84803F3F3F3F
F09D8480EA9DA8
F09D8480EFB9AB
-F09D849E
+F09D849E3F3F3F3F
F09D849EEA9DA8
F09D849EEFB9AB
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
-F09D859E
+F09D859E3F3F3F3F
F09D859EEA9DA8
F09D859EEFB9AB
-F09D878F
+F09D878F3F3F3F3F
F09D878FEA9DA8
F09D878FEFB9AB
-F09D9C9F
+F09D9C9F3F3F3F3F
F09D9C9FEA9DA8
F09D9C9FEFB9AB
-F09D9E9F
+F09D9E9F3F3F3F3F
F09D9E9FEA9DA8
F09D9E9FEFB9AB
-F3A087AF
+F3A087AF3F3F3F3F
F3A087AFEA9DA8
F3A087AFEFB9AB
-F48FBFBF
+F48FBFBF3F3F3F3F
F48FBFBFEA9DA8
F48FBFBFEFB9AB
SELECT count(*) FROM t1, t2
@@ -2337,8 +2337,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=MEMORY DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding),utf8mb4_encoding FROM t1;
u_decimal hex(utf8mb4_encoding) utf8mb4_encoding
-1114111
1114111 3F ?
+1114111 3F3F3F3F ????
119040 3F ?
119070 3F ?
119070 3F3F3F3F3F3F3F3F3F3F ??????????
@@ -2358,7 +2358,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=MEMORY DEFAULT CHARSET=utf8mb4
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3;
@@ -2370,7 +2370,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=MEMORY DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3;
@@ -2382,8 +2382,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=MEMORY DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding) FROM t1;
u_decimal hex(utf8mb4_encoding)
-1114111
1114111 3F
+1114111 3F3F3F3F
119040 3F
119070 3F
119070 3F3F3F3F3F3F3F3F3F3F
@@ -2403,8 +2403,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=MEMORY DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding) FROM t1;
u_decimal hex(utf8mb4_encoding)
-1114111
1114111 3F
+1114111 3F3F3F3F
119040 3F
119070 3F
119070 3F3F3F3F3F3F3F3F3F3F
@@ -2424,7 +2424,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=MEMORY DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
DROP TABLE IF EXISTS t3;
diff --git a/mysql-test/r/ctype_utf8mb4_innodb.result b/mysql-test/r/ctype_utf8mb4_innodb.result
index ba03a3f66e6..053e6de8fe1 100644
--- a/mysql-test/r/ctype_utf8mb4_innodb.result
+++ b/mysql-test/r/ctype_utf8mb4_innodb.result
@@ -225,7 +225,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 varchar(10) character set utf8mb4) engine InnoDB;
insert into t1 values (0x41FF);
@@ -233,7 +233,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 text character set utf8mb4) engine InnoDB;
insert into t1 values (0x41FF);
@@ -241,7 +241,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (a text character set utf8mb4, primary key(a(371))) engine InnoDB;
ERROR 42000: Specified key was too long; max key length is 767 bytes
@@ -2285,7 +2285,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8mb4' at row 1
select hex(utf8mb4) from t1;
hex(utf8mb4)
-
+3F
F0908080
F0BFBFBF
delete from t1;
@@ -2305,7 +2305,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8mb4' at row 1
select hex(utf8mb4) from t1;
hex(utf8mb4)
-
+3F
F4808080
F48F8080
drop table t1;
@@ -2421,7 +2421,7 @@ Warning 1366 Incorrect string value: '\xF4\x8F\xBF\xBD' for column 'utf8mb3_enco
UPDATE t2 SET utf8mb3_encoding= _utf8mb4 x'ea9da8' where u_decimal= 42856;
SELECT HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) FROM t1;
HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8'))
-EA9DA8
+3F3F3F3FEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
F09D8480EA9DA8
F09D849EEA9DA8
@@ -2435,40 +2435,40 @@ F3A087AFEA9DA8
F48FBFBFEA9DA8
SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2;
HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding))
-
-EA9DA8
-EFB9AB
-EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB
+3F3F3F3F3F3F3F3F
+3F3F3F3FEA9DA8
+3F3F3F3FEFB9AB
+EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB
-F09D8480
+F09D84803F3F3F3F
F09D8480EA9DA8
F09D8480EFB9AB
-F09D849E
+F09D849E3F3F3F3F
F09D849EEA9DA8
F09D849EEFB9AB
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
-F09D859E
+F09D859E3F3F3F3F
F09D859EEA9DA8
F09D859EEFB9AB
-F09D878F
+F09D878F3F3F3F3F
F09D878FEA9DA8
F09D878FEFB9AB
-F09D9C9F
+F09D9C9F3F3F3F3F
F09D9C9FEA9DA8
F09D9C9FEFB9AB
-F09D9E9F
+F09D9E9F3F3F3F3F
F09D9E9FEA9DA8
F09D9E9FEFB9AB
-F3A087AF
+F3A087AF3F3F3F3F
F3A087AFEA9DA8
F3A087AFEFB9AB
-F48FBFBF
+F48FBFBF3F3F3F3F
F48FBFBFEA9DA8
F48FBFBFEFB9AB
SELECT count(*) FROM t1, t2
@@ -2484,8 +2484,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding),utf8mb4_encoding FROM t1;
u_decimal hex(utf8mb4_encoding) utf8mb4_encoding
-1114111
1114111 3F ?
+1114111 3F3F3F3F ????
119040 3F ?
119070 3F ?
119070 3F3F3F3F3F3F3F3F3F3F ??????????
@@ -2505,7 +2505,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3;
@@ -2517,7 +2517,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3;
@@ -2529,8 +2529,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding) FROM t1;
u_decimal hex(utf8mb4_encoding)
-1114111
1114111 3F
+1114111 3F3F3F3F
119040 3F
119070 3F
119070 3F3F3F3F3F3F3F3F3F3F
@@ -2550,8 +2550,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding) FROM t1;
u_decimal hex(utf8mb4_encoding)
-1114111
1114111 3F
+1114111 3F3F3F3F
119040 3F
119070 3F
119070 3F3F3F3F3F3F3F3F3F3F
@@ -2571,7 +2571,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
DROP TABLE IF EXISTS t3;
diff --git a/mysql-test/r/ctype_utf8mb4_myisam.result b/mysql-test/r/ctype_utf8mb4_myisam.result
index c4ff8e0a882..5bfdfe8ca71 100644
--- a/mysql-test/r/ctype_utf8mb4_myisam.result
+++ b/mysql-test/r/ctype_utf8mb4_myisam.result
@@ -225,7 +225,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 varchar(10) character set utf8mb4) engine MyISAM;
insert into t1 values (0x41FF);
@@ -233,7 +233,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (s1 text character set utf8mb4) engine MyISAM;
insert into t1 values (0x41FF);
@@ -241,7 +241,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xFF' for column 's1' at row 1
select hex(s1) from t1;
hex(s1)
-41
+413F
drop table t1;
create table t1 (a text character set utf8mb4, primary key(a(371))) engine MyISAM;
ERROR 42000: Specified key was too long; max key length is 1000 bytes
@@ -2285,7 +2285,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xF0\x8F\x80\x80' for column 'utf8mb4' at row 1
select hex(utf8mb4) from t1;
hex(utf8mb4)
-
+3F
F0908080
F0BFBFBF
delete from t1;
@@ -2305,7 +2305,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xF4\x90\x80\x80' for column 'utf8mb4' at row 1
select hex(utf8mb4) from t1;
hex(utf8mb4)
-
+3F
F4808080
F48F8080
drop table t1;
@@ -2421,7 +2421,7 @@ Warning 1366 Incorrect string value: '\xF4\x8F\xBF\xBD' for column 'utf8mb3_enco
UPDATE t2 SET utf8mb3_encoding= _utf8mb4 x'ea9da8' where u_decimal= 42856;
SELECT HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8')) FROM t1;
HEX(CONCAT(utf8mb4_encoding, _utf8 x'ea9da8'))
-EA9DA8
+3F3F3F3FEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
F09D8480EA9DA8
F09D849EEA9DA8
@@ -2435,40 +2435,40 @@ F3A087AFEA9DA8
F48FBFBFEA9DA8
SELECT HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding)) FROM t1,t2;
HEX(CONCAT(utf8mb4_encoding, utf8mb3_encoding))
-
-EA9DA8
-EFB9AB
-EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB
+3F3F3F3F3F3F3F3F
+3F3F3F3FEA9DA8
+3F3F3F3FEFB9AB
+EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9AB3F3F3F3F
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEA9DA8
EFB9ABF09D849EF09D859EF09D859EF09D8480F09D859FEFB9ABEFB9ABF09D85A0EFB9ABEFB9AB
-F09D8480
+F09D84803F3F3F3F
F09D8480EA9DA8
F09D8480EFB9AB
-F09D849E
+F09D849E3F3F3F3F
F09D849EEA9DA8
F09D849EEFB9AB
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
-F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
+F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D84803F3F3F3F
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EA9DA8
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
F09D849EF09D859EF09D859EF09D8480F09D859FF09D859FF09D859FF09D85A0F09D85A0F09D8480EFB9AB
-F09D859E
+F09D859E3F3F3F3F
F09D859EEA9DA8
F09D859EEFB9AB
-F09D878F
+F09D878F3F3F3F3F
F09D878FEA9DA8
F09D878FEFB9AB
-F09D9C9F
+F09D9C9F3F3F3F3F
F09D9C9FEA9DA8
F09D9C9FEFB9AB
-F09D9E9F
+F09D9E9F3F3F3F3F
F09D9E9FEA9DA8
F09D9E9FEFB9AB
-F3A087AF
+F3A087AF3F3F3F3F
F3A087AFEA9DA8
F3A087AFEFB9AB
-F48FBFBF
+F48FBFBF3F3F3F3F
F48FBFBFEA9DA8
F48FBFBFEFB9AB
SELECT count(*) FROM t1, t2
@@ -2484,8 +2484,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding),utf8mb4_encoding FROM t1;
u_decimal hex(utf8mb4_encoding) utf8mb4_encoding
-1114111
1114111 3F ?
+1114111 3F3F3F3F ????
119040 3F ?
119070 3F ?
119070 3F3F3F3F3F3F3F3F3F3F ??????????
@@ -2505,7 +2505,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
ALTER TABLE t2 CONVERT TO CHARACTER SET utf8mb3;
@@ -2517,7 +2517,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
ALTER TABLE t1 MODIFY utf8mb4_encoding VARCHAR(10) CHARACTER SET utf8mb3;
@@ -2529,8 +2529,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding) FROM t1;
u_decimal hex(utf8mb4_encoding)
-1114111
1114111 3F
+1114111 3F3F3F3F
119040 3F
119070 3F
119070 3F3F3F3F3F3F3F3F3F3F
@@ -2550,8 +2550,8 @@ t1 CREATE TABLE `t1` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb4_encoding) FROM t1;
u_decimal hex(utf8mb4_encoding)
-1114111
1114111 3F
+1114111 3F3F3F3F
119040 3F
119070 3F
119070 3F3F3F3F3F3F3F3F3F3F
@@ -2571,7 +2571,7 @@ t2 CREATE TABLE `t2` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8
SELECT u_decimal,hex(utf8mb3_encoding) FROM t2;
u_decimal hex(utf8mb3_encoding)
-1114111
+1114111 3F3F3F3F
42856 EA9DA8
65131 EFB9AB
DROP TABLE IF EXISTS t3;
diff --git a/mysql-test/suite/funcs_2/include/check_charset.inc b/mysql-test/suite/funcs_2/include/check_charset.inc
index df4a58d0eeb..0242d4390ac 100644
--- a/mysql-test/suite/funcs_2/include/check_charset.inc
+++ b/mysql-test/suite/funcs_2/include/check_charset.inc
@@ -22,13 +22,15 @@ SHOW TABLE STATUS LIKE 't1';
--disable_warnings
--disable_query_log
+ALTER TABLE test.t1 ADD code VARCHAR(16) NOT NULL;
let $1= 221;
while ($1)
{
- eval INSERT INTO test.t1 VALUES(CHAR(254-$1));
+ eval INSERT INTO test.t1 VALUES(CHAR(254-$1), HEX(254-$1));
dec $1;
}
DELETE FROM test.t1 WHERE CHAR_LENGTH(a) <> 1;
+DELETE FROM test.t1 WHERE a='?' AND code<>'3F';
--enable_query_log
--enable_warnings
diff --git a/mysql-test/suite/innodb/r/innodb-update-insert.result b/mysql-test/suite/innodb/r/innodb-update-insert.result
index cd0fed101ab..034a63bca6c 100644
--- a/mysql-test/suite/innodb/r/innodb-update-insert.result
+++ b/mysql-test/suite/innodb/r/innodb-update-insert.result
@@ -30,7 +30,7 @@ Warnings:
Warning 1366 Incorrect string value: '\xA3' for column 'f1' at row 1
select f1 from t1;
f1
-
+?
update t1 set f1=0x6a;
update t1 set f3=repeat(0xb1,8103);
update t1 set f1=0x4a;
@@ -39,5 +39,5 @@ Warnings:
Warning 1366 Incorrect string value: '\x82' for column 'f1' at row 1
select f1 from t1;
f1
-
+?
drop table t1;
diff --git a/mysql-test/t/ctype_big5.test b/mysql-test/t/ctype_big5.test
index 5c0bdff4633..46bb29514ff 100644
--- a/mysql-test/t/ctype_big5.test
+++ b/mysql-test/t/ctype_big5.test
@@ -121,7 +121,7 @@ DROP TEMPORARY TABLE head, tail;
SHOW CREATE TABLE t1;
SELECT COUNT(*) FROM t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
#
# Display all characters that have upper or lower case mapping.
#
diff --git a/mysql-test/t/ctype_cp932_binlog_stm.test b/mysql-test/t/ctype_cp932_binlog_stm.test
index 304c9f5d05c..1b92006c949 100644
--- a/mysql-test/t/ctype_cp932_binlog_stm.test
+++ b/mysql-test/t/ctype_cp932_binlog_stm.test
@@ -99,7 +99,7 @@ DROP TEMPORARY TABLE head, tail;
SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
SELECT COUNT(*) FROM t1;
-SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1;
+SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1;
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
#
# Display all characters that have upper or lower case mapping.
diff --git a/mysql-test/t/ctype_eucjpms.test b/mysql-test/t/ctype_eucjpms.test
index 49ca81850ed..2dd806ed027 100644
--- a/mysql-test/t/ctype_eucjpms.test
+++ b/mysql-test/t/ctype_eucjpms.test
@@ -446,6 +446,7 @@ SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
SELECT COUNT(*) FROM t1;
SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'' AND a<>'?';
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
SELECT * FROM t1 WHERE CHAR_LENGTH(a)=2;
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=3;
diff --git a/mysql-test/t/ctype_euckr.test b/mysql-test/t/ctype_euckr.test
index 155b8ebed00..33b3e96cba8 100644
--- a/mysql-test/t/ctype_euckr.test
+++ b/mysql-test/t/ctype_euckr.test
@@ -95,8 +95,8 @@ WHERE t11.a >= 0x81 AND t11.a <= 0xFE
AND t12.a >= 0x41 AND t12.a <= 0xFE
ORDER BY t11.a, t12.a;
--enable_warnings
-SELECT s as bad_code FROM t2 WHERE a='' ORDER BY s;
-DELETE FROM t2 WHERE a='';
+SELECT s as bad_code FROM t2 WHERE a='?' ORDER BY s;
+DELETE FROM t2 WHERE a='?';
ALTER TABLE t2 ADD u VARCHAR(1) CHARACTER SET utf8, ADD a2 VARCHAR(1) CHARACTER SET euckr;
--disable_warnings
UPDATE t2 SET u=a, a2=u;
@@ -145,7 +145,7 @@ ORDER BY head, tail;
DROP TEMPORARY TABLE head, tail;
SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
#
# Display all characters that have upper or lower case mapping.
#
diff --git a/mysql-test/t/ctype_gb2312.test b/mysql-test/t/ctype_gb2312.test
index e3dd448f54c..3ca6941705c 100644
--- a/mysql-test/t/ctype_gb2312.test
+++ b/mysql-test/t/ctype_gb2312.test
@@ -69,7 +69,7 @@ ORDER BY head, tail;
DROP TEMPORARY TABLE head, tail;
SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
#
# Display all characters that have upper or lower case mapping.
#
diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test
index d44009b6109..d98be88326e 100644
--- a/mysql-test/t/ctype_gbk.test
+++ b/mysql-test/t/ctype_gbk.test
@@ -104,7 +104,7 @@ ORDER BY head, tail;
DROP TEMPORARY TABLE head, tail;
SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
#
# Display all characters that have upper or lower case mapping.
#
@@ -203,3 +203,228 @@ SET NAMES gbk;
--echo #
--echo # End of 10.0 tests
--echo #
+
+
+--echo #
+--echo # Start of 10.1 tests
+--echo #
+
+--echo #
+--echo # MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
+--echo #
+
+CREATE TABLE t1 (
+ id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ b VARBINARY(16),
+ type SET('ascii','bad','head','tail','mb2','unassigned')
+);
+INSERT INTO t1 (b, type) VALUES (0x40, 'ascii,tail');
+INSERT INTO t1 (b, type) VALUES (0x80, 'tail');
+INSERT INTO t1 (b, type) VALUES (0x81, 'head,tail');
+INSERT INTO t1 (b, type) VALUES (0xFF, 'bad');
+INSERT INTO t1 (b, type) VALUES (0xA140, 'mb2,unassigned');
+INSERT INTO t1 (b, type) VALUES (0xA1A3, 'mb2');
+INSERT INTO t1 (b, type) VALUES (0xFE40, 'mb2');
+CREATE TABLE t2 AS SELECT
+ CONCAT(t1.b,t2.b) AS b,
+ t1.type AS type1,
+ t2.type AS type2,
+ CONCAT('[',t1.type,'][',t2.type,']') AS comment
+FROM t1, t1 t2;
+
+CREATE TABLE t3
+(
+ b VARBINARY(16),
+ c VARCHAR(16) CHARACTER SET gbk,
+ comment VARCHAR(128)
+);
+--echo #
+--echo # A combination of two valid characters, should give no warnings
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE
+ (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND
+ (FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2))
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Sequences that start with a tail or a bad byte,
+--echo # or end with a bad byte, all should be fixed.
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE type1='tail' OR type1='bad' OR type2='bad'
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Sequences that start with an ASCII or an MB2 character,
+--echo # followed by a non-ASCII tail, all should be fixed.
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1))
+ AND (FIND_IN_SET('tail',type2) AND NOT FIND_IN_SET('ascii',type2))
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Other sequences
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t3;
+DROP TABLE t3;
+DROP TABLE t2;
+
+CREATE TABLE t2 AS SELECT
+ CONCAT(t1.b,t2.b,t3.b) AS b,
+ t1.type AS type1,
+ t2.type AS type2,
+ t3.type AS type3,
+ CONCAT('[',t1.type,'][',t2.type,'][',t3.type,']') AS comment
+FROM t1, t1 t2,t1 t3;
+SELECT COUNT(*) FROM t2;
+
+CREATE TABLE t3
+(
+ b VARBINARY(16),
+ c VARCHAR(16) CHARACTER SET gbk,
+ comment VARCHAR(128)
+);
+
+--echo #
+--echo # A combination of three valid characters, should give no warnings
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE
+ (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1)) AND
+ (FIND_IN_SET('ascii',type2) OR FIND_IN_SET('mb2',type2)) AND
+ (FIND_IN_SET('ascii',type3) OR FIND_IN_SET('mb2',type3))
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Sequences that start with a tail or a bad byte,
+--echo # or have a bad byte, all should be fixed.
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE type1='tail' OR type1='bad' OR type2='bad' OR type3='bad'
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Sequences that start with an ASCII or an MB2 character,
+--echo # followed by a pure non-ASCII tail, all should be fixed.
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1))
+ AND type2='tail'
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Sequences that consist of two ASCII or MB2 characters,
+--echo # followed by a pure non-ASCII tail, all should be fixed.
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('mb2',type1) OR FIND_IN_SET('ascii',type1)) AND
+ (FIND_IN_SET('mb2',type2) OR FIND_IN_SET('ascii',type2)) AND
+ type3='tail'
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+
+--echo #
+--echo # Sequences that consist of two MB2 characters,
+--echo # followed by a non-ASCII head or tail, all should be fixed.
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE FIND_IN_SET('mb2',type1) AND FIND_IN_SET('mb2',type2)
+ AND NOT FIND_IN_SET('ascii',type3)
+ AND NOT FIND_IN_SET('mb2',type3)
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+
+--echo #
+--echo # Sequences that consist of head + tail + MB2 should go without warnings
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE FIND_IN_SET('head',type1)
+ AND FIND_IN_SET('tail',type2)
+ AND FIND_IN_SET('mb2',type3)
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+--echo #
+--echo # Sequences that consist of (ascii or mb2) + head + tail should go without warnings
+--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2
+WHERE (FIND_IN_SET('ascii',type1) OR FIND_IN_SET('mb2',type1))
+ AND FIND_IN_SET('head',type2)
+ AND FIND_IN_SET('tail',type3)
+ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+DELETE FROM t2 WHERE b IN (SELECT b FROM t3);
+DELETE FROM t3;
+
+
+#--echo #
+#--echo # Other sequences
+#--echo #
+INSERT INTO t3 (b,c,comment) SELECT b,b,comment FROM t2 ORDER BY b;
+SELECT COUNT(*) FROM t3;
+SELECT HEX(c),comment FROM t3 WHERE b=c ORDER BY b;
+SELECT HEX(c),HEX(b),comment FROM t3 WHERE b<>c ORDER BY b;
+
+DROP TABLE t3;
+DROP TABLE t2;
+DROP TABLE t1;
+
+--echo #
+--echo # END OF MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
+--echo #
+
+--echo #
+--echo # End of 10.1 tests
+--echo #
diff --git a/mysql-test/t/ctype_sjis.test b/mysql-test/t/ctype_sjis.test
index ae110b20cb2..2777cf6a035 100644
--- a/mysql-test/t/ctype_sjis.test
+++ b/mysql-test/t/ctype_sjis.test
@@ -145,7 +145,7 @@ DROP TEMPORARY TABLE head, tail;
SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
SELECT COUNT(*) FROM t1;
-SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=1;
+SELECT COUNT(*) FROM t1 WHERE a<>'?' AND OCTET_LENGTH(a)=1;
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
#
# Display all characters that have upper or lower case mapping.
diff --git a/mysql-test/t/ctype_ujis.test b/mysql-test/t/ctype_ujis.test
index 48dc0e63058..94fc7ffe4c0 100644
--- a/mysql-test/t/ctype_ujis.test
+++ b/mysql-test/t/ctype_ujis.test
@@ -1276,7 +1276,7 @@ SHOW CREATE TABLE t1;
UPDATE t1 SET a=unhex(code) ORDER BY code;
SELECT COUNT(*) FROM t1;
-SELECT COUNT(*) FROM t1 WHERE a<>'';
+SELECT COUNT(*) FROM t1 WHERE a<>'?';
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=2;
SELECT COUNT(*) FROM t1 WHERE a<>'' AND OCTET_LENGTH(a)=3;
#
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 7a3c67bb417..232dd8fcb5d 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1832,6 +1832,50 @@ set @@collation_connection=utf8mb4_bin;
--echo # End of 5.6 tests
--echo #
+--echo #
+--echo # Start of 10.0 tests
+--echo #
+
+--echo #
+--echo # MDEV-6566 Different INSERT behaviour on bad bytes with and without character set conversion
+--echo #
+
+--echo #
+--echo # This test sets session character set to 3-byte utf8,
+--echo # but then sends a 4-byte sequence (which is wrong for 3-byte utf8).
+--echo # It should be replaced to four question marks: '????' in both columns
+--echo # (i.e. four unknown bytes are replaced to four question marks),
+--echo # then the rest of the string should be stored, so we get 'a ???? b'.
+--echo #
+SET NAMES utf8;
+CREATE TABLE t1 (
+ a VARCHAR(32) CHARACTER SET utf8mb4,
+ b VARCHAR(32) CHARACTER SET utf8
+);
+INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b';
+SELECT * FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # This test sets session character set to 4-byte utf8,
+--echo # then normally sends a 4-byte sequence.
+--echo # It should be stored AS IS into the utf8mb4 column (a),
+--echo # and should be replaced to a single question mark in the utf8 column (b)
+--echo # (i.e. one character that cannot be converted is replaced to one question mark).
+--echo #
+
+SET NAMES utf8mb4;
+CREATE TABLE t1 (
+ a VARCHAR(32) CHARACTER SET utf8mb4,
+ b VARCHAR(32) CHARACTER SET utf8
+);
+INSERT INTO t1 SELECT 'a 😁 b', 'a 😁 b';
+SELECT * FROM t1;
+DROP TABLE t1;
+
+--echo #
+--echo # End of 10.0 tests
+--echo #
--echo #
--echo # End of tests
diff --git a/sql/sql_string.cc b/sql/sql_string.cc
index 9fb462e9a9d..a0b63956ed0 100644
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -922,8 +922,8 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
my_charset_same(from_cs, to_cs))
{
m_cannot_convert_error_pos= NULL;
- return to_cs->cset->copy_abort(to_cs, to, to_length, from, from_length,
- nchars, this);
+ return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
+ nchars, this);
}
else
{
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index d631bd0a34e..eda81c0c4d3 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -50,7 +50,7 @@
#define MY_FUNCTION_NAME(x) my_ ## x ## _big5
#define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -6843,6 +6843,9 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1])))
return -2;
@@ -6894,7 +6897,9 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_big5,
+ my_well_formed_char_length_big5,
+ my_copy_fix_mb,
};
struct charset_info_st my_charset_big5_chinese_ci=
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index 6b53b34159a..95f31038ee6 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -549,6 +549,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 13129a6a874..2e26a98bf05 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -186,7 +186,7 @@ static const uchar sort_order_cp932[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932
#define IS_8BIT_CHAR(x) iscp932kata(x)
#define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -34765,7 +34765,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_cp932,
+ my_well_formed_char_length_cp932,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index eab9539ad45..a2c95bf77c8 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -204,7 +204,7 @@ static const uchar sort_order_euc_kr[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr
#define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -9928,6 +9928,9 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1])))
return -2;
@@ -9979,7 +9982,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_euckr,
+ my_well_formed_char_length_euckr,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 52873c2f87e..827feda927b 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -198,7 +198,7 @@ static const uchar sort_order_eucjpms[]=
#define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y))
#define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
#define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -67511,7 +67511,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_eucjpms,
+ my_well_formed_char_length_eucjpms,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index a4268b8fd68..129e8edb966 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -167,7 +167,7 @@ static const uchar sort_order_gb2312[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312
#define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -6330,7 +6330,10 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
-
+
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F)))
return -2;
@@ -6382,7 +6385,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_gb2312,
+ my_well_formed_char_length_gb2312,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 392fdb487b6..b3bd1efb6c4 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -45,7 +45,7 @@
#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk
#define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -10724,6 +10724,9 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
if (s+2>e)
return MY_CS_TOOSMALL2;
+ if (!IS_MB2_CHAR(hi, s[1]))
+ return MY_CS_ILSEQ;
+
if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1])))
return -2;
@@ -10776,7 +10779,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_gbk,
+ my_well_formed_char_length_gbk,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c
index 099f03460ce..bc51911dceb 100644
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -422,6 +422,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index fc41563324a..5947c3d4f4a 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -424,25 +424,95 @@ size_t my_well_formed_len_mb(CHARSET_INFO *cs, const char *b, const char *e,
/*
- Copy a multi-byte string. Abort if a bad byte sequence was found.
- Note more than "nchars" characters are copied.
+ Append a badly formed piece of string.
+ Bad bytes are fixed to '?'.
+
+ @param to The destination string
+ @param to_end The end of the destination string
+ @param from The source string
+ @param from_end The end of the source string
+ @param nchars Write not more than "nchars" characters.
+ @param status Copying status, must be previously initialized,
+ e.g. using well_formed_char_length() on the original
+ full source string.
*/
+static size_t
+my_append_fix_badly_formed_tail(CHARSET_INFO *cs,
+ char *to, char *to_end,
+ const char *from, const char *from_end,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status)
+{
+ char *to0= to;
+
+ for ( ; nchars; nchars--)
+ {
+ int chlen;
+ if ((chlen= cs->cset->charlen(cs, (const uchar*) from,
+ (const uchar *) from_end)) > 0)
+ {
+ /* Found a valid character */ /* chlen == 1..MBMAXLEN */
+ DBUG_ASSERT(chlen <= (int) cs->mbmaxlen);
+ if (to + chlen > to_end)
+ goto end; /* Does not fit to "to" */
+ memcpy(to, from, (size_t) chlen);
+ from+= chlen;
+ to+= chlen;
+ continue;
+ }
+ if (chlen == MY_CS_ILSEQ) /* chlen == 0 */
+ {
+ DBUG_ASSERT(from < from_end); /* Shouldn't get MY_CS_ILSEQ if empty */
+ goto bad;
+ }
+ /* Got an incomplete character */ /* chlen == MY_CS_TOOSMALLXXX */
+ DBUG_ASSERT(chlen >= MY_CS_TOOSMALL6);
+ DBUG_ASSERT(chlen <= MY_CS_TOOSMALL);
+ if (from >= from_end)
+ break; /* End of the source string */
+bad:
+ /* Bad byte sequence, or incomplete character found */
+ if (!status->m_well_formed_error_pos)
+ status->m_well_formed_error_pos= from;
+
+ if ((chlen= cs->cset->wc_mb(cs, '?', (uchar*) to, (uchar *) to_end)) <= 0)
+ break; /* Question mark does not fit into the destination */
+ to+= chlen;
+ from++;
+ }
+end:
+ status->m_source_end_pos= from;
+ return to - to0;
+}
+
+
size_t
-my_copy_abort_mb(CHARSET_INFO *cs,
- char *dst, size_t dst_length,
- const char *src, size_t src_length,
- size_t nchars, MY_STRCOPY_STATUS *status)
+my_copy_fix_mb(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, MY_STRCOPY_STATUS *status)
{
- int well_formed_error;
- size_t res;
+ size_t well_formed_nchars;
+ size_t well_formed_length;
+ size_t fixed_length;
set_if_smaller(src_length, dst_length);
- res= cs->cset->well_formed_len(cs, src, src + src_length,
- nchars, &well_formed_error);
- memmove(dst, src, res);
- status->m_source_end_pos= src + res;
- status->m_well_formed_error_pos= well_formed_error ? src + res : NULL;
- return res;
+ well_formed_nchars= cs->cset->well_formed_char_length(cs,
+ src, src + src_length,
+ nchars, status);
+ DBUG_ASSERT(well_formed_nchars <= nchars);
+ memmove(dst, src, (well_formed_length= status->m_source_end_pos - src));
+ if (!status->m_well_formed_error_pos)
+ return well_formed_length;
+
+ fixed_length= my_append_fix_badly_formed_tail(cs,
+ dst + well_formed_length,
+ dst + dst_length,
+ src + well_formed_length,
+ src + src_length,
+ nchars - well_formed_nchars,
+ status);
+ return well_formed_length + fixed_length;
}
diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic
index 70cc89c9af0..55094535d5e 100644
--- a/strings/ctype-mb.ic
+++ b/strings/ctype-mb.ic
@@ -29,7 +29,70 @@
#endif
-#ifdef WELL_FORMED_LEN
+#ifdef DEFINE_ASIAN_ROUTINES
+#define DEFINE_WELL_FORMED_LEN
+#define DEFINE_WELL_FORMED_CHAR_LENGTH
+#define DEFINE_CHARLEN
+#endif
+
+
+#ifdef DEFINE_CHARLEN
+/**
+ Returns length of the left-most character of a string.
+ @param cs - charset with mbminlen==1 and mbmaxlen<=4
+ @param b - the beginning of the string
+ @param e - the end of the string
+
+ @return MY_CS_ILSEQ if a bad byte sequence was found
+ @return MY_CS_TOOSMALL(N) if the string ended unexpectedly
+ @return >0 if a valid character was found
+*/
+static int
+MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *b, const uchar *e)
+{
+ DBUG_ASSERT(cs->mbminlen == 1);
+ DBUG_ASSERT(cs->mbmaxlen <= 4);
+
+ if (b >= e)
+ return MY_CS_TOOSMALL;
+ if ((uchar) b[0] < 128)
+ return 1; /* Single byte ASCII character */
+
+#ifdef IS_8BIT_CHAR
+ if (IS_8BIT_CHAR(b[0]))
+ {
+ /* Single byte non-ASCII character, e.g. half width kana in sjis */
+ return 1;
+ }
+#endif
+
+ if (b + 2 > e)
+ return MY_CS_TOOSMALLN(2);
+ if (IS_MB2_CHAR(b[0], b[1]))
+ return 2; /* Double byte character */
+
+#ifdef IS_MB3_CHAR
+ if (b + 3 > e)
+ return MY_CS_TOOSMALLN(3);
+ if (IS_MB3_CHAR(b[0], b[1], b[2]))
+ return 3; /* Three-byte character */
+#endif
+
+#ifdef IS_MB4_CHAR
+ if (b + 4 > e)
+ return MY_CS_TOOSMALLN(4);
+ if (IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
+ return 4; /* Four-byte character */
+#endif
+
+ /* Wrong byte sequence */
+ return MY_CS_ILSEQ;
+}
+#endif /* DEFINE_WELL_FORMED_LEN */
+
+
+#ifdef DEFINE_WELL_FORMED_LEN
/**
Returns well formed length of a character string with
variable character length for character sets with:
@@ -91,4 +154,105 @@ MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)),
return b - b0;
}
-#endif /* WELL_FORMED_LEN */
+#endif /* DEFINE_WELL_FORMED_LEN */
+
+
+
+#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH
+/**
+ Returns well formed length of a string
+ measured in characters (rather than in bytes).
+ Version for character sets that define IS_MB?_CHAR(), e.g. big5.
+*/
+static size_t
+MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status)
+{
+ size_t nchars0= nchars;
+ for ( ; b < e && nchars ; nchars--)
+ {
+ if ((uchar) b[0] < 128)
+ {
+ b++; /* Single byte ASCII character */
+ continue;
+ }
+
+ if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1]))
+ {
+ b+= 2; /* Double byte character */
+ continue;
+ }
+
+#ifdef IS_MB3_CHAR
+ if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2]))
+ {
+ b+= 3; /* Three-byte character */
+ continue;
+ }
+#endif
+
+#ifdef IS_MB4_CHAR
+ if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
+ {
+ b+= 4; /* Four-byte character */
+ continue;
+ }
+#endif
+
+#ifdef IS_8BIT_CHAR
+ if (IS_8BIT_CHAR(b[0]))
+ {
+ b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */
+ continue;
+ }
+#endif
+
+ /* Wrong byte sequence */
+ status->m_source_end_pos= status->m_well_formed_error_pos= b;
+ return nchars0 - nchars;
+ }
+ status->m_source_end_pos= b;
+ status->m_well_formed_error_pos= NULL;
+ return nchars0 - nchars;
+}
+#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */
+
+
+#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#ifndef CHARLEN
+#error CHARLEN is not defined
+#endif
+/**
+ Returns well formed length of a string
+ measured in characters (rather than in bytes).
+ Version for character sets that define CHARLEN(), e.g. utf8.
+ CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does:
+ - a positive number in the range [1-mbmaxlen] if a valid
+ single-byte or multi-byte character was found
+ - MY_CS_ILSEQ (0) on a bad byte sequence
+ - MY_CS_TOOSMALLxx if the incoming sequence is incomplete
+*/
+static size_t
+MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e,
+ size_t nchars,
+ MY_STRCOPY_STATUS *status)
+{
+ size_t nchars0= nchars;
+ int chlen;
+ for ( ; nchars ; nchars--, b+= chlen)
+ {
+ if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0)
+ {
+ status->m_well_formed_error_pos= b < e ? b : NULL;
+ status->m_source_end_pos= b;
+ return nchars0 - nchars;
+ }
+ }
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= b;
+ return nchars0 - nchars;
+}
+#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index b010c528979..d7a1b3f33b4 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -248,6 +248,13 @@ int my_strcasecmp_8bit(CHARSET_INFO * cs,const char *s, const char *t)
}
+int my_charlen_8bit(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *str, const uchar *end)
+{
+ return str >= end ? MY_CS_TOOSMALL : 1;
+}
+
+
int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc,
const uchar *str,
const uchar *end __attribute__((unused)))
@@ -1108,6 +1115,19 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)),
}
+size_t
+my_well_formed_char_length_8bit(CHARSET_INFO *cs __attribute__((unused)),
+ const char *start, const char *end,
+ size_t nchars, MY_STRCOPY_STATUS *status)
+{
+ size_t nbytes= (size_t) (end - start);
+ size_t res= MY_MIN(nbytes, nchars);
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= start + res;
+ return res;
+}
+
+
/*
Copy a 8-bit string. Not more than "nchars" character are copied.
*/
@@ -1906,6 +1926,8 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index 432e2e5e823..bbf0026cf2b 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -187,7 +187,7 @@ static const uchar sort_order_sjis[]=
#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis
#define IS_8BIT_CHAR(x) issjiskata(x)
#define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -34144,7 +34144,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_sjis,
+ my_well_formed_char_length_sjis,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c
index 343fb812e20..6537b380ab3 100644
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -886,6 +886,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
+ my_charlen_8bit,
+ my_well_formed_char_length_8bit,
my_copy_8bit,
};
diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c
index 8f234e9e3a8..d1441a4d3a5 100644
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -92,62 +92,107 @@ my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
}
+typedef enum
+{
+ MY_CHAR_COPY_OK= 0, /* The character was Okey */
+ MY_CHAR_COPY_ERROR= 1, /* The character was not Ok, and could not fix */
+ MY_CHAR_COPY_FIXED= 2 /* The character was not Ok, was fixed to '?' */
+} my_char_copy_status_t;
+
+
/*
- Copy an UCS2/UTF16/UTF32 string.
- Not more that "nchars" characters are copied.
+ Copies an incomplete character, lef-padding it with 0x00 bytes.
+
+ @param cs Character set
+ @param dst The destination string
+ @param dst_length Space available in dst
+ @param src The source string
+ @param src_length Length of src
+ @param nchars Copy not more than nchars characters.
+ The "nchars" parameter of the caller.
+ Only 0 and non-0 are important here.
+ @param fix What to do if after zero-padding didn't get a valid
+ character:
+ - FALSE - exit with error.
+ - TRUE - try to put '?' instead.
+
+ @return MY_CHAR_COPY_OK if after zero-padding got a valid character.
+ cs->mbmaxlen bytes were written to "dst".
+ @return MY_CHAR_COPY_FIXED if after zero-padding did not get a valid
+ character, but wrote '?' to the destination
+ string instead.
+ cs->mbminlen bytes were written to "dst".
+ @return MY_CHAR_COPY_ERROR If failed and nothing was written to "dst".
+ Possible reasons:
+ - dst_length was too short
+ - nchars was 0
+ - the character after padding appeared not
+ to be valid, and could not fix it to '?'.
+*/
+static my_char_copy_status_t
+my_copy_incomplete_char(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, my_bool fix)
+{
+ size_t pad_length;
+ size_t src_offset= src_length % cs->mbminlen;
+ if (dst_length < cs->mbminlen || !nchars)
+ return MY_CHAR_COPY_ERROR;
+
+ pad_length= cs->mbminlen - src_offset;
+ bzero(dst, pad_length);
+ memmove(dst + pad_length, src, src_offset);
+ /*
+ In some cases left zero-padding can create an incorrect character.
+ For example:
+ INSERT INTO t1 (utf32_column) VALUES (0x110000);
+ We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
+ The valid characters range is limited to 0x00000000..0x0010FFFF.
+
+ Make sure we didn't pad to an incorrect character.
+ */
+ if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
+ (int) cs->mbminlen)
+ return MY_CHAR_COPY_OK;
- UCS2/UTF16/UTF32 may need to prepend zero some bytes,
- e.g. when copying from a BINARY source:
- INSERT INTO t1 (ucs2_column) VALUES (0x01);
- 0x01 -> 0x0001
+ if (fix &&
+ cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
+ (int) cs->mbminlen)
+ return MY_CHAR_COPY_FIXED;
+
+ return MY_CHAR_COPY_ERROR;
+}
+
+
+/*
+ Copy an UCS2/UTF16/UTF32 string, fix bad characters.
*/
static size_t
-my_copy_abort_mb2_or_mb4(CHARSET_INFO *cs,
- char *dst, size_t dst_length,
- const char *src, size_t src_length,
- size_t nchars, MY_STRCOPY_STATUS *status)
+my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs,
+ char *dst, size_t dst_length,
+ const char *src, size_t src_length,
+ size_t nchars, MY_STRCOPY_STATUS *status)
{
- size_t src_offset;
-
- if ((src_offset= (src_length % cs->mbminlen)))
- {
- int well_formed_error;
- size_t pad_length;
- if (dst_length < cs->mbminlen || !nchars)
- {
- status->m_source_end_pos= status->m_well_formed_error_pos= src;
- return 0;
- }
-
- pad_length= cs->mbminlen - src_offset;
- bzero(dst, pad_length);
- memmove(dst + pad_length, src, src_offset);
- /*
- In some cases left zero-padding can create an incorrect character.
- For example:
- INSERT INTO t1 (utf32_column) VALUES (0x110000);
- We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
- The valid characters range is limited to 0x00000000..0x0010FFFF.
-
- Make sure we didn't pad to an incorrect character.
- */
- if (cs->cset->well_formed_len(cs,
- dst, dst + cs->mbminlen, 1,
- &well_formed_error) != cs->mbminlen)
- {
- status->m_source_end_pos= status->m_well_formed_error_pos= src;
- return 0;
- }
- nchars--;
- src+= src_offset;
- src_length-= src_offset;
- dst+= cs->mbminlen;
- dst_length-= cs->mbminlen;
- return
- cs->mbminlen /* The left-padded character */ +
- my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+ size_t length2, src_offset= src_length % cs->mbminlen;
+ my_char_copy_status_t padstatus;
+
+ if (!src_offset)
+ return my_copy_fix_mb(cs, dst, dst_length,
+ src, src_length, nchars, status);
+ if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length,
+ src, src_length, nchars, TRUE)) ==
+ MY_CHAR_COPY_ERROR)
+ {
+ status->m_source_end_pos= status->m_well_formed_error_pos= src;
+ return 0;
}
- return my_copy_abort_mb(cs, dst, dst_length, src, src_length, nchars, status);
+ length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen,
+ src + src_offset, src_length - src_offset,
+ nchars - 1, status);
+ if (padstatus == MY_CHAR_COPY_FIXED)
+ status->m_well_formed_error_pos= src;
+ return cs->mbminlen /* The left-padded character */ + length2;
}
@@ -1475,6 +1520,24 @@ my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e)
}
+static int
+my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
+{
+ my_wc_t wc;
+ return cs->cset->mb_wc(cs, &wc, str, end);
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16
+#define CHARLEN(cs,str,end) my_charlen_utf16(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* Defines my_well_formed_char_length_utf16 */
+
+
static uint
my_mbcharlen_utf16(CHARSET_INFO *cs __attribute__((unused)),
uint c __attribute__((unused)))
@@ -1742,7 +1805,9 @@ MY_CHARSET_HANDLER my_charset_utf16_handler=
my_strtoll10_mb2,
my_strntoull10rnd_mb2_or_mb4,
my_scan_mb2,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_utf16,
+ my_well_formed_char_length_utf16,
+ my_copy_fix_mb2_or_mb4,
};
@@ -1912,7 +1977,9 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler=
my_strtoll10_mb2,
my_strntoull10rnd_mb2_or_mb4,
my_scan_mb2,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_utf16,
+ my_well_formed_char_length_utf16,
+ my_copy_fix_mb2_or_mb4,
};
@@ -1987,6 +2054,13 @@ struct charset_info_st my_charset_utf16le_bin=
#ifdef HAVE_CHARSET_utf32
+/*
+ Check is b0 and b1 start a valid UTF32 four-byte sequence.
+ Don't accept characters greater than U+10FFFF.
+*/
+#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
+
+
static int
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e)
@@ -1994,7 +2068,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s + 4 > e)
return MY_CS_TOOSMALL4;
*pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
- return 4;
+ return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
}
@@ -2004,7 +2078,10 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
-
+
+ if (wc > 0x10FFFF)
+ return MY_CS_ILUNI;
+
s[0]= (uchar) (wc >> 24);
s[1]= (uchar) (wc >> 16) & 0xFF;
s[2]= (uchar) (wc >> 8) & 0xFF;
@@ -2263,10 +2340,29 @@ my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *b,
const char *e)
{
- return b + 4 > e ? 0 : 4;
+ return b + 4 > e || !IS_UTF32_MBHEAD4(b[0], b[1]) ? 0 : 4;
}
+static int
+my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *b, const uchar *e)
+{
+ return b + 4 > e ? MY_CS_TOOSMALL4 :
+ IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ;
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32
+#define CHARLEN(cs,str,end) my_charlen_utf32(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* Defines my_well_formed_char_length_utf32 */
+
+
static uint
my_mbcharlen_utf32(CHARSET_INFO *cs __attribute__((unused)) ,
uint c __attribute__((unused)))
@@ -2579,8 +2675,7 @@ my_well_formed_len_utf32(CHARSET_INFO *cs __attribute__((unused)),
}
for (; b < e; b+= 4)
{
- /* Don't accept characters greater than U+10FFFF */
- if (b[0] || (uchar) b[1] > 0x10)
+ if (!IS_UTF32_MBHEAD4(b[0], b[1]))
{
*error= 1;
return b - b0;
@@ -2827,7 +2922,9 @@ MY_CHARSET_HANDLER my_charset_utf32_handler=
my_strtoll10_utf32,
my_strntoull10rnd_mb2_or_mb4,
my_scan_utf32,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_utf32,
+ my_well_formed_char_length_utf32,
+ my_copy_fix_mb2_or_mb4,
};
@@ -2961,6 +3058,14 @@ static const uchar to_upper_ucs2[] = {
};
+static int
+my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
+{
+ return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2;
+}
+
+
static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
@@ -3264,6 +3369,31 @@ size_t my_well_formed_len_ucs2(CHARSET_INFO *cs __attribute__((unused)),
}
+static size_t
+my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)),
+ const char *b, const char *e,
+ size_t nchars, MY_STRCOPY_STATUS *status)
+{
+ size_t length= e - b;
+ if (nchars * 2 <= length)
+ {
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= b + (nchars * 2);
+ return nchars;
+ }
+ if (length % 2)
+ {
+ status->m_well_formed_error_pos= status->m_source_end_pos= e - 1;
+ }
+ else
+ {
+ status->m_well_formed_error_pos= NULL;
+ status->m_source_end_pos= e;
+ }
+ return length / 2;
+}
+
+
static
int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
const char *str,const char *str_end,
@@ -3446,7 +3576,9 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
my_strtoll10_mb2,
my_strntoull10rnd_mb2_or_mb4,
my_scan_mb2,
- my_copy_abort_mb2_or_mb4,
+ my_charlen_ucs2,
+ my_well_formed_char_length_ucs2,
+ my_copy_fix_mb2_or_mb4,
};
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index 99f5be3fa38..cb000a2afa0 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -197,7 +197,7 @@ static const uchar sort_order_ujis[]=
#define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y))
#define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
#define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z))
-#define WELL_FORMED_LEN
+#define DEFINE_ASIAN_ROUTINES
#include "ctype-mb.ic"
@@ -67255,7 +67255,9 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_ujis,
+ my_well_formed_char_length_ujis,
+ my_copy_fix_mb,
};
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 1116228f706..56824aac59e 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -5446,8 +5446,8 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
static
-int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *s, const uchar *e)
+int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
{
uchar c;
@@ -5515,7 +5515,7 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
{
int mb_len;
- if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+ if ((mb_len= my_charlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
@@ -5526,9 +5526,20 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
return (size_t) (b - b_start);
}
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8
+#define CHARLEN(cs,str,end) my_charlen_utf8(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_utf8 */
+
+
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
- int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
+ int res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e);
return (res>1) ? res : 0;
}
@@ -5615,7 +5626,9 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_utf8,
+ my_well_formed_char_length_utf8,
+ my_copy_fix_mb,
};
@@ -7125,6 +7138,24 @@ my_wc_mb_filename(CHARSET_INFO *cs __attribute__((unused)),
}
+static int
+my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
+{
+ my_wc_t wc;
+ return cs->cset->mb_wc(cs, &wc, str, end);
+}
+
+
+#define MY_FUNCTION_NAME(x) my_ ## x ## _filename
+#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_filename */
+
+
static MY_COLLATION_HANDLER my_collation_filename_handler =
{
NULL, /* init */
@@ -7169,7 +7200,9 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_filename,
+ my_well_formed_char_length_filename,
+ my_copy_fix_mb,
};
@@ -7954,8 +7987,8 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs,
static int
-my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
- const uchar *s, const uchar *e)
+my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *s, const uchar *e)
{
uchar c;
@@ -8015,7 +8048,7 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
{
int mb_len;
- if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+ if ((mb_len= my_charlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
@@ -8027,10 +8060,19 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
}
+#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4
+#define CHARLEN(cs,str,end) my_charlen_utf8mb4(cs,str,end)
+#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+#include "ctype-mb.ic"
+#undef MY_FUNCTION_NAME
+#undef CHARLEN
+#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
+/* my_well_formed_char_length_utf8mb4 */
+
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
- int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
+ int res= my_charlen_utf8mb4(cs, (const uchar*) b, (const uchar*) e);
return (res > 1) ? res : 0;
}
@@ -8113,7 +8155,9 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_strtoll10_8bit,
my_strntoull10rnd_8bit,
my_scan_8bit,
- my_copy_abort_mb,
+ my_charlen_utf8mb4,
+ my_well_formed_char_length_utf8mb4,
+ my_copy_fix_mb,
};