summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/m_ctype.h3
-rw-r--r--mysql-test/r/ctype_utf8mb4.result23
-rw-r--r--mysql-test/r/loaddata.result3
-rw-r--r--mysql-test/std_data/loaddata/mdev-11343.txt12
-rw-r--r--mysql-test/t/ctype_utf8mb4.test11
-rw-r--r--mysql-test/t/loaddata.test1
-rw-r--r--sql/sql_load.cc189
-rw-r--r--sql/sql_string.h1
8 files changed, 196 insertions, 47 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 5994816cbfc..8d9838fdde2 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -180,6 +180,9 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
/* A helper macros for "need at least n bytes" */
#define MY_CS_TOOSMALLN(n) (-100-(n))
+#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
+
+
#define MY_SEQ_INTTAIL 1
#define MY_SEQ_SPACES 2
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index 38814bc36d6..50382b5d5ca 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3356,5 +3356,28 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF
# End of 5.6 tests
#
#
+# Start of 10.0 tests
+#
+#
+# MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
+SELECT HEX(a) FROM t1;
+HEX(a)
+C3A4
+C3A478
+78C3A4
+78C3A478
+EA99A0
+EA99A078
+78EA99A0
+78EA99A078
+F09F988E
+F09F988E78
+78F09F988E
+78F09F988E78
+DROP TABLE t1;
+#
# End of tests
#
diff --git a/mysql-test/r/loaddata.result b/mysql-test/r/loaddata.result
index 2f2a3579eec..12462305dc8 100644
--- a/mysql-test/r/loaddata.result
+++ b/mysql-test/r/loaddata.result
@@ -552,7 +552,8 @@ CREATE DATABASE d2 CHARSET utf8;
USE d2;
CREATE TABLE t1 (val TEXT);
LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
-ERROR HY000: Invalid utf8 character string: 'ร"RT @niouzechun: \9058\221A'
+Warnings:
+Warning 1366 Incorrect string value: '\xF5\x80\x81\xAE\xE7\xB9...' for column 'val' at row 1
DROP TABLE d1.t1, d2.t1;
DROP DATABASE d1;
DROP DATABASE d2;
diff --git a/mysql-test/std_data/loaddata/mdev-11343.txt b/mysql-test/std_data/loaddata/mdev-11343.txt
new file mode 100644
index 00000000000..dded1215ffa
--- /dev/null
+++ b/mysql-test/std_data/loaddata/mdev-11343.txt
@@ -0,0 +1,12 @@
+\รค
+\รคx
+x\รค
+x\รคx
+\๊™ 
+\๊™ x
+x\๊™ 
+x\๊™ x
+\๐Ÿ˜Ž
+\๐Ÿ˜Žx
+x\๐Ÿ˜Ž
+x\๐Ÿ˜Žx
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 6b876cc5eba..cf1c103137e 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1864,6 +1864,17 @@ set @@collation_connection=utf8mb4_bin;
--echo # End of 5.6 tests
--echo #
+--echo #
+--echo # Start of 10.0 tests
+--echo #
+
+--echo #
+--echo # MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;
--echo #
--echo # End of tests
diff --git a/mysql-test/t/loaddata.test b/mysql-test/t/loaddata.test
index 7d0f3852b66..9f2aafc8efd 100644
--- a/mysql-test/t/loaddata.test
+++ b/mysql-test/t/loaddata.test
@@ -675,7 +675,6 @@ SELECT HEX(val) FROM t1;
CREATE DATABASE d2 CHARSET utf8;
USE d2;
CREATE TABLE t1 (val TEXT);
---error ER_INVALID_CHARACTER_STRING
LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
DROP TABLE d1.t1, d2.t1;
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index af4b25185d0..51a284964e1 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -79,6 +79,81 @@ class READ_INFO {
NET *io_net;
int level; /* for load xml */
+
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2.
+#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp()
+#else
+ int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end)
+ {
+ my_wc_t wc;
+ return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
+ }
+
+ /**
+ Read a tail of a multi-byte character.
+ The first byte of the character is assumed to be already
+ read from the file and appended to "str".
+
+ @returns true - if EOF happened unexpectedly
+ @returns false - no EOF happened: found a good multi-byte character,
+ or a bad byte sequence
+
+ Note:
+ The return value depends only on EOF:
+ - read_mbtail() returns "false" is a good character was read, but also
+ - read_mbtail() returns "false" if an incomplete byte sequence was found
+ and no EOF happened.
+
+ For example, suppose we have an ujis file with bytes 0x8FA10A, where:
+ - 0x8FA1 is an incomplete prefix of a 3-byte character
+ (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
+ - 0x0A is a line demiliter
+ This file has some broken data, the trailing [A1-FE] is missing.
+
+ In this example it works as follows:
+ - 0x8F is read from the file and put into "data" before the call
+ for read_mbtail()
+ - 0xA1 is read from the file and put into "data" by read_mbtail()
+ - 0x0A is kept in the read queue, so the next read iteration after
+ the current read_mbtail() call will normally find it and recognize as
+ a line delimiter
+ - the current call for read_mbtail() returns "false",
+ because no EOF happened
+ */
+ bool read_mbtail(String *str)
+ {
+ int chlen;
+ if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1)
+ return false; // Single byte character found
+ for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
+ {
+ int chr= GET;
+ if (chr == my_b_EOF)
+ {
+ DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
+ return true; // EOF
+ }
+ str->append(chr);
+ chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end());
+ if (chlen == MY_CS_ILSEQ)
+ {
+ /**
+ It has been an incomplete (but a valid) sequence so far,
+ but the last byte turned it into a bad byte sequence.
+ Unget the very last byte.
+ */
+ str->length(str->length() - 1);
+ PUSH(chr);
+ DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
+ return false; // Bad byte sequence
+ }
+ }
+ DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
+ return false; // Good multi-byte character
+ }
+#endif
+
public:
bool error,line_cuted,found_null,enclosed;
uchar *row_start, /* Found row starts here */
@@ -1474,6 +1549,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length)
}
+/**
+ Read a field.
+
+ The data in the loaded file was presumably escaped using
+ - either select_export::send_data() OUTFILE
+ - or mysql_real_escape_string()
+ using the same character set with the one specified in the current
+ "LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set).
+
+ Note, non-escaped multi-byte characters are scanned as a single entity.
+ This is needed to correctly distinguish between:
+ - 0x5C as an escape character versus
+ - 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis)
+
+ Parts of escaped multi-byte characters are scanned on different loop
+ iterations. See the comment about 0x5C handling in select_export::send_data()
+ in sql_class.cc.
+
+ READ_INFO::read_field() does not check wellformedness.
+ Raising wellformedness errors or warnings in READ_INFO::read_field()
+ would be wrong, as the data after unescaping can go into a BLOB field,
+ or into a TEXT/VARCHAR field of a different character set.
+ The loop below only makes sure to revert escaping made by
+ select_export::send_data() or mysql_real_escape_string().
+ Wellformedness is checked later, during Field::store(str,length,cs) time.
+
+ Note, in some cases users can supply data which did not go through
+ escaping properly. For example, utf8 "\<C3><A4>"
+ (backslash followed by LATIN SMALL LETTER A WITH DIAERESIS)
+ is improperly escaped data that could not be generated by
+ select_export::send_data() / mysql_real_escape_string():
+ - either there should be two backslashes: "\\<C3><A4>"
+ - or there should be no backslashes at all: "<C3><A4>"
+ "\<C3>" and "<A4> are scanned on two different loop iterations and
+ store "<C3><A4>" into the field.
+
+ Note, adding useless escapes before multi-byte characters like in the
+ example above is safe in case of utf8, but is not safe in case of
+ character sets that have escape_with_backslash_is_dangerous==TRUE,
+ such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the
+ data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0).
+ If we add an extra escape before this sequence, then we'll get
+ <5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>.
+ The second loop iteration will turn <5C><30> into <30>.
+ So the program that generates a dump file for further use with LOAD DATA
+ must make sure to use escapes properly.
+*/
+
int READ_INFO::read_field()
{
int chr,found_enclosed_char;
@@ -1510,7 +1633,8 @@ int READ_INFO::read_field()
for (;;)
{
- while ( to < end_of_buff)
+ // Make sure we have enough space for the longest multi-byte character.
+ while ( to + read_charset->mbmaxlen < end_of_buff)
{
chr = GET;
if (chr == my_b_EOF)
@@ -1598,52 +1722,27 @@ int READ_INFO::read_field()
}
}
#ifdef USE_MB
- uint ml= my_mbcharlen(read_charset, chr);
- if (ml == 0)
- {
- *to= '\0';
- my_error(ER_INVALID_CHARACTER_STRING, MYF(0),
- read_charset->csname, buffer);
- error= true;
- return 1;
- }
-
- if (ml > 1 &&
- to + ml <= end_of_buff)
- {
- uchar* p= to;
- *to++ = chr;
-
- for (uint i= 1; i < ml; i++)
- {
- chr= GET;
- if (chr == my_b_EOF)
- {
- /*
- Need to back up the bytes already ready from illformed
- multi-byte char
- */
- to-= i;
- goto found_eof;
- }
- *to++ = chr;
- }
- if (my_ismbchar(read_charset,
- (const char *)p,
- (const char *)to))
- continue;
- for (uint i= 0; i < ml; i++)
- PUSH(*--to);
- chr= GET;
- }
- else if (ml > 1)
- {
- // Buffer is too small, exit while loop, and reallocate.
- PUSH(chr);
- break;
- }
#endif
*to++ = (uchar) chr;
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2
+#else
+ if (my_mbcharlen(read_charset, (uchar) chr) > 1)
+ {
+ /*
+ A known MBHEAD found. Try to scan the full multi-byte character.
+ Otherwise, a possible following second byte 0x5C would be
+ mis-interpreted as an escape on the next iteration.
+ (Important for big5, gbk, sjis, cp932).
+ */
+ String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset);
+ tmp.length(1);
+ bool eof= read_mbtail(&tmp);
+ to+= tmp.length() - 1;
+ if (eof)
+ goto found_eof;
+ }
+#endif
}
/*
** We come here if buffer is too small. Enlarge it and continue
diff --git a/sql/sql_string.h b/sql/sql_string.h
index c287f051d98..557d14a79f8 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -136,6 +136,7 @@ public:
inline bool is_empty() const { return (str_length == 0); }
inline void mark_as_const() { Alloced_length= 0;}
inline const char *ptr() const { return Ptr; }
+ inline const char *end() const { return Ptr + str_length; }
inline char *c_ptr()
{
DBUG_ASSERT(!alloced || !Ptr || !Alloced_length ||