diff options
-rw-r--r-- | include/m_ctype.h | 3 | ||||
-rw-r--r-- | mysql-test/r/ctype_utf8mb4.result | 23 | ||||
-rw-r--r-- | mysql-test/r/loaddata.result | 3 | ||||
-rw-r--r-- | mysql-test/std_data/loaddata/mdev-11343.txt | 12 | ||||
-rw-r--r-- | mysql-test/t/ctype_utf8mb4.test | 11 | ||||
-rw-r--r-- | mysql-test/t/loaddata.test | 1 | ||||
-rw-r--r-- | sql/sql_load.cc | 189 | ||||
-rw-r--r-- | sql/sql_string.h | 1 |
8 files changed, 196 insertions, 47 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index 5994816cbfc..8d9838fdde2 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -180,6 +180,9 @@ extern MY_UNI_CTYPE my_uni_ctype[256]; /* A helper macros for "need at least n bytes" */ #define MY_CS_TOOSMALLN(n) (-100-(n)) +#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL) + + #define MY_SEQ_INTTAIL 1 #define MY_SEQ_SPACES 2 diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result index 38814bc36d6..50382b5d5ca 100644 --- a/mysql-test/r/ctype_utf8mb4.result +++ b/mysql-test/r/ctype_utf8mb4.result @@ -3356,5 +3356,28 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF # End of 5.6 tests # # +# Start of 10.0 tests +# +# +# MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character +# +CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4); +LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4; +SELECT HEX(a) FROM t1; +HEX(a) +C3A4 +C3A478 +78C3A4 +78C3A478 +EA99A0 +EA99A078 +78EA99A0 +78EA99A078 +F09F988E +F09F988E78 +78F09F988E +78F09F988E78 +DROP TABLE t1; +# # End of tests # diff --git a/mysql-test/r/loaddata.result b/mysql-test/r/loaddata.result index 2f2a3579eec..12462305dc8 100644 --- a/mysql-test/r/loaddata.result +++ b/mysql-test/r/loaddata.result @@ -552,7 +552,8 @@ CREATE DATABASE d2 CHARSET utf8; USE d2; CREATE TABLE t1 (val TEXT); LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1; -ERROR HY000: Invalid utf8 character string: 'ร"RT @niouzechun: \9058\221A' +Warnings: +Warning 1366 Incorrect string value: '\xF5\x80\x81\xAE\xE7\xB9...' for column 'val' at row 1 DROP TABLE d1.t1, d2.t1; DROP DATABASE d1; DROP DATABASE d2; diff --git a/mysql-test/std_data/loaddata/mdev-11343.txt b/mysql-test/std_data/loaddata/mdev-11343.txt new file mode 100644 index 00000000000..dded1215ffa --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev-11343.txt @@ -0,0 +1,12 @@ +\รค +\รคx +x\รค +x\รคx +\๊ +\๊ x +x\๊ +x\๊ x +\๐ +\๐x +x\๐ +x\๐x diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test index 6b876cc5eba..cf1c103137e 100644 --- a/mysql-test/t/ctype_utf8mb4.test +++ b/mysql-test/t/ctype_utf8mb4.test @@ -1864,6 +1864,17 @@ set @@collation_connection=utf8mb4_bin; --echo # End of 5.6 tests --echo # +--echo # +--echo # Start of 10.0 tests +--echo # + +--echo # +--echo # MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character +--echo # +CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4); +LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4; +SELECT HEX(a) FROM t1; +DROP TABLE t1; --echo # --echo # End of tests diff --git a/mysql-test/t/loaddata.test b/mysql-test/t/loaddata.test index 7d0f3852b66..9f2aafc8efd 100644 --- a/mysql-test/t/loaddata.test +++ b/mysql-test/t/loaddata.test @@ -675,7 +675,6 @@ SELECT HEX(val) FROM t1; CREATE DATABASE d2 CHARSET utf8; USE d2; CREATE TABLE t1 (val TEXT); ---error ER_INVALID_CHARACTER_STRING LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1; DROP TABLE d1.t1, d2.t1; diff --git a/sql/sql_load.cc b/sql/sql_load.cc index af4b25185d0..51a284964e1 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -79,6 +79,81 @@ class READ_INFO { NET *io_net; int level; /* for load xml */ + +#if MYSQL_VERSION_ID >= 100200 +#error This 10.0 and 10.1 specific fix should be removed in 10.2. +#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp() +#else + int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end) + { + my_wc_t wc; + return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end); + } + + /** + Read a tail of a multi-byte character. + The first byte of the character is assumed to be already + read from the file and appended to "str". + + @returns true - if EOF happened unexpectedly + @returns false - no EOF happened: found a good multi-byte character, + or a bad byte sequence + + Note: + The return value depends only on EOF: + - read_mbtail() returns "false" is a good character was read, but also + - read_mbtail() returns "false" if an incomplete byte sequence was found + and no EOF happened. + + For example, suppose we have an ujis file with bytes 0x8FA10A, where: + - 0x8FA1 is an incomplete prefix of a 3-byte character + (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character) + - 0x0A is a line demiliter + This file has some broken data, the trailing [A1-FE] is missing. + + In this example it works as follows: + - 0x8F is read from the file and put into "data" before the call + for read_mbtail() + - 0xA1 is read from the file and put into "data" by read_mbtail() + - 0x0A is kept in the read queue, so the next read iteration after + the current read_mbtail() call will normally find it and recognize as + a line delimiter + - the current call for read_mbtail() returns "false", + because no EOF happened + */ + bool read_mbtail(String *str) + { + int chlen; + if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1) + return false; // Single byte character found + for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); ) + { + int chr= GET; + if (chr == my_b_EOF) + { + DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen)); + return true; // EOF + } + str->append(chr); + chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end()); + if (chlen == MY_CS_ILSEQ) + { + /** + It has been an incomplete (but a valid) sequence so far, + but the last byte turned it into a bad byte sequence. + Unget the very last byte. + */ + str->length(str->length() - 1); + PUSH(chr); + DBUG_PRINT("info", ("read_mbtail: ILSEQ")); + return false; // Bad byte sequence + } + } + DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen)); + return false; // Good multi-byte character + } +#endif + public: bool error,line_cuted,found_null,enclosed; uchar *row_start, /* Found row starts here */ @@ -1474,6 +1549,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length) } +/** + Read a field. + + The data in the loaded file was presumably escaped using + - either select_export::send_data() OUTFILE + - or mysql_real_escape_string() + using the same character set with the one specified in the current + "LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set). + + Note, non-escaped multi-byte characters are scanned as a single entity. + This is needed to correctly distinguish between: + - 0x5C as an escape character versus + - 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis) + + Parts of escaped multi-byte characters are scanned on different loop + iterations. See the comment about 0x5C handling in select_export::send_data() + in sql_class.cc. + + READ_INFO::read_field() does not check wellformedness. + Raising wellformedness errors or warnings in READ_INFO::read_field() + would be wrong, as the data after unescaping can go into a BLOB field, + or into a TEXT/VARCHAR field of a different character set. + The loop below only makes sure to revert escaping made by + select_export::send_data() or mysql_real_escape_string(). + Wellformedness is checked later, during Field::store(str,length,cs) time. + + Note, in some cases users can supply data which did not go through + escaping properly. For example, utf8 "\<C3><A4>" + (backslash followed by LATIN SMALL LETTER A WITH DIAERESIS) + is improperly escaped data that could not be generated by + select_export::send_data() / mysql_real_escape_string(): + - either there should be two backslashes: "\\<C3><A4>" + - or there should be no backslashes at all: "<C3><A4>" + "\<C3>" and "<A4> are scanned on two different loop iterations and + store "<C3><A4>" into the field. + + Note, adding useless escapes before multi-byte characters like in the + example above is safe in case of utf8, but is not safe in case of + character sets that have escape_with_backslash_is_dangerous==TRUE, + such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the + data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0). + If we add an extra escape before this sequence, then we'll get + <5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>. + The second loop iteration will turn <5C><30> into <30>. + So the program that generates a dump file for further use with LOAD DATA + must make sure to use escapes properly. +*/ + int READ_INFO::read_field() { int chr,found_enclosed_char; @@ -1510,7 +1633,8 @@ int READ_INFO::read_field() for (;;) { - while ( to < end_of_buff) + // Make sure we have enough space for the longest multi-byte character. + while ( to + read_charset->mbmaxlen < end_of_buff) { chr = GET; if (chr == my_b_EOF) @@ -1598,52 +1722,27 @@ int READ_INFO::read_field() } } #ifdef USE_MB - uint ml= my_mbcharlen(read_charset, chr); - if (ml == 0) - { - *to= '\0'; - my_error(ER_INVALID_CHARACTER_STRING, MYF(0), - read_charset->csname, buffer); - error= true; - return 1; - } - - if (ml > 1 && - to + ml <= end_of_buff) - { - uchar* p= to; - *to++ = chr; - - for (uint i= 1; i < ml; i++) - { - chr= GET; - if (chr == my_b_EOF) - { - /* - Need to back up the bytes already ready from illformed - multi-byte char - */ - to-= i; - goto found_eof; - } - *to++ = chr; - } - if (my_ismbchar(read_charset, - (const char *)p, - (const char *)to)) - continue; - for (uint i= 0; i < ml; i++) - PUSH(*--to); - chr= GET; - } - else if (ml > 1) - { - // Buffer is too small, exit while loop, and reallocate. - PUSH(chr); - break; - } #endif *to++ = (uchar) chr; +#if MYSQL_VERSION_ID >= 100200 +#error This 10.0 and 10.1 specific fix should be removed in 10.2 +#else + if (my_mbcharlen(read_charset, (uchar) chr) > 1) + { + /* + A known MBHEAD found. Try to scan the full multi-byte character. + Otherwise, a possible following second byte 0x5C would be + mis-interpreted as an escape on the next iteration. + (Important for big5, gbk, sjis, cp932). + */ + String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset); + tmp.length(1); + bool eof= read_mbtail(&tmp); + to+= tmp.length() - 1; + if (eof) + goto found_eof; + } +#endif } /* ** We come here if buffer is too small. Enlarge it and continue diff --git a/sql/sql_string.h b/sql/sql_string.h index c287f051d98..557d14a79f8 100644 --- a/sql/sql_string.h +++ b/sql/sql_string.h @@ -136,6 +136,7 @@ public: inline bool is_empty() const { return (str_length == 0); } inline void mark_as_const() { Alloced_length= 0;} inline const char *ptr() const { return Ptr; } + inline const char *end() const { return Ptr + str_length; } inline char *c_ptr() { DBUG_ASSERT(!alloced || !Ptr || !Alloced_length || |