diff options
author | Sergei Golubchik <serg@mariadb.org> | 2016-12-11 09:53:42 +0100 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2016-12-11 09:53:42 +0100 |
commit | 2f20d297f8ea731d845bb220e680ad10c7a927bc (patch) | |
tree | 9bd18ef1ab766422ba4c51b4ab189e259955a2d0 /sql/sql_load.cc | |
parent | a629b5172e96c96c414fca70fffd64c80f2f7e8f (diff) | |
parent | eb4f2e063c341d9f3644339c68cb01679e782001 (diff) | |
download | mariadb-git-2f20d297f8ea731d845bb220e680ad10c7a927bc.tar.gz |
Merge branch '10.0' into 10.1
Diffstat (limited to 'sql/sql_load.cc')
-rw-r--r-- | sql/sql_load.cc | 192 |
1 files changed, 146 insertions, 46 deletions
diff --git a/sql/sql_load.cc b/sql/sql_load.cc index c2c97a37633..630c1e0d21e 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -79,6 +79,81 @@ class READ_INFO { NET *io_net; int level; /* for load xml */ + +#if MYSQL_VERSION_ID >= 100200 +#error This 10.0 and 10.1 specific fix should be removed in 10.2. +#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp() +#else + int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end) + { + my_wc_t wc; + return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end); + } + + /** + Read a tail of a multi-byte character. + The first byte of the character is assumed to be already + read from the file and appended to "str". + + @returns true - if EOF happened unexpectedly + @returns false - no EOF happened: found a good multi-byte character, + or a bad byte sequence + + Note: + The return value depends only on EOF: + - read_mbtail() returns "false" is a good character was read, but also + - read_mbtail() returns "false" if an incomplete byte sequence was found + and no EOF happened. + + For example, suppose we have an ujis file with bytes 0x8FA10A, where: + - 0x8FA1 is an incomplete prefix of a 3-byte character + (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character) + - 0x0A is a line demiliter + This file has some broken data, the trailing [A1-FE] is missing. + + In this example it works as follows: + - 0x8F is read from the file and put into "data" before the call + for read_mbtail() + - 0xA1 is read from the file and put into "data" by read_mbtail() + - 0x0A is kept in the read queue, so the next read iteration after + the current read_mbtail() call will normally find it and recognize as + a line delimiter + - the current call for read_mbtail() returns "false", + because no EOF happened + */ + bool read_mbtail(String *str) + { + int chlen; + if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1) + return false; // Single byte character found + for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); ) + { + int chr= GET; + if (chr == my_b_EOF) + { + DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen)); + return true; // EOF + } + str->append(chr); + chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end()); + if (chlen == MY_CS_ILSEQ) + { + /** + It has been an incomplete (but a valid) sequence so far, + but the last byte turned it into a bad byte sequence. + Unget the very last byte. + */ + str->length(str->length() - 1); + PUSH(chr); + DBUG_PRINT("info", ("read_mbtail: ILSEQ")); + return false; // Bad byte sequence + } + } + DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen)); + return false; // Good multi-byte character + } +#endif + public: bool error,line_cuted,found_null,enclosed; uchar *row_start, /* Found row starts here */ @@ -514,7 +589,8 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, *enclosed, skip_lines, ignore); thd_proc_info(thd, "End bulk insert"); - thd_progress_next_stage(thd); + if (!error) + thd_progress_next_stage(thd); if (thd->locked_tables_mode <= LTM_LOCK_TABLES && table->file->ha_end_bulk_insert() && !error) { @@ -1449,6 +1525,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length) } +/** + Read a field. + + The data in the loaded file was presumably escaped using + - either select_export::send_data() OUTFILE + - or mysql_real_escape_string() + using the same character set with the one specified in the current + "LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set). + + Note, non-escaped multi-byte characters are scanned as a single entity. + This is needed to correctly distinguish between: + - 0x5C as an escape character versus + - 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis) + + Parts of escaped multi-byte characters are scanned on different loop + iterations. See the comment about 0x5C handling in select_export::send_data() + in sql_class.cc. + + READ_INFO::read_field() does not check wellformedness. + Raising wellformedness errors or warnings in READ_INFO::read_field() + would be wrong, as the data after unescaping can go into a BLOB field, + or into a TEXT/VARCHAR field of a different character set. + The loop below only makes sure to revert escaping made by + select_export::send_data() or mysql_real_escape_string(). + Wellformedness is checked later, during Field::store(str,length,cs) time. + + Note, in some cases users can supply data which did not go through + escaping properly. For example, utf8 "\<C3><A4>" + (backslash followed by LATIN SMALL LETTER A WITH DIAERESIS) + is improperly escaped data that could not be generated by + select_export::send_data() / mysql_real_escape_string(): + - either there should be two backslashes: "\\<C3><A4>" + - or there should be no backslashes at all: "<C3><A4>" + "\<C3>" and "<A4> are scanned on two different loop iterations and + store "<C3><A4>" into the field. + + Note, adding useless escapes before multi-byte characters like in the + example above is safe in case of utf8, but is not safe in case of + character sets that have escape_with_backslash_is_dangerous==TRUE, + such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the + data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0). + If we add an extra escape before this sequence, then we'll get + <5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>. + The second loop iteration will turn <5C><30> into <30>. + So the program that generates a dump file for further use with LOAD DATA + must make sure to use escapes properly. +*/ + int READ_INFO::read_field() { int chr,found_enclosed_char; @@ -1485,7 +1609,8 @@ int READ_INFO::read_field() for (;;) { - while ( to < end_of_buff) + // Make sure we have enough space for the longest multi-byte character. + while ( to + read_charset->mbmaxlen < end_of_buff) { chr = GET; if (chr == my_b_EOF) @@ -1573,52 +1698,27 @@ int READ_INFO::read_field() } } #ifdef USE_MB - uint ml= my_mbcharlen(read_charset, chr); - if (ml == 0) - { - *to= '\0'; - my_error(ER_INVALID_CHARACTER_STRING, MYF(0), - read_charset->csname, buffer); - error= true; - return 1; - } - - if (ml > 1 && - to + ml <= end_of_buff) - { - uchar* p= to; - *to++ = chr; - - for (uint i= 1; i < ml; i++) - { - chr= GET; - if (chr == my_b_EOF) - { - /* - Need to back up the bytes already ready from illformed - multi-byte char - */ - to-= i; - goto found_eof; - } - *to++ = chr; - } - if (my_ismbchar(read_charset, - (const char *)p, - (const char *)to)) - continue; - for (uint i= 0; i < ml; i++) - PUSH(*--to); - chr= GET; - } - else if (ml > 1) - { - // Buffer is too small, exit while loop, and reallocate. - PUSH(chr); - break; - } #endif *to++ = (uchar) chr; +#if MYSQL_VERSION_ID >= 100200 +#error This 10.0 and 10.1 specific fix should be removed in 10.2 +#else + if (my_mbcharlen(read_charset, (uchar) chr) > 1) + { + /* + A known MBHEAD found. Try to scan the full multi-byte character. + Otherwise, a possible following second byte 0x5C would be + mis-interpreted as an escape on the next iteration. + (Important for big5, gbk, sjis, cp932). + */ + String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset); + tmp.length(1); + bool eof= read_mbtail(&tmp); + to+= tmp.length() - 1; + if (eof) + goto found_eof; + } +#endif } /* ** We come here if buffer is too small. Enlarge it and continue |