diff options
author | Alexander Barkov <bar@mariadb.org> | 2016-03-31 14:22:25 +0400 |
---|---|---|
committer | Alexander Barkov <bar@mariadb.org> | 2016-03-31 14:22:25 +0400 |
commit | 3fc6a8b832fd152f1fbabff08273e0223c0ff0ab (patch) | |
tree | 0144fb4e5187b92a422adffe6de9249081398f7d /sql | |
parent | 1d73005bf357a607423f858482c52ecb4712607d (diff) | |
download | mariadb-git-3fc6a8b832fd152f1fbabff08273e0223c0ff0ab.tar.gz |
MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
Diffstat (limited to 'sql')
-rw-r--r-- | sql/sql_load.cc | 93 |
1 files changed, 72 insertions, 21 deletions
diff --git a/sql/sql_load.cc b/sql/sql_load.cc index d43eb884abd..f1c29203f3e 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -79,6 +79,14 @@ class READ_INFO { NET *io_net; int level; /* for load xml */ + bool getbyte(char *to) + { + int chr= GET; + if (chr == my_b_EOF) + return (eof= true); + *to= chr; + return false; + } public: bool error,line_cuted,found_null,enclosed; uchar *row_start, /* Found row starts here */ @@ -1706,33 +1714,76 @@ int READ_INFO::next_line() return 0; // No lines for (;;) { - int chr = GET; -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1) - { - for (uint i=1; - chr != my_b_EOF && i<my_mbcharlen(read_charset, chr); - i++) - chr = GET; - if (chr == escape_char) - continue; - } -#endif - if (chr == my_b_EOF) - { - eof=1; - return 1; + int chlen; + char buf[MY_CS_MBMAXLEN]; + + if (getbyte(&buf[0])) + return 1; // EOF + + if (use_mb(read_charset) && + (chlen= my_charlen(read_charset, buf, buf + 1)) != 1) + { + uint i; + for (i= 1; MY_CS_IS_TOOSMALL(chlen); ) + { + DBUG_ASSERT(i < sizeof(buf)); + DBUG_ASSERT(chlen != 1); + if (getbyte(&buf[i++])) + return 1; // EOF + chlen= my_charlen(read_charset, buf, buf + i); + } + + /* + Either a complete multi-byte sequence, + or a broken byte sequence was found. + Check if the sequence is a prefix of the "LINES TERMINATED BY" string. + */ + if ((uchar) buf[0] == line_term_char && i <= line_term_length && + !memcmp(buf, line_term_ptr, i)) + { + if (line_term_length == i) + { + /* + We found a "LINES TERMINATED BY" string that consists + of a single multi-byte character. + */ + return 0; + } + /* + buf[] is a prefix of "LINES TERMINATED BY". + Now check the suffix. Length of the suffix of line_term_ptr + that still needs to be checked is (line_term_length - i). + Note, READ_INFO::terminator() assumes that the leftmost byte of the + argument is already scanned from the file and is checked to + be a known prefix (e.g. against line_term_char). + So we need to pass one extra byte. + */ + if (terminator(line_term_ptr + i - 1, line_term_length - i + 1)) + return 0; + } + /* + Here we have a good multi-byte sequence or a broken byte sequence, + and the sequence is not equal to "LINES TERMINATED BY". + No needs to check for escape_char, because: + - multi-byte escape characters in "FIELDS ESCAPED BY" are not + supported and are rejected at parse time. + - broken single-byte sequences are not recognized as escapes, + they are considered to be a part of the data and are converted to + question marks. + */ + line_cuted= true; + continue; } - if (chr == escape_char) + if (buf[0] == escape_char) { - line_cuted=1; + line_cuted= true; if (GET == my_b_EOF) - return 1; + return 1; continue; } - if (chr == line_term_char && terminator(line_term_ptr,line_term_length)) + if (buf[0] == line_term_char && terminator(line_term_ptr,line_term_length)) return 0; - line_cuted=1; + line_cuted= true; } } |