summaryrefslogtreecommitdiff
path: root/sql/sql_load.cc
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mariadb.org>2016-03-31 14:22:25 +0400
committerAlexander Barkov <bar@mariadb.org>2016-03-31 14:22:25 +0400
commit3fc6a8b832fd152f1fbabff08273e0223c0ff0ab (patch)
tree0144fb4e5187b92a422adffe6de9249081398f7d /sql/sql_load.cc
parent1d73005bf357a607423f858482c52ecb4712607d (diff)
downloadmariadb-git-3fc6a8b832fd152f1fbabff08273e0223c0ff0ab.tar.gz
MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases
MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified
Diffstat (limited to 'sql/sql_load.cc')
-rw-r--r--sql/sql_load.cc93
1 files changed, 72 insertions, 21 deletions
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index d43eb884abd..f1c29203f3e 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -79,6 +79,14 @@ class READ_INFO {
NET *io_net;
int level; /* for load xml */
+ bool getbyte(char *to)
+ {
+ int chr= GET;
+ if (chr == my_b_EOF)
+ return (eof= true);
+ *to= chr;
+ return false;
+ }
public:
bool error,line_cuted,found_null,enclosed;
uchar *row_start, /* Found row starts here */
@@ -1706,33 +1714,76 @@ int READ_INFO::next_line()
return 0; // No lines
for (;;)
{
- int chr = GET;
-#ifdef USE_MB
- if (my_mbcharlen(read_charset, chr) > 1)
- {
- for (uint i=1;
- chr != my_b_EOF && i<my_mbcharlen(read_charset, chr);
- i++)
- chr = GET;
- if (chr == escape_char)
- continue;
- }
-#endif
- if (chr == my_b_EOF)
- {
- eof=1;
- return 1;
+ int chlen;
+ char buf[MY_CS_MBMAXLEN];
+
+ if (getbyte(&buf[0]))
+ return 1; // EOF
+
+ if (use_mb(read_charset) &&
+ (chlen= my_charlen(read_charset, buf, buf + 1)) != 1)
+ {
+ uint i;
+ for (i= 1; MY_CS_IS_TOOSMALL(chlen); )
+ {
+ DBUG_ASSERT(i < sizeof(buf));
+ DBUG_ASSERT(chlen != 1);
+ if (getbyte(&buf[i++]))
+ return 1; // EOF
+ chlen= my_charlen(read_charset, buf, buf + i);
+ }
+
+ /*
+ Either a complete multi-byte sequence,
+ or a broken byte sequence was found.
+ Check if the sequence is a prefix of the "LINES TERMINATED BY" string.
+ */
+ if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
+ !memcmp(buf, line_term_ptr, i))
+ {
+ if (line_term_length == i)
+ {
+ /*
+ We found a "LINES TERMINATED BY" string that consists
+ of a single multi-byte character.
+ */
+ return 0;
+ }
+ /*
+ buf[] is a prefix of "LINES TERMINATED BY".
+ Now check the suffix. Length of the suffix of line_term_ptr
+ that still needs to be checked is (line_term_length - i).
+ Note, READ_INFO::terminator() assumes that the leftmost byte of the
+ argument is already scanned from the file and is checked to
+ be a known prefix (e.g. against line_term_char).
+ So we need to pass one extra byte.
+ */
+ if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
+ return 0;
+ }
+ /*
+ Here we have a good multi-byte sequence or a broken byte sequence,
+ and the sequence is not equal to "LINES TERMINATED BY".
+ No needs to check for escape_char, because:
+ - multi-byte escape characters in "FIELDS ESCAPED BY" are not
+ supported and are rejected at parse time.
+ - broken single-byte sequences are not recognized as escapes,
+ they are considered to be a part of the data and are converted to
+ question marks.
+ */
+ line_cuted= true;
+ continue;
}
- if (chr == escape_char)
+ if (buf[0] == escape_char)
{
- line_cuted=1;
+ line_cuted= true;
if (GET == my_b_EOF)
- return 1;
+ return 1;
continue;
}
- if (chr == line_term_char && terminator(line_term_ptr,line_term_length))
+ if (buf[0] == line_term_char && terminator(line_term_ptr,line_term_length))
return 0;
- line_cuted=1;
+ line_cuted= true;
}
}