diff options
author | Sergei Petrunia <psergey@askmonty.org> | 2016-04-07 00:54:39 +0300 |
---|---|---|
committer | Sergei Petrunia <psergey@askmonty.org> | 2016-04-07 00:54:39 +0300 |
commit | 59e5f5b47e1f12a1426319a905dbc8cc55219c0d (patch) | |
tree | b2095faf431949d31e2ea69200bea27a8cf2f629 /sql/sql_load.cc | |
parent | 306de8a927916db98c67fa338b5a275735f78240 (diff) | |
parent | 89b744eb6c2484412f476a53087cea7bf28dc917 (diff) | |
download | mariadb-git-59e5f5b47e1f12a1426319a905dbc8cc55219c0d.tar.gz |
Merge branch '10.2' into bb-10.2-mdev9543
- Make Window Functions errors use the MariaDB's extra error range.
- Fix a trivial bug in check_error_mesg
Diffstat (limited to 'sql/sql_load.cc')
-rw-r--r-- | sql/sql_load.cc | 459 |
1 files changed, 256 insertions, 203 deletions
diff --git a/sql/sql_load.cc b/sql/sql_load.cc index 94b0fe72ac3..a4044dd0d59 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -61,24 +61,128 @@ XML_TAG::XML_TAG(int l, String f, String v) } +/* + Field and line terminators must be interpreted as sequence of unsigned char. + Otherwise, non-ascii terminators will be negative on some platforms, + and positive on others (depending on the implementation of char). +*/ +class Term_string +{ + const uchar *m_ptr; + uint m_length; + int m_initial_byte; +public: + Term_string(const String &str) : + m_ptr(static_cast<const uchar*>(static_cast<const void*>(str.ptr()))), + m_length(str.length()), + m_initial_byte((uchar) (str.length() ? str.ptr()[0] : INT_MAX)) + { } + void set(const uchar *str, uint length, int initial_byte) + { + m_ptr= str; + m_length= length; + m_initial_byte= initial_byte; + } + void reset() { set(NULL, 0, INT_MAX); } + const uchar *ptr() const { return m_ptr; } + uint length() const { return m_length; } + int initial_byte() const { return m_initial_byte; } + bool eq(const Term_string &other) const + { + return length() == other.length() && !memcmp(ptr(), other.ptr(), length()); + } +}; + + #define GET (stack_pos != stack ? *--stack_pos : my_b_get(&cache)) #define PUSH(A) *(stack_pos++)=(A) class READ_INFO { File file; - uchar *buffer, /* Buffer for read text */ - *end_of_buff; /* Data in bufferts ends here */ - uint buff_length, /* Length of buffert */ - max_length; /* Max length of row */ - const uchar *field_term_ptr,*line_term_ptr; - const char *line_start_ptr,*line_start_end; - uint field_term_length,line_term_length,enclosed_length; - int field_term_char,line_term_char,enclosed_char,escape_char; + String data; /* Read buffer */ + uint fixed_length; /* Length of the fixed length record */ + uint max_length; /* Max length of row */ + Term_string m_field_term; /* FIELDS TERMINATED BY 'string' */ + Term_string m_line_term; /* LINES TERMINATED BY 'string' */ + Term_string m_line_start; /* LINES STARTING BY 'string' */ + int enclosed_char,escape_char; int *stack,*stack_pos; bool found_end_of_line,start_of_line,eof; NET *io_net; int level; /* for load xml */ + bool getbyte(char *to) + { + int chr= GET; + if (chr == my_b_EOF) + return (eof= true); + *to= chr; + return false; + } + + /** + Read a tail of a multi-byte character. + The first byte of the character is assumed to be already + read from the file and appended to "str". + + @returns true - if EOF happened unexpectedly + @returns false - no EOF happened: found a good multi-byte character, + or a bad byte sequence + + Note: + The return value depends only on EOF: + - read_mbtail() returns "false" is a good character was read, but also + - read_mbtail() returns "false" if an incomplete byte sequence was found + and no EOF happened. + + For example, suppose we have an ujis file with bytes 0x8FA10A, where: + - 0x8FA1 is an incomplete prefix of a 3-byte character + (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character) + - 0x0A is a line demiliter + This file has some broken data, the trailing [A1-FE] is missing. + + In this example it works as follows: + - 0x8F is read from the file and put into "data" before the call + for read_mbtail() + - 0xA1 is read from the file and put into "data" by read_mbtail() + - 0x0A is kept in the read queue, so the next read iteration after + the current read_mbtail() call will normally find it and recognize as + a line delimiter + - the current call for read_mbtail() returns "false", + because no EOF happened + */ + bool read_mbtail(String *str) + { + int chlen; + if ((chlen= my_charlen(read_charset, str->end() - 1, str->end())) == 1) + return false; // Single byte character found + for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); ) + { + int chr= GET; + if (chr == my_b_EOF) + { + DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen)); + return true; // EOF + } + str->append(chr); + chlen= my_charlen(read_charset, str->ptr() + length0, str->end()); + if (chlen == MY_CS_ILSEQ) + { + /** + It has been an incomplete (but a valid) sequence so far, + but the last byte turned it into a bad byte sequence. + Unget the very last byte. + */ + str->length(str->length() - 1); + PUSH(chr); + DBUG_PRINT("info", ("read_mbtail: ILSEQ")); + return false; // Bad byte sequence + } + } + DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen)); + return false; // Good multi-byte character + } + public: bool error,line_cuted,found_null,enclosed; uchar *row_start, /* Found row starts here */ @@ -94,7 +198,11 @@ public: int read_fixed_length(void); int next_line(void); char unescape(char chr); - int terminator(const uchar *ptr, uint length); + bool terminator(const uchar *ptr, uint length); + bool terminator(const Term_string &str) + { return terminator(str.ptr(), str.length()); } + bool terminator(int chr, const Term_string &str) + { return str.initial_byte() == chr && terminator(str); } bool find_start_of_fields(); /* load xml */ List<XML_TAG> taglist; @@ -1344,63 +1452,40 @@ READ_INFO::READ_INFO(THD *thd, File file_par, uint tot_length, CHARSET_INFO *cs, String &field_term, String &line_start, String &line_term, String &enclosed_par, int escape, bool get_it_from_net, bool is_fifo) - :file(file_par), buffer(NULL), buff_length(tot_length), escape_char(escape), - found_end_of_line(false), eof(false), + :file(file_par), fixed_length(tot_length), + m_field_term(field_term), m_line_term(line_term), m_line_start(line_start), + escape_char(escape), found_end_of_line(false), eof(false), error(false), line_cuted(false), found_null(false), read_charset(cs) { + data.set_thread_specific(); /* Field and line terminators must be interpreted as sequence of unsigned char. Otherwise, non-ascii terminators will be negative on some platforms, and positive on others (depending on the implementation of char). */ - field_term_ptr= - static_cast<const uchar*>(static_cast<const void*>(field_term.ptr())); - field_term_length= field_term.length(); - line_term_ptr= - static_cast<const uchar*>(static_cast<const void*>(line_term.ptr())); - line_term_length= line_term.length(); level= 0; /* for load xml */ - if (line_start.length() == 0) - { - line_start_ptr=0; - start_of_line= 0; - } - else - { - line_start_ptr= line_start.ptr(); - line_start_end=line_start_ptr+line_start.length(); - start_of_line= 1; - } + start_of_line= line_start.length() != 0; /* If field_terminator == line_terminator, don't use line_terminator */ - if (field_term_length == line_term_length && - !memcmp(field_term_ptr,line_term_ptr,field_term_length)) - { - line_term_length=0; - line_term_ptr= NULL; - } - enclosed_char= (enclosed_length=enclosed_par.length()) ? - (uchar) enclosed_par[0] : INT_MAX; - field_term_char= field_term_length ? field_term_ptr[0] : INT_MAX; - line_term_char= line_term_length ? line_term_ptr[0] : INT_MAX; + if (m_field_term.eq(m_line_term)) + m_line_term.reset(); + enclosed_char= enclosed_par.length() ? (uchar) enclosed_par[0] : INT_MAX; /* Set of a stack for unget if long terminators */ - uint length= MY_MAX(cs->mbmaxlen, MY_MAX(field_term_length, line_term_length)) + 1; + uint length= MY_MAX(cs->mbmaxlen, MY_MAX(m_field_term.length(), + m_line_term.length())) + 1; set_if_bigger(length,line_start.length()); stack= stack_pos= (int*) thd->alloc(sizeof(int) * length); - if (!(buffer=(uchar*) my_malloc(buff_length+1,MYF(MY_THREAD_SPECIFIC)))) + if (data.reserve(tot_length)) error=1; /* purecov: inspected */ else { - end_of_buff=buffer+buff_length; if (init_io_cache(&cache,(get_it_from_net) ? -1 : file, 0, (get_it_from_net) ? READ_NET : (is_fifo ? READ_FIFO : READ_CACHE),0L,1, MYF(MY_WME | MY_THREAD_SPECIFIC))) { - my_free(buffer); /* purecov: inspected */ - buffer= NULL; error=1; } else @@ -1423,7 +1508,6 @@ READ_INFO::READ_INFO(THD *thd, File file_par, uint tot_length, CHARSET_INFO *cs, READ_INFO::~READ_INFO() { ::end_io_cache(&cache); - my_free(buffer); List_iterator<XML_TAG> xmlit(taglist); XML_TAG *t; while ((t= xmlit++)) @@ -1431,7 +1515,7 @@ READ_INFO::~READ_INFO() } -inline int READ_INFO::terminator(const uchar *ptr,uint length) +inline bool READ_INFO::terminator(const uchar *ptr, uint length) { int chr=0; // Keep gcc happy uint i; @@ -1443,18 +1527,17 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length) } } if (i == length) - return 1; + return true; PUSH(chr); while (i-- > 1) PUSH(*--ptr); - return 0; + return false; } int READ_INFO::read_field() { int chr,found_enclosed_char; - uchar *to,*new_buffer; found_null=0; if (found_end_of_line) @@ -1473,11 +1556,11 @@ int READ_INFO::read_field() found_end_of_line=eof=1; return 1; } - to=buffer; + data.length(0); if (chr == enclosed_char) { found_enclosed_char=enclosed_char; - *to++=(uchar) chr; // If error + data.append(chr); // If error } else { @@ -1487,7 +1570,8 @@ int READ_INFO::read_field() for (;;) { - while ( to < end_of_buff) + // Make sure we have enough space for the longest multi-byte character. + while (data.length() + read_charset->mbmaxlen <= data.alloced_length()) { chr = GET; if (chr == my_b_EOF) @@ -1496,7 +1580,7 @@ int READ_INFO::read_field() { if ((chr=GET) == my_b_EOF) { - *to++= (uchar) escape_char; + data.append(escape_char); goto found_eof; } /* @@ -1508,24 +1592,24 @@ int READ_INFO::read_field() */ if (escape_char != enclosed_char || chr == escape_char) { - *to++ = (uchar) unescape((char) chr); + data.append(unescape((char) chr)); continue; } PUSH(chr); chr= escape_char; } #ifdef ALLOW_LINESEPARATOR_IN_STRINGS - if (chr == line_term_char) + if (chr == m_line_term.initial_byte()) #else - if (chr == line_term_char && found_enclosed_char == INT_MAX) + if (chr == m_line_term.initial_byte() && found_enclosed_char == INT_MAX) #endif { - if (terminator(line_term_ptr,line_term_length)) + if (terminator(m_line_term)) { // Maybe unexpected linefeed enclosed=0; found_end_of_line=1; - row_start=buffer; - row_end= to; + row_start= (uchar *) data.ptr(); + row_end= (uchar *) data.end(); return 0; } } @@ -1533,27 +1617,24 @@ int READ_INFO::read_field() { if ((chr=GET) == found_enclosed_char) { // Remove dupplicated - *to++ = (uchar) chr; + data.append(chr); continue; } // End of enclosed field if followed by field_term or line_term - if (chr == my_b_EOF || - (chr == line_term_char && terminator(line_term_ptr, - line_term_length))) + if (chr == my_b_EOF || terminator(chr, m_line_term)) { /* Maybe unexpected linefeed */ enclosed=1; found_end_of_line=1; - row_start=buffer+1; - row_end= to; + row_start= (uchar *) data.ptr() + 1; + row_end= (uchar *) data.end(); return 0; } - if (chr == field_term_char && - terminator(field_term_ptr,field_term_length)) + if (terminator(chr, m_field_term)) { enclosed=1; - row_start=buffer+1; - row_end= to; + row_start= (uchar *) data.ptr() + 1; + row_end= (uchar *) data.end(); return 0; } /* @@ -1564,68 +1645,33 @@ int READ_INFO::read_field() /* copy the found term character to 'to' */ chr= found_enclosed_char; } - else if (chr == field_term_char && found_enclosed_char == INT_MAX) + else if (chr == m_field_term.initial_byte() && + found_enclosed_char == INT_MAX) { - if (terminator(field_term_ptr,field_term_length)) + if (terminator(m_field_term)) { enclosed=0; - row_start=buffer; - row_end= to; + row_start= (uchar *) data.ptr(); + row_end= (uchar *) data.end(); return 0; } } -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1 && - to + my_mbcharlen(read_charset, chr) <= end_of_buff) - { - uchar* p= to; - int ml, i; - *to++ = chr; - - ml= my_mbcharlen(read_charset, chr); - - for (i= 1; i < ml; i++) - { - chr= GET; - if (chr == my_b_EOF) - { - /* - Need to back up the bytes already ready from illformed - multi-byte char - */ - to-= i; - goto found_eof; - } - *to++ = chr; - } - if (my_ismbchar(read_charset, - (const char *)p, - (const char *)to)) - continue; - for (i= 0; i < ml; i++) - PUSH(*--to); - chr= GET; - } -#endif - *to++ = (uchar) chr; + data.append(chr); + if (use_mb(read_charset) && read_mbtail(&data)) + goto found_eof; } /* ** We come here if buffer is too small. Enlarge it and continue */ - if (!(new_buffer=(uchar*) my_realloc((char*) buffer,buff_length+1+IO_SIZE, - MYF(MY_WME | MY_THREAD_SPECIFIC)))) - return (error=1); - to=new_buffer + (to-buffer); - buffer=new_buffer; - buff_length+=IO_SIZE; - end_of_buff=buffer+buff_length; + if (data.reserve(IO_SIZE)) + return (error= 1); } found_eof: enclosed=0; found_end_of_line=eof=1; - row_start=buffer; - row_end=to; + row_start= (uchar *) data.ptr(); + row_end= (uchar *) data.end(); return 0; } @@ -1647,7 +1693,6 @@ found_eof: int READ_INFO::read_fixed_length() { int chr; - uchar *to; if (found_end_of_line) return 1; // One have to call next_line @@ -1658,8 +1703,7 @@ int READ_INFO::read_fixed_length() return 1; } - to=row_start=buffer; - while (to < end_of_buff) + for (data.length(0); data.length() < fixed_length ; ) { if ((chr=GET) == my_b_EOF) goto found_eof; @@ -1667,105 +1711,129 @@ int READ_INFO::read_fixed_length() { if ((chr=GET) == my_b_EOF) { - *to++= (uchar) escape_char; + data.append(escape_char); goto found_eof; } - *to++ =(uchar) unescape((char) chr); + data.append((uchar) unescape((char) chr)); continue; } - if (chr == line_term_char) - { - if (terminator(line_term_ptr,line_term_length)) - { // Maybe unexpected linefeed - found_end_of_line=1; - row_end= to; - return 0; - } + if (terminator(chr, m_line_term)) + { // Maybe unexpected linefeed + found_end_of_line= true; + break; } - *to++ = (uchar) chr; + data.append(chr); } - row_end=to; // Found full line + row_start= (uchar *) data.ptr(); + row_end= (uchar *) data.end(); // Found full line return 0; found_eof: found_end_of_line=eof=1; - row_start=buffer; - row_end=to; - return to == buffer ? 1 : 0; + row_start= (uchar *) data.ptr(); + row_end= (uchar *) data.end(); + return data.length() == 0 ? 1 : 0; } int READ_INFO::next_line() { line_cuted=0; - start_of_line= line_start_ptr != 0; + start_of_line= m_line_start.length() != 0; if (found_end_of_line || eof) { found_end_of_line=0; return eof; } found_end_of_line=0; - if (!line_term_length) + if (!m_line_term.length()) return 0; // No lines for (;;) { - int chr = GET; -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1) - { - for (uint i=1; - chr != my_b_EOF && i<my_mbcharlen(read_charset, chr); - i++) - chr = GET; - if (chr == escape_char) - continue; - } -#endif - if (chr == my_b_EOF) - { - eof=1; - return 1; + int chlen; + char buf[MY_CS_MBMAXLEN]; + + if (getbyte(&buf[0])) + return 1; // EOF + + if (use_mb(read_charset) && + (chlen= my_charlen(read_charset, buf, buf + 1)) != 1) + { + uint i; + for (i= 1; MY_CS_IS_TOOSMALL(chlen); ) + { + DBUG_ASSERT(i < sizeof(buf)); + DBUG_ASSERT(chlen != 1); + if (getbyte(&buf[i++])) + return 1; // EOF + chlen= my_charlen(read_charset, buf, buf + i); + } + + /* + Either a complete multi-byte sequence, + or a broken byte sequence was found. + Check if the sequence is a prefix of the "LINES TERMINATED BY" string. + */ + if ((uchar) buf[0] == m_line_term.initial_byte() && + i <= m_line_term.length() && + !memcmp(buf, m_line_term.ptr(), i)) + { + if (m_line_term.length() == i) + { + /* + We found a "LINES TERMINATED BY" string that consists + of a single multi-byte character. + */ + return 0; + } + /* + buf[] is a prefix of "LINES TERMINATED BY". + Now check the suffix. Length of the suffix of line_term_ptr + that still needs to be checked is (line_term_length - i). + Note, READ_INFO::terminator() assumes that the leftmost byte of the + argument is already scanned from the file and is checked to + be a known prefix (e.g. against line_term.initial_char()). + So we need to pass one extra byte. + */ + if (terminator(m_line_term.ptr() + i - 1, + m_line_term.length() - i + 1)) + return 0; + } + /* + Here we have a good multi-byte sequence or a broken byte sequence, + and the sequence is not equal to "LINES TERMINATED BY". + No needs to check for escape_char, because: + - multi-byte escape characters in "FIELDS ESCAPED BY" are not + supported and are rejected at parse time. + - broken single-byte sequences are not recognized as escapes, + they are considered to be a part of the data and are converted to + question marks. + */ + line_cuted= true; + continue; } - if (chr == escape_char) + if (buf[0] == escape_char) { - line_cuted=1; + line_cuted= true; if (GET == my_b_EOF) - return 1; + return 1; continue; } - if (chr == line_term_char && terminator(line_term_ptr,line_term_length)) + if (terminator(buf[0], m_line_term)) return 0; - line_cuted=1; + line_cuted= true; } } bool READ_INFO::find_start_of_fields() { - int chr; - try_again: - do - { - if ((chr=GET) == my_b_EOF) - { - found_end_of_line=eof=1; - return 1; - } - } while ((char) chr != line_start_ptr[0]); - for (const char *ptr=line_start_ptr+1 ; ptr != line_start_end ; ptr++) + for (int chr= GET ; chr != my_b_EOF ; chr= GET) { - chr=GET; // Eof will be checked later - if ((char) chr != *ptr) - { // Can't be line_start - PUSH(chr); - while (--ptr != line_start_ptr) - { // Restart with next char - PUSH( *ptr); - } - goto try_again; - } + if (terminator(chr, m_line_start)) + return false; } - return 0; + return (found_end_of_line= eof= true); } @@ -1846,26 +1914,8 @@ int READ_INFO::read_value(int delim, String *val) int chr; String tmp; - for (chr= GET; my_tospace(chr) != delim && chr != my_b_EOF;) + for (chr= GET; my_tospace(chr) != delim && chr != my_b_EOF; chr= GET) { -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1) - { - DBUG_PRINT("read_xml",("multi byte")); - int i, ml= my_mbcharlen(read_charset, chr); - for (i= 1; i < ml; i++) - { - val->append(chr); - /* - Don't use my_tospace() in the middle of a multi-byte character - TODO: check that the multi-byte sequence is valid. - */ - chr= GET; - if (chr == my_b_EOF) - return chr; - } - } -#endif if(chr == '&') { tmp.length(0); @@ -1885,8 +1935,11 @@ int READ_INFO::read_value(int delim, String *val) } } else + { val->append(chr); - chr= GET; + if (use_mb(read_charset) && read_mbtail(val)) + return my_b_EOF; + } } return my_tospace(chr); } @@ -1955,11 +2008,11 @@ int READ_INFO::read_xml(THD *thd) } // row tag should be in ROWS IDENTIFIED BY '<row>' - stored in line_term - if((tag.length() == line_term_length -2) && - (memcmp(tag.ptr(), line_term_ptr + 1, tag.length()) == 0)) + if((tag.length() == m_line_term.length() - 2) && + (memcmp(tag.ptr(), m_line_term.ptr() + 1, tag.length()) == 0)) { DBUG_PRINT("read_xml", ("start-of-row: %i %s %s", - level,tag.c_ptr_safe(), line_term_ptr)); + level,tag.c_ptr_safe(), m_line_term.ptr())); } if(chr == ' ' || chr == '>') @@ -2026,8 +2079,8 @@ int READ_INFO::read_xml(THD *thd) chr= my_tospace(GET); } - if((tag.length() == line_term_length -2) && - (memcmp(tag.ptr(), line_term_ptr + 1, tag.length()) == 0)) + if((tag.length() == m_line_term.length() - 2) && + (memcmp(tag.ptr(), m_line_term.ptr() + 1, tag.length()) == 0)) { DBUG_PRINT("read_xml", ("found end-of-row %i %s", level, tag.c_ptr_safe())); |