Merge branch '10.0' into 10.1

author: Sergei Golubchik <serg@mariadb.org> 2016-12-11 09:53:42 +0100
committer: Sergei Golubchik <serg@mariadb.org> 2016-12-11 09:53:42 +0100
commit: 2f20d297f8ea731d845bb220e680ad10c7a927bc (patch)
tree: 9bd18ef1ab766422ba4c51b4ab189e259955a2d0 /sql/sql_load.cc
parent: a629b5172e96c96c414fca70fffd64c80f2f7e8f (diff)
parent: eb4f2e063c341d9f3644339c68cb01679e782001 (diff)
download: mariadb-git-2f20d297f8ea731d845bb220e680ad10c7a927bc.tar.gz
1 files changed, 146 insertions, 46 deletions
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index c2c97a37633..630c1e0d21e 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -79,6 +79,81 @@ class READ_INFO {
   NET *io_net;
   int level; /* for load xml */
 
+
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2.
+#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp()
+#else
+  int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end)
+  {
+    my_wc_t wc;
+    return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
+  }
+
+  /**
+    Read a tail of a multi-byte character.
+    The first byte of the character is assumed to be already
+    read from the file and appended to "str".
+
+    @returns  true  - if EOF happened unexpectedly
+    @returns  false - no EOF happened: found a good multi-byte character,
+                                       or a bad byte sequence
+
+    Note:
+    The return value depends only on EOF:
+    - read_mbtail() returns "false" is a good character was read, but also
+    - read_mbtail() returns "false" if an incomplete byte sequence was found
+      and no EOF happened.
+
+    For example, suppose we have an ujis file with bytes 0x8FA10A, where:
+    - 0x8FA1 is an incomplete prefix of a 3-byte character
+      (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
+    - 0x0A is a line demiliter
+    This file has some broken data, the trailing [A1-FE] is missing.
+
+    In this example it works as follows:
+    - 0x8F is read from the file and put into "data" before the call
+      for read_mbtail()
+    - 0xA1 is read from the file and put into "data" by read_mbtail()
+    - 0x0A is kept in the read queue, so the next read iteration after
+      the current read_mbtail() call will normally find it and recognize as
+      a line delimiter
+    - the current call for read_mbtail() returns "false",
+      because no EOF happened
+  */
+  bool read_mbtail(String *str)
+  {
+    int chlen;
+    if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1)
+      return false; // Single byte character found
+    for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
+    {
+      int chr= GET;
+      if (chr == my_b_EOF)
+      {
+        DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
+        return true; // EOF
+      }
+      str->append(chr);
+      chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end());
+      if (chlen == MY_CS_ILSEQ)
+      {
+        /**
+          It has been an incomplete (but a valid) sequence so far,
+          but the last byte turned it into a bad byte sequence.
+          Unget the very last byte.
+        */
+        str->length(str->length() - 1);
+        PUSH(chr);
+        DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
+        return false; // Bad byte sequence
+      }
+    }
+    DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
+    return false; // Good multi-byte character
+  }
+#endif
+
 public:
   bool error,line_cuted,found_null,enclosed;
   uchar	*row_start,			/* Found row starts here */
@@ -514,7 +589,8 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
 			    *enclosed, skip_lines, ignore);
 
     thd_proc_info(thd, "End bulk insert");
-    thd_progress_next_stage(thd);
+    if (!error)
+      thd_progress_next_stage(thd);
     if (thd->locked_tables_mode <= LTM_LOCK_TABLES &&
         table->file->ha_end_bulk_insert() && !error)
     {
@@ -1449,6 +1525,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length)
 }
 
 
+/**
+  Read a field.
+
+  The data in the loaded file was presumably escaped using
+  - either select_export::send_data() OUTFILE
+  - or mysql_real_escape_string()
+  using the same character set with the one specified in the current
+  "LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set).
+
+  Note, non-escaped multi-byte characters are scanned as a single entity.
+  This is needed to correctly distinguish between:
+  - 0x5C as an escape character versus
+  - 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis)
+
+  Parts of escaped multi-byte characters are scanned on different loop
+  iterations. See the comment about 0x5C handling in select_export::send_data()
+  in sql_class.cc.
+
+  READ_INFO::read_field() does not check wellformedness.
+  Raising wellformedness errors or warnings in READ_INFO::read_field()
+  would be wrong, as the data after unescaping can go into a BLOB field,
+  or into a TEXT/VARCHAR field of a different character set.
+  The loop below only makes sure to revert escaping made by
+  select_export::send_data() or mysql_real_escape_string().
+  Wellformedness is checked later, during Field::store(str,length,cs) time.
+
+  Note, in some cases users can supply data which did not go through
+  escaping properly. For example, utf8 "\<C3><A4>"
+  (backslash followed by LATIN SMALL LETTER A WITH DIAERESIS)
+  is improperly escaped data that could not be generated by
+  select_export::send_data() / mysql_real_escape_string():
+  - either there should be two backslashes:   "\\<C3><A4>"
+  - or there should be no backslashes at all: "<C3><A4>"
+  "\<C3>" and "<A4> are scanned on two different loop iterations and
+  store "<C3><A4>" into the field.
+
+  Note, adding useless escapes before multi-byte characters like in the
+  example above is safe in case of utf8, but is not safe in case of
+  character sets that have escape_with_backslash_is_dangerous==TRUE,
+  such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the
+  data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0).
+  If we add an extra escape before this sequence, then we'll get
+  <5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>.
+  The second loop iteration will turn <5C><30> into <30>.
+  So the program that generates a dump file for further use with LOAD DATA
+  must make sure to use escapes properly.
+*/
+
 int READ_INFO::read_field()
 {
   int chr,found_enclosed_char;
@@ -1485,7 +1609,8 @@ int READ_INFO::read_field()
 
   for (;;)
   {
-    while ( to < end_of_buff)
+    // Make sure we have enough space for the longest multi-byte character.
+    while ( to + read_charset->mbmaxlen < end_of_buff)
     {
       chr = GET;
       if (chr == my_b_EOF)
@@ -1573,52 +1698,27 @@ int READ_INFO::read_field()
 	}
       }
 #ifdef USE_MB
-        uint ml= my_mbcharlen(read_charset, chr);
-        if (ml == 0)
-        {
-          *to= '\0';
-          my_error(ER_INVALID_CHARACTER_STRING, MYF(0),
-                   read_charset->csname, buffer);
-          error= true;
-          return 1;
-        }
-
-        if (ml > 1 &&
-            to + ml <= end_of_buff)
-        {
-          uchar* p= to;
-          *to++ = chr;
-
-          for (uint i= 1; i < ml; i++)
-          {
-            chr= GET;
-            if (chr == my_b_EOF)
-            {
-              /*
-                Need to back up the bytes already ready from illformed
-                multi-byte char 
-              */
-              to-= i;
-              goto found_eof;
-            }
-            *to++ = chr;
-          }
-          if (my_ismbchar(read_charset,
-                        (const char *)p,
-                        (const char *)to))
-            continue;
-          for (uint i= 0; i < ml; i++)
-            PUSH(*--to);
-          chr= GET;
-        }
-        else if (ml > 1)
-        {
-          // Buffer is too small, exit while loop, and reallocate.
-          PUSH(chr);
-          break;
-        }
 #endif
       *to++ = (uchar) chr;
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2
+#else
+      if (my_mbcharlen(read_charset, (uchar) chr) > 1)
+      {
+        /*
+          A known MBHEAD found. Try to scan the full multi-byte character.
+          Otherwise, a possible following second byte 0x5C would be
+          mis-interpreted as an escape on the next iteration.
+          (Important for big5, gbk, sjis, cp932).
+        */
+        String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset);
+        tmp.length(1);
+        bool eof= read_mbtail(&tmp);
+        to+= tmp.length() - 1;
+        if (eof)
+          goto found_eof;
+      }
+#endif
     }
     /*
     ** We come here if buffer is too small. Enlarge it and continue
author	Sergei Golubchik <serg@mariadb.org>	2016-12-11 09:53:42 +0100
committer	Sergei Golubchik <serg@mariadb.org>	2016-12-11 09:53:42 +0100
commit	2f20d297f8ea731d845bb220e680ad10c7a927bc (patch)
tree	9bd18ef1ab766422ba4c51b4ab189e259955a2d0 /sql/sql_load.cc
parent	a629b5172e96c96c414fca70fffd64c80f2f7e8f (diff)
parent	eb4f2e063c341d9f3644339c68cb01679e782001 (diff)
download	mariadb-git-2f20d297f8ea731d845bb220e680ad10c7a927bc.tar.gz