Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields

When a .CSV file for table in the CSV engine contains \X characters as part of unquoted fields, e.g. 2,naraya\nan \n is not interpreted as a new line (it is however interpreted as a newline in a quoted field). The old algorithm copied the entire value for a unquoted field without parsing the \X characters. The new algorithm adds the capability to handle \X characters in the unquoted fields of a .CSV file. mysql-test/r/csv.result: Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields Contains additional test output corresponding to the new tests added. mysql-test/t/csv.test: Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields Contains additional tests for testing the behaviour of the CSV storage engine when the fields are not enclosed in quotes and contain \X characters. storage/csv/ha_tina.cc: Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields Changes the parsing logic of the rows in a CSV file, to parse \X characters that might be present in the unquoted fields.
author: V Narayanan <v.narayanan@sun.com> 2009-12-03 17:18:43 +0530
committer: V Narayanan <v.narayanan@sun.com> 2009-12-03 17:18:43 +0530
commit: a5aa3b3c919ab53cdcaa657a6446051348371245 (patch)
tree: 93acd035717949e4567c97f47e5ade78fe2f641c /storage/csv
parent: 40ec012c905be0262ba5c36bbccfa0db0105e31f (diff)
download: mariadb-git-a5aa3b3c919ab53cdcaa657a6446051348371245.tar.gz
1 files changed, 65 insertions, 8 deletions
diff --git a/storage/csv/ha_tina.cc b/storage/csv/ha_tina.cc
index 9cc0f1e607b..fac78986563 100644
--- a/storage/csv/ha_tina.cc
+++ b/storage/csv/ha_tina.cc
@@ -614,6 +614,33 @@ int ha_tina::find_current_row(uchar *buf)
 
   memset(buf, 0, table->s->null_bytes);
 
+  /*
+    Parse the line obtained using the following algorithm
+   
+    BEGIN
+      1) Store the EOL (end of line) for the current row
+      2) Until all the fields in the current query have not been 
+         filled
+         2.1) If the current character is a quote
+              2.1.1) Until EOL has not been reached
+                     a) If end of current field is reached, move
+                        to next field and jump to step 2.3
+                     b) If current character is a \\ handle
+                        \\n, \\r, \\, \\"
+                     c) else append the current character into the buffer
+                        before checking that EOL has not been reached.
+          2.2) If the current character does not begin with a quote
+               2.2.1) Until EOL has not been reached
+                      a) If the end of field has been reached move to the
+                         next field and jump to step 2.3
+                      b) If current character begins with \\ handle
+                        \\n, \\r, \\, \\"
+                      c) else append the current character into the buffer
+                         before checking that EOL has not been reached.
+          2.3) Store the current field value and jump to 2)
+    TERMINATE
+  */  
+
   for (Field **field=table->field ; *field ; field++)
   {
     char curr_char;
@@ -622,19 +649,23 @@ int ha_tina::find_current_row(uchar *buf)
     if (curr_offset >= end_offset)
       goto err;
     curr_char= file_buff->get_value(curr_offset);
+    /* Handle the case where the first character is a quote */
     if (curr_char == '"')
     {
-      curr_offset++; // Incrementpast the first quote
+      /* Increment past the first quote */
+      curr_offset++;
 
-      for(; curr_offset < end_offset; curr_offset++)
+      /* Loop through the row to extract the values for the current field */
+      for ( ; curr_offset < end_offset; curr_offset++)
       {
         curr_char= file_buff->get_value(curr_offset);
-        // Need to convert line feeds!
+        /* check for end of the current field */
         if (curr_char == '"' &&
             (curr_offset == end_offset - 1 ||
              file_buff->get_value(curr_offset + 1) == ','))
         {
-          curr_offset+= 2; // Move past the , and the "
+          /* Move past the , and the " */
+          curr_offset+= 2;
           break;
         }
         if (curr_char == '\\' && curr_offset != (end_offset - 1))
@@ -656,7 +687,7 @@ int ha_tina::find_current_row(uchar *buf)
         else // ordinary symbol
         {
           /*
-            We are at final symbol and no last quote was found =>
+            If we are at final symbol and no last quote was found =>
             we are working with a damaged file.
           */
           if (curr_offset == end_offset - 1)
@@ -667,15 +698,41 @@ int ha_tina::find_current_row(uchar *buf)
     }
     else 
     {
-      for(; curr_offset < end_offset; curr_offset++)
+      for ( ; curr_offset < end_offset; curr_offset++)
       {
         curr_char= file_buff->get_value(curr_offset);
+        /* Move past the ,*/
         if (curr_char == ',')
         {
-          curr_offset++;       // Skip the ,
+          curr_offset++;
           break;
         }
-        buffer.append(curr_char);
+        if (curr_char == '\\' && curr_offset != (end_offset - 1))
+        {
+          curr_offset++;
+          curr_char= file_buff->get_value(curr_offset);
+          if (curr_char == 'r')
+            buffer.append('\r');
+          else if (curr_char == 'n' )
+            buffer.append('\n');
+          else if (curr_char == '\\' || curr_char == '"')
+            buffer.append(curr_char);
+          else  /* This could only happed with an externally created file */
+          {
+            buffer.append('\\');
+            buffer.append(curr_char);
+          }
+        }
+        else
+        {
+          /*
+             We are at the final symbol and a quote was found for the
+             unquoted field => We are working with a damaged field.
+          */
+          if (curr_offset == end_offset - 1 && curr_char == '"')
+            goto err;
+          buffer.append(curr_char);
+        }
       }
     }
author	V Narayanan <v.narayanan@sun.com>	2009-12-03 17:18:43 +0530
committer	V Narayanan <v.narayanan@sun.com>	2009-12-03 17:18:43 +0530
commit	a5aa3b3c919ab53cdcaa657a6446051348371245 (patch)
tree	93acd035717949e4567c97f47e5ade78fe2f641c /storage/csv
parent	40ec012c905be0262ba5c36bbccfa0db0105e31f (diff)
download	mariadb-git-a5aa3b3c919ab53cdcaa657a6446051348371245.tar.gz