Bug#20854 XML functions: wrong result in ExtractValue

mysql-test/r/xml.result: - Adding test case - Fixing error message mysql-test/t/xml.test: Adding test case sql/item_xmlfunc.cc: For grammar rules with loops like: AdditiveExpr ::= MultiplicativeExpr ('+' MultiplicativeExpr)* If we scanned scanned '+' and then met an error when parsing MultiplicativeExpr, then we should fully stop parsing - without trying to apply any other rules. Fix: add "error" member into MY_XPATH structure, and make my_xpath_parse_term() never return success as soon as error set. strings/xml.c: Adding my_xml_ctype map for flags, indicating whether a character is a space character, is a valid identifier start character, is a valid identifier body character. Using this map to properly scan identifiers. Also, using this map to scan spaces faster (instead of strchr).
author: unknown <bar@mysql.com/bar.intranet.mysql.r18.ru> 2006-09-14 11:47:19 +0500
committer: unknown <bar@mysql.com/bar.intranet.mysql.r18.ru> 2006-09-14 11:47:19 +0500
commit: 32ede45dd416e3ddbcf26ed0704cd86886f5bbd4 (patch)
tree: dc4747cdb58f41bf99ee390562be5094d0e63af3
parent: 9e89ea6fa8734c2538a416b29fc3120447f6b733 (diff)
download: mariadb-git-32ede45dd416e3ddbcf26ed0704cd86886f5bbd4.tar.gz
4 files changed, 152 insertions, 16 deletions
diff --git a/mysql-test/r/xml.result b/mysql-test/r/xml.result
index 780be01d7ce..efe7d14095d 100644
--- a/mysql-test/r/xml.result
+++ b/mysql-test/r/xml.result
@@ -570,7 +570,7 @@ select extractvalue('<a>a<b>B</b></a>','a|/b');
 extractvalue('<a>a<b>B</b></a>','a|/b')
 a
 select extractvalue('<a>A</a>','/<a>');
-ERROR HY000: XPATH syntax error: '<a>'
+ERROR HY000: XPATH error: comparison of two nodesets is not supported: '<a>'
 select extractvalue('<a><b>b</b><b!>b!</b!></a>','//b!');
 ERROR HY000: XPATH syntax error: '!'
 select extractvalue('<a>A<b>B<c>C</c></b></a>','/a/descendant::*');
@@ -710,3 +710,29 @@ Data
 select extractValue('<foo><foo.bar>Data</foo.bar><something>Otherdata</something></foo>','/foo/something');
 extractValue('<foo><foo.bar>Data</foo.bar><something>Otherdata</something></foo>','/foo/something')
 Otherdata
+select extractValue('<zot><tim0><01>10:39:15</01><02>140</02></tim0></zot>','/zot/tim0/02');
+ERROR HY000: XPATH syntax error: '02'
+select extractValue('<zot><tim0><01>10:39:15</01><02>140</02></tim0></zot>','//*');
+extractValue('<zot><tim0><01>10:39:15</01><02>140</02></tim0></zot>','//*')
+NULL
+Warnings:
+Warning	1512	Incorrect XML value: 'parse error at line 1 pos 13: unknown token unexpected (ident or '/' wanted)'
+select extractValue('<.>test</.>','//*');
+extractValue('<.>test</.>','//*')
+NULL
+Warnings:
+Warning	1512	Incorrect XML value: 'parse error at line 1 pos 2: unknown token unexpected (ident or '/' wanted)'
+select extractValue('<->test</->','//*');
+extractValue('<->test</->','//*')
+NULL
+Warnings:
+Warning	1512	Incorrect XML value: 'parse error at line 1 pos 2: unknown token unexpected (ident or '/' wanted)'
+select extractValue('<:>test</:>','//*');
+extractValue('<:>test</:>','//*')
+test
+select extractValue('<_>test</_>','//*');
+extractValue('<_>test</_>','//*')
+test
+select extractValue('<x.-_:>test</x.-_:>','//*');
+extractValue('<x.-_:>test</x.-_:>','//*')
+test
diff --git a/mysql-test/t/xml.test b/mysql-test/t/xml.test
index d510a61f04d..3347573b4b7 100644
--- a/mysql-test/t/xml.test
+++ b/mysql-test/t/xml.test
@@ -360,3 +360,19 @@ select extractValue('<ns:element xmlns:ns="myns">a</ns:element>','/ns:element/@x
 #
 select extractValue('<foo><foo.bar>Data</foo.bar><something>Otherdata</something></foo>','/foo/foo.bar');
 select extractValue('<foo><foo.bar>Data</foo.bar><something>Otherdata</something></foo>','/foo/something');
+
+#
+# Bug#20854 XML functions: wrong result in ExtractValue
+#
+--error 1105
+select extractValue('<zot><tim0><01>10:39:15</01><02>140</02></tim0></zot>','/zot/tim0/02');
+select extractValue('<zot><tim0><01>10:39:15</01><02>140</02></tim0></zot>','//*');
+# dot and dash are bad identtifier start character
+select extractValue('<.>test</.>','//*');
+select extractValue('<->test</->','//*');
+# semicolon is good identifier start character
+select extractValue('<:>test</:>','//*');
+# underscore is good identifier start character
+select extractValue('<_>test</_>','//*');
+# dot, dash, underscore and semicolon are good identifier middle characters
+select extractValue('<x.-_:>test</x.-_:>','//*');
diff --git a/sql/item_xmlfunc.cc b/sql/item_xmlfunc.cc
index dfa2d2a7325..44a2b690bac 100644
--- a/sql/item_xmlfunc.cc
+++ b/sql/item_xmlfunc.cc
@@ -105,6 +105,7 @@ typedef struct my_xpath_st
   String *context_cache; /* last context provider                     */
   String *pxml;          /* Parsed XML, an array of MY_XML_NODE       */
   CHARSET_INFO *cs;      /* character set/collation string comparison */
+  int error;
 } MY_XPATH;
 
 
@@ -913,7 +914,9 @@ static Item *eq_func_reverse(int oper, Item *a, Item *b)
   RETURN
     The newly created item.
 */
-static Item *create_comparator(MY_XPATH *xpath, int oper, Item *a, Item *b)
+static Item *create_comparator(MY_XPATH *xpath,
+                               int oper, MY_XPATH_LEX *context,
+                               Item *a, Item *b)
 {
   if (a->type() != Item::XPATH_NODESET &&
       b->type() != Item::XPATH_NODESET)
@@ -923,6 +926,13 @@ static Item *create_comparator(MY_XPATH *xpath, int oper, Item *a, Item *b)
   else if (a->type() == Item::XPATH_NODESET &&
            b->type() == Item::XPATH_NODESET)
   {
+    uint len= context->end - context->beg;
+    set_if_bigger(len, 32);
+    my_printf_error(ER_UNKNOWN_ERROR,
+                    "XPATH error: "
+                    "comparison of two nodesets is not supported: '%.*s'",
+                    MYF(0), len, context->beg);
+
     return 0; // TODO: Comparison of two nodesets
   }
   else
@@ -1430,7 +1440,7 @@ my_xpath_lex_scan(MY_XPATH *xpath,
 static int
 my_xpath_parse_term(MY_XPATH *xpath, int term)
 {
-  if (xpath->lasttok.term == term)
+  if (xpath->lasttok.term == term && !xpath->error)
   {
     xpath->prevtok= xpath->lasttok;
     my_xpath_lex_scan(xpath, &xpath->lasttok,
@@ -1558,8 +1568,9 @@ static int my_xpath_parse_AbsoluteLocationPath(MY_XPATH *xpath)
     return my_xpath_parse_RelativeLocationPath(xpath);
   }
 
-  return my_xpath_parse_term(xpath, MY_XPATH_LEX_EOF) ||
-         my_xpath_parse_RelativeLocationPath(xpath);
+  my_xpath_parse_RelativeLocationPath(xpath);
+ 
+  return (xpath->error == 0);
 }
 
 
@@ -1596,7 +1607,10 @@ static int my_xpath_parse_RelativeLocationPath(MY_XPATH *xpath)
                                                              "*", 1,
                                                              xpath->pxml, 1);
     if (!my_xpath_parse_Step(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
   }
   return 1;
 }
@@ -1633,10 +1647,16 @@ my_xpath_parse_AxisSpecifier_NodeTest_opt_Predicate_list(MY_XPATH *xpath)
     xpath->context_cache= context_cache;
 
     if(!my_xpath_parse_PredicateExpr(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
 
     if (!my_xpath_parse_term(xpath, MY_XPATH_LEX_RB))
+    {
+      xpath->error= 1;
       return 0;
+    }
 
     xpath->item= nodeset2bool(xpath, xpath->item);
 
@@ -1893,7 +1913,10 @@ static int my_xpath_parse_UnionExpr(MY_XPATH *xpath)
     
     if (!my_xpath_parse_PathExpr(xpath)
         || xpath->item->type() != Item::XPATH_NODESET)
+    {
+      xpath->error= 1;
       return 0;
+    }
     xpath->item= new Item_nodeset_func_union(prev, xpath->item, xpath->pxml);
   }
   return 1;
@@ -1929,6 +1952,7 @@ static int my_xpath_parse_PathExpr(MY_XPATH *xpath)
 {
   return my_xpath_parse_LocationPath(xpath) || 
          my_xpath_parse_FilterExpr_opt_slashes_RelativeLocationPath(xpath);
+         
 }
 
 
@@ -1975,7 +1999,10 @@ static int my_xpath_parse_OrExpr(MY_XPATH *xpath)
   {
     Item *prev= xpath->item;
     if (!my_xpath_parse_AndExpr(xpath))
+    {
       return 0;
+      xpath->error= 1;
+    }
     xpath->item= new Item_cond_or(nodeset2bool(xpath, prev),
                                   nodeset2bool(xpath, xpath->item));
   }
@@ -2003,7 +2030,10 @@ static int my_xpath_parse_AndExpr(MY_XPATH *xpath)
   {
     Item *prev= xpath->item;
     if (!my_xpath_parse_EqualityExpr(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
 
     xpath->item= new Item_cond_and(nodeset2bool(xpath,prev), 
                                    nodeset2bool(xpath,xpath->item));
@@ -2057,17 +2087,26 @@ static int my_xpath_parse_EqualityOperator(MY_XPATH *xpath)
 }
 static int my_xpath_parse_EqualityExpr(MY_XPATH *xpath)
 {
+  MY_XPATH_LEX operator_context;
   if (!my_xpath_parse_RelationalExpr(xpath))
     return 0;
+
+  operator_context= xpath->lasttok;
   while (my_xpath_parse_EqualityOperator(xpath))
   {
     Item *prev= xpath->item;
     int oper= xpath->extra;
     if (!my_xpath_parse_RelationalExpr(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
 
-    if (!(xpath->item= create_comparator(xpath, oper, prev, xpath->item)))
+    if (!(xpath->item= create_comparator(xpath, oper, &operator_context,
+                                         prev, xpath->item)))
       return 0;
+
+    operator_context= xpath->lasttok;
   }
   return 1;
 }
@@ -2109,18 +2148,25 @@ static int my_xpath_parse_RelationalOperator(MY_XPATH *xpath)
 }
 static int my_xpath_parse_RelationalExpr(MY_XPATH *xpath)
 {
+  MY_XPATH_LEX operator_context;
   if (!my_xpath_parse_AdditiveExpr(xpath))
     return 0;
+  operator_context= xpath->lasttok;
   while (my_xpath_parse_RelationalOperator(xpath))
   {
     Item *prev= xpath->item;
     int oper= xpath->extra;
 
     if (!my_xpath_parse_AdditiveExpr(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
 
-    if (!(xpath->item= create_comparator(xpath, oper, prev, xpath->item)))
+    if (!(xpath->item= create_comparator(xpath, oper, &operator_context,
+                                         prev, xpath->item)))
       return 0;
+    operator_context= xpath->lasttok;
   }
   return 1;
 }
@@ -2153,7 +2199,10 @@ static int my_xpath_parse_AdditiveExpr(MY_XPATH *xpath)
     int oper= xpath->prevtok.term;
     Item *prev= xpath->item;
     if (!my_xpath_parse_MultiplicativeExpr(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
 
     if (oper == MY_XPATH_LEX_PLUS)
       xpath->item= new Item_func_plus(prev, xpath->item);
@@ -2198,7 +2247,10 @@ static int my_xpath_parse_MultiplicativeExpr(MY_XPATH *xpath)
     int oper= xpath->prevtok.term;
     Item *prev= xpath->item;
     if (!my_xpath_parse_UnaryExpr(xpath))
+    {
+      xpath->error= 1;
       return 0;
+    }
     switch (oper)
     {
       case MY_XPATH_LEX_ASTERISK:
diff --git a/strings/xml.c b/strings/xml.c
index 51649dcb343..7f7c531d051 100644
--- a/strings/xml.c
+++ b/strings/xml.c
@@ -19,6 +19,7 @@
 #include "my_xml.h"
 
 
+#define MY_XML_UNKNOWN  'U'
 #define MY_XML_EOF	'E'
 #define MY_XML_STRING	'S'
 #define MY_XML_IDENT	'I'
@@ -39,6 +40,46 @@ typedef struct xml_attr_st
 } MY_XML_ATTR;
 
 
+/*
+  XML ctype:
+*/
+#define	MY_XML_ID0  0x01 /* Identifier initial character */
+#define	MY_XML_ID1  0x02 /* Identifier medial  character */
+#define	MY_XML_SPC  0x08 /* Spacing character */
+
+
+/*
+ http://www.w3.org/TR/REC-xml/ 
+ [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
+                  CombiningChar | Extender
+ [5] Name ::= (Letter | '_' | ':') (NameChar)*
+*/
+
+static char my_xml_ctype[256]=
+{
+/*00*/  0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
+/*10*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+/*20*/  8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,  /*  !"#$%&'()*+,-./ */
+/*30*/  2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0,  /* 0123456789:;<=>? */
+/*40*/  0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* @ABCDEFGHIJKLMNO */
+/*50*/  3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3,  /* PQRSTUVWXYZ[\]^_ */
+/*60*/  0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* `abcdefghijklmno */
+/*70*/  3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,  /* pqrstuvwxyz{|}~  */
+/*80*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*90*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*A0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*B0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*C0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*D0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*E0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*F0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+};
+
+#define my_xml_is_space(c)  (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
+#define my_xml_is_id0(c)    (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
+#define my_xml_is_id1(c)    (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
+
+
 static const char *lex2str(int lex)
 {
   switch(lex)
@@ -56,13 +97,13 @@ static const char *lex2str(int lex)
     case MY_XML_QUESTION: return "'?'";
     case MY_XML_EXCLAM:   return "'!'";
   }
-  return "UNKNOWN";
+  return "unknown token";
 }
 
 static void my_xml_norm_text(MY_XML_ATTR *a)
 {
-  for ( ; (a->beg < a->end) && strchr(" \t\r\n",a->beg[0]) ; a->beg++ );
-  for ( ; (a->beg < a->end) && strchr(" \t\r\n",a->end[-1]) ; a->end-- );
+  for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
+  for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
 }
 
 
@@ -70,7 +111,7 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
 {
   int lex;
   
-  for(  ; ( p->cur < p->end) && strchr(" \t\r\n",p->cur[0]) ;  p->cur++);
+  for(  ; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ;  p->cur++);
   
   if (p->cur >= p->end)
   {
@@ -124,16 +165,17 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
       my_xml_norm_text(a);
     lex=MY_XML_STRING;
   }
-  else
+  else if (my_xml_is_id0(p->cur[0]))
   {
-    for(;
-	(p->cur < p->end) && !strchr("?'\"=/<> \t\r\n", p->cur[0]);
-	p->cur++)
-    {}
+    p->cur++;
+    while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
+      p->cur++;
     a->end=p->cur;
     my_xml_norm_text(a);
     lex=MY_XML_IDENT;
   }
+  else
+    lex= MY_XML_UNKNOWN;
 
 #if 0
   printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
author	unknown <bar@mysql.com/bar.intranet.mysql.r18.ru>	2006-09-14 11:47:19 +0500
committer	unknown <bar@mysql.com/bar.intranet.mysql.r18.ru>	2006-09-14 11:47:19 +0500
commit	32ede45dd416e3ddbcf26ed0704cd86886f5bbd4 (patch)
tree	dc4747cdb58f41bf99ee390562be5094d0e63af3
parent	9e89ea6fa8734c2538a416b29fc3120447f6b733 (diff)
download	mariadb-git-32ede45dd416e3ddbcf26ed0704cd86886f5bbd4.tar.gz