summaryrefslogtreecommitdiff
path: root/strings/xml.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/xml.c')
-rw-r--r--strings/xml.c228
1 files changed, 170 insertions, 58 deletions
diff --git a/strings/xml.c b/strings/xml.c
index 986bd0e157f..3ad955bbabd 100644
--- a/strings/xml.c
+++ b/strings/xml.c
@@ -1,5 +1,4 @@
-/* Copyright (c) 2003-2006 MySQL AB, 2009 Sun Microsystems, Inc.
- Use is subject to license terms.
+/* Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -12,13 +11,15 @@
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
#include "my_global.h"
#include "m_string.h"
#include "my_xml.h"
+#define MY_XML_UNKNOWN 'U'
#define MY_XML_EOF 'E'
#define MY_XML_STRING 'S'
#define MY_XML_IDENT 'I'
@@ -30,6 +31,7 @@
#define MY_XML_TEXT 'T'
#define MY_XML_QUESTION '?'
#define MY_XML_EXCLAM '!'
+#define MY_XML_CDATA 'D'
typedef struct xml_attr_st
{
@@ -38,13 +40,54 @@ typedef struct xml_attr_st
} MY_XML_ATTR;
+/*
+ XML ctype:
+*/
+#define MY_XML_ID0 0x01 /* Identifier initial character */
+#define MY_XML_ID1 0x02 /* Identifier medial character */
+#define MY_XML_SPC 0x08 /* Spacing character */
+
+
+/*
+ http://www.w3.org/TR/REC-xml/
+ [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
+ CombiningChar | Extender
+ [5] Name ::= (Letter | '_' | ':') (NameChar)*
+*/
+
+static char my_xml_ctype[256]=
+{
+/*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
+/*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+/*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */
+/*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */
+/*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */
+/*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */
+/*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */
+/*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */
+/*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+};
+
+#define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
+#define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
+#define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
+
+
static const char *lex2str(int lex)
{
switch(lex)
{
- case MY_XML_EOF: return "EOF";
+ case MY_XML_EOF: return "END-OF-INPUT";
case MY_XML_STRING: return "STRING";
case MY_XML_IDENT: return "IDENT";
+ case MY_XML_CDATA: return "CDATA";
case MY_XML_EQ: return "'='";
case MY_XML_LT: return "'<'";
case MY_XML_GT: return "'>'";
@@ -54,13 +97,20 @@ static const char *lex2str(int lex)
case MY_XML_QUESTION: return "'?'";
case MY_XML_EXCLAM: return "'!'";
}
- return "UNKNOWN";
+ return "unknown token";
}
static void my_xml_norm_text(MY_XML_ATTR *a)
{
- for ( ; (a->beg < a->end) && strchr(" \t\r\n",a->beg[0]) ; a->beg++ );
- for ( ; (a->beg < a->end) && strchr(" \t\r\n",a->end[-1]) ; a->end-- );
+ for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
+ for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
+}
+
+
+static inline my_bool
+my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
+{
+ return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
}
@@ -68,7 +118,7 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
{
int lex;
- for( ; ( p->cur < p->end) && strchr(" \t\r\n",p->cur[0]) ; p->cur++);
+ for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++);
if (p->cur >= p->end)
{
@@ -81,15 +131,33 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
a->beg=p->cur;
a->end=p->cur;
- if ((p->end - p->cur > 3) && !bcmp(p->cur,"<!--",4))
+ if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
{
- for( ; (p->cur < p->end) && bcmp(p->cur, "-->", 3); p->cur++)
- {}
- if (!bcmp(p->cur, "-->", 3))
- p->cur+=3;
+ for (; p->cur < p->end; p->cur++)
+ {
+ if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
+ {
+ p->cur+= 3;
+ break;
+ }
+ }
a->end=p->cur;
lex=MY_XML_COMMENT;
}
+ else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
+ {
+ p->cur+= 9;
+ for (; p->cur < p->end - 2 ; p->cur++)
+ {
+ if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
+ {
+ p->cur+= 3;
+ a->end= p->cur;
+ break;
+ }
+ }
+ lex= MY_XML_CDATA;
+ }
else if (strchr("?=/<>!",p->cur[0]))
{
p->cur++;
@@ -98,25 +166,32 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
}
else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
{
+ /*
+ "string" or 'string' found.
+ Scan until the closing quote/doublequote, or until the END-OF-INPUT.
+ */
p->cur++;
- for( ; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
+ for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
{}
a->end=p->cur;
- if (a->beg[0] == p->cur[0])p->cur++;
+ if (p->cur < p->end) /* Closing quote or doublequote has been found */
+ p->cur++;
a->beg++;
- my_xml_norm_text(a);
+ if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
+ my_xml_norm_text(a);
lex=MY_XML_STRING;
}
- else
+ else if (my_xml_is_id0(p->cur[0]))
{
- for(;
- (p->cur < p->end) && !strchr("?'\"=/<> \t\r\n", p->cur[0]);
- p->cur++)
- {}
+ p->cur++;
+ while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
+ p->cur++;
a->end=p->cur;
my_xml_norm_text(a);
lex=MY_XML_IDENT;
}
+ else
+ lex= MY_XML_UNKNOWN;
#if 0
printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
@@ -127,32 +202,35 @@ ret:
}
-static int my_xml_value(MY_XML_PARSER *st, const char *str, uint len)
+static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
{
return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
}
-static int my_xml_enter(MY_XML_PARSER *st, const char *str, uint len)
+static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
{
- if ((uint) (st->attrend-st->attr+len+1) > sizeof(st->attr))
+ if ((size_t) (st->attrend-st->attr+len+1) > sizeof(st->attr))
{
sprintf(st->errstr,"To deep XML");
return MY_XML_ERROR;
}
if (st->attrend > st->attr)
{
- st->attrend[0]='.';
+ st->attrend[0]= '/';
st->attrend++;
}
memcpy(st->attrend,str,len);
st->attrend+=len;
st->attrend[0]='\0';
- return st->enter ? st->enter(st,st->attr, (uint) (st->attrend - st->attr)) : MY_XML_OK;
+ if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
+ return st->enter ? st->enter(st, str, len) : MY_XML_OK;
+ else
+ return st->enter ? st->enter(st,st->attr,st->attrend-st->attr) : MY_XML_OK;
}
-static void mstr(char *s,const char *src,uint l1, uint l2)
+static void mstr(char *s,const char *src,size_t l1, size_t l2)
{
l1 = l1<l2 ? l1 : l2;
memcpy(s,src,l1);
@@ -160,27 +238,36 @@ static void mstr(char *s,const char *src,uint l1, uint l2)
}
-static int my_xml_leave(MY_XML_PARSER *p, const char *str, uint slen)
+static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
{
char *e;
- uint glen;
+ size_t glen;
char s[32];
char g[32];
int rc;
-
- /* Find previous '.' or beginning */
- for( e=p->attrend; (e>p->attr) && (e[0] != '.') ; e--);
- glen = (uint) ((e[0] == '.') ? (p->attrend-e-1) : p->attrend-e);
+
+ /* Find previous '/' or beginning */
+ for (e=p->attrend; (e>p->attr) && (e[0] != '/') ; e--);
+ glen = (size_t) ((e[0] == '/') ? (p->attrend-e-1) : p->attrend-e);
if (str && (slen != glen))
{
mstr(s,str,sizeof(s)-1,slen);
- mstr(g,e+1,sizeof(g)-1,glen),
- sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
+ if (glen)
+ {
+ mstr(g,e+1,sizeof(g)-1,glen),
+ sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
+ }
+ else
+ sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
return MY_XML_ERROR;
}
- rc = p->leave_xml ? p->leave_xml(p,p->attr, (uint) (p->attrend - p->attr)) : MY_XML_OK;
+ if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
+ rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
+ else
+ rc= (p->leave_xml ? p->leave_xml(p,p->attr,p->attrend-p->attr) :
+ MY_XML_OK);
*e='\0';
p->attrend=e;
@@ -189,7 +276,7 @@ static int my_xml_leave(MY_XML_PARSER *p, const char *str, uint slen)
}
-int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len)
+int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
{
p->attrend=p->attr;
p->beg=str;
@@ -208,7 +295,13 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len)
lex=my_xml_scan(p,&a);
if (MY_XML_COMMENT == lex)
+ continue;
+
+ if (lex == MY_XML_CDATA)
{
+ a.beg+= 9;
+ a.end-= 3;
+ my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
continue;
}
@@ -218,10 +311,10 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len)
{
if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
{
- sprintf(p->errstr,"1: %s unexpected (ident wanted)",lex2str(lex));
+ sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
return MY_XML_ERROR;
}
- if (MY_XML_OK != my_xml_leave(p,a.beg,(uint) (a.end-a.beg)))
+ if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
return MY_XML_ERROR;
lex=my_xml_scan(p,&a);
goto gt;
@@ -240,18 +333,19 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len)
if (MY_XML_IDENT == lex)
{
- if (MY_XML_OK != my_xml_enter(p,a.beg,(uint) (a.end-a.beg)))
+ p->current_node_type= MY_XML_NODE_TAG;
+ if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
return MY_XML_ERROR;
}
else
{
- sprintf(p->errstr,"3: %s unexpected (ident or '/' wanted)",
+ sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
lex2str(lex));
return MY_XML_ERROR;
}
while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
- (MY_XML_STRING == lex))
+ ((MY_XML_STRING == lex && exclam)))
{
MY_XML_ATTR b;
if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
@@ -259,24 +353,35 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len)
lex=my_xml_scan(p,&b);
if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
{
- if ((MY_XML_OK != my_xml_enter(p,a.beg,(uint) (a.end-a.beg))) ||
- (MY_XML_OK != my_xml_value(p,b.beg,(uint) (b.end-b.beg))) ||
- (MY_XML_OK != my_xml_leave(p,a.beg,(uint) (a.end-a.beg))))
+ p->current_node_type= MY_XML_NODE_ATTR;
+ if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
+ (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) ||
+ (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
return MY_XML_ERROR;
}
else
{
- sprintf(p->errstr,"4: %s unexpected (ident or string wanted)",
+ sprintf(p->errstr,"%s unexpected (ident or string wanted)",
lex2str(lex));
return MY_XML_ERROR;
}
}
- else if ((MY_XML_STRING == lex) || (MY_XML_IDENT == lex))
+ else if (MY_XML_IDENT == lex)
{
- if ((MY_XML_OK != my_xml_enter(p,a.beg,(uint) (a.end-a.beg))) ||
- (MY_XML_OK != my_xml_leave(p,a.beg,(uint) (a.end-a.beg))))
+ p->current_node_type= MY_XML_NODE_ATTR;
+ if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
+ (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
return MY_XML_ERROR;
}
+ else if ((MY_XML_STRING == lex) && exclam)
+ {
+ /*
+ We are in <!DOCTYPE>, e.g.
+ <!DOCTYPE name SYSTEM "SystemLiteral">
+ <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
+ Just skip "SystemLiteral" and "PublicidLiteral"
+ */
+ }
else
break;
}
@@ -293,7 +398,7 @@ gt:
{
if (lex != MY_XML_QUESTION)
{
- sprintf(p->errstr,"6: %s unexpected ('?' wanted)",lex2str(lex));
+ sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
return MY_XML_ERROR;
}
if (MY_XML_OK != my_xml_leave(p,NULL,0))
@@ -309,7 +414,7 @@ gt:
if (lex != MY_XML_GT)
{
- sprintf(p->errstr,"5: %s unexpected ('>' wanted)",lex2str(lex));
+ sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
return MY_XML_ERROR;
}
}
@@ -319,13 +424,20 @@ gt:
for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++);
a.end=p->cur;
- my_xml_norm_text(&a);
+ if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
+ my_xml_norm_text(&a);
if (a.beg != a.end)
{
- my_xml_value(p,a.beg,(uint) (a.end-a.beg));
+ my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
}
}
}
+
+ if (p->attr[0])
+ {
+ sprintf(p->errstr,"unexpected END-OF-INPUT");
+ return MY_XML_ERROR;
+ }
return MY_XML_OK;
}
@@ -343,14 +455,14 @@ void my_xml_parser_free(MY_XML_PARSER *p __attribute__((unused)))
void my_xml_set_value_handler(MY_XML_PARSER *p,
int (*action)(MY_XML_PARSER *p, const char *s,
- uint l))
+ size_t l))
{
p->value=action;
}
void my_xml_set_enter_handler(MY_XML_PARSER *p,
int (*action)(MY_XML_PARSER *p, const char *s,
- uint l))
+ size_t l))
{
p->enter=action;
}
@@ -358,7 +470,7 @@ void my_xml_set_enter_handler(MY_XML_PARSER *p,
void my_xml_set_leave_handler(MY_XML_PARSER *p,
int (*action)(MY_XML_PARSER *p, const char *s,
- uint l))
+ size_t l))
{
p->leave_xml=action;
}
@@ -376,7 +488,7 @@ const char *my_xml_error_string(MY_XML_PARSER *p)
}
-uint my_xml_error_pos(MY_XML_PARSER *p)
+size_t my_xml_error_pos(MY_XML_PARSER *p)
{
const char *beg=p->beg;
const char *s;
@@ -385,7 +497,7 @@ uint my_xml_error_pos(MY_XML_PARSER *p)
if (s[0] == '\n')
beg=s;
}
- return (uint) (p->cur-beg);
+ return (size_t) (p->cur-beg);
}
uint my_xml_error_lineno(MY_XML_PARSER *p)