diff options
Diffstat (limited to 'strings/xml.c')
-rw-r--r-- | strings/xml.c | 228 |
1 files changed, 170 insertions, 58 deletions
diff --git a/strings/xml.c b/strings/xml.c index 986bd0e157f..3ad955bbabd 100644 --- a/strings/xml.c +++ b/strings/xml.c @@ -1,5 +1,4 @@ -/* Copyright (c) 2003-2006 MySQL AB, 2009 Sun Microsystems, Inc. - Use is subject to license terms. +/* Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -12,13 +11,15 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ #include "my_global.h" #include "m_string.h" #include "my_xml.h" +#define MY_XML_UNKNOWN 'U' #define MY_XML_EOF 'E' #define MY_XML_STRING 'S' #define MY_XML_IDENT 'I' @@ -30,6 +31,7 @@ #define MY_XML_TEXT 'T' #define MY_XML_QUESTION '?' #define MY_XML_EXCLAM '!' +#define MY_XML_CDATA 'D' typedef struct xml_attr_st { @@ -38,13 +40,54 @@ typedef struct xml_attr_st } MY_XML_ATTR; +/* + XML ctype: +*/ +#define MY_XML_ID0 0x01 /* Identifier initial character */ +#define MY_XML_ID1 0x02 /* Identifier medial character */ +#define MY_XML_SPC 0x08 /* Spacing character */ + + +/* + http://www.w3.org/TR/REC-xml/ + [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | + CombiningChar | Extender + [5] Name ::= (Letter | '_' | ':') (NameChar)* +*/ + +static char my_xml_ctype[256]= +{ +/*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0, +/*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +/*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */ +/*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */ +/*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */ +/*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */ +/*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */ +/*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */ +/*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 +}; + +#define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC) +#define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0) +#define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1) + + static const char *lex2str(int lex) { switch(lex) { - case MY_XML_EOF: return "EOF"; + case MY_XML_EOF: return "END-OF-INPUT"; case MY_XML_STRING: return "STRING"; case MY_XML_IDENT: return "IDENT"; + case MY_XML_CDATA: return "CDATA"; case MY_XML_EQ: return "'='"; case MY_XML_LT: return "'<'"; case MY_XML_GT: return "'>'"; @@ -54,13 +97,20 @@ static const char *lex2str(int lex) case MY_XML_QUESTION: return "'?'"; case MY_XML_EXCLAM: return "'!'"; } - return "UNKNOWN"; + return "unknown token"; } static void my_xml_norm_text(MY_XML_ATTR *a) { - for ( ; (a->beg < a->end) && strchr(" \t\r\n",a->beg[0]) ; a->beg++ ); - for ( ; (a->beg < a->end) && strchr(" \t\r\n",a->end[-1]) ; a->end-- ); + for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ ); + for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- ); +} + + +static inline my_bool +my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen) +{ + return (p->cur + slen > p->end) || memcmp(p->cur, s, slen); } @@ -68,7 +118,7 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a) { int lex; - for( ; ( p->cur < p->end) && strchr(" \t\r\n",p->cur[0]) ; p->cur++); + for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++); if (p->cur >= p->end) { @@ -81,15 +131,33 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a) a->beg=p->cur; a->end=p->cur; - if ((p->end - p->cur > 3) && !bcmp(p->cur,"<!--",4)) + if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--"))) { - for( ; (p->cur < p->end) && bcmp(p->cur, "-->", 3); p->cur++) - {} - if (!bcmp(p->cur, "-->", 3)) - p->cur+=3; + for (; p->cur < p->end; p->cur++) + { + if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->"))) + { + p->cur+= 3; + break; + } + } a->end=p->cur; lex=MY_XML_COMMENT; } + else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA["))) + { + p->cur+= 9; + for (; p->cur < p->end - 2 ; p->cur++) + { + if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>') + { + p->cur+= 3; + a->end= p->cur; + break; + } + } + lex= MY_XML_CDATA; + } else if (strchr("?=/<>!",p->cur[0])) { p->cur++; @@ -98,25 +166,32 @@ static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a) } else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') ) { + /* + "string" or 'string' found. + Scan until the closing quote/doublequote, or until the END-OF-INPUT. + */ p->cur++; - for( ; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++) + for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++) {} a->end=p->cur; - if (a->beg[0] == p->cur[0])p->cur++; + if (p->cur < p->end) /* Closing quote or doublequote has been found */ + p->cur++; a->beg++; - my_xml_norm_text(a); + if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION)) + my_xml_norm_text(a); lex=MY_XML_STRING; } - else + else if (my_xml_is_id0(p->cur[0])) { - for(; - (p->cur < p->end) && !strchr("?'\"=/<> \t\r\n", p->cur[0]); - p->cur++) - {} + p->cur++; + while (p->cur < p->end && my_xml_is_id1(p->cur[0])) + p->cur++; a->end=p->cur; my_xml_norm_text(a); lex=MY_XML_IDENT; } + else + lex= MY_XML_UNKNOWN; #if 0 printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg); @@ -127,32 +202,35 @@ ret: } -static int my_xml_value(MY_XML_PARSER *st, const char *str, uint len) +static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len) { return (st->value) ? (st->value)(st,str,len) : MY_XML_OK; } -static int my_xml_enter(MY_XML_PARSER *st, const char *str, uint len) +static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len) { - if ((uint) (st->attrend-st->attr+len+1) > sizeof(st->attr)) + if ((size_t) (st->attrend-st->attr+len+1) > sizeof(st->attr)) { sprintf(st->errstr,"To deep XML"); return MY_XML_ERROR; } if (st->attrend > st->attr) { - st->attrend[0]='.'; + st->attrend[0]= '/'; st->attrend++; } memcpy(st->attrend,str,len); st->attrend+=len; st->attrend[0]='\0'; - return st->enter ? st->enter(st,st->attr, (uint) (st->attrend - st->attr)) : MY_XML_OK; + if (st->flags & MY_XML_FLAG_RELATIVE_NAMES) + return st->enter ? st->enter(st, str, len) : MY_XML_OK; + else + return st->enter ? st->enter(st,st->attr,st->attrend-st->attr) : MY_XML_OK; } -static void mstr(char *s,const char *src,uint l1, uint l2) +static void mstr(char *s,const char *src,size_t l1, size_t l2) { l1 = l1<l2 ? l1 : l2; memcpy(s,src,l1); @@ -160,27 +238,36 @@ static void mstr(char *s,const char *src,uint l1, uint l2) } -static int my_xml_leave(MY_XML_PARSER *p, const char *str, uint slen) +static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen) { char *e; - uint glen; + size_t glen; char s[32]; char g[32]; int rc; - - /* Find previous '.' or beginning */ - for( e=p->attrend; (e>p->attr) && (e[0] != '.') ; e--); - glen = (uint) ((e[0] == '.') ? (p->attrend-e-1) : p->attrend-e); + + /* Find previous '/' or beginning */ + for (e=p->attrend; (e>p->attr) && (e[0] != '/') ; e--); + glen = (size_t) ((e[0] == '/') ? (p->attrend-e-1) : p->attrend-e); if (str && (slen != glen)) { mstr(s,str,sizeof(s)-1,slen); - mstr(g,e+1,sizeof(g)-1,glen), - sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g); + if (glen) + { + mstr(g,e+1,sizeof(g)-1,glen), + sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g); + } + else + sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s); return MY_XML_ERROR; } - rc = p->leave_xml ? p->leave_xml(p,p->attr, (uint) (p->attrend - p->attr)) : MY_XML_OK; + if (p->flags & MY_XML_FLAG_RELATIVE_NAMES) + rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK; + else + rc= (p->leave_xml ? p->leave_xml(p,p->attr,p->attrend-p->attr) : + MY_XML_OK); *e='\0'; p->attrend=e; @@ -189,7 +276,7 @@ static int my_xml_leave(MY_XML_PARSER *p, const char *str, uint slen) } -int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len) +int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len) { p->attrend=p->attr; p->beg=str; @@ -208,7 +295,13 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len) lex=my_xml_scan(p,&a); if (MY_XML_COMMENT == lex) + continue; + + if (lex == MY_XML_CDATA) { + a.beg+= 9; + a.end-= 3; + my_xml_value(p, a.beg, (size_t) (a.end-a.beg)); continue; } @@ -218,10 +311,10 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len) { if (MY_XML_IDENT != (lex=my_xml_scan(p,&a))) { - sprintf(p->errstr,"1: %s unexpected (ident wanted)",lex2str(lex)); + sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex)); return MY_XML_ERROR; } - if (MY_XML_OK != my_xml_leave(p,a.beg,(uint) (a.end-a.beg))) + if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))) return MY_XML_ERROR; lex=my_xml_scan(p,&a); goto gt; @@ -240,18 +333,19 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len) if (MY_XML_IDENT == lex) { - if (MY_XML_OK != my_xml_enter(p,a.beg,(uint) (a.end-a.beg))) + p->current_node_type= MY_XML_NODE_TAG; + if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) return MY_XML_ERROR; } else { - sprintf(p->errstr,"3: %s unexpected (ident or '/' wanted)", + sprintf(p->errstr,"%s unexpected (ident or '/' wanted)", lex2str(lex)); return MY_XML_ERROR; } while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) || - (MY_XML_STRING == lex)) + ((MY_XML_STRING == lex && exclam))) { MY_XML_ATTR b; if (MY_XML_EQ == (lex=my_xml_scan(p,&b))) @@ -259,24 +353,35 @@ int my_xml_parse(MY_XML_PARSER *p,const char *str, uint len) lex=my_xml_scan(p,&b); if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) ) { - if ((MY_XML_OK != my_xml_enter(p,a.beg,(uint) (a.end-a.beg))) || - (MY_XML_OK != my_xml_value(p,b.beg,(uint) (b.end-b.beg))) || - (MY_XML_OK != my_xml_leave(p,a.beg,(uint) (a.end-a.beg)))) + p->current_node_type= MY_XML_NODE_ATTR; + if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) || + (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) || + (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))) return MY_XML_ERROR; } else { - sprintf(p->errstr,"4: %s unexpected (ident or string wanted)", + sprintf(p->errstr,"%s unexpected (ident or string wanted)", lex2str(lex)); return MY_XML_ERROR; } } - else if ((MY_XML_STRING == lex) || (MY_XML_IDENT == lex)) + else if (MY_XML_IDENT == lex) { - if ((MY_XML_OK != my_xml_enter(p,a.beg,(uint) (a.end-a.beg))) || - (MY_XML_OK != my_xml_leave(p,a.beg,(uint) (a.end-a.beg)))) + p->current_node_type= MY_XML_NODE_ATTR; + if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) || + (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))) return MY_XML_ERROR; } + else if ((MY_XML_STRING == lex) && exclam) + { + /* + We are in <!DOCTYPE>, e.g. + <!DOCTYPE name SYSTEM "SystemLiteral"> + <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral"> + Just skip "SystemLiteral" and "PublicidLiteral" + */ + } else break; } @@ -293,7 +398,7 @@ gt: { if (lex != MY_XML_QUESTION) { - sprintf(p->errstr,"6: %s unexpected ('?' wanted)",lex2str(lex)); + sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex)); return MY_XML_ERROR; } if (MY_XML_OK != my_xml_leave(p,NULL,0)) @@ -309,7 +414,7 @@ gt: if (lex != MY_XML_GT) { - sprintf(p->errstr,"5: %s unexpected ('>' wanted)",lex2str(lex)); + sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex)); return MY_XML_ERROR; } } @@ -319,13 +424,20 @@ gt: for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++); a.end=p->cur; - my_xml_norm_text(&a); + if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION)) + my_xml_norm_text(&a); if (a.beg != a.end) { - my_xml_value(p,a.beg,(uint) (a.end-a.beg)); + my_xml_value(p,a.beg,(size_t) (a.end-a.beg)); } } } + + if (p->attr[0]) + { + sprintf(p->errstr,"unexpected END-OF-INPUT"); + return MY_XML_ERROR; + } return MY_XML_OK; } @@ -343,14 +455,14 @@ void my_xml_parser_free(MY_XML_PARSER *p __attribute__((unused))) void my_xml_set_value_handler(MY_XML_PARSER *p, int (*action)(MY_XML_PARSER *p, const char *s, - uint l)) + size_t l)) { p->value=action; } void my_xml_set_enter_handler(MY_XML_PARSER *p, int (*action)(MY_XML_PARSER *p, const char *s, - uint l)) + size_t l)) { p->enter=action; } @@ -358,7 +470,7 @@ void my_xml_set_enter_handler(MY_XML_PARSER *p, void my_xml_set_leave_handler(MY_XML_PARSER *p, int (*action)(MY_XML_PARSER *p, const char *s, - uint l)) + size_t l)) { p->leave_xml=action; } @@ -376,7 +488,7 @@ const char *my_xml_error_string(MY_XML_PARSER *p) } -uint my_xml_error_pos(MY_XML_PARSER *p) +size_t my_xml_error_pos(MY_XML_PARSER *p) { const char *beg=p->beg; const char *s; @@ -385,7 +497,7 @@ uint my_xml_error_pos(MY_XML_PARSER *p) if (s[0] == '\n') beg=s; } - return (uint) (p->cur-beg); + return (size_t) (p->cur-beg); } uint my_xml_error_lineno(MY_XML_PARSER *p) |