diff options
Diffstat (limited to 'libfsm/xmlscan.rl')
-rw-r--r-- | libfsm/xmlscan.rl | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/libfsm/xmlscan.rl b/libfsm/xmlscan.rl new file mode 100644 index 00000000..4e9ee4e2 --- /dev/null +++ b/libfsm/xmlscan.rl @@ -0,0 +1,315 @@ +/* + * Copyright 2001-2007 Adrian Thurston <thurston@colm.net> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <iostream> +#include <string.h> +#include "vector.h" +#include "xmlparse.h" + +using std::istream; +using std::cout; +using std::cerr; +using std::endl; + +%%{ + machine XmlScanner; + write data; +}%% + +class Perfect_Hash +{ +private: + static inline unsigned int hash (const char *str, unsigned int len); + +public: + static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len); +}; + +XmlScanner::XmlScanner( const char *fileName, istream &input ) : + fileName(fileName), + input(input), + curline(1), + curcol(1), + p(0), pe(0), + done(false), + data(0), data_len(0), + value(0) +{ + %%{ + machine XmlScanner; + write init; + }%% +} + +#define TK_NO_TOKEN (-1) +#define TK_ERR 1 +#define TK_SPACE 2 +#define TK_EOF 3 +#define TK_OpenTag 4 +#define TK_CloseTag 5 + +#define ret_tok( _tok ) token = (_tok); data = ts + +void XmlScanner::adjustAttrPointers( int distance ) +{ + for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) { + attr->id -= distance; + attr->value -= distance; + } +} + +/* There is no claim that this is a proper XML parser, but it is good + * enough for our purposes. */ +%%{ + machine XmlScanner; + + action colup { curcol++; } + action start_tok { token_col = curcol; token_line = curline; } + NL = '\n' @{ curcol = 0; curline++; }; + + WS = [\r\t ] | NL; + id = [_a-zA-Z][_a-zA-Z0-9]*; + literal = '"' ( [^"] | NL )* '"'; + + # Attribute identifiers. + action start_attr_id { attr_id_start = p; } + action leave_attr_id { attr_id_len = p - attr_id_start; } + + attr_id = id >start_attr_id %leave_attr_id; + + # Attribute values + action start_attr_value { attr_value_start = p; } + action leave_attr_value + { + attr_value_len = p - attr_value_start; + + AttrMarker newAttr; + newAttr.id = attr_id_start; + newAttr.idLen = attr_id_len; + newAttr.value = attr_value_start; + newAttr.valueLen = attr_value_len; + attrMkList.append( newAttr ); + } + + attr_value = literal >start_attr_value %leave_attr_value; + + # Attribute list. + attribute = attr_id WS* '=' WS* attr_value WS*; + + # Tag identifiers. + action tag_id_start { tag_id_start = p; } + action leave_tag_id { tag_id_len = p - tag_id_start; } + + tag_id = id >tag_id_start %leave_tag_id; + + main := |* + # Tags + ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup + => { ret_tok( TK_OpenTag ); fbreak; }; + + ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup + => { ret_tok( TK_CloseTag ); fbreak; }; + + # Data in between tags. + ( [^<&\0] | NL ) $colup + => { buffer.append( *p ); }; + + # Specials. + "&" $colup + => { buffer.append( '&' ); }; + "<" $colup + => { buffer.append( '<' ); }; + ">" $colup + => { buffer.append( '>' ); }; + + # EOF + 0 >start_tok => { ret_tok( TK_EOF ); fbreak; }; + + *|; +}%% + +int XmlScanner::scan( ) +{ + int token = TK_NO_TOKEN; + int space = 0, readlen = 0; + char *attr_id_start = 0; + char *attr_value_start = 0; + int attr_id_len = 0; + int attr_value_len = 0; + + attrMkList.empty(); + buffer.clear(); + + while ( 1 ) { + if ( p == pe ) { + //printf("scanner: need more data\n"); + + if ( ts == 0 ) + have = 0; + else { + /* There is data that needs to be shifted over. */ + //printf("scanner: buffer broken mid token\n"); + have = pe - ts; + memmove( buf, ts, have ); + + int distance = ts - buf; + te -= distance; + tag_id_start -= distance; + attr_id_start -= distance; + attr_value_start -= distance; + adjustAttrPointers( distance ); + ts = buf; + } + + p = buf + have; + space = XML_BUFSIZE - have; + + if ( space == 0 ) { + /* We filled up the buffer trying to scan a token. */ + return TK_SPACE; + } + + if ( done ) { + //printf("scanner: end of file\n"); + p[0] = 0; + readlen = 1; + } + else { + input.read( p, space ); + readlen = input.gcount(); + if ( input.eof() ) { + //printf("scanner: setting done flag\n"); + done = 1; + } + } + + pe = p + readlen; + } + + %% write exec; + + if ( cs == XmlScanner_error ) + return TK_ERR; + + if ( token != TK_NO_TOKEN ) { + data_len = p - data; + return token; + } + } +} + +int xml_parse( std::istream &input, const char *fileName, + bool outputActive, bool wantComplete, + XmlScanner &scanner, XmlParser &parser ) +{ + while ( 1 ) { + int token = scanner.scan(); + if ( token == TK_NO_TOKEN ) { + cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl; + exit(1); + } + else if ( token == TK_EOF ) { + parser.token( XmlParser_tk_eof, scanner.token_col, scanner.token_line ); + break; + } + else if ( token == TK_ERR ) { + scanner.error() << "scanner error" << endl; + break; + } + else if ( token == TK_SPACE ) { + scanner.error() << "scanner is out of buffer space" << endl; + break; + } + else { + /* All other tokens are either open or close tags. */ + XMLTagHashPair *tagId = Perfect_Hash::in_word_set( + scanner.tag_id_start, scanner.tag_id_len ); + + XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? + XMLTag::Open : XMLTag::Close ); + + if ( tagId != 0 ) { + /* Get attributes for open tags. */ + if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) { + tag->attrList = new AttrList; + for ( AttrMkList::Iter attr = scanner.attrMkList; + attr.lte(); attr++ ) + { + Attribute newAttr; + newAttr.id = new char[attr->idLen+1]; + memcpy( newAttr.id, attr->id, attr->idLen ); + newAttr.id[attr->idLen] = 0; + + /* Exclude the surrounding quotes. */ + newAttr.value = new char[attr->valueLen-1]; + memcpy( newAttr.value, attr->value+1, attr->valueLen-2 ); + newAttr.value[attr->valueLen-2] = 0; + + tag->attrList->append( newAttr ); + } + } + + /* Get content for closing tags. */ + if ( token == TK_CloseTag ) { + switch ( tagId->id ) { + case TAG_host: case TAG_arg: + case TAG_t: case TAG_alphtype: + case TAG_text: case TAG_goto: + case TAG_call: case TAG_next: + case TAG_entry: case TAG_set_tokend: + case TAG_set_act: case TAG_start_state: + case TAG_error_state: case TAG_state_actions: + case TAG_action_table: case TAG_cond_space: + case TAG_c: case TAG_ex: case TAG_eof_t: + tag->content = new char[scanner.buffer.length+1]; + memcpy( tag->content, scanner.buffer.data, + scanner.buffer.length ); + tag->content[scanner.buffer.length] = 0; + break; + } + } + } + + #if 0 + cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") << + ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl; + if ( tag->attrList != 0 ) { + for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ ) + cerr << " " << attr->id << ": " << attr->value << endl; + } + if ( tag->content != 0 ) + cerr << " content: " << tag->content << endl; + #endif + + parser.token( tag, scanner.token_col, scanner.token_line ); + } + } + + return 0; +} + +std::ostream &XmlScanner::error() +{ + gblErrorCount += 1; + cerr << fileName << ":" << curline << ":" << curcol << ": "; + return cerr; +} |