summaryrefslogtreecommitdiff
path: root/libfsm/xmlscan.rl
diff options
context:
space:
mode:
Diffstat (limited to 'libfsm/xmlscan.rl')
-rw-r--r--libfsm/xmlscan.rl315
1 files changed, 315 insertions, 0 deletions
diff --git a/libfsm/xmlscan.rl b/libfsm/xmlscan.rl
new file mode 100644
index 00000000..4e9ee4e2
--- /dev/null
+++ b/libfsm/xmlscan.rl
@@ -0,0 +1,315 @@
+/*
+ * Copyright 2001-2007 Adrian Thurston <thurston@colm.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <iostream>
+#include <string.h>
+#include "vector.h"
+#include "xmlparse.h"
+
+using std::istream;
+using std::cout;
+using std::cerr;
+using std::endl;
+
+%%{
+ machine XmlScanner;
+ write data;
+}%%
+
+class Perfect_Hash
+{
+private:
+ static inline unsigned int hash (const char *str, unsigned int len);
+
+public:
+ static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
+};
+
+XmlScanner::XmlScanner( const char *fileName, istream &input ) :
+ fileName(fileName),
+ input(input),
+ curline(1),
+ curcol(1),
+ p(0), pe(0),
+ done(false),
+ data(0), data_len(0),
+ value(0)
+{
+ %%{
+ machine XmlScanner;
+ write init;
+ }%%
+}
+
+#define TK_NO_TOKEN (-1)
+#define TK_ERR 1
+#define TK_SPACE 2
+#define TK_EOF 3
+#define TK_OpenTag 4
+#define TK_CloseTag 5
+
+#define ret_tok( _tok ) token = (_tok); data = ts
+
+void XmlScanner::adjustAttrPointers( int distance )
+{
+ for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
+ attr->id -= distance;
+ attr->value -= distance;
+ }
+}
+
+/* There is no claim that this is a proper XML parser, but it is good
+ * enough for our purposes. */
+%%{
+ machine XmlScanner;
+
+ action colup { curcol++; }
+ action start_tok { token_col = curcol; token_line = curline; }
+ NL = '\n' @{ curcol = 0; curline++; };
+
+ WS = [\r\t ] | NL;
+ id = [_a-zA-Z][_a-zA-Z0-9]*;
+ literal = '"' ( [^"] | NL )* '"';
+
+ # Attribute identifiers.
+ action start_attr_id { attr_id_start = p; }
+ action leave_attr_id { attr_id_len = p - attr_id_start; }
+
+ attr_id = id >start_attr_id %leave_attr_id;
+
+ # Attribute values
+ action start_attr_value { attr_value_start = p; }
+ action leave_attr_value
+ {
+ attr_value_len = p - attr_value_start;
+
+ AttrMarker newAttr;
+ newAttr.id = attr_id_start;
+ newAttr.idLen = attr_id_len;
+ newAttr.value = attr_value_start;
+ newAttr.valueLen = attr_value_len;
+ attrMkList.append( newAttr );
+ }
+
+ attr_value = literal >start_attr_value %leave_attr_value;
+
+ # Attribute list.
+ attribute = attr_id WS* '=' WS* attr_value WS*;
+
+ # Tag identifiers.
+ action tag_id_start { tag_id_start = p; }
+ action leave_tag_id { tag_id_len = p - tag_id_start; }
+
+ tag_id = id >tag_id_start %leave_tag_id;
+
+ main := |*
+ # Tags
+ ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup
+ => { ret_tok( TK_OpenTag ); fbreak; };
+
+ ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup
+ => { ret_tok( TK_CloseTag ); fbreak; };
+
+ # Data in between tags.
+ ( [^<&\0] | NL ) $colup
+ => { buffer.append( *p ); };
+
+ # Specials.
+ "&amp;" $colup
+ => { buffer.append( '&' ); };
+ "&lt;" $colup
+ => { buffer.append( '<' ); };
+ "&gt;" $colup
+ => { buffer.append( '>' ); };
+
+ # EOF
+ 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
+
+ *|;
+}%%
+
+int XmlScanner::scan( )
+{
+ int token = TK_NO_TOKEN;
+ int space = 0, readlen = 0;
+ char *attr_id_start = 0;
+ char *attr_value_start = 0;
+ int attr_id_len = 0;
+ int attr_value_len = 0;
+
+ attrMkList.empty();
+ buffer.clear();
+
+ while ( 1 ) {
+ if ( p == pe ) {
+ //printf("scanner: need more data\n");
+
+ if ( ts == 0 )
+ have = 0;
+ else {
+ /* There is data that needs to be shifted over. */
+ //printf("scanner: buffer broken mid token\n");
+ have = pe - ts;
+ memmove( buf, ts, have );
+
+ int distance = ts - buf;
+ te -= distance;
+ tag_id_start -= distance;
+ attr_id_start -= distance;
+ attr_value_start -= distance;
+ adjustAttrPointers( distance );
+ ts = buf;
+ }
+
+ p = buf + have;
+ space = XML_BUFSIZE - have;
+
+ if ( space == 0 ) {
+ /* We filled up the buffer trying to scan a token. */
+ return TK_SPACE;
+ }
+
+ if ( done ) {
+ //printf("scanner: end of file\n");
+ p[0] = 0;
+ readlen = 1;
+ }
+ else {
+ input.read( p, space );
+ readlen = input.gcount();
+ if ( input.eof() ) {
+ //printf("scanner: setting done flag\n");
+ done = 1;
+ }
+ }
+
+ pe = p + readlen;
+ }
+
+ %% write exec;
+
+ if ( cs == XmlScanner_error )
+ return TK_ERR;
+
+ if ( token != TK_NO_TOKEN ) {
+ data_len = p - data;
+ return token;
+ }
+ }
+}
+
+int xml_parse( std::istream &input, const char *fileName,
+ bool outputActive, bool wantComplete,
+ XmlScanner &scanner, XmlParser &parser )
+{
+ while ( 1 ) {
+ int token = scanner.scan();
+ if ( token == TK_NO_TOKEN ) {
+ cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl;
+ exit(1);
+ }
+ else if ( token == TK_EOF ) {
+ parser.token( XmlParser_tk_eof, scanner.token_col, scanner.token_line );
+ break;
+ }
+ else if ( token == TK_ERR ) {
+ scanner.error() << "scanner error" << endl;
+ break;
+ }
+ else if ( token == TK_SPACE ) {
+ scanner.error() << "scanner is out of buffer space" << endl;
+ break;
+ }
+ else {
+ /* All other tokens are either open or close tags. */
+ XMLTagHashPair *tagId = Perfect_Hash::in_word_set(
+ scanner.tag_id_start, scanner.tag_id_len );
+
+ XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ?
+ XMLTag::Open : XMLTag::Close );
+
+ if ( tagId != 0 ) {
+ /* Get attributes for open tags. */
+ if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
+ tag->attrList = new AttrList;
+ for ( AttrMkList::Iter attr = scanner.attrMkList;
+ attr.lte(); attr++ )
+ {
+ Attribute newAttr;
+ newAttr.id = new char[attr->idLen+1];
+ memcpy( newAttr.id, attr->id, attr->idLen );
+ newAttr.id[attr->idLen] = 0;
+
+ /* Exclude the surrounding quotes. */
+ newAttr.value = new char[attr->valueLen-1];
+ memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
+ newAttr.value[attr->valueLen-2] = 0;
+
+ tag->attrList->append( newAttr );
+ }
+ }
+
+ /* Get content for closing tags. */
+ if ( token == TK_CloseTag ) {
+ switch ( tagId->id ) {
+ case TAG_host: case TAG_arg:
+ case TAG_t: case TAG_alphtype:
+ case TAG_text: case TAG_goto:
+ case TAG_call: case TAG_next:
+ case TAG_entry: case TAG_set_tokend:
+ case TAG_set_act: case TAG_start_state:
+ case TAG_error_state: case TAG_state_actions:
+ case TAG_action_table: case TAG_cond_space:
+ case TAG_c: case TAG_ex: case TAG_eof_t:
+ tag->content = new char[scanner.buffer.length+1];
+ memcpy( tag->content, scanner.buffer.data,
+ scanner.buffer.length );
+ tag->content[scanner.buffer.length] = 0;
+ break;
+ }
+ }
+ }
+
+ #if 0
+ cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
+ ": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
+ if ( tag->attrList != 0 ) {
+ for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
+ cerr << " " << attr->id << ": " << attr->value << endl;
+ }
+ if ( tag->content != 0 )
+ cerr << " content: " << tag->content << endl;
+ #endif
+
+ parser.token( tag, scanner.token_col, scanner.token_line );
+ }
+ }
+
+ return 0;
+}
+
+std::ostream &XmlScanner::error()
+{
+ gblErrorCount += 1;
+ cerr << fileName << ":" << curline << ":" << curcol << ": ";
+ return cerr;
+}