1 files changed, 315 insertions, 0 deletions
diff --git a/libfsm/xmlscan.rl b/libfsm/xmlscan.rl
new file mode 100644
index 00000000..4e9ee4e2
--- /dev/null
+++ b/libfsm/xmlscan.rl
@@ -0,0 +1,315 @@
+/*
+ * Copyright 2001-2007 Adrian Thurston <thurston@colm.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <iostream>
+#include <string.h>
+#include "vector.h"
+#include "xmlparse.h"
+
+using std::istream;
+using std::cout;
+using std::cerr;
+using std::endl;
+
+%%{
+	machine XmlScanner;
+	write data;
+}%%
+
+class Perfect_Hash
+{
+private:
+	static inline unsigned int hash (const char *str, unsigned int len);
+
+public:
+	static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
+};
+
+XmlScanner::XmlScanner( const char *fileName, istream &input ) : 
+	fileName(fileName),
+	input(input), 
+	curline(1), 
+	curcol(1),
+	p(0), pe(0), 
+	done(false),
+	data(0), data_len(0),
+	value(0)
+{
+	%%{
+		machine XmlScanner;
+		write init;
+	}%%
+}
+
+#define TK_NO_TOKEN (-1)
+#define TK_ERR 1
+#define TK_SPACE 2
+#define TK_EOF 3
+#define TK_OpenTag 4
+#define TK_CloseTag 5
+
+#define ret_tok( _tok ) token = (_tok); data = ts
+
+void XmlScanner::adjustAttrPointers( int distance )
+{
+	for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
+		attr->id -= distance;
+		attr->value -= distance;
+	}
+}
+
+/* There is no claim that this is a proper XML parser, but it is good
+ * enough for our purposes. */
+%%{
+	machine XmlScanner;
+
+	action colup { curcol++; }
+	action start_tok { token_col = curcol; token_line = curline; }
+	NL = '\n' @{ curcol = 0; curline++; };
+
+	WS = [\r\t ] | NL;
+	id = [_a-zA-Z][_a-zA-Z0-9]*;
+	literal = '"' ( [^"] | NL )* '"';
+
+	# Attribute identifiers.
+	action start_attr_id { attr_id_start = p; }
+	action leave_attr_id { attr_id_len = p - attr_id_start; }
+
+	attr_id = id >start_attr_id %leave_attr_id;
+
+	# Attribute values
+	action start_attr_value { attr_value_start = p; }
+	action leave_attr_value
+	{
+		attr_value_len = p - attr_value_start;
+
+		AttrMarker newAttr;
+		newAttr.id = attr_id_start;
+		newAttr.idLen = attr_id_len;
+		newAttr.value = attr_value_start;
+		newAttr.valueLen = attr_value_len;
+		attrMkList.append( newAttr );
+	}
+
+	attr_value = literal >start_attr_value %leave_attr_value;
+
+	# Attribute list. 
+	attribute = attr_id WS* '=' WS* attr_value WS*;
+
+	# Tag identifiers.
+	action tag_id_start { tag_id_start = p; }
+	action leave_tag_id { tag_id_len = p - tag_id_start; }
+
+	tag_id = id >tag_id_start %leave_tag_id;
+
+	main := |*
+		# Tags
+		( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup 
+			=> { ret_tok( TK_OpenTag ); fbreak; };
+
+		( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup 
+			=> { ret_tok( TK_CloseTag ); fbreak; };
+
+		# Data in between tags.
+		( [^<&\0] | NL ) $colup 
+			=> { buffer.append( *p ); };
+
+		# Specials.
+		"&amp;" $colup
+			=> { buffer.append( '&' ); };
+		"&lt;" $colup
+			=> { buffer.append( '<' ); };
+		"&gt;" $colup
+			=> { buffer.append( '>' ); };
+		
+		# EOF
+		0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
+
+	*|;
+}%%
+
+int XmlScanner::scan( )
+{
+	int token = TK_NO_TOKEN;
+	int space = 0, readlen = 0;
+	char *attr_id_start = 0;
+	char *attr_value_start = 0;
+	int attr_id_len = 0;
+	int attr_value_len = 0;
+
+	attrMkList.empty();
+	buffer.clear();
+
+	while ( 1 ) {
+		if ( p == pe ) {
+			//printf("scanner: need more data\n");
+
+			if ( ts == 0 )
+				have = 0;
+			else {
+				/* There is data that needs to be shifted over. */
+				//printf("scanner: buffer broken mid token\n");
+				have = pe - ts;
+				memmove( buf, ts, have );
+
+				int distance = ts - buf;
+				te -= distance;
+				tag_id_start -= distance;
+				attr_id_start -= distance;
+				attr_value_start -= distance;
+				adjustAttrPointers( distance );
+				ts = buf;
+			}
+
+			p = buf + have;
+			space = XML_BUFSIZE - have;
+
+			if ( space == 0 ) {
+				/* We filled up the buffer trying to scan a token. */
+				return TK_SPACE;
+			}
+
+			if ( done ) {
+				//printf("scanner: end of file\n");
+				p[0] = 0;
+				readlen = 1;
+			}
+			else {
+				input.read( p, space );
+				readlen = input.gcount();
+				if ( input.eof() ) {
+					//printf("scanner: setting done flag\n");
+					done = 1;
+				}
+			}
+
+			pe = p + readlen;
+		}
+
+		%% write exec;
+
+		if ( cs == XmlScanner_error )
+			return TK_ERR;
+
+		if ( token != TK_NO_TOKEN ) {
+			data_len = p - data;
+			return token;
+		}
+	}
+}
+
+int xml_parse( std::istream &input, const char *fileName, 
+		bool outputActive, bool wantComplete, 
+		XmlScanner &scanner, XmlParser &parser )
+{
+	while ( 1 ) {
+		int token = scanner.scan();
+		if ( token == TK_NO_TOKEN ) {
+			cerr << "xmlscan: interal error: scanner returned NO_TOKEN" << endl;
+			exit(1);
+		}
+		else if ( token == TK_EOF ) {
+			parser.token( XmlParser_tk_eof, scanner.token_col, scanner.token_line );
+			break;
+		}
+		else if ( token == TK_ERR ) {
+			scanner.error() << "scanner error" << endl;
+			break;
+		}
+		else if ( token == TK_SPACE ) {
+			scanner.error() << "scanner is out of buffer space" << endl;
+			break;
+		}
+		else {
+			/* All other tokens are either open or close tags. */
+			XMLTagHashPair *tagId = Perfect_Hash::in_word_set( 
+					scanner.tag_id_start, scanner.tag_id_len );
+
+			XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ? 
+					XMLTag::Open : XMLTag::Close );
+
+			if ( tagId != 0 ) {
+				/* Get attributes for open tags. */
+				if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
+					tag->attrList = new AttrList;
+					for ( AttrMkList::Iter attr = scanner.attrMkList; 
+							attr.lte(); attr++ )
+					{
+						Attribute newAttr;
+						newAttr.id = new char[attr->idLen+1];
+						memcpy( newAttr.id, attr->id, attr->idLen );
+						newAttr.id[attr->idLen] = 0;
+
+						/* Exclude the surrounding quotes. */
+						newAttr.value = new char[attr->valueLen-1];
+						memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
+						newAttr.value[attr->valueLen-2] = 0;
+
+						tag->attrList->append( newAttr );
+					}
+				}
+
+				/* Get content for closing tags. */
+				if ( token == TK_CloseTag ) {
+					switch ( tagId->id ) {
+					case TAG_host: case TAG_arg:
+					case TAG_t: case TAG_alphtype:
+					case TAG_text: case TAG_goto:
+					case TAG_call: case TAG_next:
+					case TAG_entry: case TAG_set_tokend:
+					case TAG_set_act: case TAG_start_state:
+					case TAG_error_state: case TAG_state_actions: 
+					case TAG_action_table: case TAG_cond_space: 
+					case TAG_c: case TAG_ex: case TAG_eof_t:
+						tag->content = new char[scanner.buffer.length+1];
+						memcpy( tag->content, scanner.buffer.data,
+								scanner.buffer.length );
+						tag->content[scanner.buffer.length] = 0;
+						break;
+					}
+				}
+			}
+
+			#if 0
+			cerr << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
+					": " << (tag->tagId != 0 ? tag->tagId->name : "<unknown>") << endl;
+			if ( tag->attrList != 0 ) {
+				for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
+					cerr << "    " << attr->id << ": " << attr->value << endl;
+			}
+			if ( tag->content != 0 )
+				cerr << "    content: " << tag->content << endl;
+			#endif
+
+			parser.token( tag, scanner.token_col, scanner.token_line );
+		}
+	}
+
+	return 0;
+}
+
+std::ostream &XmlScanner::error()
+{
+	gblErrorCount += 1;
+	cerr << fileName << ":" << curline << ":" << curcol << ": ";
+	return cerr;
+}