moved source files into commit repository

author: Adrian Thurston <thurston@colm.net> 2020-03-14 15:29:52 +0200
committer: Adrian Thurston <thurston@colm.net> 2020-03-14 15:29:52 +0200
commit: f653735830d537715f2885bd832cf04851d35401 (patch)
tree: 95e6551e39407543366d4f49aedf7b78c6e8bbe1 /src/lmscan.rl
parent: bcc54d5df10cf425e7134b06f70d7ffe1abee4e4 (diff)
download: colm-f653735830d537715f2885bd832cf04851d35401.tar.gz
1 files changed, 637 insertions, 0 deletions
diff --git a/src/lmscan.rl b/src/lmscan.rl
new file mode 100644
index 00000000..231e2689
--- /dev/null
+++ b/src/lmscan.rl
@@ -0,0 +1,637 @@
+/*
+ * Copyright 2006-2012 Adrian Thurston <thurston@colm.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string.h>
+
+#include "global.h"
+#include "lmscan.h"
+#include "lmparse.h"
+#include "parsedata.h"
+#include "avltree.h"
+#include "vector.h"
+
+//#define PRINT_TOKENS
+
+using std::ifstream;
+using std::istream;
+using std::ostream;
+using std::cout;
+using std::cerr;
+using std::endl;
+
+%%{
+	machine section_parse;
+	alphtype int;
+	write data;
+}%%
+
+void ColmScanner::sectionParseInit()
+{
+	%% write init;
+}
+
+ostream &ColmScanner::scan_error()
+{
+	/* Maintain the error count. */
+	gblErrorCount += 1;
+	cerr << fileName << ":" << line << ":" << column << ": ";
+	return cerr;
+}
+
+bool ColmScanner::recursiveInclude( const char *inclFileName )
+{
+	for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
+		if ( strcmp( si->fileName, inclFileName ) == 0 )
+			return true;
+	}
+	return false;	
+}
+
+void ColmScanner::updateCol()
+{
+	char *from = lastnl;
+	if ( from == 0 )
+		from = ts;
+	//cerr << "adding " << te - from << " to column" << endl;
+	column += te - from;
+	lastnl = 0;
+}
+
+void ColmScanner::token( int type, char c )
+{
+	token( type, &c, &c + 1 );
+}
+
+void ColmScanner::token( int type )
+{
+	token( type, 0, 0 );
+}
+
+bool isAbsolutePath( const char *path )
+{
+	return path[0] == '/';
+}
+
+ifstream *ColmScanner::tryOpenInclude( char **pathChecks, long &found )
+{
+	char **check = pathChecks;
+	ifstream *inFile = new ifstream;
+	
+	while ( *check != 0 ) {
+		inFile->open( *check );
+		if ( inFile->is_open() ) {
+			found = check - pathChecks;
+			return inFile;
+		}
+		check += 1;
+	}
+
+	found = -1;
+	delete inFile;
+	return 0;
+}
+
+char **ColmScanner::makeIncludePathChecks( const char *thisFileName, const char *fileName )
+{
+	char **checks = 0;
+	long nextCheck = 0;
+	char *data = strdup(fileName);
+	long length = strlen(fileName);
+
+	/* Absolute path? */
+	if ( isAbsolutePath( data ) ) {
+		checks = new char*[2];
+		checks[nextCheck++] = data;
+	}
+	else {
+		/* Search from the the location of the current file. */
+		checks = new char *[2 + includePaths.length()];
+		const char *lastSlash = strrchr( thisFileName, '/' );
+		if ( lastSlash == 0 )
+			checks[nextCheck++] = data;
+		else {
+			long givenPathLen = (lastSlash - thisFileName) + 1;
+			long checklen = givenPathLen + length;
+			char *check = new char[checklen+1];
+			memcpy( check, thisFileName, givenPathLen );
+			memcpy( check+givenPathLen, data, length );
+			check[checklen] = 0;
+			checks[nextCheck++] = check;
+		}
+
+		/* Search from the include paths given on the command line. */
+		for ( ArgsVector::Iter incp = includePaths; incp.lte(); incp++ ) {
+			long pathLen = strlen( *incp );
+			long checkLen = pathLen + 1 + length;
+			char *check = new char[checkLen+1];
+			memcpy( check, *incp, pathLen );
+			check[pathLen] = '/';
+			memcpy( check+pathLen+1, data, length );
+			check[checkLen] = 0;
+			checks[nextCheck++] = check;
+		}
+	}
+
+	checks[nextCheck] = 0;
+	return checks;
+}
+
+
+%%{
+	machine section_parse;
+	import "lmparse.h";
+
+	action clear_words { word = lit = 0; word_len = lit_len = 0; }
+	action store_lit { lit = tokdata; lit_len = toklen; }
+
+	action mach_err { scan_error() << "bad machine statement" << endl; }
+	action incl_err { scan_error() << "bad include statement" << endl; }
+	action write_err { scan_error() << "bad write statement" << endl; }
+
+	action handle_include
+	{
+		String src( lit, lit_len );
+		String fileName;
+		bool unused;
+
+		/* Need a location. */
+		InputLoc here;
+		here.fileName = fileName;
+		here.line = line;
+		here.col = column;
+
+		prepareLitString( fileName, unused, src, here );
+		char **checks = makeIncludePathChecks( this->fileName, fileName );
+
+		/* Open the input file for reading. */
+		long found = 0;
+		ifstream *inFile = tryOpenInclude( checks, found );
+		if ( inFile == 0 ) {
+			scan_error() << "include: could not open " << 
+					fileName << " for reading" << endl;
+		}
+		else {
+			/* Only proceed with the include if it was found. */
+			if ( recursiveInclude( checks[found] ) )
+				scan_error() << "include: this is a recursive include operation" << endl;
+
+			/* Check for a recursive include structure. Add the current file/section
+			 * name then check if what we are including is already in the stack. */
+			includeStack.append( IncludeStackItem( checks[found] ) );
+
+			ColmScanner *scanner = new ColmScanner( fileName, *inFile, parser, includeDepth+1 );
+			scanner->scan();
+			delete inFile;
+
+			/* Remove the last element (len-1) */
+			includeStack.remove( -1 );
+
+			delete scanner;
+		}
+	}
+
+	include_target = 
+		TK_Literal >clear_words @store_lit;
+
+	include_stmt =
+		( KW_Include include_target ) @handle_include
+		<>err incl_err <>eof incl_err;
+
+	action handle_token
+	{
+//	cout << Parser_lelNames[type] << " ";
+//	if ( start != 0 ) {
+//		cout.write( start, end-start );
+//	}
+//	cout << endl;
+
+		InputLoc loc;
+
+		#ifdef PRINT_TOKENS
+		cerr << "scanner:" << line << ":" << column << 
+				": sending token to the parser " << Parser_lelNames[*p];
+		cerr << " " << toklen;
+		if ( tokdata != 0 )
+			cerr << " " << tokdata;
+		cerr << endl;
+		#endif
+
+		loc.fileName = fileName;
+		loc.line = line;
+		loc.col = column;
+
+		if ( tokdata != 0 && tokdata[toklen-1] == '\n' )
+			loc.line -= 1;
+
+		parser->token( loc, type, tokdata, toklen );
+	}
+
+	# Catch everything else.
+	everything_else = ^( KW_Include ) @handle_token;
+
+	main := ( 
+		include_stmt |
+		everything_else
+	)*;
+}%%
+
+void ColmScanner::token( int type, char *start, char *end )
+{
+	char *tokdata = 0;
+	int toklen = 0;
+	int *p = &type;
+	int *pe = &type + 1;
+	int *eof = 0;
+
+	if ( start != 0 ) {
+		toklen = end-start;
+		tokdata = new char[toklen+1];
+		memcpy( tokdata, start, toklen );
+		tokdata[toklen] = 0;
+	}
+
+	%%{
+		machine section_parse;
+		write exec;
+	}%%
+
+	updateCol();
+}
+
+void ColmScanner::endSection( )
+{
+	/* Execute the eof actions for the section parser. */
+	/* Probably use: token( -1 ); */
+}
+
+%%{
+	machine lmscan;
+
+	# This is sent by the driver code.
+	EOF = 0;
+	
+	action inc_nl { 
+		lastnl = p; 
+		column = 0;
+		line++;
+	}
+	NL = '\n' @inc_nl;
+
+	# Identifiers, numbers, commetns, and other common things.
+	ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
+	number = digit+;
+	hex_number = '0x' [0-9a-fA-F]+;
+
+	# These literal forms are common to C-like host code and ragel.
+	s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
+	d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
+
+	whitespace = [ \t] | NL;
+	pound_comment = '#' [^\n]* NL;
+
+	or_literal := |*
+		# Escape sequences in OR expressions.
+		'\\0' => { token( TK_ReChar, '\0' ); };
+		'\\a' => { token( TK_ReChar, '\a' ); };
+		'\\b' => { token( TK_ReChar, '\b' ); };
+		'\\t' => { token( TK_ReChar, '\t' ); };
+		'\\n' => { token( TK_ReChar, '\n' ); };
+		'\\v' => { token( TK_ReChar, '\v' ); };
+		'\\f' => { token( TK_ReChar, '\f' ); };
+		'\\r' => { token( TK_ReChar, '\r' ); };
+		'\\\n' => { updateCol(); };
+		'\\' any => { token( TK_ReChar, ts+1, te ); };
+
+		# Range dash in an OR expression.
+		'-' => { token( TK_Dash, 0, 0 ); };
+
+		# Terminate an OR expression.
+		']'	=> { token( TK_SqClose ); fret; };
+
+		EOF => {
+			scan_error() << "unterminated OR literal" << endl;
+		};
+
+		# Characters in an OR expression.
+		[^\]] => { token( TK_ReChar, ts, te ); };
+
+	*|;
+
+	regular_type := |*
+		# Identifiers.
+		ident => { token( TK_Word, ts, te ); } ;
+
+		# Numbers
+		number => { token( TK_UInt, ts, te ); };
+		hex_number => { token( TK_Hex, ts, te ); };
+
+		# Literals, with optionals.
+		( s_literal | d_literal ) [i]? 
+			=> { token( TK_Literal, ts, te ); };
+
+		'[' => { token( TK_SqOpen ); fcall or_literal; };
+		'[^' => { token( TK_SqOpenNeg ); fcall or_literal; };
+
+		'/' => { token( '/'); fret; };
+
+		# Ignore.
+		pound_comment => { updateCol(); };
+
+		'..' => { token( TK_DotDot ); };
+		'**' => { token( TK_StarStar ); };
+		'--' => { token( TK_DashDash ); };
+
+		':>'  => { token( TK_ColonGt ); };
+		':>>' => { token( TK_ColonGtGt ); };
+		'<:'  => { token( TK_LtColon ); };
+
+		# Whitespace other than newline.
+		[ \t\r]+ => { updateCol(); };
+
+		# If we are in a single line machine then newline may end the spec.
+		NL => { updateCol(); };
+
+		# Consume eof.
+		EOF;
+
+		any => { token( *ts ); } ;
+	*|;
+
+	literal_pattern := |*
+		'\\' '0' { litBuf.append( '\0' ); };
+		'\\' 'a' { litBuf.append( '\a' ); };
+		'\\' 'b' { litBuf.append( '\b' ); };
+		'\\' 't' { litBuf.append( '\t' ); };
+		'\\' 'n' { litBuf.append( '\n' ); };
+		'\\' 'v' { litBuf.append( '\v' ); };
+		'\\' 'f' { litBuf.append( '\f' ); };
+		'\\' 'r' { litBuf.append( '\r' ); };
+
+		'\\' any {
+			litBuf.append( ts[1] );
+		};
+		'"' => {
+			if ( litBuf.length > 0 ) {
+				token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
+				litBuf.clear();
+			}
+			token( '"' );
+			fret;
+		};
+		NL => {
+			litBuf.append( '\n' );
+			token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
+			litBuf.clear();
+			token( '"' );
+			fret;
+		};
+		'[' => { 
+			if ( litBuf.length > 0 ) {
+				token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
+				litBuf.clear();
+			}
+			token( '[' );
+			fcall main;
+		};
+		any => { 
+			litBuf.append( *ts );
+		};
+	*|;
+
+	# Parser definitions. 
+	main := |*
+		'lex' => { token( KW_Lex ); };
+		'commit' => { token( KW_Commit ); };
+		'token' => { token( KW_Token ); };
+		'literal' => { token( KW_Literal ); };
+		'rl' => { token( KW_Rl ); };
+		'def' => { token( KW_Def ); };
+		'ignore' => { token( KW_Ignore ); };
+		'construct' => { token( KW_Construct ); };
+		'cons' => { token( KW_Construct ); };
+		'new' => { token( KW_New ); };
+		'if' => { token( KW_If ); };
+		'reject' => { token( KW_Reject ); };
+		'while' => { token( KW_While ); };
+		'else' => { token( KW_Else ); };
+		'elsif' => { token( KW_Elsif ); };
+		'match' => { token( KW_Match ); };
+		'for' => { token( KW_For ); };
+		'iter' => { token( KW_Iter ); };
+		'prints' => { token( KW_PrintStream ); };
+		'print' => { token( KW_Print ); };
+		'print_xml_ac' => { token( KW_PrintXMLAC ); };
+		'print_xml' => { token( KW_PrintXML ); };
+		'namespace' => { token( KW_Namespace ); };
+		'lex' => { token( KW_Lex ); };
+		'end' => { token( KW_End ); };
+		'map' => { token( KW_Map ); };
+		'list' => { token( KW_List ); };
+		'vector' => { token( KW_Vector ); };
+		'accum' => { token( KW_Parser ); };
+		'parser' => { token( KW_Parser ); };
+		'return' => { token( KW_Return ); };
+		'break' => { token( KW_Break ); };
+		'yield' => { token( KW_Yield ); };
+		'typeid' => { token( KW_TypeId ); };
+		'make_token' => { token( KW_MakeToken ); };
+		'make_tree' => { token( KW_MakeTree ); };
+		'reducefirst' => { token( KW_ReduceFirst ); };
+		'for' => { token( KW_For ); };
+		'in' => { token( KW_In ); };
+		'nil' => { token( KW_Nil ); };
+		'true' => { token( KW_True ); };
+		'false' => { token( KW_False ); };
+		'parse' => { token( KW_Parse ); };
+		'parse_stop' => { token( KW_ParseStop ); };
+		'global' => { token( KW_Global ); };
+		'export' => { token( KW_Export ); };
+		'ptr' => { token( KW_Ptr ); };
+		'ref' => { token( KW_Ref ); };
+		'deref' => { token( KW_Deref ); };
+		'require' => { token( KW_Require ); };
+		'preeof' => { token( KW_Preeof ); };
+		'left' => { token( KW_Left ); };
+		'right' => { token( KW_Right ); };
+		'nonassoc' => { token( KW_Nonassoc ); };
+		'prec' => { token( KW_Prec ); };
+		'include' => { token( KW_Include ); };
+		'context' => { token( KW_Context ); };
+		'alias' => { token( KW_Alias ); };
+		'send' => { token( KW_Send ); };
+		'ni' => { token( KW_Ni ); };
+
+		# Identifiers.
+		ident => { token( TK_Word, ts, te ); } ;
+
+		number => { token( TK_Number, ts, te ); };
+
+		'/' => { 
+			token( '/' ); 
+			if ( parser->enterRl )
+				fcall regular_type;
+		};
+
+		"~" [^\n]* NL => { 
+			token( '"' );
+			token( TK_LitPat, ts+1, te );
+			token( '"' );
+		};
+
+		"'" ([^'\\\n] | '\\' (any | NL))* ( "'" | NL ) => {
+			token( TK_Literal, ts, te );
+		};
+
+		'"' => { 
+			token( '"' );
+			litBuf.clear(); 
+			fcall literal_pattern;
+		};
+		'[' => { 
+			token( '[' ); 
+			fcall main;
+		};
+
+		']' => {
+			token( ']' );
+			if ( top > 0 )
+				fret;
+		};
+
+		# Ignore.
+		pound_comment => { updateCol(); };
+
+		'=>' => { token( TK_DoubleArrow ); };
+		'==' => { token( TK_DoubleEql ); };
+		'!=' => { token( TK_NotEql ); };
+		'::' => { token( TK_DoubleColon ); };
+		'<=' => { token( TK_LessEql ); };
+		'>=' => { token( TK_GrtrEql ); };
+		'->' => { token( TK_RightArrow ); };
+		'&&' => { token( TK_AmpAmp ); };
+		'||' => { token( TK_BarBar ); };
+		'<<' => { token( TK_LtLt ); };
+		
+		( '+' | '-' | '*' | '/' | '(' | ')' | '@' | '$' | '^' ) => { token( *ts ); };
+
+
+		# Whitespace other than newline.
+		[ \t\r]+ => { updateCol(); };
+		NL => { updateCol(); };
+
+		# Consume eof.
+		EOF;
+
+		any => { token( *ts ); } ;
+	*|;
+}%%
+
+%% write data;
+
+void ColmScanner::scan()
+{
+	int bufsize = 8;
+	char *buf = new char[bufsize];
+	const char last_char = 0;
+	int cs, act, have = 0;
+	int top, stack[32];
+	bool execute = true;
+
+	sectionParseInit();
+	%% write init;
+
+	while ( execute ) {
+		char *p = buf + have;
+		int space = bufsize - have;
+
+		if ( space == 0 ) {
+			/* We filled up the buffer trying to scan a token. Grow it. */
+			bufsize = bufsize * 2;
+			char *newbuf = new char[bufsize];
+
+			/* Recompute p and space. */
+			p = newbuf + have;
+			space = bufsize - have;
+
+			/* Patch up pointers possibly in use. */
+			if ( ts != 0 )
+				ts = newbuf + ( ts - buf );
+			te = newbuf + ( te - buf );
+
+			/* Copy the new buffer in. */
+			memcpy( newbuf, buf, have );
+			delete[] buf;
+			buf = newbuf;
+		}
+
+		input.read( p, space );
+		int len = input.gcount();
+
+		/* If we see eof then append the EOF char. */
+	 	if ( len == 0 ) {
+			p[0] = last_char, len = 1;
+			execute = false;
+		}
+
+		char *pe = p + len;
+		char *eof = 0;
+		%% write exec;
+
+		/* Check if we failed. */
+		if ( cs == lmscan_error ) {
+			/* Machine failed before finding a token. I'm not yet sure if this
+			 * is reachable. */
+			scan_error() << "colm scanner error (metalanguage)" << endl;
+			exit(1);
+		}
+
+		/* Decide if we need to preserve anything. */
+		char *preserve = ts;
+
+		/* Now set up the prefix. */
+		if ( preserve == 0 )
+			have = 0;
+		else {
+			/* There is data that needs to be shifted over. */
+			have = pe - preserve;
+			memmove( buf, preserve, have );
+			unsigned int shiftback = preserve - buf;
+			if ( ts != 0 )
+				ts -= shiftback;
+			te -= shiftback;
+
+			preserve = buf;
+		}
+	}
+	delete[] buf;
+}
+
+void ColmScanner::eof()
+{
+	InputLoc loc;
+	loc.fileName = "<EOF>";
+	loc.line = line;
+	loc.col = 1;
+	parser->token( loc, ColmParser_tk_eof, 0, 0 );
+}
author	Adrian Thurston <thurston@colm.net>	2020-03-14 15:29:52 +0200
committer	Adrian Thurston <thurston@colm.net>	2020-03-14 15:29:52 +0200
commit	f653735830d537715f2885bd832cf04851d35401 (patch)
tree	95e6551e39407543366d4f49aedf7b78c6e8bbe1 /src/lmscan.rl
parent	bcc54d5df10cf425e7134b06f70d7ffe1abee4e4 (diff)
download	colm-f653735830d537715f2885bd832cf04851d35401.tar.gz