summaryrefslogtreecommitdiff
path: root/src/lmscan.rl
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@colm.net>2020-03-14 15:29:52 +0200
committerAdrian Thurston <thurston@colm.net>2020-03-14 15:29:52 +0200
commitf653735830d537715f2885bd832cf04851d35401 (patch)
tree95e6551e39407543366d4f49aedf7b78c6e8bbe1 /src/lmscan.rl
parentbcc54d5df10cf425e7134b06f70d7ffe1abee4e4 (diff)
downloadcolm-f653735830d537715f2885bd832cf04851d35401.tar.gz
moved source files into commit repository
Diffstat (limited to 'src/lmscan.rl')
-rw-r--r--src/lmscan.rl637
1 files changed, 637 insertions, 0 deletions
diff --git a/src/lmscan.rl b/src/lmscan.rl
new file mode 100644
index 00000000..231e2689
--- /dev/null
+++ b/src/lmscan.rl
@@ -0,0 +1,637 @@
+/*
+ * Copyright 2006-2012 Adrian Thurston <thurston@colm.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string.h>
+
+#include "global.h"
+#include "lmscan.h"
+#include "lmparse.h"
+#include "parsedata.h"
+#include "avltree.h"
+#include "vector.h"
+
+//#define PRINT_TOKENS
+
+using std::ifstream;
+using std::istream;
+using std::ostream;
+using std::cout;
+using std::cerr;
+using std::endl;
+
+%%{
+ machine section_parse;
+ alphtype int;
+ write data;
+}%%
+
+void ColmScanner::sectionParseInit()
+{
+ %% write init;
+}
+
+ostream &ColmScanner::scan_error()
+{
+ /* Maintain the error count. */
+ gblErrorCount += 1;
+ cerr << fileName << ":" << line << ":" << column << ": ";
+ return cerr;
+}
+
+bool ColmScanner::recursiveInclude( const char *inclFileName )
+{
+ for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
+ if ( strcmp( si->fileName, inclFileName ) == 0 )
+ return true;
+ }
+ return false;
+}
+
+void ColmScanner::updateCol()
+{
+ char *from = lastnl;
+ if ( from == 0 )
+ from = ts;
+ //cerr << "adding " << te - from << " to column" << endl;
+ column += te - from;
+ lastnl = 0;
+}
+
+void ColmScanner::token( int type, char c )
+{
+ token( type, &c, &c + 1 );
+}
+
+void ColmScanner::token( int type )
+{
+ token( type, 0, 0 );
+}
+
+bool isAbsolutePath( const char *path )
+{
+ return path[0] == '/';
+}
+
+ifstream *ColmScanner::tryOpenInclude( char **pathChecks, long &found )
+{
+ char **check = pathChecks;
+ ifstream *inFile = new ifstream;
+
+ while ( *check != 0 ) {
+ inFile->open( *check );
+ if ( inFile->is_open() ) {
+ found = check - pathChecks;
+ return inFile;
+ }
+ check += 1;
+ }
+
+ found = -1;
+ delete inFile;
+ return 0;
+}
+
+char **ColmScanner::makeIncludePathChecks( const char *thisFileName, const char *fileName )
+{
+ char **checks = 0;
+ long nextCheck = 0;
+ char *data = strdup(fileName);
+ long length = strlen(fileName);
+
+ /* Absolute path? */
+ if ( isAbsolutePath( data ) ) {
+ checks = new char*[2];
+ checks[nextCheck++] = data;
+ }
+ else {
+ /* Search from the the location of the current file. */
+ checks = new char *[2 + includePaths.length()];
+ const char *lastSlash = strrchr( thisFileName, '/' );
+ if ( lastSlash == 0 )
+ checks[nextCheck++] = data;
+ else {
+ long givenPathLen = (lastSlash - thisFileName) + 1;
+ long checklen = givenPathLen + length;
+ char *check = new char[checklen+1];
+ memcpy( check, thisFileName, givenPathLen );
+ memcpy( check+givenPathLen, data, length );
+ check[checklen] = 0;
+ checks[nextCheck++] = check;
+ }
+
+ /* Search from the include paths given on the command line. */
+ for ( ArgsVector::Iter incp = includePaths; incp.lte(); incp++ ) {
+ long pathLen = strlen( *incp );
+ long checkLen = pathLen + 1 + length;
+ char *check = new char[checkLen+1];
+ memcpy( check, *incp, pathLen );
+ check[pathLen] = '/';
+ memcpy( check+pathLen+1, data, length );
+ check[checkLen] = 0;
+ checks[nextCheck++] = check;
+ }
+ }
+
+ checks[nextCheck] = 0;
+ return checks;
+}
+
+
+%%{
+ machine section_parse;
+ import "lmparse.h";
+
+ action clear_words { word = lit = 0; word_len = lit_len = 0; }
+ action store_lit { lit = tokdata; lit_len = toklen; }
+
+ action mach_err { scan_error() << "bad machine statement" << endl; }
+ action incl_err { scan_error() << "bad include statement" << endl; }
+ action write_err { scan_error() << "bad write statement" << endl; }
+
+ action handle_include
+ {
+ String src( lit, lit_len );
+ String fileName;
+ bool unused;
+
+ /* Need a location. */
+ InputLoc here;
+ here.fileName = fileName;
+ here.line = line;
+ here.col = column;
+
+ prepareLitString( fileName, unused, src, here );
+ char **checks = makeIncludePathChecks( this->fileName, fileName );
+
+ /* Open the input file for reading. */
+ long found = 0;
+ ifstream *inFile = tryOpenInclude( checks, found );
+ if ( inFile == 0 ) {
+ scan_error() << "include: could not open " <<
+ fileName << " for reading" << endl;
+ }
+ else {
+ /* Only proceed with the include if it was found. */
+ if ( recursiveInclude( checks[found] ) )
+ scan_error() << "include: this is a recursive include operation" << endl;
+
+ /* Check for a recursive include structure. Add the current file/section
+ * name then check if what we are including is already in the stack. */
+ includeStack.append( IncludeStackItem( checks[found] ) );
+
+ ColmScanner *scanner = new ColmScanner( fileName, *inFile, parser, includeDepth+1 );
+ scanner->scan();
+ delete inFile;
+
+ /* Remove the last element (len-1) */
+ includeStack.remove( -1 );
+
+ delete scanner;
+ }
+ }
+
+ include_target =
+ TK_Literal >clear_words @store_lit;
+
+ include_stmt =
+ ( KW_Include include_target ) @handle_include
+ <>err incl_err <>eof incl_err;
+
+ action handle_token
+ {
+// cout << Parser_lelNames[type] << " ";
+// if ( start != 0 ) {
+// cout.write( start, end-start );
+// }
+// cout << endl;
+
+ InputLoc loc;
+
+ #ifdef PRINT_TOKENS
+ cerr << "scanner:" << line << ":" << column <<
+ ": sending token to the parser " << Parser_lelNames[*p];
+ cerr << " " << toklen;
+ if ( tokdata != 0 )
+ cerr << " " << tokdata;
+ cerr << endl;
+ #endif
+
+ loc.fileName = fileName;
+ loc.line = line;
+ loc.col = column;
+
+ if ( tokdata != 0 && tokdata[toklen-1] == '\n' )
+ loc.line -= 1;
+
+ parser->token( loc, type, tokdata, toklen );
+ }
+
+ # Catch everything else.
+ everything_else = ^( KW_Include ) @handle_token;
+
+ main := (
+ include_stmt |
+ everything_else
+ )*;
+}%%
+
+void ColmScanner::token( int type, char *start, char *end )
+{
+ char *tokdata = 0;
+ int toklen = 0;
+ int *p = &type;
+ int *pe = &type + 1;
+ int *eof = 0;
+
+ if ( start != 0 ) {
+ toklen = end-start;
+ tokdata = new char[toklen+1];
+ memcpy( tokdata, start, toklen );
+ tokdata[toklen] = 0;
+ }
+
+ %%{
+ machine section_parse;
+ write exec;
+ }%%
+
+ updateCol();
+}
+
+void ColmScanner::endSection( )
+{
+ /* Execute the eof actions for the section parser. */
+ /* Probably use: token( -1 ); */
+}
+
+%%{
+ machine lmscan;
+
+ # This is sent by the driver code.
+ EOF = 0;
+
+ action inc_nl {
+ lastnl = p;
+ column = 0;
+ line++;
+ }
+ NL = '\n' @inc_nl;
+
+ # Identifiers, numbers, commetns, and other common things.
+ ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
+ number = digit+;
+ hex_number = '0x' [0-9a-fA-F]+;
+
+ # These literal forms are common to C-like host code and ragel.
+ s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
+ d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
+
+ whitespace = [ \t] | NL;
+ pound_comment = '#' [^\n]* NL;
+
+ or_literal := |*
+ # Escape sequences in OR expressions.
+ '\\0' => { token( TK_ReChar, '\0' ); };
+ '\\a' => { token( TK_ReChar, '\a' ); };
+ '\\b' => { token( TK_ReChar, '\b' ); };
+ '\\t' => { token( TK_ReChar, '\t' ); };
+ '\\n' => { token( TK_ReChar, '\n' ); };
+ '\\v' => { token( TK_ReChar, '\v' ); };
+ '\\f' => { token( TK_ReChar, '\f' ); };
+ '\\r' => { token( TK_ReChar, '\r' ); };
+ '\\\n' => { updateCol(); };
+ '\\' any => { token( TK_ReChar, ts+1, te ); };
+
+ # Range dash in an OR expression.
+ '-' => { token( TK_Dash, 0, 0 ); };
+
+ # Terminate an OR expression.
+ ']' => { token( TK_SqClose ); fret; };
+
+ EOF => {
+ scan_error() << "unterminated OR literal" << endl;
+ };
+
+ # Characters in an OR expression.
+ [^\]] => { token( TK_ReChar, ts, te ); };
+
+ *|;
+
+ regular_type := |*
+ # Identifiers.
+ ident => { token( TK_Word, ts, te ); } ;
+
+ # Numbers
+ number => { token( TK_UInt, ts, te ); };
+ hex_number => { token( TK_Hex, ts, te ); };
+
+ # Literals, with optionals.
+ ( s_literal | d_literal ) [i]?
+ => { token( TK_Literal, ts, te ); };
+
+ '[' => { token( TK_SqOpen ); fcall or_literal; };
+ '[^' => { token( TK_SqOpenNeg ); fcall or_literal; };
+
+ '/' => { token( '/'); fret; };
+
+ # Ignore.
+ pound_comment => { updateCol(); };
+
+ '..' => { token( TK_DotDot ); };
+ '**' => { token( TK_StarStar ); };
+ '--' => { token( TK_DashDash ); };
+
+ ':>' => { token( TK_ColonGt ); };
+ ':>>' => { token( TK_ColonGtGt ); };
+ '<:' => { token( TK_LtColon ); };
+
+ # Whitespace other than newline.
+ [ \t\r]+ => { updateCol(); };
+
+ # If we are in a single line machine then newline may end the spec.
+ NL => { updateCol(); };
+
+ # Consume eof.
+ EOF;
+
+ any => { token( *ts ); } ;
+ *|;
+
+ literal_pattern := |*
+ '\\' '0' { litBuf.append( '\0' ); };
+ '\\' 'a' { litBuf.append( '\a' ); };
+ '\\' 'b' { litBuf.append( '\b' ); };
+ '\\' 't' { litBuf.append( '\t' ); };
+ '\\' 'n' { litBuf.append( '\n' ); };
+ '\\' 'v' { litBuf.append( '\v' ); };
+ '\\' 'f' { litBuf.append( '\f' ); };
+ '\\' 'r' { litBuf.append( '\r' ); };
+
+ '\\' any {
+ litBuf.append( ts[1] );
+ };
+ '"' => {
+ if ( litBuf.length > 0 ) {
+ token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
+ litBuf.clear();
+ }
+ token( '"' );
+ fret;
+ };
+ NL => {
+ litBuf.append( '\n' );
+ token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
+ litBuf.clear();
+ token( '"' );
+ fret;
+ };
+ '[' => {
+ if ( litBuf.length > 0 ) {
+ token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
+ litBuf.clear();
+ }
+ token( '[' );
+ fcall main;
+ };
+ any => {
+ litBuf.append( *ts );
+ };
+ *|;
+
+ # Parser definitions.
+ main := |*
+ 'lex' => { token( KW_Lex ); };
+ 'commit' => { token( KW_Commit ); };
+ 'token' => { token( KW_Token ); };
+ 'literal' => { token( KW_Literal ); };
+ 'rl' => { token( KW_Rl ); };
+ 'def' => { token( KW_Def ); };
+ 'ignore' => { token( KW_Ignore ); };
+ 'construct' => { token( KW_Construct ); };
+ 'cons' => { token( KW_Construct ); };
+ 'new' => { token( KW_New ); };
+ 'if' => { token( KW_If ); };
+ 'reject' => { token( KW_Reject ); };
+ 'while' => { token( KW_While ); };
+ 'else' => { token( KW_Else ); };
+ 'elsif' => { token( KW_Elsif ); };
+ 'match' => { token( KW_Match ); };
+ 'for' => { token( KW_For ); };
+ 'iter' => { token( KW_Iter ); };
+ 'prints' => { token( KW_PrintStream ); };
+ 'print' => { token( KW_Print ); };
+ 'print_xml_ac' => { token( KW_PrintXMLAC ); };
+ 'print_xml' => { token( KW_PrintXML ); };
+ 'namespace' => { token( KW_Namespace ); };
+ 'lex' => { token( KW_Lex ); };
+ 'end' => { token( KW_End ); };
+ 'map' => { token( KW_Map ); };
+ 'list' => { token( KW_List ); };
+ 'vector' => { token( KW_Vector ); };
+ 'accum' => { token( KW_Parser ); };
+ 'parser' => { token( KW_Parser ); };
+ 'return' => { token( KW_Return ); };
+ 'break' => { token( KW_Break ); };
+ 'yield' => { token( KW_Yield ); };
+ 'typeid' => { token( KW_TypeId ); };
+ 'make_token' => { token( KW_MakeToken ); };
+ 'make_tree' => { token( KW_MakeTree ); };
+ 'reducefirst' => { token( KW_ReduceFirst ); };
+ 'for' => { token( KW_For ); };
+ 'in' => { token( KW_In ); };
+ 'nil' => { token( KW_Nil ); };
+ 'true' => { token( KW_True ); };
+ 'false' => { token( KW_False ); };
+ 'parse' => { token( KW_Parse ); };
+ 'parse_stop' => { token( KW_ParseStop ); };
+ 'global' => { token( KW_Global ); };
+ 'export' => { token( KW_Export ); };
+ 'ptr' => { token( KW_Ptr ); };
+ 'ref' => { token( KW_Ref ); };
+ 'deref' => { token( KW_Deref ); };
+ 'require' => { token( KW_Require ); };
+ 'preeof' => { token( KW_Preeof ); };
+ 'left' => { token( KW_Left ); };
+ 'right' => { token( KW_Right ); };
+ 'nonassoc' => { token( KW_Nonassoc ); };
+ 'prec' => { token( KW_Prec ); };
+ 'include' => { token( KW_Include ); };
+ 'context' => { token( KW_Context ); };
+ 'alias' => { token( KW_Alias ); };
+ 'send' => { token( KW_Send ); };
+ 'ni' => { token( KW_Ni ); };
+
+ # Identifiers.
+ ident => { token( TK_Word, ts, te ); } ;
+
+ number => { token( TK_Number, ts, te ); };
+
+ '/' => {
+ token( '/' );
+ if ( parser->enterRl )
+ fcall regular_type;
+ };
+
+ "~" [^\n]* NL => {
+ token( '"' );
+ token( TK_LitPat, ts+1, te );
+ token( '"' );
+ };
+
+ "'" ([^'\\\n] | '\\' (any | NL))* ( "'" | NL ) => {
+ token( TK_Literal, ts, te );
+ };
+
+ '"' => {
+ token( '"' );
+ litBuf.clear();
+ fcall literal_pattern;
+ };
+ '[' => {
+ token( '[' );
+ fcall main;
+ };
+
+ ']' => {
+ token( ']' );
+ if ( top > 0 )
+ fret;
+ };
+
+ # Ignore.
+ pound_comment => { updateCol(); };
+
+ '=>' => { token( TK_DoubleArrow ); };
+ '==' => { token( TK_DoubleEql ); };
+ '!=' => { token( TK_NotEql ); };
+ '::' => { token( TK_DoubleColon ); };
+ '<=' => { token( TK_LessEql ); };
+ '>=' => { token( TK_GrtrEql ); };
+ '->' => { token( TK_RightArrow ); };
+ '&&' => { token( TK_AmpAmp ); };
+ '||' => { token( TK_BarBar ); };
+ '<<' => { token( TK_LtLt ); };
+
+ ( '+' | '-' | '*' | '/' | '(' | ')' | '@' | '$' | '^' ) => { token( *ts ); };
+
+
+ # Whitespace other than newline.
+ [ \t\r]+ => { updateCol(); };
+ NL => { updateCol(); };
+
+ # Consume eof.
+ EOF;
+
+ any => { token( *ts ); } ;
+ *|;
+}%%
+
+%% write data;
+
+void ColmScanner::scan()
+{
+ int bufsize = 8;
+ char *buf = new char[bufsize];
+ const char last_char = 0;
+ int cs, act, have = 0;
+ int top, stack[32];
+ bool execute = true;
+
+ sectionParseInit();
+ %% write init;
+
+ while ( execute ) {
+ char *p = buf + have;
+ int space = bufsize - have;
+
+ if ( space == 0 ) {
+ /* We filled up the buffer trying to scan a token. Grow it. */
+ bufsize = bufsize * 2;
+ char *newbuf = new char[bufsize];
+
+ /* Recompute p and space. */
+ p = newbuf + have;
+ space = bufsize - have;
+
+ /* Patch up pointers possibly in use. */
+ if ( ts != 0 )
+ ts = newbuf + ( ts - buf );
+ te = newbuf + ( te - buf );
+
+ /* Copy the new buffer in. */
+ memcpy( newbuf, buf, have );
+ delete[] buf;
+ buf = newbuf;
+ }
+
+ input.read( p, space );
+ int len = input.gcount();
+
+ /* If we see eof then append the EOF char. */
+ if ( len == 0 ) {
+ p[0] = last_char, len = 1;
+ execute = false;
+ }
+
+ char *pe = p + len;
+ char *eof = 0;
+ %% write exec;
+
+ /* Check if we failed. */
+ if ( cs == lmscan_error ) {
+ /* Machine failed before finding a token. I'm not yet sure if this
+ * is reachable. */
+ scan_error() << "colm scanner error (metalanguage)" << endl;
+ exit(1);
+ }
+
+ /* Decide if we need to preserve anything. */
+ char *preserve = ts;
+
+ /* Now set up the prefix. */
+ if ( preserve == 0 )
+ have = 0;
+ else {
+ /* There is data that needs to be shifted over. */
+ have = pe - preserve;
+ memmove( buf, preserve, have );
+ unsigned int shiftback = preserve - buf;
+ if ( ts != 0 )
+ ts -= shiftback;
+ te -= shiftback;
+
+ preserve = buf;
+ }
+ }
+ delete[] buf;
+}
+
+void ColmScanner::eof()
+{
+ InputLoc loc;
+ loc.fileName = "<EOF>";
+ loc.line = line;
+ loc.col = 1;
+ parser->token( loc, ColmParser_tk_eof, 0, 0 );
+}