From f653735830d537715f2885bd832cf04851d35401 Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Sat, 14 Mar 2020 15:29:52 +0200 Subject: moved source files into commit repository --- src/lmscan.rl | 637 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 637 insertions(+) create mode 100644 src/lmscan.rl (limited to 'src/lmscan.rl') diff --git a/src/lmscan.rl b/src/lmscan.rl new file mode 100644 index 00000000..231e2689 --- /dev/null +++ b/src/lmscan.rl @@ -0,0 +1,637 @@ +/* + * Copyright 2006-2012 Adrian Thurston + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "global.h" +#include "lmscan.h" +#include "lmparse.h" +#include "parsedata.h" +#include "avltree.h" +#include "vector.h" + +//#define PRINT_TOKENS + +using std::ifstream; +using std::istream; +using std::ostream; +using std::cout; +using std::cerr; +using std::endl; + +%%{ + machine section_parse; + alphtype int; + write data; +}%% + +void ColmScanner::sectionParseInit() +{ + %% write init; +} + +ostream &ColmScanner::scan_error() +{ + /* Maintain the error count. */ + gblErrorCount += 1; + cerr << fileName << ":" << line << ":" << column << ": "; + return cerr; +} + +bool ColmScanner::recursiveInclude( const char *inclFileName ) +{ + for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) { + if ( strcmp( si->fileName, inclFileName ) == 0 ) + return true; + } + return false; +} + +void ColmScanner::updateCol() +{ + char *from = lastnl; + if ( from == 0 ) + from = ts; + //cerr << "adding " << te - from << " to column" << endl; + column += te - from; + lastnl = 0; +} + +void ColmScanner::token( int type, char c ) +{ + token( type, &c, &c + 1 ); +} + +void ColmScanner::token( int type ) +{ + token( type, 0, 0 ); +} + +bool isAbsolutePath( const char *path ) +{ + return path[0] == '/'; +} + +ifstream *ColmScanner::tryOpenInclude( char **pathChecks, long &found ) +{ + char **check = pathChecks; + ifstream *inFile = new ifstream; + + while ( *check != 0 ) { + inFile->open( *check ); + if ( inFile->is_open() ) { + found = check - pathChecks; + return inFile; + } + check += 1; + } + + found = -1; + delete inFile; + return 0; +} + +char **ColmScanner::makeIncludePathChecks( const char *thisFileName, const char *fileName ) +{ + char **checks = 0; + long nextCheck = 0; + char *data = strdup(fileName); + long length = strlen(fileName); + + /* Absolute path? */ + if ( isAbsolutePath( data ) ) { + checks = new char*[2]; + checks[nextCheck++] = data; + } + else { + /* Search from the the location of the current file. */ + checks = new char *[2 + includePaths.length()]; + const char *lastSlash = strrchr( thisFileName, '/' ); + if ( lastSlash == 0 ) + checks[nextCheck++] = data; + else { + long givenPathLen = (lastSlash - thisFileName) + 1; + long checklen = givenPathLen + length; + char *check = new char[checklen+1]; + memcpy( check, thisFileName, givenPathLen ); + memcpy( check+givenPathLen, data, length ); + check[checklen] = 0; + checks[nextCheck++] = check; + } + + /* Search from the include paths given on the command line. */ + for ( ArgsVector::Iter incp = includePaths; incp.lte(); incp++ ) { + long pathLen = strlen( *incp ); + long checkLen = pathLen + 1 + length; + char *check = new char[checkLen+1]; + memcpy( check, *incp, pathLen ); + check[pathLen] = '/'; + memcpy( check+pathLen+1, data, length ); + check[checkLen] = 0; + checks[nextCheck++] = check; + } + } + + checks[nextCheck] = 0; + return checks; +} + + +%%{ + machine section_parse; + import "lmparse.h"; + + action clear_words { word = lit = 0; word_len = lit_len = 0; } + action store_lit { lit = tokdata; lit_len = toklen; } + + action mach_err { scan_error() << "bad machine statement" << endl; } + action incl_err { scan_error() << "bad include statement" << endl; } + action write_err { scan_error() << "bad write statement" << endl; } + + action handle_include + { + String src( lit, lit_len ); + String fileName; + bool unused; + + /* Need a location. */ + InputLoc here; + here.fileName = fileName; + here.line = line; + here.col = column; + + prepareLitString( fileName, unused, src, here ); + char **checks = makeIncludePathChecks( this->fileName, fileName ); + + /* Open the input file for reading. */ + long found = 0; + ifstream *inFile = tryOpenInclude( checks, found ); + if ( inFile == 0 ) { + scan_error() << "include: could not open " << + fileName << " for reading" << endl; + } + else { + /* Only proceed with the include if it was found. */ + if ( recursiveInclude( checks[found] ) ) + scan_error() << "include: this is a recursive include operation" << endl; + + /* Check for a recursive include structure. Add the current file/section + * name then check if what we are including is already in the stack. */ + includeStack.append( IncludeStackItem( checks[found] ) ); + + ColmScanner *scanner = new ColmScanner( fileName, *inFile, parser, includeDepth+1 ); + scanner->scan(); + delete inFile; + + /* Remove the last element (len-1) */ + includeStack.remove( -1 ); + + delete scanner; + } + } + + include_target = + TK_Literal >clear_words @store_lit; + + include_stmt = + ( KW_Include include_target ) @handle_include + <>err incl_err <>eof incl_err; + + action handle_token + { +// cout << Parser_lelNames[type] << " "; +// if ( start != 0 ) { +// cout.write( start, end-start ); +// } +// cout << endl; + + InputLoc loc; + + #ifdef PRINT_TOKENS + cerr << "scanner:" << line << ":" << column << + ": sending token to the parser " << Parser_lelNames[*p]; + cerr << " " << toklen; + if ( tokdata != 0 ) + cerr << " " << tokdata; + cerr << endl; + #endif + + loc.fileName = fileName; + loc.line = line; + loc.col = column; + + if ( tokdata != 0 && tokdata[toklen-1] == '\n' ) + loc.line -= 1; + + parser->token( loc, type, tokdata, toklen ); + } + + # Catch everything else. + everything_else = ^( KW_Include ) @handle_token; + + main := ( + include_stmt | + everything_else + )*; +}%% + +void ColmScanner::token( int type, char *start, char *end ) +{ + char *tokdata = 0; + int toklen = 0; + int *p = &type; + int *pe = &type + 1; + int *eof = 0; + + if ( start != 0 ) { + toklen = end-start; + tokdata = new char[toklen+1]; + memcpy( tokdata, start, toklen ); + tokdata[toklen] = 0; + } + + %%{ + machine section_parse; + write exec; + }%% + + updateCol(); +} + +void ColmScanner::endSection( ) +{ + /* Execute the eof actions for the section parser. */ + /* Probably use: token( -1 ); */ +} + +%%{ + machine lmscan; + + # This is sent by the driver code. + EOF = 0; + + action inc_nl { + lastnl = p; + column = 0; + line++; + } + NL = '\n' @inc_nl; + + # Identifiers, numbers, commetns, and other common things. + ident = ( alpha | '_' ) ( alpha |digit |'_' )*; + number = digit+; + hex_number = '0x' [0-9a-fA-F]+; + + # These literal forms are common to C-like host code and ragel. + s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'"; + d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"'; + + whitespace = [ \t] | NL; + pound_comment = '#' [^\n]* NL; + + or_literal := |* + # Escape sequences in OR expressions. + '\\0' => { token( TK_ReChar, '\0' ); }; + '\\a' => { token( TK_ReChar, '\a' ); }; + '\\b' => { token( TK_ReChar, '\b' ); }; + '\\t' => { token( TK_ReChar, '\t' ); }; + '\\n' => { token( TK_ReChar, '\n' ); }; + '\\v' => { token( TK_ReChar, '\v' ); }; + '\\f' => { token( TK_ReChar, '\f' ); }; + '\\r' => { token( TK_ReChar, '\r' ); }; + '\\\n' => { updateCol(); }; + '\\' any => { token( TK_ReChar, ts+1, te ); }; + + # Range dash in an OR expression. + '-' => { token( TK_Dash, 0, 0 ); }; + + # Terminate an OR expression. + ']' => { token( TK_SqClose ); fret; }; + + EOF => { + scan_error() << "unterminated OR literal" << endl; + }; + + # Characters in an OR expression. + [^\]] => { token( TK_ReChar, ts, te ); }; + + *|; + + regular_type := |* + # Identifiers. + ident => { token( TK_Word, ts, te ); } ; + + # Numbers + number => { token( TK_UInt, ts, te ); }; + hex_number => { token( TK_Hex, ts, te ); }; + + # Literals, with optionals. + ( s_literal | d_literal ) [i]? + => { token( TK_Literal, ts, te ); }; + + '[' => { token( TK_SqOpen ); fcall or_literal; }; + '[^' => { token( TK_SqOpenNeg ); fcall or_literal; }; + + '/' => { token( '/'); fret; }; + + # Ignore. + pound_comment => { updateCol(); }; + + '..' => { token( TK_DotDot ); }; + '**' => { token( TK_StarStar ); }; + '--' => { token( TK_DashDash ); }; + + ':>' => { token( TK_ColonGt ); }; + ':>>' => { token( TK_ColonGtGt ); }; + '<:' => { token( TK_LtColon ); }; + + # Whitespace other than newline. + [ \t\r]+ => { updateCol(); }; + + # If we are in a single line machine then newline may end the spec. + NL => { updateCol(); }; + + # Consume eof. + EOF; + + any => { token( *ts ); } ; + *|; + + literal_pattern := |* + '\\' '0' { litBuf.append( '\0' ); }; + '\\' 'a' { litBuf.append( '\a' ); }; + '\\' 'b' { litBuf.append( '\b' ); }; + '\\' 't' { litBuf.append( '\t' ); }; + '\\' 'n' { litBuf.append( '\n' ); }; + '\\' 'v' { litBuf.append( '\v' ); }; + '\\' 'f' { litBuf.append( '\f' ); }; + '\\' 'r' { litBuf.append( '\r' ); }; + + '\\' any { + litBuf.append( ts[1] ); + }; + '"' => { + if ( litBuf.length > 0 ) { + token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length ); + litBuf.clear(); + } + token( '"' ); + fret; + }; + NL => { + litBuf.append( '\n' ); + token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length ); + litBuf.clear(); + token( '"' ); + fret; + }; + '[' => { + if ( litBuf.length > 0 ) { + token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length ); + litBuf.clear(); + } + token( '[' ); + fcall main; + }; + any => { + litBuf.append( *ts ); + }; + *|; + + # Parser definitions. + main := |* + 'lex' => { token( KW_Lex ); }; + 'commit' => { token( KW_Commit ); }; + 'token' => { token( KW_Token ); }; + 'literal' => { token( KW_Literal ); }; + 'rl' => { token( KW_Rl ); }; + 'def' => { token( KW_Def ); }; + 'ignore' => { token( KW_Ignore ); }; + 'construct' => { token( KW_Construct ); }; + 'cons' => { token( KW_Construct ); }; + 'new' => { token( KW_New ); }; + 'if' => { token( KW_If ); }; + 'reject' => { token( KW_Reject ); }; + 'while' => { token( KW_While ); }; + 'else' => { token( KW_Else ); }; + 'elsif' => { token( KW_Elsif ); }; + 'match' => { token( KW_Match ); }; + 'for' => { token( KW_For ); }; + 'iter' => { token( KW_Iter ); }; + 'prints' => { token( KW_PrintStream ); }; + 'print' => { token( KW_Print ); }; + 'print_xml_ac' => { token( KW_PrintXMLAC ); }; + 'print_xml' => { token( KW_PrintXML ); }; + 'namespace' => { token( KW_Namespace ); }; + 'lex' => { token( KW_Lex ); }; + 'end' => { token( KW_End ); }; + 'map' => { token( KW_Map ); }; + 'list' => { token( KW_List ); }; + 'vector' => { token( KW_Vector ); }; + 'accum' => { token( KW_Parser ); }; + 'parser' => { token( KW_Parser ); }; + 'return' => { token( KW_Return ); }; + 'break' => { token( KW_Break ); }; + 'yield' => { token( KW_Yield ); }; + 'typeid' => { token( KW_TypeId ); }; + 'make_token' => { token( KW_MakeToken ); }; + 'make_tree' => { token( KW_MakeTree ); }; + 'reducefirst' => { token( KW_ReduceFirst ); }; + 'for' => { token( KW_For ); }; + 'in' => { token( KW_In ); }; + 'nil' => { token( KW_Nil ); }; + 'true' => { token( KW_True ); }; + 'false' => { token( KW_False ); }; + 'parse' => { token( KW_Parse ); }; + 'parse_stop' => { token( KW_ParseStop ); }; + 'global' => { token( KW_Global ); }; + 'export' => { token( KW_Export ); }; + 'ptr' => { token( KW_Ptr ); }; + 'ref' => { token( KW_Ref ); }; + 'deref' => { token( KW_Deref ); }; + 'require' => { token( KW_Require ); }; + 'preeof' => { token( KW_Preeof ); }; + 'left' => { token( KW_Left ); }; + 'right' => { token( KW_Right ); }; + 'nonassoc' => { token( KW_Nonassoc ); }; + 'prec' => { token( KW_Prec ); }; + 'include' => { token( KW_Include ); }; + 'context' => { token( KW_Context ); }; + 'alias' => { token( KW_Alias ); }; + 'send' => { token( KW_Send ); }; + 'ni' => { token( KW_Ni ); }; + + # Identifiers. + ident => { token( TK_Word, ts, te ); } ; + + number => { token( TK_Number, ts, te ); }; + + '/' => { + token( '/' ); + if ( parser->enterRl ) + fcall regular_type; + }; + + "~" [^\n]* NL => { + token( '"' ); + token( TK_LitPat, ts+1, te ); + token( '"' ); + }; + + "'" ([^'\\\n] | '\\' (any | NL))* ( "'" | NL ) => { + token( TK_Literal, ts, te ); + }; + + '"' => { + token( '"' ); + litBuf.clear(); + fcall literal_pattern; + }; + '[' => { + token( '[' ); + fcall main; + }; + + ']' => { + token( ']' ); + if ( top > 0 ) + fret; + }; + + # Ignore. + pound_comment => { updateCol(); }; + + '=>' => { token( TK_DoubleArrow ); }; + '==' => { token( TK_DoubleEql ); }; + '!=' => { token( TK_NotEql ); }; + '::' => { token( TK_DoubleColon ); }; + '<=' => { token( TK_LessEql ); }; + '>=' => { token( TK_GrtrEql ); }; + '->' => { token( TK_RightArrow ); }; + '&&' => { token( TK_AmpAmp ); }; + '||' => { token( TK_BarBar ); }; + '<<' => { token( TK_LtLt ); }; + + ( '+' | '-' | '*' | '/' | '(' | ')' | '@' | '$' | '^' ) => { token( *ts ); }; + + + # Whitespace other than newline. + [ \t\r]+ => { updateCol(); }; + NL => { updateCol(); }; + + # Consume eof. + EOF; + + any => { token( *ts ); } ; + *|; +}%% + +%% write data; + +void ColmScanner::scan() +{ + int bufsize = 8; + char *buf = new char[bufsize]; + const char last_char = 0; + int cs, act, have = 0; + int top, stack[32]; + bool execute = true; + + sectionParseInit(); + %% write init; + + while ( execute ) { + char *p = buf + have; + int space = bufsize - have; + + if ( space == 0 ) { + /* We filled up the buffer trying to scan a token. Grow it. */ + bufsize = bufsize * 2; + char *newbuf = new char[bufsize]; + + /* Recompute p and space. */ + p = newbuf + have; + space = bufsize - have; + + /* Patch up pointers possibly in use. */ + if ( ts != 0 ) + ts = newbuf + ( ts - buf ); + te = newbuf + ( te - buf ); + + /* Copy the new buffer in. */ + memcpy( newbuf, buf, have ); + delete[] buf; + buf = newbuf; + } + + input.read( p, space ); + int len = input.gcount(); + + /* If we see eof then append the EOF char. */ + if ( len == 0 ) { + p[0] = last_char, len = 1; + execute = false; + } + + char *pe = p + len; + char *eof = 0; + %% write exec; + + /* Check if we failed. */ + if ( cs == lmscan_error ) { + /* Machine failed before finding a token. I'm not yet sure if this + * is reachable. */ + scan_error() << "colm scanner error (metalanguage)" << endl; + exit(1); + } + + /* Decide if we need to preserve anything. */ + char *preserve = ts; + + /* Now set up the prefix. */ + if ( preserve == 0 ) + have = 0; + else { + /* There is data that needs to be shifted over. */ + have = pe - preserve; + memmove( buf, preserve, have ); + unsigned int shiftback = preserve - buf; + if ( ts != 0 ) + ts -= shiftback; + te -= shiftback; + + preserve = buf; + } + } + delete[] buf; +} + +void ColmScanner::eof() +{ + InputLoc loc; + loc.fileName = ""; + loc.line = line; + loc.col = 1; + parser->token( loc, ColmParser_tk_eof, 0, 0 ); +} -- cgit v1.2.1