summaryrefslogtreecommitdiff
path: root/src/lmscan.rl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lmscan.rl')
-rw-r--r--src/lmscan.rl636
1 files changed, 0 insertions, 636 deletions
diff --git a/src/lmscan.rl b/src/lmscan.rl
deleted file mode 100644
index d38df495..00000000
--- a/src/lmscan.rl
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * Copyright 2006-2012 Adrian Thurston <thurston@complang.org>
- */
-
-/* This file is part of Colm.
- *
- * Colm is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Colm is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Colm; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <iostream>
-#include <fstream>
-#include <string.h>
-
-#include "global.h"
-#include "lmscan.h"
-#include "lmparse.h"
-#include "parsedata.h"
-#include "avltree.h"
-#include "vector.h"
-
-//#define PRINT_TOKENS
-
-using std::ifstream;
-using std::istream;
-using std::ostream;
-using std::cout;
-using std::cerr;
-using std::endl;
-
-%%{
- machine section_parse;
- alphtype int;
- write data;
-}%%
-
-void ColmScanner::sectionParseInit()
-{
- %% write init;
-}
-
-ostream &ColmScanner::scan_error()
-{
- /* Maintain the error count. */
- gblErrorCount += 1;
- cerr << fileName << ":" << line << ":" << column << ": ";
- return cerr;
-}
-
-bool ColmScanner::recursiveInclude( const char *inclFileName )
-{
- for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
- if ( strcmp( si->fileName, inclFileName ) == 0 )
- return true;
- }
- return false;
-}
-
-void ColmScanner::updateCol()
-{
- char *from = lastnl;
- if ( from == 0 )
- from = ts;
- //cerr << "adding " << te - from << " to column" << endl;
- column += te - from;
- lastnl = 0;
-}
-
-void ColmScanner::token( int type, char c )
-{
- token( type, &c, &c + 1 );
-}
-
-void ColmScanner::token( int type )
-{
- token( type, 0, 0 );
-}
-
-bool isAbsolutePath( const char *path )
-{
- return path[0] == '/';
-}
-
-ifstream *ColmScanner::tryOpenInclude( char **pathChecks, long &found )
-{
- char **check = pathChecks;
- ifstream *inFile = new ifstream;
-
- while ( *check != 0 ) {
- inFile->open( *check );
- if ( inFile->is_open() ) {
- found = check - pathChecks;
- return inFile;
- }
- check += 1;
- }
-
- found = -1;
- delete inFile;
- return 0;
-}
-
-char **ColmScanner::makeIncludePathChecks( const char *thisFileName, const char *fileName )
-{
- char **checks = 0;
- long nextCheck = 0;
- char *data = strdup(fileName);
- long length = strlen(fileName);
-
- /* Absolute path? */
- if ( isAbsolutePath( data ) ) {
- checks = new char*[2];
- checks[nextCheck++] = data;
- }
- else {
- /* Search from the the location of the current file. */
- checks = new char *[2 + includePaths.length()];
- const char *lastSlash = strrchr( thisFileName, '/' );
- if ( lastSlash == 0 )
- checks[nextCheck++] = data;
- else {
- long givenPathLen = (lastSlash - thisFileName) + 1;
- long checklen = givenPathLen + length;
- char *check = new char[checklen+1];
- memcpy( check, thisFileName, givenPathLen );
- memcpy( check+givenPathLen, data, length );
- check[checklen] = 0;
- checks[nextCheck++] = check;
- }
-
- /* Search from the include paths given on the command line. */
- for ( ArgsVector::Iter incp = includePaths; incp.lte(); incp++ ) {
- long pathLen = strlen( *incp );
- long checkLen = pathLen + 1 + length;
- char *check = new char[checkLen+1];
- memcpy( check, *incp, pathLen );
- check[pathLen] = '/';
- memcpy( check+pathLen+1, data, length );
- check[checkLen] = 0;
- checks[nextCheck++] = check;
- }
- }
-
- checks[nextCheck] = 0;
- return checks;
-}
-
-
-%%{
- machine section_parse;
- import "lmparse.h";
-
- action clear_words { word = lit = 0; word_len = lit_len = 0; }
- action store_lit { lit = tokdata; lit_len = toklen; }
-
- action mach_err { scan_error() << "bad machine statement" << endl; }
- action incl_err { scan_error() << "bad include statement" << endl; }
- action write_err { scan_error() << "bad write statement" << endl; }
-
- action handle_include
- {
- String src( lit, lit_len );
- String fileName;
- bool unused;
-
- /* Need a location. */
- InputLoc here;
- here.fileName = fileName;
- here.line = line;
- here.col = column;
-
- prepareLitString( fileName, unused, src, here );
- char **checks = makeIncludePathChecks( this->fileName, fileName );
-
- /* Open the input file for reading. */
- long found = 0;
- ifstream *inFile = tryOpenInclude( checks, found );
- if ( inFile == 0 ) {
- scan_error() << "include: could not open " <<
- fileName << " for reading" << endl;
- }
- else {
- /* Only proceed with the include if it was found. */
- if ( recursiveInclude( checks[found] ) )
- scan_error() << "include: this is a recursive include operation" << endl;
-
- /* Check for a recursive include structure. Add the current file/section
- * name then check if what we are including is already in the stack. */
- includeStack.append( IncludeStackItem( checks[found] ) );
-
- ColmScanner *scanner = new ColmScanner( fileName, *inFile, output, parser, includeDepth+1 );
- scanner->scan();
- delete inFile;
-
- /* Remove the last element (len-1) */
- includeStack.remove( -1 );
-
- delete scanner;
- }
- }
-
- include_target =
- TK_Literal >clear_words @store_lit;
-
- include_stmt =
- ( KW_Include include_target ) @handle_include
- <>err incl_err <>eof incl_err;
-
- action handle_token
- {
-// cout << Parser_lelNames[type] << " ";
-// if ( start != 0 ) {
-// cout.write( start, end-start );
-// }
-// cout << endl;
-
- InputLoc loc;
-
- #ifdef PRINT_TOKENS
- cerr << "scanner:" << line << ":" << column <<
- ": sending token to the parser " << Parser_lelNames[*p];
- cerr << " " << toklen;
- if ( tokdata != 0 )
- cerr << " " << tokdata;
- cerr << endl;
- #endif
-
- loc.fileName = fileName;
- loc.line = line;
- loc.col = column;
-
- if ( tokdata != 0 && tokdata[toklen-1] == '\n' )
- loc.line -= 1;
-
- parser->token( loc, type, tokdata, toklen );
- }
-
- # Catch everything else.
- everything_else = ^( KW_Include ) @handle_token;
-
- main := (
- include_stmt |
- everything_else
- )*;
-}%%
-
-void ColmScanner::token( int type, char *start, char *end )
-{
- char *tokdata = 0;
- int toklen = 0;
- int *p = &type;
- int *pe = &type + 1;
- int *eof = 0;
-
- if ( start != 0 ) {
- toklen = end-start;
- tokdata = new char[toklen+1];
- memcpy( tokdata, start, toklen );
- tokdata[toklen] = 0;
- }
-
- %%{
- machine section_parse;
- write exec;
- }%%
-
- updateCol();
-}
-
-void ColmScanner::endSection( )
-{
- /* Execute the eof actions for the section parser. */
- /* Probably use: token( -1 ); */
-}
-
-%%{
- machine lmscan;
-
- # This is sent by the driver code.
- EOF = 0;
-
- action inc_nl {
- lastnl = p;
- column = 0;
- line++;
- }
- NL = '\n' @inc_nl;
-
- # Identifiers, numbers, commetns, and other common things.
- ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
- number = digit+;
- hex_number = '0x' [0-9a-fA-F]+;
-
- # These literal forms are common to C-like host code and ragel.
- s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
- d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
-
- whitespace = [ \t] | NL;
- pound_comment = '#' [^\n]* NL;
-
- or_literal := |*
- # Escape sequences in OR expressions.
- '\\0' => { token( TK_ReChar, '\0' ); };
- '\\a' => { token( TK_ReChar, '\a' ); };
- '\\b' => { token( TK_ReChar, '\b' ); };
- '\\t' => { token( TK_ReChar, '\t' ); };
- '\\n' => { token( TK_ReChar, '\n' ); };
- '\\v' => { token( TK_ReChar, '\v' ); };
- '\\f' => { token( TK_ReChar, '\f' ); };
- '\\r' => { token( TK_ReChar, '\r' ); };
- '\\\n' => { updateCol(); };
- '\\' any => { token( TK_ReChar, ts+1, te ); };
-
- # Range dash in an OR expression.
- '-' => { token( TK_Dash, 0, 0 ); };
-
- # Terminate an OR expression.
- ']' => { token( TK_SqClose ); fret; };
-
- EOF => {
- scan_error() << "unterminated OR literal" << endl;
- };
-
- # Characters in an OR expression.
- [^\]] => { token( TK_ReChar, ts, te ); };
-
- *|;
-
- regular_type := |*
- # Identifiers.
- ident => { token( TK_Word, ts, te ); } ;
-
- # Numbers
- number => { token( TK_UInt, ts, te ); };
- hex_number => { token( TK_Hex, ts, te ); };
-
- # Literals, with optionals.
- ( s_literal | d_literal ) [i]?
- => { token( TK_Literal, ts, te ); };
-
- '[' => { token( TK_SqOpen ); fcall or_literal; };
- '[^' => { token( TK_SqOpenNeg ); fcall or_literal; };
-
- '/' => { token( '/'); fret; };
-
- # Ignore.
- pound_comment => { updateCol(); };
-
- '..' => { token( TK_DotDot ); };
- '**' => { token( TK_StarStar ); };
- '--' => { token( TK_DashDash ); };
-
- ':>' => { token( TK_ColonGt ); };
- ':>>' => { token( TK_ColonGtGt ); };
- '<:' => { token( TK_LtColon ); };
-
- # Whitespace other than newline.
- [ \t\r]+ => { updateCol(); };
-
- # If we are in a single line machine then newline may end the spec.
- NL => { updateCol(); };
-
- # Consume eof.
- EOF;
-
- any => { token( *ts ); } ;
- *|;
-
- literal_pattern := |*
- '\\' '0' { litBuf.append( '\0' ); };
- '\\' 'a' { litBuf.append( '\a' ); };
- '\\' 'b' { litBuf.append( '\b' ); };
- '\\' 't' { litBuf.append( '\t' ); };
- '\\' 'n' { litBuf.append( '\n' ); };
- '\\' 'v' { litBuf.append( '\v' ); };
- '\\' 'f' { litBuf.append( '\f' ); };
- '\\' 'r' { litBuf.append( '\r' ); };
-
- '\\' any {
- litBuf.append( ts[1] );
- };
- '"' => {
- if ( litBuf.length > 0 ) {
- token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
- litBuf.clear();
- }
- token( '"' );
- fret;
- };
- NL => {
- litBuf.append( '\n' );
- token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
- litBuf.clear();
- token( '"' );
- fret;
- };
- '[' => {
- if ( litBuf.length > 0 ) {
- token( TK_LitPat, litBuf.data, litBuf.data+litBuf.length );
- litBuf.clear();
- }
- token( '[' );
- fcall main;
- };
- any => {
- litBuf.append( *ts );
- };
- *|;
-
- # Parser definitions.
- main := |*
- 'lex' => { token( KW_Lex ); };
- 'commit' => { token( KW_Commit ); };
- 'token' => { token( KW_Token ); };
- 'literal' => { token( KW_Literal ); };
- 'rl' => { token( KW_Rl ); };
- 'def' => { token( KW_Def ); };
- 'ignore' => { token( KW_Ignore ); };
- 'construct' => { token( KW_Construct ); };
- 'cons' => { token( KW_Construct ); };
- 'new' => { token( KW_New ); };
- 'if' => { token( KW_If ); };
- 'reject' => { token( KW_Reject ); };
- 'while' => { token( KW_While ); };
- 'else' => { token( KW_Else ); };
- 'elsif' => { token( KW_Elsif ); };
- 'match' => { token( KW_Match ); };
- 'for' => { token( KW_For ); };
- 'iter' => { token( KW_Iter ); };
- 'prints' => { token( KW_PrintStream ); };
- 'print' => { token( KW_Print ); };
- 'print_xml_ac' => { token( KW_PrintXMLAC ); };
- 'print_xml' => { token( KW_PrintXML ); };
- 'namespace' => { token( KW_Namespace ); };
- 'lex' => { token( KW_Lex ); };
- 'map' => { token( KW_Map ); };
- 'list' => { token( KW_List ); };
- 'vector' => { token( KW_Vector ); };
- 'accum' => { token( KW_Parser ); };
- 'parser' => { token( KW_Parser ); };
- 'return' => { token( KW_Return ); };
- 'break' => { token( KW_Break ); };
- 'yield' => { token( KW_Yield ); };
- 'typeid' => { token( KW_TypeId ); };
- 'make_token' => { token( KW_MakeToken ); };
- 'make_tree' => { token( KW_MakeTree ); };
- 'reducefirst' => { token( KW_ReduceFirst ); };
- 'for' => { token( KW_For ); };
- 'in' => { token( KW_In ); };
- 'nil' => { token( KW_Nil ); };
- 'true' => { token( KW_True ); };
- 'false' => { token( KW_False ); };
- 'parse' => { token( KW_Parse ); };
- 'parse_stop' => { token( KW_ParseStop ); };
- 'global' => { token( KW_Global ); };
- 'export' => { token( KW_Export ); };
- 'ptr' => { token( KW_Ptr ); };
- 'ref' => { token( KW_Ref ); };
- 'deref' => { token( KW_Deref ); };
- 'require' => { token( KW_Require ); };
- 'preeof' => { token( KW_Preeof ); };
- 'left' => { token( KW_Left ); };
- 'right' => { token( KW_Right ); };
- 'nonassoc' => { token( KW_Nonassoc ); };
- 'prec' => { token( KW_Prec ); };
- 'include' => { token( KW_Include ); };
- 'context' => { token( KW_Context ); };
- 'alias' => { token( KW_Alias ); };
- 'send' => { token( KW_Send ); };
- 'ni' => { token( KW_Ni ); };
- 'ci' => { token( KW_Ci ); };
-
- # Identifiers.
- ident => { token( TK_Word, ts, te ); } ;
-
- number => { token( TK_Number, ts, te ); };
-
- '/' => {
- token( '/' );
- if ( parser->enterRl )
- fcall regular_type;
- };
-
- "~" [^\n]* NL => {
- token( '"' );
- token( TK_LitPat, ts+1, te );
- token( '"' );
- };
-
- "'" ([^'\\\n] | '\\' (any | NL))* ( "'" | NL ) => {
- token( TK_Literal, ts, te );
- };
-
- '"' => {
- token( '"' );
- litBuf.clear();
- fcall literal_pattern;
- };
- '[' => {
- token( '[' );
- fcall main;
- };
-
- ']' => {
- token( ']' );
- if ( top > 0 )
- fret;
- };
-
- # Ignore.
- pound_comment => { updateCol(); };
-
- '=>' => { token( TK_DoubleArrow ); };
- '==' => { token( TK_DoubleEql ); };
- '!=' => { token( TK_NotEql ); };
- '::' => { token( TK_DoubleColon ); };
- '<=' => { token( TK_LessEql ); };
- '>=' => { token( TK_GrtrEql ); };
- '->' => { token( TK_RightArrow ); };
- '&&' => { token( TK_AmpAmp ); };
- '||' => { token( TK_BarBar ); };
- '<<' => { token( TK_LtLt ); };
-
- ( '+' | '-' | '*' | '/' | '(' | ')' | '@' | '$' | '^' ) => { token( *ts ); };
-
-
- # Whitespace other than newline.
- [ \t\r]+ => { updateCol(); };
- NL => { updateCol(); };
-
- # Consume eof.
- EOF;
-
- any => { token( *ts ); } ;
- *|;
-}%%
-
-%% write data;
-
-void ColmScanner::scan()
-{
- int bufsize = 8;
- char *buf = new char[bufsize];
- const char last_char = 0;
- int cs, act, have = 0;
- int top, stack[32];
- bool execute = true;
-
- sectionParseInit();
- %% write init;
-
- while ( execute ) {
- char *p = buf + have;
- int space = bufsize - have;
-
- if ( space == 0 ) {
- /* We filled up the buffer trying to scan a token. Grow it. */
- bufsize = bufsize * 2;
- char *newbuf = new char[bufsize];
-
- /* Recompute p and space. */
- p = newbuf + have;
- space = bufsize - have;
-
- /* Patch up pointers possibly in use. */
- if ( ts != 0 )
- ts = newbuf + ( ts - buf );
- te = newbuf + ( te - buf );
-
- /* Copy the new buffer in. */
- memcpy( newbuf, buf, have );
- delete[] buf;
- buf = newbuf;
- }
-
- input.read( p, space );
- int len = input.gcount();
-
- /* If we see eof then append the EOF char. */
- if ( len == 0 ) {
- p[0] = last_char, len = 1;
- execute = false;
- }
-
- char *pe = p + len;
- char *eof = 0;
- %% write exec;
-
- /* Check if we failed. */
- if ( cs == lmscan_error ) {
- /* Machine failed before finding a token. I'm not yet sure if this
- * is reachable. */
- scan_error() << "colm scanner error (metalanguage)" << endl;
- exit(1);
- }
-
- /* Decide if we need to preserve anything. */
- char *preserve = ts;
-
- /* Now set up the prefix. */
- if ( preserve == 0 )
- have = 0;
- else {
- /* There is data that needs to be shifted over. */
- have = pe - preserve;
- memmove( buf, preserve, have );
- unsigned int shiftback = preserve - buf;
- if ( ts != 0 )
- ts -= shiftback;
- te -= shiftback;
-
- preserve = buf;
- }
- }
- delete[] buf;
-}
-
-void ColmScanner::eof()
-{
- InputLoc loc;
- loc.fileName = "<EOF>";
- loc.line = line;
- loc.col = 1;
- parser->token( loc, ColmParser_tk_eof, 0, 0 );
-}