diff options
Diffstat (limited to 'ragel/rlscan.rl')
-rw-r--r-- | ragel/rlscan.rl | 1193 |
1 files changed, 1193 insertions, 0 deletions
diff --git a/ragel/rlscan.rl b/ragel/rlscan.rl new file mode 100644 index 00000000..f745b9a0 --- /dev/null +++ b/ragel/rlscan.rl @@ -0,0 +1,1193 @@ +/* + * Copyright 2006-2007 Adrian Thurston <thurston@colm.net> + * Copyright 2011 Josef Goettgens + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <iostream> +#include <fstream> +#include <string.h> + +#include "ragel.h" +#include "rlscan.h" +#include "inputdata.h" + +//#define LOG_TOKENS + +using std::ifstream; +using std::istream; +using std::ostream; +using std::endl; + +enum InlineBlockType +{ + CurlyDelimited, + SemiTerminated +}; + +char *newTokdata( int toklen ) +{ + char *tokdata = new char[sizeof(TokHead) + toklen + 1]; + return tokdata + sizeof(TokHead); +} + +void deleteTokdata( char *tokdata ) +{ + if ( tokdata ) + delete[] ( tokdata - sizeof(TokHead) ); +} + +void linkTokdata( Parser6 *parser, char *tokdata ) +{ + TokHead *head = (TokHead*)( tokdata - sizeof(TokHead) ); + head->next = parser->tokHead; + parser->tokHead = head; +} + +void clearTokdata( Parser6 *parser ) +{ + while ( parser->tokHead != 0 ) { + TokHead *next = parser->tokHead->next; + delete[] (char*)parser->tokHead; + parser->tokHead = next; + } +} + +/* + * The Scanner for Importing + */ + +%%{ + machine inline_token_scan; + alphtype int; + access tok_; + + # Import scanner tokens. + import "rlparse.h"; + + main := |* + # Define of number. + IMP_Define IMP_Word IMP_UInt => { + int base = tok_ts - token_data; + int nameOff = 1; + int numOff = 2; + + directToParser( inclToParser, fileName, line, column, TK_Word, + token_strings[base+nameOff], token_lens[base+nameOff] ); + directToParser( inclToParser, fileName, line, column, '=', 0, 0 ); + directToParser( inclToParser, fileName, line, column, TK_UInt, + token_strings[base+numOff], token_lens[base+numOff] ); + directToParser( inclToParser, fileName, line, column, ';', 0, 0 ); + }; + + # Assignment of number. + IMP_Word '=' IMP_UInt => { + int base = tok_ts - token_data; + int nameOff = 0; + int numOff = 2; + + directToParser( inclToParser, fileName, line, column, TK_Word, + token_strings[base+nameOff], token_lens[base+nameOff] ); + directToParser( inclToParser, fileName, line, column, '=', 0, 0 ); + directToParser( inclToParser, fileName, line, column, TK_UInt, + token_strings[base+numOff], token_lens[base+numOff] ); + directToParser( inclToParser, fileName, line, column, ';', 0, 0 ); + }; + + # Define of literal. + IMP_Define IMP_Word IMP_Literal => { + int base = tok_ts - token_data; + int nameOff = 1; + int litOff = 2; + + directToParser( inclToParser, fileName, line, column, TK_Word, + token_strings[base+nameOff], token_lens[base+nameOff] ); + directToParser( inclToParser, fileName, line, column, '=', 0, 0 ); + directToParser( inclToParser, fileName, line, column, TK_Literal, + token_strings[base+litOff], token_lens[base+litOff] ); + directToParser( inclToParser, fileName, line, column, ';', 0, 0 ); + }; + + # Assignment of literal. + IMP_Word '=' IMP_Literal => { + int base = tok_ts - token_data; + int nameOff = 0; + int litOff = 2; + + directToParser( inclToParser, fileName, line, column, TK_Word, + token_strings[base+nameOff], token_lens[base+nameOff] ); + directToParser( inclToParser, fileName, line, column, '=', 0, 0 ); + directToParser( inclToParser, fileName, line, column, TK_Literal, + token_strings[base+litOff], token_lens[base+litOff] ); + directToParser( inclToParser, fileName, line, column, ';', 0, 0 ); + }; + + # Catch everything else. + any; + *|; +}%% + +%% write data; + +void Scanner::flushImport() +{ + int *p = token_data; + int *pe = token_data + cur_token; + int *eof = 0; + + %%{ + machine inline_token_scan; + write init; + write exec; + }%% + + if ( tok_ts == 0 ) + cur_token = 0; + else { + cur_token = pe - tok_ts; + int ts_offset = tok_ts - token_data; + memmove( token_data, token_data+ts_offset, cur_token*sizeof(token_data[0]) ); + memmove( token_strings, token_strings+ts_offset, cur_token*sizeof(token_strings[0]) ); + memmove( token_lens, token_lens+ts_offset, cur_token*sizeof(token_lens[0]) ); + } +} + +void Scanner::directToParser( Parser6 *toParser, const char *tokFileName, int tokLine, + int tokColumn, int type, char *tokdata, int toklen ) +{ + InputLoc loc; + + #ifdef LOG_TOKENS + cerr << "scanner:" << tokLine << ":" << tokColumn << + ": sending token to the parser " << Parser6_lelNames[type]; + cerr << " " << toklen; + if ( tokdata != 0 ) + cerr << " " << tokdata; + cerr << endl; + #endif + + loc.fileName = tokFileName; + loc.line = tokLine; + loc.col = tokColumn; + + toParser->token( loc, type, tokdata, toklen ); +} + +void Scanner::importToken( int token, char *start, char *end ) +{ + if ( cur_token == max_tokens ) + flushImport(); + + token_data[cur_token] = token; + if ( start == 0 ) { + token_strings[cur_token] = 0; + token_lens[cur_token] = 0; + } + else { + int toklen = end-start; + token_lens[cur_token] = toklen; + token_strings[cur_token] = new char[toklen+1]; + memcpy( token_strings[cur_token], start, toklen ); + token_strings[cur_token][toklen] = 0; + } + cur_token++; +} + +void Scanner::pass() +{ + if ( sectionPass ) + return; + + updateCol(); + + /* If no errors and we are at the bottom of the include stack (the + * source file listed on the command line) then write out the data. */ + if ( includeDepth == 0 && id->machineSpec == 0 && id->machineName == 0 ) + id->curItem->data.write( ts, te-ts ); +} + +void Scanner::pass( int token, char *start, char *end ) +{ + if ( sectionPass ) + return; + + if ( importMachines ) + importToken( token, start, end ); + + pass(); +} + +/* + * The scanner for processing sections, includes, imports, etc. + */ + +%%{ + machine section_parse; + alphtype int; + write data; +}%% + +void Scanner::init( ) +{ + %% write init; +} + +bool Scanner::active() +{ + if ( ignoreSection ) + return false; + + if ( parser == 0 && ! parserExistsError ) { + id->error(scan_loc()) << "this specification has no name, nor does any previous" + " specification" << endl; + parserExistsError = true; + } + + if ( parser == 0 ) + return false; + + return true; +} + +InputLoc Scanner::scan_loc() +{ + return makeInputLoc( fileName, line, column ); +} + +void Scanner::updateCol() +{ + char *from = lastnl; + if ( from == 0 ) + from = ts; + column += te - from; + lastnl = 0; +} + +void Scanner::handleMachine() +{ + if ( sectionPass ) { + /* Assign a name to the machine. */ + char *machine = word; + + SectionDictEl *sdEl = id->sectionDict.find( machine ); + if ( sdEl == 0 ) { + sdEl = new SectionDictEl( machine ); + sdEl->value = new Section( machine ); + id->sectionDict.insert( sdEl ); + } + + section = sdEl->value; + } + else { + + /* Assign a name to the machine. */ + char *machine = word; + + if ( !importMachines && inclSectionTarg == 0 ) { + ignoreSection = false; + + ParserDictEl *pdEl = id->parserDict.find( machine ); + if ( pdEl == 0 ) { + pdEl = new ParserDictEl( machine ); + pdEl->value = new Parser6( id, fileName, machine, sectionLoc, + id->hostLang, id->minimizeLevel, id->minimizeOpt ); + pdEl->value->init(); + id->parserDict.insert( pdEl ); + id->parserList.append( pdEl->value ); + + /* Also into the parse data dict. This is the new style. */ + ParseDataDictEl *pddEl = new ParseDataDictEl( machine ); + pddEl->value = pdEl->value->pd; + id->parseDataDict.insert( pddEl ); + id->parseDataList.append( pddEl->value ); + } + + parser = pdEl->value; + } + else if ( !importMachines && strcmp( inclSectionTarg, machine ) == 0 ) { + /* found include target */ + ignoreSection = false; + parser = inclToParser; + } + else { + /* ignoring section */ + ignoreSection = true; + parser = 0; + } + } +} + +void Scanner::handleInclude() +{ + if ( sectionPass ) + return; + + if ( active() ) { + char *inclSectionName = word; + const char **includeChecks = 0; + + /* Implement defaults for the input file and section name. */ + if ( inclSectionName == 0 ) + inclSectionName = parser->sectionName; + + if ( lit != 0 ) { + long length = 0; + bool caseInsensitive = false; + char *data = prepareLitString( id, InputLoc(), lit, lit_len, length, caseInsensitive ); + + includeChecks = parser->pd->id->makeIncludePathChecks( fileName, data ); + } + else { + char *test = new char[strlen(fileName)+1]; + strcpy( test, fileName ); + + includeChecks = new const char*[2]; + + includeChecks[0] = test; + includeChecks[1] = 0; + } + + long found = 0; + ifstream *inFile = parser->pd->id->tryOpenInclude( includeChecks, found ); + if ( inFile == 0 ) { + id->error(scan_loc()) << "include: failed to locate file" << endl; + const char **tried = includeChecks; + while ( *tried != 0 ) + id->error(scan_loc()) << "include: attempted: \"" << *tried++ << '\"' << endl; + } + else { + /* Don't include anything that's already been included. */ + if ( !parser->pd->duplicateInclude( includeChecks[found], inclSectionName ) ) { + parser->pd->includeHistory.push_back( IncludeHistoryItem( + includeChecks[found], inclSectionName ) ); + + Scanner scanner( id, includeChecks[found], *inFile, parser, + inclSectionName, includeDepth+1, false ); + scanner.do_scan( ); + } + + delete inFile; + } + } +} + +void Scanner::handleImport() +{ + if ( sectionPass ) + return; + + if ( active() ) { + long length = 0; + bool caseInsensitive = false; + char *data = prepareLitString( id, InputLoc(), lit, lit_len, length, caseInsensitive ); + + const char **importChecks = parser->pd->id->makeIncludePathChecks( fileName, data ); + + /* Open the input file for reading. */ + long found = 0; + ifstream *inFile = parser->pd->id->tryOpenInclude( importChecks, found ); + if ( inFile == 0 ) { + id->error(scan_loc()) << "import: could not open import file " << + "for reading" << endl; + const char **tried = importChecks; + while ( *tried != 0 ) + id->error(scan_loc()) << "import: attempted: \"" << *tried++ << '\"' << endl; + } + + Scanner scanner( id, importChecks[found], *inFile, parser, + 0, includeDepth+1, true ); + scanner.do_scan( ); + scanner.importToken( 0, 0, 0 ); + scanner.flushImport(); + delete inFile; + } +} + +%%{ + machine section_parse; + + # Need the defines representing tokens. + import "rlparse.h"; + + action clear_words { word = lit = 0; word_len = lit_len = 0; } + action store_word { word = tokdata; word_len = toklen; } + action store_lit { lit = tokdata; lit_len = toklen; } + + action mach_err { id->error(scan_loc()) << "bad machine statement" << endl; } + action incl_err { id->error(scan_loc()) << "bad include statement" << endl; } + action import_err { id->error(scan_loc()) << "bad import statement" << endl; } + action write_err { id->error(scan_loc()) << "bad write statement" << endl; } + + action handle_machine { handleMachine(); } + action handle_include { handleInclude(); } + action handle_import { handleImport(); } + + machine_stmt = + ( KW_Machine TK_Word @store_word ';' ) @handle_machine + <>err mach_err <>eof mach_err; + + include_names = ( + TK_Word @store_word ( TK_Literal @store_lit )? | + TK_Literal @store_lit + ) >clear_words; + + include_stmt = + ( KW_Include include_names ';' ) @handle_include + <>err incl_err <>eof incl_err; + + import_stmt = + ( KW_Import TK_Literal @store_lit ';' ) @handle_import + <>err import_err <>eof import_err; + + action write_command + { + if ( sectionPass ) { + InputItem *inputItem = new InputItem; + inputItem->type = InputItem::Write; + inputItem->loc.fileName = fileName; + inputItem->loc.line = line; + inputItem->loc.col = column; + inputItem->name = section->sectionName; + inputItem->section = section; + + /* Track the last reference. */ + inputItem->section->lastReference = inputItem; + + id->inputItems.append( inputItem ); + } + else { + if ( includeDepth == 0 && active() && + id->machineSpec == 0 && id->machineName == 0 ) + { + id->curItem = id->curItem->next; + id->curItem->pd = parser->pd; + id->curItem->parser = parser; + id->checkLastRef( id->curItem ); + } + } + } + + action write_arg + { + if ( sectionPass ) { + } + else { + if ( active() && id->machineSpec == 0 && id->machineName == 0 ) + id->curItem->writeArgs.push_back( strdup(tokdata) ); + } + } + + action write_close + { + if ( sectionPass ) { + } + else { + /* if ( active() && id->machineSpec == 0 && id->machineName == 0 ) + * id->curItem->writeArgs.append( 0 ); */ + } + } + + write_stmt = + ( KW_Write @write_command + ( TK_Word @write_arg )+ ';' @write_close ) + <>err write_err <>eof write_err; + + action handle_token + { + if ( sectionPass ) { + deleteTokdata( tokdata ); + } + else { + /* Send the token off to the parser. */ + if ( active() ) { + if ( tokdata != 0 ) { + linkTokdata( parser, tokdata ); + } + + directToParser( parser, fileName, line, column, type, tokdata, toklen ); + } + else { + deleteTokdata( tokdata ); + } + } + } + + # Catch everything else. + everything_else = + ^( KW_Machine | KW_Include | KW_Import | KW_Write ) @handle_token; + + main := ( + machine_stmt | + include_stmt | + import_stmt | + write_stmt | + everything_else + )*; +}%% + +void Scanner::token( int type, char c ) +{ + token( type, &c, &c + 1 ); +} + +void Scanner::token( int type ) +{ + token( type, 0, 0 ); +} + + +void Scanner::token( int type, char *start, char *end ) +{ + char *tokdata = 0; + int toklen = 0; + if ( start != 0 ) { + toklen = end-start; + tokdata = newTokdata( toklen + 1 ); + memcpy( tokdata, start, toklen ); + tokdata[toklen] = 0; + } + + processToken( type, tokdata, toklen ); +} + +void Scanner::processToken( int type, char *tokdata, int toklen ) +{ + int *p, *pe, *eof; + + if ( type < 0 ) + p = pe = eof = 0; + else { + p = &type; + pe = &type + 1; + eof = 0; + } + + %%{ + machine section_parse; + write exec; + }%% + + updateCol(); + + /* Record the last token for use in controlling the scan of subsequent + * tokens. */ + lastToken = type; +} + +void Scanner::startSection( ) +{ + parserExistsError = false; + + sectionLoc.fileName = fileName; + sectionLoc.line = line; + sectionLoc.col = column; +} + +void Scanner::endSection( ) +{ + /* Execute the eof actions for the section parser. */ + processToken( -1, 0, 0 ); + + if ( sectionPass ) { + InputItem *inputItem = new InputItem; + inputItem->type = InputItem::EndSection; + inputItem->loc.fileName = fileName; + inputItem->loc.line = line; + inputItem->loc.col = column; + id->inputItems.append( inputItem ); + if ( section != 0 ) { + inputItem->section = section; + section->lastReference = inputItem; + } + + if ( includeDepth == 0 ) { + if ( id->machineSpec == 0 && id->machineName == 0 ) { + /* The end section may include a newline on the end, so + * we use the last line, which will count the newline. */ + InputItem *inputItem = new InputItem; + inputItem->type = InputItem::HostData; + inputItem->loc.fileName = fileName; + inputItem->loc.line = line; + inputItem->loc.col = column; + id->inputItems.append( inputItem ); + } + } + } + else { + /* Close off the section with the parser. */ + if ( includeDepth == 0 && active() ) { + InputLoc loc; + loc.fileName = fileName; + loc.line = line; + loc.col = column; + + parser->token( loc, TK_EndSection, 0, 0 ); + + id->curItem = id->curItem->next; + + if ( parser != 0 ) { + id->curItem->pd = parser->pd; + id->curItem->parser = parser; + } + + id->checkLastRef( id->curItem ); + } + + if ( includeDepth == 0 ) { + if ( id->machineSpec == 0 && id->machineName == 0 ) { + id->curItem = id->curItem->next; + id->checkLastRef( id->curItem ); + } + } + } +} + +%%{ + machine rlscan; + + # This is sent by the driver code. + EOF = 0; + + action inc_nl { + lastnl = p; + column = 0; + line++; + } + NL = '\n' @inc_nl; + + # Identifiers, numbers, commetns, and other common things. + ident = ( alpha | '_' ) ( alpha |digit |'_' )*; + ocaml_ident = ( alpha | '_' ) ( alpha |digit |'_' )* "'"?; + number = digit+; + hex_number = '0x' [0-9a-fA-F]+; + + c_comment = + '/*' ( any | NL )* :>> '*/'; + + cpp_comment = + '//' [^\n]* NL; + + c_cpp_comment = c_comment | cpp_comment; + + ruby_comment = '#' [^\n]* NL; + + # These literal forms are common to host code and ragel. + s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'"; + d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"'; + host_re_literal = '/' ([^/\\] | NL | '\\' (any | NL))* '/'; + + whitespace = [ \t] | NL; + pound_comment = '#' [^\n]* NL; + + # An inline block of code for languages other than Ruby. + inline_code := |* + # Inline expression keywords. + "fpc" => { token( KW_PChar ); }; + "fc" => { token( KW_Char ); }; + "fcurs" => { token( KW_CurState ); }; + "ftargs" => { token( KW_TargState ); }; + "fentry" => { + whitespaceOn = false; + token( KW_Entry ); + }; + + # Inline statement keywords. + "fhold" => { + whitespaceOn = false; + token( KW_Hold ); + }; + "fexec" => { token( KW_Exec, 0, 0 ); }; + "fgoto" => { + whitespaceOn = false; + token( KW_Goto ); + }; + "fnext" => { + whitespaceOn = false; + token( KW_Next ); + }; + "fcall" => { + whitespaceOn = false; + token( KW_Call ); + }; + "fret" => { + whitespaceOn = false; + token( KW_Ret ); + }; + "fbreak" => { + whitespaceOn = false; + token( KW_Break ); + }; + "fncall" => { + whitespaceOn = false; + token( KW_Ncall ); + }; + "fnret" => { + whitespaceOn = false; + token( KW_Nret ); + }; + "fnbreak" => { + whitespaceOn = false; + token( KW_Nbreak ); + }; + + ident => { token( TK_Word, ts, te ); }; + + number => { token( TK_UInt, ts, te ); }; + hex_number => { token( TK_Hex, ts, te ); }; + + ( s_literal | d_literal ) + => { token( IL_Literal, ts, te ); }; + + whitespace+ => { + if ( whitespaceOn ) + token( IL_WhiteSpace, ts, te ); + }; + + c_cpp_comment => { token( IL_Comment, ts, te ); }; + + "::" => { token( TK_NameSep, ts, te ); }; + + # Some symbols need to go to the parser as with their cardinal value as + # the token type (as opposed to being sent as anonymous symbols) + # because they are part of the sequences which we interpret. The * ) ; + # symbols cause whitespace parsing to come back on. This gets turned + # off by some keywords. + + ";" => { + whitespaceOn = true; + token( *ts, ts, te ); + if ( inlineBlockType == SemiTerminated ) + fret; + }; + + "$" [a-zA-Z_][a-zA-Z_0-9]* => { + if ( parser != 0 && parser->parseSubstitutions ) + token( TK_SubstRef, ts+1, te ); + else { + token( IL_Symbol, ts, ts+1 ); + fexec ts+1; + } + }; + + [*)] => { + whitespaceOn = true; + token( *ts, ts, te ); + }; + + [,(] => { token( *ts, ts, te ); }; + + '{' => { + token( IL_Symbol, ts, te ); + curly_count += 1; + }; + + '}' => { + if ( --curly_count == 0 && inlineBlockType == CurlyDelimited ) { + /* Inline code block ends. */ + token( '}' ); + fret; + } + else { + /* Either a semi terminated inline block or only the closing + * brace of some inner scope, not the block's closing brace. */ + token( IL_Symbol, ts, te ); + } + }; + + EOF => { + id->error(scan_loc()) << "unterminated code block" << endl; + }; + + # Send every other character as a symbol. + any => { token( IL_Symbol, ts, te ); }; + *|; + + or_literal := |* + # Escape sequences in OR expressions. + '\\0' => { token( RE_Char, '\0' ); }; + '\\a' => { token( RE_Char, '\a' ); }; + '\\b' => { token( RE_Char, '\b' ); }; + '\\t' => { token( RE_Char, '\t' ); }; + '\\n' => { token( RE_Char, '\n' ); }; + '\\v' => { token( RE_Char, '\v' ); }; + '\\f' => { token( RE_Char, '\f' ); }; + '\\r' => { token( RE_Char, '\r' ); }; + '\\\n' => { updateCol(); }; + '\\' any => { token( RE_Char, ts+1, te ); }; + + # Range dash in an OR expression. + '-' => { token( RE_Dash, 0, 0 ); }; + + # Terminate an OR expression. + ']' => { token( RE_SqClose ); fret; }; + + EOF => { + id->error(scan_loc()) << "unterminated OR literal" << endl; + }; + + # Characters in an OR expression. + [^\]] => { token( RE_Char, ts, te ); }; + + *|; + + ragel_re_literal := |* + # Escape sequences in regular expressions. + '\\0' => { token( RE_Char, '\0' ); }; + '\\a' => { token( RE_Char, '\a' ); }; + '\\b' => { token( RE_Char, '\b' ); }; + '\\t' => { token( RE_Char, '\t' ); }; + '\\n' => { token( RE_Char, '\n' ); }; + '\\v' => { token( RE_Char, '\v' ); }; + '\\f' => { token( RE_Char, '\f' ); }; + '\\r' => { token( RE_Char, '\r' ); }; + '\\\n' => { updateCol(); }; + '\\' any => { token( RE_Char, ts+1, te ); }; + + # Terminate an OR expression. + '/' [i]? => { + token( RE_Slash, ts, te ); + fgoto parser_def; + }; + + # Special characters. + '.' => { token( RE_Dot ); }; + '*' => { token( RE_Star ); }; + + '[' => { token( RE_SqOpen ); fcall or_literal; }; + '[^' => { token( RE_SqOpenNeg ); fcall or_literal; }; + + EOF => { + id->error(scan_loc()) << "unterminated regular expression" << endl; + }; + + # Characters in an OR expression. + [^\/] => { token( RE_Char, ts, te ); }; + *|; + + # We need a separate token space here to avoid the ragel keywords. + write_statement := |* + ident => { token( TK_Word, ts, te ); } ; + [ \t\n]+ => { updateCol(); }; + ';' => { token( ';' ); fgoto parser_def; }; + + EOF => { + id->error(scan_loc()) << "unterminated write statement" << endl; + }; + *|; + + # Parser definitions. + parser_def := |* + #'length_cond' => { token( KW_Length ); }; + 'machine' => { token( KW_Machine ); }; + 'include' => { token( KW_Include ); }; + 'import' => { token( KW_Import ); }; + 'write' => { + token( KW_Write ); + fgoto write_statement; + }; + 'action' => { token( KW_Action ); }; + 'alphtype' => { token( KW_AlphType ); }; + 'prepush' => { token( KW_PrePush ); }; + 'postpop' => { token( KW_PostPop ); }; + + 'nfaprepush' => { token( KW_NfaPrePush ); }; + 'nfapostpop' => { token( KW_NfaPostPop ); }; + + # FIXME: Enable this post 5.17. + # 'range' => { token( KW_Range ); }; + + 'getkey' => { + token( KW_GetKey ); + inlineBlockType = SemiTerminated; + fcall inline_code; + }; + 'access' => { + token( KW_Access ); + inlineBlockType = SemiTerminated; + fcall inline_code; + }; + 'variable' => { + token( KW_Variable ); + inlineBlockType = SemiTerminated; + fcall inline_code; + }; + 'when' => { token( KW_When ); }; + 'inwhen' => { token( KW_InWhen ); }; + 'outwhen' => { token( KW_OutWhen ); }; + 'eof' => { token( KW_Eof ); }; + 'err' => { token( KW_Err ); }; + 'lerr' => { token( KW_Lerr ); }; + 'to' => { token( KW_To ); }; + 'from' => { token( KW_From ); }; + 'export' => { token( KW_Export ); }; + + # Identifiers. + ident => { token( TK_Word, ts, te ); } ; + + # Numbers + number => { token( TK_UInt, ts, te ); }; + hex_number => { token( TK_Hex, ts, te ); }; + + # Literals, with optionals. + ( s_literal | d_literal ) [i]? + => { token( TK_Literal, ts, te ); }; + + '[' => { token( RE_SqOpen ); fcall or_literal; }; + '[^' => { token( RE_SqOpenNeg ); fcall or_literal; }; + + '/' => { token( RE_Slash ); fgoto ragel_re_literal; }; + + # Ignore. + pound_comment => { updateCol(); }; + + ':=' => { token( TK_ColonEquals ); }; + '|=' => { token( TK_BarEquals ); }; + + # To State Actions. + ">~" => { token( TK_StartToState ); }; + "$~" => { token( TK_AllToState ); }; + "%~" => { token( TK_FinalToState ); }; + "<~" => { token( TK_NotStartToState ); }; + "@~" => { token( TK_NotFinalToState ); }; + "<>~" => { token( TK_MiddleToState ); }; + + # From State actions + ">*" => { token( TK_StartFromState ); }; + "$*" => { token( TK_AllFromState ); }; + "%*" => { token( TK_FinalFromState ); }; + "<*" => { token( TK_NotStartFromState ); }; + "@*" => { token( TK_NotFinalFromState ); }; + "<>*" => { token( TK_MiddleFromState ); }; + + # EOF Actions. + ">/" => { token( TK_StartEOF ); }; + "$/" => { token( TK_AllEOF ); }; + "%/" => { token( TK_FinalEOF ); }; + "</" => { token( TK_NotStartEOF ); }; + "@/" => { token( TK_NotFinalEOF ); }; + "<>/" => { token( TK_MiddleEOF ); }; + + # Global Error actions. + ">!" => { token( TK_StartGblError ); }; + "$!" => { token( TK_AllGblError ); }; + "%!" => { token( TK_FinalGblError ); }; + "<!" => { token( TK_NotStartGblError ); }; + "@!" => { token( TK_NotFinalGblError ); }; + "<>!" => { token( TK_MiddleGblError ); }; + + # Local error actions. + ">^" => { token( TK_StartLocalError ); }; + "$^" => { token( TK_AllLocalError ); }; + "%^" => { token( TK_FinalLocalError ); }; + "<^" => { token( TK_NotStartLocalError ); }; + "@^" => { token( TK_NotFinalLocalError ); }; + "<>^" => { token( TK_MiddleLocalError ); }; + + # Middle. + "<>" => { token( TK_Middle ); }; + + # Conditions. + '>?' => { token( TK_StartCond ); }; + '$?' => { token( TK_AllCond ); }; + '%?' => { token( TK_LeavingCond ); }; + + '..' => { token( TK_DotDot ); }; + '../i' => { token( TK_DotDotIndep ); }; + + '**' => { token( TK_StarStar ); }; + '--' => { token( TK_DashDash ); }; + '->' => { token( TK_Arrow ); }; + '=>' => { token( TK_DoubleArrow ); }; + + ":>" => { token( TK_ColonGt ); }; + ":>>" => { token( TK_ColonGtGt ); }; + "<:" => { token( TK_LtColon ); }; + + ":nfa(" => { token( TK_ColonNfaOpen ); }; + ":cond(" => { token( TK_ColonCondOpen ); }; + ":condstar(" => { token( TK_ColonCondStarOpen ); }; + ":condplus(" => { token( TK_ColonCondPlusOpen ); }; + ":nomax(" => { token( TK_ColonNoMaxOpen ); }; + "):" => { token( TK_CloseColon ); }; + + # Opening of longest match. + "|*" => { token( TK_BarStar ); }; + + # Separater for name references. + "::" => { token( TK_NameSep, ts, te ); }; + + '}%%' => { + updateCol(); + endSection(); + fret; + }; + + [ \t\r]+ => { updateCol(); }; + + # If we are in a single line machine then newline may end the spec. + NL => { + updateCol(); + if ( singleLineSpec ) { + endSection(); + fret; + } + }; + + '{' => { + if ( lastToken == KW_Export || lastToken == KW_Entry ) + token( '{' ); + else { + token( '{' ); + curly_count = 1; + inlineBlockType = CurlyDelimited; + fcall inline_code; + } + }; + + EOF => { + id->error(scan_loc()) << "unterminated ragel section" << endl; + }; + + any => { token( *ts ); } ; + *|; + + # Outside code scanner. These tokens get passed through. + main := |* + 'define' => { pass( IMP_Define, 0, 0 ); }; + ident => { pass( IMP_Word, ts, te ); }; + number => { pass( IMP_UInt, ts, te ); }; + c_cpp_comment => { pass(); }; + ( s_literal | d_literal ) => { pass( IMP_Literal, ts, te ); }; + + '%%{' => { + updateCol(); + singleLineSpec = false; + startSection(); + fcall parser_def; + }; + '%%' => { + updateCol(); + singleLineSpec = true; + startSection(); + fcall parser_def; + }; + whitespace+ => { pass(); }; + EOF; + any => { pass( *ts, 0, 0 ); }; + *|; +}%% + +%% write data; + +void Scanner::do_scan() +{ + int bufsize = 8; + char *buf = new char[bufsize]; + int cs, act, have = 0; + int top; + + /* The stack is two deep, one level for going into ragel defs from the main + * machines which process outside code, and another for going into or literals + * from either a ragel spec, or a regular expression. */ + int stack[2]; + int curly_count = 0; + bool execute = true; + bool singleLineSpec = false; + InlineBlockType inlineBlockType = CurlyDelimited; + + line = 1; + column = 1; + lastnl = 0; + + /* Init the section parser and the character scanner. */ + init(); + %% write init; + + /* Set up the start state. FIXME: After 5.20 is released the nocs write + * init option should be used, the main machine eliminated and this statement moved + * above the write init. */ + cs = rlscan_en_main; + + while ( execute ) { + char *p = buf + have; + int space = bufsize - have; + + if ( space == 0 ) { + /* We filled up the buffer trying to scan a token. Grow it. */ + bufsize = bufsize * 2; + char *newbuf = new char[bufsize]; + + /* Recompute p and space. */ + p = newbuf + have; + space = bufsize - have; + + /* Patch up pointers possibly in use. */ + if ( ts != 0 ) + ts = newbuf + ( ts - buf ); + te = newbuf + ( te - buf ); + + /* Copy the new buffer in. */ + memcpy( newbuf, buf, have ); + delete[] buf; + buf = newbuf; + } + + input.read( p, space ); + int len = input.gcount(); + char *pe = p + len; + + /* If we see eof then append the eof var. */ + char *eof = 0; + if ( len == 0 ) { + eof = pe; + execute = false; + } + + %% write exec; + + /* Check if we failed. */ + if ( cs == rlscan_error ) { + /* Machine failed before finding a token. I'm not yet sure if this + * is reachable. */ + id->error(scan_loc()) << "scanner error" << endl; + id->abortCompile( 1 ); + } + + /* Decide if we need to preserve anything. */ + char *preserve = ts; + + /* Now set up the prefix. */ + if ( preserve == 0 ) + have = 0; + else { + /* There is data that needs to be shifted over. */ + have = pe - preserve; + memmove( buf, preserve, have ); + unsigned int shiftback = preserve - buf; + if ( ts != 0 ) + ts -= shiftback; + te -= shiftback; + + preserve = buf; + } + } + + delete[] buf; +} |