From 5bb41f82c1c8a9edf8bf7b73de20425888de639c Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Sun, 8 Mar 2009 02:49:32 +0000 Subject: More work on the pull scanner. Seems to work parsing input and constructors. --- colm/fsmcodegen.cpp | 1 - colm/fsmrun.cpp | 244 +++++++++++++++++++++++++++++----------------------- colm/parsedata.cpp | 8 +- 3 files changed, 141 insertions(+), 112 deletions(-) diff --git a/colm/fsmcodegen.cpp b/colm/fsmcodegen.cpp index 63fec1c3..9e1ff32d 100644 --- a/colm/fsmcodegen.cpp +++ b/colm/fsmcodegen.cpp @@ -1012,7 +1012,6 @@ void FsmCodeGen::writeExec() out << "void FsmRun::execute()\n" "{\n" - " matchedToken = 0;\n" "/*_resume:*/\n"; if ( redFsm->errState != 0 ) { diff --git a/colm/fsmrun.cpp b/colm/fsmrun.cpp index 1ee8b918..24e40ae2 100644 --- a/colm/fsmrun.cpp +++ b/colm/fsmrun.cpp @@ -832,6 +832,10 @@ long PdaRun::undoParse( Tree *tree, CodeVect *rev ) return 0; } +#define SCAN_ERROR -3 +#define SCAN_LANG_EL -2 +#define SCAN_EOF -1 + void parse( FsmRun *fsmRun, PdaRun *parser ) { parser->init(); @@ -839,83 +843,22 @@ void parse( FsmRun *fsmRun, PdaRun *parser ) while ( true ) { int tokenId = fsmRun->scan( parser ); - if ( tokenId == 0 ) - break; - - bool ctxDepParsing = fsmRun->prg->ctxDepParsing; - LangElInfo *lelInfo = parser->tables->rtd->lelInfo; - if ( ctxDepParsing && lelInfo[tokenId].frameId >= 0 ) - fsmRun->execGen( parser, tokenId ); - else if ( lelInfo[tokenId].ignore ) - fsmRun->sendIgnore( parser, tokenId ); - else - fsmRun->sendToken( parser, tokenId ); - } -} - -long FsmRun::scan( PdaRun *parser ) -{ - long space; - -// long prevState = cs; -// PdaRun *prevParser = parser; -// parser = destParser; - -// parser->init(); - - act = 0; - tokstart = 0; - tokend = 0; - region = parser->getNextRegion(); - cs = tables->entryByRegion[region]; - memset( mark, 0, sizeof(mark) ); - - /* Start with the EOF test. The pattern and replacement input sources can - * be EOF from the start. */ - - while ( true ) { - /* Check for eof. */ - if ( p == pe && inputStream->isEOF() ) { - if ( tokstart != 0 ) { - /* If a token has been started, but not finshed - * this is an error. */ - cs = tables->errorState; - } - else { - eofSent = true; - sendEOF( parser ); - if ( !eofSent ) - continue; + /* Check for EOF. */ + if ( tokenId == SCAN_EOF ) { + fsmRun->eofSent = true; + fsmRun->sendEOF( parser ); + if ( fsmRun->eofSent ) break; - } } - if ( p == pe ) { - /* We don't have any data. What is next in the input stream? */ - if ( inputStream->isLangEl() ) - sendNamedLangEl( parser ); - else { - space = runBuf->buf + FSM_BUFSIZE - pe; - - if ( space == 0 ) - cerr << "OUT OF BUFFER SPACE" << endp; - - int len = inputStream->getData( p, space ); - pe = p + len; - if ( inputStream->needFlush() ) - peof = pe; - } - } - - execute(); - - /* First check if scanning stopped because we have a token. */ - if ( matchedToken > 0 ) - return matchedToken; + /* Check for a named language element. */ + if ( tokenId == SCAN_LANG_EL ) + fsmRun->sendNamedLangEl( parser ); +/* THIS GOES SOMEWHERE ELSE NOW */ +#if 0 /* Fall through here either when the input buffer has been exhausted * or the scanner is in an error state. Otherwise we must continue. */ - if ( cs == tables->errorState && parser->stopParsing ) { #ifdef COLM_LOG_PARSE if ( colm_log_parse ) { @@ -924,22 +867,10 @@ long FsmRun::scan( PdaRun *parser ) #endif goto done; } +#endif - /* First thing check for error. */ - if ( cs == tables->errorState ) { - /* If a token was started, but not finished (tokstart != 0) then - * restore p to the beginning of that token. */ - if ( tokstart != 0 ) - p = tokstart; - - /* Check for a default token in the region. If one is there - * then send it and continue with the processing loop. */ - if ( parser->tables->rtd->regionInfo[region].defaultToken >= 0 ) { - tokstart = tokend = p; - sendToken( parser, parser->tables->rtd->regionInfo[region].defaultToken ); - continue; - } - + /* Check for error. */ + if ( tokenId == SCAN_ERROR ) { if ( parser->getNextRegion( 1 ) != 0 ) { #ifdef COLM_LOG_PARSE if ( colm_log_parse ) { @@ -953,12 +884,12 @@ long FsmRun::scan( PdaRun *parser ) parser->sendBackIgnore(); parser->nextRegionInd += 1; - region = parser->getNextRegion(); - cs = tables->entryByRegion[region]; + fsmRun->region = parser->getNextRegion(); + fsmRun->cs = fsmRun->tables->entryByRegion[fsmRun->region]; #ifdef COLM_LOG_PARSE if ( colm_log_parse ) { cerr << "new token region: " << - parser->tables->rtd->regionInfo[region].name << endl; + parser->tables->rtd->regionInfo[fsmRun->region].name << endl; } #endif continue; @@ -980,12 +911,12 @@ long FsmRun::scan( PdaRun *parser ) cerr << "PARSE ERROR" << endp; } else { - region = parser->getNextRegion(); - cs = tables->entryByRegion[region]; + fsmRun->region = parser->getNextRegion(); + fsmRun->cs = fsmRun->tables->entryByRegion[fsmRun->region]; #ifdef COLM_LOG_PARSE if ( colm_log_parse ) { cerr << "new token region: " << - parser->tables->rtd->regionInfo[region].name << endl; + parser->tables->rtd->regionInfo[fsmRun->region].name << endl; } #endif continue; @@ -993,36 +924,122 @@ long FsmRun::scan( PdaRun *parser ) } /* Machine failed before finding a token. */ - cerr << "error:" << inputStream->line << ": scanner error" << endp; + cerr << "error:" << fsmRun->inputStream->line << ": scanner error" << endp; + } + + bool ctxDepParsing = fsmRun->prg->ctxDepParsing; + LangElInfo *lelInfo = parser->tables->rtd->lelInfo; + if ( ctxDepParsing && lelInfo[tokenId].frameId >= 0 ) + fsmRun->execGen( parser, tokenId ); + else if ( lelInfo[tokenId].ignore ) + fsmRun->sendIgnore( parser, tokenId ); + else + fsmRun->sendToken( parser, tokenId ); + } +} + +long FsmRun::scan( PdaRun *parser ) +{ + long space; + + /* Init the scanner vars. */ + act = 0; + tokstart = 0; + tokend = 0; + matchedToken = 0; + + /* Set the state using the state of the parser. */ + region = parser->getNextRegion(); + cs = tables->entryByRegion[region]; + + /* Clear the mark array. */ + memset( mark, 0, sizeof(mark) ); + + /* Start with the EOF test. The pattern and replacement input sources can + * be EOF from the start. */ + + while ( true ) { + execute(); + + /* First check if scanning stopped because we have a token. */ + if ( matchedToken > 0 ) + return matchedToken; + + /* Check for error. */ + if ( cs == tables->errorState ) { + /* If a token was started, but not finished (tokstart != 0) then + * restore p to the beginning of that token. */ + if ( tokstart != 0 ) + p = tokstart; + + /* Check for a default token in the region. If one is there + * then send it and continue with the processing loop. */ + if ( parser->tables->rtd->regionInfo[region].defaultToken >= 0 ) { + tokstart = tokend = p; + return parser->tables->rtd->regionInfo[region].defaultToken; + } + + return SCAN_ERROR; } + /* Got here because the state machine didn't match a token or + * encounter an error. Must be because we got to the end of the buffer + * data. */ + assert( p == pe ); + + /* Check for a named language element. Note that we can do this only + * when p == pe otherwise we get ahead of what's already in the + * buffer. */ + if ( inputStream->isLangEl() ) + return SCAN_LANG_EL; + + /* Maybe need eof. */ + if ( inputStream->isEOF() ) { + if ( tokstart != 0 ) { + /* If a token has been started, but not finshed + * this is an error. */ + cs = tables->errorState; + return SCAN_ERROR; + } + else { + return SCAN_EOF; + } + } + + /* There may be space left in the current buffer. If not then we need + * to make some. */ space = runBuf->buf + FSM_BUFSIZE - pe; if ( space == 0 ) { /* Create a new run buf. */ - RunBuf *buf = new RunBuf; - buf->next = runBuf; - runBuf = buf; + RunBuf *newBuf = new RunBuf; /* If partway through a token then preserve the prefix. */ long have = 0; if ( tokstart == 0 ) { - /* No prefix, the previous buffer was filled. */ - runBuf->next->length = FSM_BUFSIZE; + /* No prefix. We filled the previous buffer. */ + runBuf->length = FSM_BUFSIZE; } else { + if ( tokstart == runBuf->buf ) { + /* A token is started and it is already at the beginning + * of the current buffer. This means buffer is full and it + * must be grown. Probably need to do this sooner. */ + cerr << "OUT OF BUFFER SPACE" << endp; + } + /* There is data that needs to be shifted over. */ have = pe - tokstart; - memcpy( runBuf->buf, tokstart, have ); + memcpy( newBuf->buf, tokstart, have ); /* Compute the length of the previous buffer. */ - runBuf->next->length = FSM_BUFSIZE - have; + runBuf->length = FSM_BUFSIZE - have; /* Compute tokstart and tokend. */ - long dist = tokstart - runBuf->buf; + long dist = tokstart - newBuf->buf; tokend -= dist; - tokstart = runBuf->buf; + tokstart = newBuf->buf; /* Shift any markers. */ for ( int i = 0; i < MARK_SLOTS; i++ ) { @@ -1030,13 +1047,26 @@ long FsmRun::scan( PdaRun *parser ) mark[i] -= dist; } } - p = pe = runBuf->buf + have; + + p = pe = newBuf->buf + have; peof = 0; + + newBuf->next = runBuf; + runBuf = newBuf; } + + /* We don't have any data. What is next in the input stream? */ + space = runBuf->buf + FSM_BUFSIZE - pe; + assert( space > 0 ); + + /* Get more data. */ + int len = inputStream->getData( p, space ); + pe = p + len; + if ( inputStream->needFlush() ) + peof = pe; + } -done: - //parser = prevParser; - //cs = prevState; - return 0; + /* Should not be reached. */ + return SCAN_ERROR; } diff --git a/colm/parsedata.cpp b/colm/parsedata.cpp index 42fb87e6..802ac83d 100644 --- a/colm/parsedata.cpp +++ b/colm/parsedata.cpp @@ -1576,14 +1576,14 @@ InputStreamRepl::InputStreamRepl( Replacement *replacement ) int InputStreamRepl::isLangEl() { - return replItem != 0 && - ( replItem->type == ReplItem::VarRefType || replItem->type == ReplItem::FactorType ); + return replItem != 0 && ( replItem->type == ReplItem::VarRefType || + replItem->type == ReplItem::FactorType ); } int InputStreamRepl::shouldFlush() { - return replItem == 0 || - ( replItem->type == ReplItem::VarRefType || replItem->type == ReplItem::FactorType ); + return replItem == 0 || ( replItem->type == ReplItem::VarRefType || + replItem->type == ReplItem::FactorType ); } KlangEl *InputStreamRepl::getLangEl( long &bindId, char *&data, long &length ) -- cgit v1.2.1