summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Thurston <thurston@complang.org>2009-03-08 02:49:32 +0000
committerAdrian Thurston <thurston@complang.org>2009-03-08 02:49:32 +0000
commit5bb41f82c1c8a9edf8bf7b73de20425888de639c (patch)
treeaac49c3a0857451cfb5460f4d6587b4f3bca0e9d
parentc2210d0045411c83ca57406daf2c858d4cdf8d2a (diff)
downloadcolm-5bb41f82c1c8a9edf8bf7b73de20425888de639c.tar.gz
More work on the pull scanner. Seems to work parsing input and constructors.
-rw-r--r--colm/fsmcodegen.cpp1
-rw-r--r--colm/fsmrun.cpp244
-rw-r--r--colm/parsedata.cpp8
3 files changed, 141 insertions, 112 deletions
diff --git a/colm/fsmcodegen.cpp b/colm/fsmcodegen.cpp
index 63fec1c3..9e1ff32d 100644
--- a/colm/fsmcodegen.cpp
+++ b/colm/fsmcodegen.cpp
@@ -1012,7 +1012,6 @@ void FsmCodeGen::writeExec()
out <<
"void FsmRun::execute()\n"
"{\n"
- " matchedToken = 0;\n"
"/*_resume:*/\n";
if ( redFsm->errState != 0 ) {
diff --git a/colm/fsmrun.cpp b/colm/fsmrun.cpp
index 1ee8b918..24e40ae2 100644
--- a/colm/fsmrun.cpp
+++ b/colm/fsmrun.cpp
@@ -832,6 +832,10 @@ long PdaRun::undoParse( Tree *tree, CodeVect *rev )
return 0;
}
+#define SCAN_ERROR -3
+#define SCAN_LANG_EL -2
+#define SCAN_EOF -1
+
void parse( FsmRun *fsmRun, PdaRun *parser )
{
parser->init();
@@ -839,83 +843,22 @@ void parse( FsmRun *fsmRun, PdaRun *parser )
while ( true ) {
int tokenId = fsmRun->scan( parser );
- if ( tokenId == 0 )
- break;
-
- bool ctxDepParsing = fsmRun->prg->ctxDepParsing;
- LangElInfo *lelInfo = parser->tables->rtd->lelInfo;
- if ( ctxDepParsing && lelInfo[tokenId].frameId >= 0 )
- fsmRun->execGen( parser, tokenId );
- else if ( lelInfo[tokenId].ignore )
- fsmRun->sendIgnore( parser, tokenId );
- else
- fsmRun->sendToken( parser, tokenId );
- }
-}
-
-long FsmRun::scan( PdaRun *parser )
-{
- long space;
-
-// long prevState = cs;
-// PdaRun *prevParser = parser;
-// parser = destParser;
-
-// parser->init();
-
- act = 0;
- tokstart = 0;
- tokend = 0;
- region = parser->getNextRegion();
- cs = tables->entryByRegion[region];
- memset( mark, 0, sizeof(mark) );
-
- /* Start with the EOF test. The pattern and replacement input sources can
- * be EOF from the start. */
-
- while ( true ) {
- /* Check for eof. */
- if ( p == pe && inputStream->isEOF() ) {
- if ( tokstart != 0 ) {
- /* If a token has been started, but not finshed
- * this is an error. */
- cs = tables->errorState;
- }
- else {
- eofSent = true;
- sendEOF( parser );
- if ( !eofSent )
- continue;
+ /* Check for EOF. */
+ if ( tokenId == SCAN_EOF ) {
+ fsmRun->eofSent = true;
+ fsmRun->sendEOF( parser );
+ if ( fsmRun->eofSent )
break;
- }
}
- if ( p == pe ) {
- /* We don't have any data. What is next in the input stream? */
- if ( inputStream->isLangEl() )
- sendNamedLangEl( parser );
- else {
- space = runBuf->buf + FSM_BUFSIZE - pe;
-
- if ( space == 0 )
- cerr << "OUT OF BUFFER SPACE" << endp;
-
- int len = inputStream->getData( p, space );
- pe = p + len;
- if ( inputStream->needFlush() )
- peof = pe;
- }
- }
-
- execute();
-
- /* First check if scanning stopped because we have a token. */
- if ( matchedToken > 0 )
- return matchedToken;
+ /* Check for a named language element. */
+ if ( tokenId == SCAN_LANG_EL )
+ fsmRun->sendNamedLangEl( parser );
+/* THIS GOES SOMEWHERE ELSE NOW */
+#if 0
/* Fall through here either when the input buffer has been exhausted
* or the scanner is in an error state. Otherwise we must continue. */
-
if ( cs == tables->errorState && parser->stopParsing ) {
#ifdef COLM_LOG_PARSE
if ( colm_log_parse ) {
@@ -924,22 +867,10 @@ long FsmRun::scan( PdaRun *parser )
#endif
goto done;
}
+#endif
- /* First thing check for error. */
- if ( cs == tables->errorState ) {
- /* If a token was started, but not finished (tokstart != 0) then
- * restore p to the beginning of that token. */
- if ( tokstart != 0 )
- p = tokstart;
-
- /* Check for a default token in the region. If one is there
- * then send it and continue with the processing loop. */
- if ( parser->tables->rtd->regionInfo[region].defaultToken >= 0 ) {
- tokstart = tokend = p;
- sendToken( parser, parser->tables->rtd->regionInfo[region].defaultToken );
- continue;
- }
-
+ /* Check for error. */
+ if ( tokenId == SCAN_ERROR ) {
if ( parser->getNextRegion( 1 ) != 0 ) {
#ifdef COLM_LOG_PARSE
if ( colm_log_parse ) {
@@ -953,12 +884,12 @@ long FsmRun::scan( PdaRun *parser )
parser->sendBackIgnore();
parser->nextRegionInd += 1;
- region = parser->getNextRegion();
- cs = tables->entryByRegion[region];
+ fsmRun->region = parser->getNextRegion();
+ fsmRun->cs = fsmRun->tables->entryByRegion[fsmRun->region];
#ifdef COLM_LOG_PARSE
if ( colm_log_parse ) {
cerr << "new token region: " <<
- parser->tables->rtd->regionInfo[region].name << endl;
+ parser->tables->rtd->regionInfo[fsmRun->region].name << endl;
}
#endif
continue;
@@ -980,12 +911,12 @@ long FsmRun::scan( PdaRun *parser )
cerr << "PARSE ERROR" << endp;
}
else {
- region = parser->getNextRegion();
- cs = tables->entryByRegion[region];
+ fsmRun->region = parser->getNextRegion();
+ fsmRun->cs = fsmRun->tables->entryByRegion[fsmRun->region];
#ifdef COLM_LOG_PARSE
if ( colm_log_parse ) {
cerr << "new token region: " <<
- parser->tables->rtd->regionInfo[region].name << endl;
+ parser->tables->rtd->regionInfo[fsmRun->region].name << endl;
}
#endif
continue;
@@ -993,36 +924,122 @@ long FsmRun::scan( PdaRun *parser )
}
/* Machine failed before finding a token. */
- cerr << "error:" << inputStream->line << ": scanner error" << endp;
+ cerr << "error:" << fsmRun->inputStream->line << ": scanner error" << endp;
+ }
+
+ bool ctxDepParsing = fsmRun->prg->ctxDepParsing;
+ LangElInfo *lelInfo = parser->tables->rtd->lelInfo;
+ if ( ctxDepParsing && lelInfo[tokenId].frameId >= 0 )
+ fsmRun->execGen( parser, tokenId );
+ else if ( lelInfo[tokenId].ignore )
+ fsmRun->sendIgnore( parser, tokenId );
+ else
+ fsmRun->sendToken( parser, tokenId );
+ }
+}
+
+long FsmRun::scan( PdaRun *parser )
+{
+ long space;
+
+ /* Init the scanner vars. */
+ act = 0;
+ tokstart = 0;
+ tokend = 0;
+ matchedToken = 0;
+
+ /* Set the state using the state of the parser. */
+ region = parser->getNextRegion();
+ cs = tables->entryByRegion[region];
+
+ /* Clear the mark array. */
+ memset( mark, 0, sizeof(mark) );
+
+ /* Start with the EOF test. The pattern and replacement input sources can
+ * be EOF from the start. */
+
+ while ( true ) {
+ execute();
+
+ /* First check if scanning stopped because we have a token. */
+ if ( matchedToken > 0 )
+ return matchedToken;
+
+ /* Check for error. */
+ if ( cs == tables->errorState ) {
+ /* If a token was started, but not finished (tokstart != 0) then
+ * restore p to the beginning of that token. */
+ if ( tokstart != 0 )
+ p = tokstart;
+
+ /* Check for a default token in the region. If one is there
+ * then send it and continue with the processing loop. */
+ if ( parser->tables->rtd->regionInfo[region].defaultToken >= 0 ) {
+ tokstart = tokend = p;
+ return parser->tables->rtd->regionInfo[region].defaultToken;
+ }
+
+ return SCAN_ERROR;
}
+ /* Got here because the state machine didn't match a token or
+ * encounter an error. Must be because we got to the end of the buffer
+ * data. */
+ assert( p == pe );
+
+ /* Check for a named language element. Note that we can do this only
+ * when p == pe otherwise we get ahead of what's already in the
+ * buffer. */
+ if ( inputStream->isLangEl() )
+ return SCAN_LANG_EL;
+
+ /* Maybe need eof. */
+ if ( inputStream->isEOF() ) {
+ if ( tokstart != 0 ) {
+ /* If a token has been started, but not finshed
+ * this is an error. */
+ cs = tables->errorState;
+ return SCAN_ERROR;
+ }
+ else {
+ return SCAN_EOF;
+ }
+ }
+
+ /* There may be space left in the current buffer. If not then we need
+ * to make some. */
space = runBuf->buf + FSM_BUFSIZE - pe;
if ( space == 0 ) {
/* Create a new run buf. */
- RunBuf *buf = new RunBuf;
- buf->next = runBuf;
- runBuf = buf;
+ RunBuf *newBuf = new RunBuf;
/* If partway through a token then preserve the prefix. */
long have = 0;
if ( tokstart == 0 ) {
- /* No prefix, the previous buffer was filled. */
- runBuf->next->length = FSM_BUFSIZE;
+ /* No prefix. We filled the previous buffer. */
+ runBuf->length = FSM_BUFSIZE;
}
else {
+ if ( tokstart == runBuf->buf ) {
+ /* A token is started and it is already at the beginning
+ * of the current buffer. This means buffer is full and it
+ * must be grown. Probably need to do this sooner. */
+ cerr << "OUT OF BUFFER SPACE" << endp;
+ }
+
/* There is data that needs to be shifted over. */
have = pe - tokstart;
- memcpy( runBuf->buf, tokstart, have );
+ memcpy( newBuf->buf, tokstart, have );
/* Compute the length of the previous buffer. */
- runBuf->next->length = FSM_BUFSIZE - have;
+ runBuf->length = FSM_BUFSIZE - have;
/* Compute tokstart and tokend. */
- long dist = tokstart - runBuf->buf;
+ long dist = tokstart - newBuf->buf;
tokend -= dist;
- tokstart = runBuf->buf;
+ tokstart = newBuf->buf;
/* Shift any markers. */
for ( int i = 0; i < MARK_SLOTS; i++ ) {
@@ -1030,13 +1047,26 @@ long FsmRun::scan( PdaRun *parser )
mark[i] -= dist;
}
}
- p = pe = runBuf->buf + have;
+
+ p = pe = newBuf->buf + have;
peof = 0;
+
+ newBuf->next = runBuf;
+ runBuf = newBuf;
}
+
+ /* We don't have any data. What is next in the input stream? */
+ space = runBuf->buf + FSM_BUFSIZE - pe;
+ assert( space > 0 );
+
+ /* Get more data. */
+ int len = inputStream->getData( p, space );
+ pe = p + len;
+ if ( inputStream->needFlush() )
+ peof = pe;
+
}
-done:
- //parser = prevParser;
- //cs = prevState;
- return 0;
+ /* Should not be reached. */
+ return SCAN_ERROR;
}
diff --git a/colm/parsedata.cpp b/colm/parsedata.cpp
index 42fb87e6..802ac83d 100644
--- a/colm/parsedata.cpp
+++ b/colm/parsedata.cpp
@@ -1576,14 +1576,14 @@ InputStreamRepl::InputStreamRepl( Replacement *replacement )
int InputStreamRepl::isLangEl()
{
- return replItem != 0 &&
- ( replItem->type == ReplItem::VarRefType || replItem->type == ReplItem::FactorType );
+ return replItem != 0 && ( replItem->type == ReplItem::VarRefType ||
+ replItem->type == ReplItem::FactorType );
}
int InputStreamRepl::shouldFlush()
{
- return replItem == 0 ||
- ( replItem->type == ReplItem::VarRefType || replItem->type == ReplItem::FactorType );
+ return replItem == 0 || ( replItem->type == ReplItem::VarRefType ||
+ replItem->type == ReplItem::FactorType );
}
KlangEl *InputStreamRepl::getLangEl( long &bindId, char *&data, long &length )