From 5bb41f82c1c8a9edf8bf7b73de20425888de639c Mon Sep 17 00:00:00 2001
From: Adrian Thurston <thurston@complang.org>
Date: Sun, 8 Mar 2009 02:49:32 +0000
Subject: More work on the pull scanner. Seems to work parsing input and
 constructors.

---
 colm/fsmcodegen.cpp |   1 -
 colm/fsmrun.cpp     | 244 +++++++++++++++++++++++++++++-----------------------
 colm/parsedata.cpp  |   8 +-
 3 files changed, 141 insertions(+), 112 deletions(-)

diff --git a/colm/fsmcodegen.cpp b/colm/fsmcodegen.cpp
index 63fec1c3..9e1ff32d 100644
--- a/colm/fsmcodegen.cpp
+++ b/colm/fsmcodegen.cpp
@@ -1012,7 +1012,6 @@ void FsmCodeGen::writeExec()
 	out <<
 		"void FsmRun::execute()\n"
 		"{\n"
-		"	matchedToken = 0;\n"
 		"/*_resume:*/\n";
 
 	if ( redFsm->errState != 0 ) {
diff --git a/colm/fsmrun.cpp b/colm/fsmrun.cpp
index 1ee8b918..24e40ae2 100644
--- a/colm/fsmrun.cpp
+++ b/colm/fsmrun.cpp
@@ -832,6 +832,10 @@ long PdaRun::undoParse( Tree *tree, CodeVect *rev )
 	return 0;
 }
 
+#define SCAN_ERROR    -3
+#define SCAN_LANG_EL  -2
+#define SCAN_EOF      -1
+
 void parse( FsmRun *fsmRun, PdaRun *parser )
 {
 	parser->init();
@@ -839,83 +843,22 @@ void parse( FsmRun *fsmRun, PdaRun *parser )
 	while ( true ) {
 		int tokenId = fsmRun->scan( parser );
 
-		if ( tokenId == 0 )
-			break;
-
-		bool ctxDepParsing = fsmRun->prg->ctxDepParsing;
-		LangElInfo *lelInfo = parser->tables->rtd->lelInfo;
-		if ( ctxDepParsing && lelInfo[tokenId].frameId >= 0 )
-			fsmRun->execGen( parser, tokenId );
-		else if ( lelInfo[tokenId].ignore )
-			fsmRun->sendIgnore( parser, tokenId );
-		else
-			fsmRun->sendToken( parser, tokenId );
-	}
-}
-
-long FsmRun::scan( PdaRun *parser )
-{
-	long space;
-
-//	long prevState = cs;
-//	PdaRun *prevParser = parser;
-//	parser = destParser;
-
-//	parser->init();
-
-	act = 0;
-	tokstart = 0;
-	tokend = 0;
-	region = parser->getNextRegion();
-	cs = tables->entryByRegion[region];
-	memset( mark, 0, sizeof(mark) );
-
-	/* Start with the EOF test. The pattern and replacement input sources can
-	 * be EOF from the start. */
-
-	while ( true ) {
-		/* Check for eof. */
- 		if ( p == pe && inputStream->isEOF() ) {
-			if ( tokstart != 0 ) {
-				/* If a token has been started, but not finshed 
-				 * this is an error. */
-				cs = tables->errorState;
-			}
-			else {
-				eofSent = true;
-				sendEOF( parser );
-				if ( !eofSent )
-					continue;
+		/* Check for EOF. */
+		if ( tokenId == SCAN_EOF ) {
+			fsmRun->eofSent = true;
+			fsmRun->sendEOF( parser );
+			if ( fsmRun->eofSent )
 				break;
-			}
 		}
 
-		if ( p == pe ) {
-			/* We don't have any data. What is next in the input stream? */
-			if ( inputStream->isLangEl() )
-				sendNamedLangEl( parser );
-			else {
-				space = runBuf->buf + FSM_BUFSIZE - pe;
-			
-				if ( space == 0 )
-					cerr << "OUT OF BUFFER SPACE" << endp;
-			
-				int len = inputStream->getData( p, space );
-				pe = p + len;
-				if ( inputStream->needFlush() )
-					peof = pe;
-			}
-		}
-
-		execute();
-
-		/* First check if scanning stopped because we have a token. */
-		if ( matchedToken > 0 ) 
-			return matchedToken;
+		/* Check for a named language element. */
+		if ( tokenId == SCAN_LANG_EL )
+			fsmRun->sendNamedLangEl( parser );
 
+/* THIS GOES SOMEWHERE ELSE NOW */
+#if 0
 		/* Fall through here either when the input buffer has been exhausted
 		 * or the scanner is in an error state. Otherwise we must continue. */
-
 		if ( cs == tables->errorState && parser->stopParsing ) {
 			#ifdef COLM_LOG_PARSE
 			if ( colm_log_parse ) {
@@ -924,22 +867,10 @@ long FsmRun::scan( PdaRun *parser )
 			#endif
 			goto done;
 		}
+#endif
 
-		/* First thing check for error. */
-		if ( cs == tables->errorState ) {
-			/* If a token was started, but not finished (tokstart != 0) then
-			 * restore p to the beginning of that token. */
-			if ( tokstart != 0 )
-				p = tokstart;
-
-			/* Check for a default token in the region. If one is there
-			 * then send it and continue with the processing loop. */
-			if ( parser->tables->rtd->regionInfo[region].defaultToken >= 0 ) {
-				tokstart = tokend = p;
-				sendToken( parser, parser->tables->rtd->regionInfo[region].defaultToken );
-				continue;
-			}
-
+		/* Check for error. */
+		if ( tokenId == SCAN_ERROR ) {
 			if ( parser->getNextRegion( 1 ) != 0 ) {
 				#ifdef COLM_LOG_PARSE
 				if ( colm_log_parse ) {
@@ -953,12 +884,12 @@ long FsmRun::scan( PdaRun *parser )
 				parser->sendBackIgnore();
 
 				parser->nextRegionInd += 1;
-				region = parser->getNextRegion();
-				cs = tables->entryByRegion[region];
+				fsmRun->region = parser->getNextRegion();
+				fsmRun->cs = fsmRun->tables->entryByRegion[fsmRun->region];
 				#ifdef COLM_LOG_PARSE
 				if ( colm_log_parse ) {
 					cerr << "new token region: " << 
-							parser->tables->rtd->regionInfo[region].name << endl;
+							parser->tables->rtd->regionInfo[fsmRun->region].name << endl;
 				}
 				#endif
 				continue;
@@ -980,12 +911,12 @@ long FsmRun::scan( PdaRun *parser )
 					cerr << "PARSE ERROR" << endp;
 				}
 				else {
-					region = parser->getNextRegion();
-					cs = tables->entryByRegion[region];
+					fsmRun->region = parser->getNextRegion();
+					fsmRun->cs = fsmRun->tables->entryByRegion[fsmRun->region];
 					#ifdef COLM_LOG_PARSE
 					if ( colm_log_parse ) {
 						cerr << "new token region: " << 
-								parser->tables->rtd->regionInfo[region].name << endl;
+								parser->tables->rtd->regionInfo[fsmRun->region].name << endl;
 					}
 					#endif
 					continue;
@@ -993,36 +924,122 @@ long FsmRun::scan( PdaRun *parser )
 			}
 
 			/* Machine failed before finding a token. */
-			cerr << "error:" << inputStream->line << ": scanner error" << endp;
+			cerr << "error:" << fsmRun->inputStream->line << ": scanner error" << endp;
+		}
+
+		bool ctxDepParsing = fsmRun->prg->ctxDepParsing;
+		LangElInfo *lelInfo = parser->tables->rtd->lelInfo;
+		if ( ctxDepParsing && lelInfo[tokenId].frameId >= 0 )
+			fsmRun->execGen( parser, tokenId );
+		else if ( lelInfo[tokenId].ignore )
+			fsmRun->sendIgnore( parser, tokenId );
+		else
+			fsmRun->sendToken( parser, tokenId );
+	}
+}
+
+long FsmRun::scan( PdaRun *parser )
+{
+	long space;
+
+	/* Init the scanner vars. */
+	act = 0;
+	tokstart = 0;
+	tokend = 0;
+	matchedToken = 0;
+
+	/* Set the state using the state of the parser. */
+	region = parser->getNextRegion();
+	cs = tables->entryByRegion[region];
+
+	/* Clear the mark array. */
+	memset( mark, 0, sizeof(mark) );
+
+	/* Start with the EOF test. The pattern and replacement input sources can
+	 * be EOF from the start. */
+
+	while ( true ) {
+		execute();
+
+		/* First check if scanning stopped because we have a token. */
+		if ( matchedToken > 0 ) 
+			return matchedToken;
+
+		/* Check for error. */
+		if ( cs == tables->errorState ) {
+			/* If a token was started, but not finished (tokstart != 0) then
+			 * restore p to the beginning of that token. */
+			if ( tokstart != 0 )
+				p = tokstart;
+
+			/* Check for a default token in the region. If one is there
+			 * then send it and continue with the processing loop. */
+			if ( parser->tables->rtd->regionInfo[region].defaultToken >= 0 ) {
+				tokstart = tokend = p;
+				return parser->tables->rtd->regionInfo[region].defaultToken;
+			}
+
+			return SCAN_ERROR;
 		}
 
+		/* Got here because the state machine didn't match a token or
+		 * encounter an error. Must be because we got to the end of the buffer
+		 * data. */
+		assert( p == pe );
+
+		/* Check for a named language element. Note that we can do this only
+		 * when p == pe otherwise we get ahead of what's already in the
+		 * buffer. */
+		if ( inputStream->isLangEl() )
+			return SCAN_LANG_EL;
+
+		/* Maybe need eof. */
+ 		if ( inputStream->isEOF() ) {
+			if ( tokstart != 0 ) {
+				/* If a token has been started, but not finshed 
+				 * this is an error. */
+				cs = tables->errorState;
+				return SCAN_ERROR;
+			}
+			else {
+				return SCAN_EOF;
+			}
+		}
+
+		/* There may be space left in the current buffer. If not then we need
+		 * to make some. */
 		space = runBuf->buf + FSM_BUFSIZE - pe;
 		if ( space == 0 ) {
 			/* Create a new run buf. */
-			RunBuf *buf = new RunBuf;
-			buf->next = runBuf;
-			runBuf = buf;
+			RunBuf *newBuf = new RunBuf;
 
 			/* If partway through a token then preserve the prefix. */
 			long have = 0;
 
 			if ( tokstart == 0 ) {
-				/* No prefix, the previous buffer was filled. */
-				runBuf->next->length = FSM_BUFSIZE;
+				/* No prefix. We filled the previous buffer. */
+				runBuf->length = FSM_BUFSIZE;
 			}
 			else {
+				if ( tokstart == runBuf->buf ) {
+					/* A token is started and it is already at the beginning
+					 * of the current buffer. This means buffer is full and it
+					 * must be grown. Probably need to do this sooner. */
+					cerr << "OUT OF BUFFER SPACE" << endp;
+				}
+
 				/* There is data that needs to be shifted over. */
 				have = pe - tokstart;
-				memcpy( runBuf->buf, tokstart, have );
+				memcpy( newBuf->buf, tokstart, have );
 
 				/* Compute the length of the previous buffer. */
-				runBuf->next->length = FSM_BUFSIZE - have;
+				runBuf->length = FSM_BUFSIZE - have;
 
 				/* Compute tokstart and tokend. */
-				long dist = tokstart - runBuf->buf;
+				long dist = tokstart - newBuf->buf;
 
 				tokend -= dist;
-				tokstart = runBuf->buf;
+				tokstart = newBuf->buf;
 
 				/* Shift any markers. */
 				for ( int i = 0; i < MARK_SLOTS; i++ ) {
@@ -1030,13 +1047,26 @@ long FsmRun::scan( PdaRun *parser )
 						mark[i] -= dist;
 				}
 			}
-			p = pe = runBuf->buf + have;
+
+			p = pe = newBuf->buf + have;
 			peof = 0;
+
+			newBuf->next = runBuf;
+			runBuf = newBuf;
 		}
+
+		/* We don't have any data. What is next in the input stream? */
+		space = runBuf->buf + FSM_BUFSIZE - pe;
+		assert( space > 0 );
+			
+		/* Get more data. */
+		int len = inputStream->getData( p, space );
+		pe = p + len;
+		if ( inputStream->needFlush() )
+			peof = pe;
+
 	}
 
-done:
-	//parser = prevParser;
-	//cs = prevState;
-	return 0;
+	/* Should not be reached. */
+	return SCAN_ERROR;
 }
diff --git a/colm/parsedata.cpp b/colm/parsedata.cpp
index 42fb87e6..802ac83d 100644
--- a/colm/parsedata.cpp
+++ b/colm/parsedata.cpp
@@ -1576,14 +1576,14 @@ InputStreamRepl::InputStreamRepl( Replacement *replacement )
 
 int InputStreamRepl::isLangEl()
 { 
-	return replItem != 0 && 
-		( replItem->type == ReplItem::VarRefType || replItem->type == ReplItem::FactorType );
+	return replItem != 0 && ( replItem->type == ReplItem::VarRefType || 
+			replItem->type == ReplItem::FactorType );
 }
 
 int InputStreamRepl::shouldFlush()
 { 
-	return replItem == 0 || 
-		( replItem->type == ReplItem::VarRefType || replItem->type == ReplItem::FactorType );
+	return replItem == 0 || ( replItem->type == ReplItem::VarRefType ||
+			replItem->type == ReplItem::FactorType );
 }
 
 KlangEl *InputStreamRepl::getLangEl( long &bindId, char *&data, long &length )
-- 
cgit v1.2.1