diff options
author | Adrian Thurston <thurston@complang.org> | 2012-07-01 12:48:22 -0400 |
---|---|---|
committer | Adrian Thurston <thurston@complang.org> | 2012-07-01 12:48:22 -0400 |
commit | 247904a84430b8c9151fa6afb68f01b60afb92c9 (patch) | |
tree | 58d498f783a935b02255120c814c387745dc6e41 /colm/compiler.cc | |
parent | d8cdec468bb7efad768d25872147533312cffe91 (diff) | |
download | colm-247904a84430b8c9151fa6afb68f01b60afb92c9.tar.gz |
moved 'colm' dir to 'src'
Diffstat (limited to 'colm/compiler.cc')
-rw-r--r-- | colm/compiler.cc | 1496 |
1 files changed, 0 insertions, 1496 deletions
diff --git a/colm/compiler.cc b/colm/compiler.cc deleted file mode 100644 index c1e775f2..00000000 --- a/colm/compiler.cc +++ /dev/null @@ -1,1496 +0,0 @@ -/* - * Copyright 2006-2012 Adrian Thurston <thurston@complang.org> - */ - -/* This file is part of Colm. - * - * Colm is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * Colm is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Colm; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <iostream> -#include <iomanip> -#include <errno.h> -#include <stdlib.h> -#include <limits.h> -#include <sstream> - -#include "global.h" -#include "lmparse.h" -#include "parsedata.h" -#include "parsetree.h" -#include "mergesort.h" -#include "redbuild.h" -#include "pdacodegen.h" -#include "fsmcodegen.h" -#include "fsmrun.h" -#include "pdarun.h" -#include "colm.h" -#include "pool.h" - -using namespace std; -using std::ostringstream; - -char machineMain[] = "main"; -exit_object endp; -void operator<<( ostream &out, exit_object & ) -{ - out << endl; - exit(1); -} - -/* Perform minimization after an operation according - * to the command line args. */ -void afterOpMinimize( FsmGraph *fsm, bool lastInSeq ) -{ - /* Switch on the prefered minimization algorithm. */ - if ( lastInSeq ) { - /* First clean up the graph. FsmGraph operations may leave these - * lying around. There should be no dead end states. The subtract - * intersection operators are the only places where they may be - * created and those operators clean them up. */ - fsm->removeUnreachableStates(); - fsm->minimizePartition2(); - } -} - -/* Count the transitions in the fsm by walking the state list. */ -int countTransitions( FsmGraph *fsm ) -{ - int numTrans = 0; - FsmState *state = fsm->stateList.head; - while ( state != 0 ) { - numTrans += state->outList.length(); - state = state->next; - } - return numTrans; -} - -Key makeFsmKeyHex( char *str, const InputLoc &loc, Compiler *pd ) -{ - /* Reset errno so we can check for overflow or underflow. In the event of - * an error, sets the return val to the upper or lower bound being tested - * against. */ - errno = 0; - unsigned int size = keyOps->alphType->size; - bool unusedBits = size < sizeof(unsigned long); - - unsigned long ul = strtoul( str, 0, 16 ); - - if ( errno == ERANGE || (unusedBits && ul >> (size * 8)) ) { - error(loc) << "literal " << str << " overflows the alphabet type" << endl; - ul = 1 << (size * 8); - } - - if ( unusedBits && keyOps->alphType->isSigned && ul >> (size * 8 - 1) ) - ul |= (0xffffffff >> (size*8 ) ) << (size*8); - - return Key( (long)ul ); -} - -Key makeFsmKeyDec( char *str, const InputLoc &loc, Compiler *pd ) -{ - /* Convert the number to a decimal. First reset errno so we can check - * for overflow or underflow. */ - errno = 0; - long long minVal = keyOps->alphType->minVal; - long long maxVal = keyOps->alphType->maxVal; - - long long ll = strtoll( str, 0, 10 ); - - /* Check for underflow. */ - if ( (errno == ERANGE && ll < 0) || ll < minVal) { - error(loc) << "literal " << str << " underflows the alphabet type" << endl; - ll = minVal; - } - /* Check for overflow. */ - else if ( (errno == ERANGE && ll > 0) || ll > maxVal ) { - error(loc) << "literal " << str << " overflows the alphabet type" << endl; - ll = maxVal; - } - - if ( keyOps->alphType->isSigned ) - return Key( (long)ll ); - else - return Key( (unsigned long)ll ); -} - -/* Make an fsm key in int format (what the fsm graph uses) from an alphabet - * number returned by the parser. Validates that the number doesn't overflow - * the alphabet type. */ -Key makeFsmKeyNum( char *str, const InputLoc &loc, Compiler *pd ) -{ - /* Switch on hex/decimal format. */ - if ( str[0] == '0' && str[1] == 'x' ) - return makeFsmKeyHex( str, loc, pd ); - else - return makeFsmKeyDec( str, loc, pd ); -} - -/* Make an fsm int format (what the fsm graph uses) from a single character. - * Performs proper conversion depending on signed/unsigned property of the - * alphabet. */ -Key makeFsmKeyChar( char c, Compiler *pd ) -{ - if ( keyOps->isSigned ) { - /* Copy from a char type. */ - return Key( c ); - } - else { - /* Copy from an unsigned byte type. */ - return Key( (unsigned char)c ); - } -} - -/* Make an fsm key array in int format (what the fsm graph uses) from a string - * of characters. Performs proper conversion depending on signed/unsigned - * property of the alphabet. */ -void makeFsmKeyArray( Key *result, char *data, int len, Compiler *pd ) -{ - if ( keyOps->isSigned ) { - /* Copy from a char star type. */ - char *src = data; - for ( int i = 0; i < len; i++ ) - result[i] = Key(src[i]); - } - else { - /* Copy from an unsigned byte ptr type. */ - unsigned char *src = (unsigned char*) data; - for ( int i = 0; i < len; i++ ) - result[i] = Key(src[i]); - } -} - -/* Like makeFsmKeyArray except the result has only unique keys. They ordering - * will be changed. */ -void makeFsmUniqueKeyArray( KeySet &result, char *data, int len, - bool caseInsensitive, Compiler *pd ) -{ - /* Use a transitions list for getting unique keys. */ - if ( keyOps->isSigned ) { - /* Copy from a char star type. */ - char *src = data; - for ( int si = 0; si < len; si++ ) { - Key key( src[si] ); - result.insert( key ); - if ( caseInsensitive ) { - if ( key.isLower() ) - result.insert( key.toUpper() ); - else if ( key.isUpper() ) - result.insert( key.toLower() ); - } - } - } - else { - /* Copy from an unsigned byte ptr type. */ - unsigned char *src = (unsigned char*) data; - for ( int si = 0; si < len; si++ ) { - Key key( src[si] ); - result.insert( key ); - if ( caseInsensitive ) { - if ( key.isLower() ) - result.insert( key.toUpper() ); - else if ( key.isUpper() ) - result.insert( key.toLower() ); - } - } - } -} - -FsmGraph *dotFsm( Compiler *pd ) -{ - FsmGraph *retFsm = new FsmGraph(); - retFsm->rangeFsm( keyOps->minKey, keyOps->maxKey ); - return retFsm; -} - -FsmGraph *dotStarFsm( Compiler *pd ) -{ - FsmGraph *retFsm = new FsmGraph(); - retFsm->rangeStarFsm( keyOps->minKey, keyOps->maxKey ); - return retFsm; -} - -/* Make a builtin type. Depends on the signed nature of the alphabet type. */ -FsmGraph *makeBuiltin( BuiltinMachine builtin, Compiler *pd ) -{ - /* FsmGraph created to return. */ - FsmGraph *retFsm = 0; - bool isSigned = keyOps->isSigned; - - switch ( builtin ) { - case BT_Any: { - /* All characters. */ - retFsm = dotFsm( pd ); - break; - } - case BT_Ascii: { - /* Ascii characters 0 to 127. */ - retFsm = new FsmGraph(); - retFsm->rangeFsm( 0, 127 ); - break; - } - case BT_Extend: { - /* Ascii extended characters. This is the full byte range. Dependent - * on signed, vs no signed. If the alphabet is one byte then just use - * dot fsm. */ - if ( isSigned ) { - retFsm = new FsmGraph(); - retFsm->rangeFsm( -128, 127 ); - } - else { - retFsm = new FsmGraph(); - retFsm->rangeFsm( 0, 255 ); - } - break; - } - case BT_Alpha: { - /* Alpha [A-Za-z]. */ - FsmGraph *upper = new FsmGraph(), *lower = new FsmGraph(); - upper->rangeFsm( 'A', 'Z' ); - lower->rangeFsm( 'a', 'z' ); - upper->unionOp( lower ); - upper->minimizePartition2(); - retFsm = upper; - break; - } - case BT_Digit: { - /* Digits [0-9]. */ - retFsm = new FsmGraph(); - retFsm->rangeFsm( '0', '9' ); - break; - } - case BT_Alnum: { - /* Alpha numerics [0-9A-Za-z]. */ - FsmGraph *digit = new FsmGraph(), *lower = new FsmGraph(); - FsmGraph *upper = new FsmGraph(); - digit->rangeFsm( '0', '9' ); - upper->rangeFsm( 'A', 'Z' ); - lower->rangeFsm( 'a', 'z' ); - digit->unionOp( upper ); - digit->unionOp( lower ); - digit->minimizePartition2(); - retFsm = digit; - break; - } - case BT_Lower: { - /* Lower case characters. */ - retFsm = new FsmGraph(); - retFsm->rangeFsm( 'a', 'z' ); - break; - } - case BT_Upper: { - /* Upper case characters. */ - retFsm = new FsmGraph(); - retFsm->rangeFsm( 'A', 'Z' ); - break; - } - case BT_Cntrl: { - /* Control characters. */ - FsmGraph *cntrl = new FsmGraph(); - FsmGraph *highChar = new FsmGraph(); - cntrl->rangeFsm( 0, 31 ); - highChar->concatFsm( 127 ); - cntrl->unionOp( highChar ); - cntrl->minimizePartition2(); - retFsm = cntrl; - break; - } - case BT_Graph: { - /* Graphical ascii characters [!-~]. */ - retFsm = new FsmGraph(); - retFsm->rangeFsm( '!', '~' ); - break; - } - case BT_Print: { - /* Printable characters. Same as graph except includes space. */ - retFsm = new FsmGraph(); - retFsm->rangeFsm( ' ', '~' ); - break; - } - case BT_Punct: { - /* Punctuation. */ - FsmGraph *range1 = new FsmGraph(); - FsmGraph *range2 = new FsmGraph(); - FsmGraph *range3 = new FsmGraph(); - FsmGraph *range4 = new FsmGraph(); - range1->rangeFsm( '!', '/' ); - range2->rangeFsm( ':', '@' ); - range3->rangeFsm( '[', '`' ); - range4->rangeFsm( '{', '~' ); - range1->unionOp( range2 ); - range1->unionOp( range3 ); - range1->unionOp( range4 ); - range1->minimizePartition2(); - retFsm = range1; - break; - } - case BT_Space: { - /* Whitespace: [\t\v\f\n\r ]. */ - FsmGraph *cntrl = new FsmGraph(); - FsmGraph *space = new FsmGraph(); - cntrl->rangeFsm( '\t', '\r' ); - space->concatFsm( ' ' ); - cntrl->unionOp( space ); - cntrl->minimizePartition2(); - retFsm = cntrl; - break; - } - case BT_Xdigit: { - /* Hex digits [0-9A-Fa-f]. */ - FsmGraph *digit = new FsmGraph(); - FsmGraph *upper = new FsmGraph(); - FsmGraph *lower = new FsmGraph(); - digit->rangeFsm( '0', '9' ); - upper->rangeFsm( 'A', 'F' ); - lower->rangeFsm( 'a', 'f' ); - digit->unionOp( upper ); - digit->unionOp( lower ); - digit->minimizePartition2(); - retFsm = digit; - break; - } - case BT_Lambda: { - retFsm = new FsmGraph(); - retFsm->lambdaFsm(); - break; - } - case BT_Empty: { - retFsm = new FsmGraph(); - retFsm->emptyFsm(); - break; - }} - - return retFsm; -} - -/* Check if this name inst or any name inst below is referenced. */ -bool NameInst::anyRefsRec() -{ - if ( numRefs > 0 ) - return true; - - /* Recurse on children until true. */ - for ( NameVect::Iter ch = childVect; ch.lte(); ch++ ) { - if ( (*ch)->anyRefsRec() ) - return true; - } - - return false; -} - -/* - * Compiler - */ - -/* Initialize the structure that will collect info during the parse of a - * machine. */ -Compiler::Compiler( const String &fileName, const String §ionName, - const InputLoc §ionLoc, ostream &out ) -: - nextPriorKey(0), - nextLocalErrKey(1), /* 0 is reserved for global error actions. */ - nextNameId(0), - alphTypeSet(false), - getKeyExpr(0), - accessExpr(0), - curStateExpr(0), - lowerNum(0), - upperNum(0), - fileName(fileName), - sectionName(sectionName), - sectionLoc(sectionLoc), - errorCount(0), - curActionOrd(0), - curPriorOrd(0), - nextEpsilonResolvedLink(0), - nextTokenId(1), - rootCodeBlock(0), - mainReturnUT(0), - parserName(sectionName), - out(out), - access(0), - tokenStruct(0), - rootLangEl(0), - eofLangEl(0), - errorLangEl(0), - defaultCharLangEl(0), - rootRegion(0), - defaultRegion(0), - firstNonTermId(0), - prodIdIndex(0), - nextPatReplId(0), - nextGenericId(1), - nextFuncId(0), - loopCleanup(0), - nextObjectId(1), /* 0 is reserved for no object. */ - nextFrameId(0), - nextParserId(0), - nextLabelId(0), - revertOn(true), - predValue(0), - nextMatchEndNum(0), - argvTypeRef(0), - context(0) -{ -} - -/* Clean up the data collected during a parse. */ -Compiler::~Compiler() -{ - /* Delete all the nodes in the action list. Will cause all the - * string data that represents the actions to be deallocated. */ - actionList.empty(); -} - -/* Make a name id in the current name instantiation scope if it is not - * already there. */ -NameInst *Compiler::addNameInst( const InputLoc &loc, char *data, bool isLabel ) -{ - /* Create the name instantitaion object and insert it. */ - NameInst *newNameInst = new NameInst( loc, curNameInst, data, nextNameId++, isLabel ); - curNameInst->childVect.append( newNameInst ); - if ( data != 0 ) - curNameInst->children.insertMulti( data, newNameInst ); - return newNameInst; -} - -void Compiler::initNameWalk( NameInst *rootName ) -{ - curNameInst = rootName; - curNameChild = 0; -} - -/* Goes into the next child scope. The number of the child is already set up. - * We need this for the syncronous name tree and parse tree walk to work - * properly. It is reset on entry into a scope and advanced on poping of a - * scope. A call to enterNameScope should be accompanied by a corresponding - * popNameScope. */ -NameFrame Compiler::enterNameScope( bool isLocal, int numScopes ) -{ - /* Save off the current data. */ - NameFrame retFrame; - retFrame.prevNameInst = curNameInst; - retFrame.prevNameChild = curNameChild; - retFrame.prevLocalScope = localNameScope; - - /* Enter into the new name scope. */ - for ( int i = 0; i < numScopes; i++ ) { - curNameInst = curNameInst->childVect[curNameChild]; - curNameChild = 0; - } - - if ( isLocal ) - localNameScope = curNameInst; - - return retFrame; -} - -/* Return from a child scope to a parent. The parent info must be specified as - * an argument and is obtained from the corresponding call to enterNameScope. - * */ -void Compiler::popNameScope( const NameFrame &frame ) -{ - /* Pop the name scope. */ - curNameInst = frame.prevNameInst; - curNameChild = frame.prevNameChild+1; - localNameScope = frame.prevLocalScope; -} - -void Compiler::resetNameScope( const NameFrame &frame ) -{ - /* Pop the name scope. */ - curNameInst = frame.prevNameInst; - curNameChild = frame.prevNameChild; - localNameScope = frame.prevLocalScope; -} - - -void Compiler::unsetObsoleteEntries( FsmGraph *graph ) -{ - /* Loop the reference names and increment the usage. Names that are no - * longer needed will be unset in graph. */ - for ( NameVect::Iter ref = curNameInst->referencedNames; ref.lte(); ref++ ) { - /* Get the name. */ - NameInst *name = *ref; - name->numUses += 1; - - /* If the name is no longer needed unset its corresponding entry. */ - if ( name->numUses == name->numRefs ) { - assert( graph->entryPoints.find( name->id ) != 0 ); - graph->unsetEntry( name->id ); - } - } -} - -NameSet Compiler::resolvePart( NameInst *refFrom, const char *data, bool recLabelsOnly ) -{ - /* Queue needed for breadth-first search, load it with the start node. */ - NameInstList nameQueue; - nameQueue.append( refFrom ); - - NameSet result; - while ( nameQueue.length() > 0 ) { - /* Pull the next from location off the queue. */ - NameInst *from = nameQueue.detachFirst(); - - /* Look for the name. */ - NameMapEl *low, *high; - if ( from->children.findMulti( data, low, high ) ) { - /* Record all instances of the name. */ - for ( ; low <= high; low++ ) - result.insert( low->value ); - } - - /* Name not there, do breadth-first operation of appending all - * childrent to the processing queue. */ - for ( NameVect::Iter name = from->childVect; name.lte(); name++ ) { - if ( !recLabelsOnly || (*name)->isLabel ) - nameQueue.append( *name ); - } - } - - /* Queue exhausted and name never found. */ - return result; -} - -void Compiler::resolveFrom( NameSet &result, NameInst *refFrom, - const NameRef &nameRef, int namePos ) -{ - /* Look for the name in the owning scope of the factor with aug. */ - NameSet partResult = resolvePart( refFrom, nameRef[namePos], false ); - - /* If there are more parts to the name then continue on. */ - if ( ++namePos < nameRef.length() ) { - /* There are more components to the name, search using all the part - * results as the base. */ - for ( NameSet::Iter name = partResult; name.lte(); name++ ) - resolveFrom( result, *name, nameRef, namePos ); - } - else { - /* This is the last component, append the part results to the final - * results. */ - result.insert( partResult ); - } -} - -ostream &operator<<( ostream &out, const Token &token ) -{ - out << token.data; - return out; -} - -/* Write out a name reference. */ -ostream &operator<<( ostream &out, const NameRef &nameRef ) -{ - int pos = 0; - if ( nameRef[pos] == 0 ) { - out << "::"; - pos += 1; - } - out << nameRef[pos++]; - for ( ; pos < nameRef.length(); pos++ ) - out << "::" << nameRef[pos]; - return out; -} - -ostream &operator<<( ostream &out, const NameInst &nameInst ) -{ - /* Count the number fully qualified name parts. */ - int numParents = 0; - NameInst *curParent = nameInst.parent; - while ( curParent != 0 ) { - numParents += 1; - curParent = curParent->parent; - } - - /* Make an array and fill it in. */ - curParent = nameInst.parent; - NameInst **parents = new NameInst*[numParents]; - for ( int p = numParents-1; p >= 0; p-- ) { - parents[p] = curParent; - curParent = curParent->parent; - } - - /* Write the parents out, skip the root. */ - for ( int p = 1; p < numParents; p++ ) - out << "::" << ( parents[p]->name != 0 ? parents[p]->name : "<ANON>" ); - - /* Write the name and cleanup. */ - out << "::" << ( nameInst.name != 0 ? nameInst.name : "<ANON>" ); - delete[] parents; - return out; -} - -struct CmpNameInstLoc -{ - static int compare( const NameInst *ni1, const NameInst *ni2 ) - { - if ( ni1->loc.line < ni2->loc.line ) - return -1; - else if ( ni1->loc.line > ni2->loc.line ) - return 1; - else if ( ni1->loc.col < ni2->loc.col ) - return -1; - else if ( ni1->loc.col > ni2->loc.col ) - return 1; - return 0; - } -}; - -void errorStateLabels( const NameSet &resolved ) -{ - MergeSort<NameInst*, CmpNameInstLoc> mergeSort; - mergeSort.sort( resolved.data, resolved.length() ); - for ( NameSet::Iter res = resolved; res.lte(); res++ ) - error((*res)->loc) << " -> " << **res << endl; -} - - -void Compiler::referenceRegions( NameInst *rootName ) -{ - for ( NameVect::Iter inst = rootName->childVect; inst.lte(); inst++ ) { - /* Inc the reference in the name. This will cause the entry point to - * survive to the end of the graph generating walk. */ - (*inst)->numRefs += 1; - } -} - -/* Walk a name tree starting at from and fill the name index. */ -void Compiler::fillNameIndex( NameInst **nameIndex, NameInst *from ) -{ - /* Fill the value for from in the name index. */ - nameIndex[from->id] = from; - - /* Recurse on the implicit final state and then all children. */ - if ( from->final != 0 ) - fillNameIndex( nameIndex, from->final ); - for ( NameVect::Iter name = from->childVect; name.lte(); name++ ) - fillNameIndex( nameIndex, *name ); -} - -NameInst **Compiler::makeNameIndex( NameInst *rootName ) -{ - /* The number of nodes in the tree can now be given by nextNameId. Put a - * null pointer on the end of the list to terminate it. */ - NameInst **nameIndex = new NameInst*[nextNameId+1]; - memset( nameIndex, 0, sizeof(NameInst*)*(nextNameId+1) ); - fillNameIndex( nameIndex, rootName ); - return nameIndex; -} - -void Compiler::createBuiltin( const char *name, BuiltinMachine builtin ) -{ - Expression *expression = new Expression( builtin ); - Join *join = new Join( expression ); - VarDef *varDef = new VarDef( name, join ); - GraphDictEl *graphDictEl = new GraphDictEl( name, varDef ); - rootNamespace->rlMap.insert( graphDictEl ); -} - -/* Initialize the graph dict with builtin types. */ -void Compiler::initGraphDict( ) -{ - createBuiltin( "any", BT_Any ); - createBuiltin( "ascii", BT_Ascii ); - createBuiltin( "extend", BT_Extend ); - createBuiltin( "alpha", BT_Alpha ); - createBuiltin( "digit", BT_Digit ); - createBuiltin( "alnum", BT_Alnum ); - createBuiltin( "lower", BT_Lower ); - createBuiltin( "upper", BT_Upper ); - createBuiltin( "cntrl", BT_Cntrl ); - createBuiltin( "graph", BT_Graph ); - createBuiltin( "print", BT_Print ); - createBuiltin( "punct", BT_Punct ); - createBuiltin( "space", BT_Space ); - createBuiltin( "xdigit", BT_Xdigit ); - createBuiltin( "null", BT_Lambda ); - createBuiltin( "zlen", BT_Lambda ); - createBuiltin( "empty", BT_Empty ); -} - -/* Initialize the key operators object that will be referenced by all fsms - * created. */ -void Compiler::initKeyOps( ) -{ - /* Signedness and bounds. */ - HostType *alphType = alphTypeSet ? userAlphType : hostLang->defaultAlphType; - thisKeyOps.setAlphType( alphType ); - - if ( lowerNum != 0 ) { - /* If ranges are given then interpret the alphabet type. */ - thisKeyOps.minKey = makeFsmKeyNum( lowerNum, rangeLowLoc, this ); - thisKeyOps.maxKey = makeFsmKeyNum( upperNum, rangeHighLoc, this ); - } - - thisCondData.nextCondKey = thisKeyOps.maxKey; - thisCondData.nextCondKey.increment(); -} - -void Compiler::printNameInst( NameInst *nameInst, int level ) -{ - for ( int i = 0; i < level; i++ ) - cerr << " "; - cerr << (nameInst->name != 0 ? nameInst->name : "<ANON>") << - " id: " << nameInst->id << - " refs: " << nameInst->numRefs << endl; - for ( NameVect::Iter name = nameInst->childVect; name.lte(); name++ ) - printNameInst( *name, level+1 ); -} - -/* Remove duplicates of unique actions from an action table. */ -void Compiler::removeDups( ActionTable &table ) -{ - /* Scan through the table looking for unique actions to - * remove duplicates of. */ - for ( int i = 0; i < table.length(); i++ ) { - /* Remove any duplicates ahead of i. */ - for ( int r = i+1; r < table.length(); ) { - if ( table[r].value == table[i].value ) - table.vremove(r); - else - r += 1; - } - } -} - -/* Remove duplicates from action lists. This operates only on transition and - * eof action lists and so should be called once all actions have been - * transfered to their final resting place. */ -void Compiler::removeActionDups( FsmGraph *graph ) -{ - /* Loop all states. */ - for ( StateList::Iter state = graph->stateList; state.lte(); state++ ) { - /* Loop all transitions. */ - for ( TransList::Iter trans = state->outList; trans.lte(); trans++ ) - removeDups( trans->actionTable ); - removeDups( state->toStateActionTable ); - removeDups( state->fromStateActionTable ); - removeDups( state->eofActionTable ); - } -} - -Action *Compiler::newAction( const String &name, InlineList *inlineList ) -{ - InputLoc loc; - loc.line = 1; - loc.col = 1; - loc.fileName = 0; - - Action *action = new Action( loc, name, inlineList ); - actionList.append( action ); - return action; -} - -void Compiler::initLongestMatchData() -{ - if ( regionList.length() > 0 ) { - /* The initActId action gives act a default value. */ - InlineList *il4 = new InlineList; - il4->append( new InlineItem( InputLoc(), InlineItem::LmInitAct ) ); - initActId = newAction( "initact", il4 ); - initActId->isLmAction = true; - - /* The setTokStart action sets tokstart. */ - InlineList *il5 = new InlineList; - il5->append( new InlineItem( InputLoc(), InlineItem::LmSetTokStart ) ); - setTokStart = newAction( "tokstart", il5 ); - setTokStart->isLmAction = true; - - /* The setTokEnd action sets tokend. */ - InlineList *il3 = new InlineList; - il3->append( new InlineItem( InputLoc(), InlineItem::LmSetTokEnd ) ); - setTokEnd = newAction( "tokend", il3 ); - setTokEnd->isLmAction = true; - - /* The action will also need an ordering: ahead of all user action - * embeddings. */ - initActIdOrd = curActionOrd++; - setTokStartOrd = curActionOrd++; - setTokEndOrd = curActionOrd++; - } -} - -void Compiler::finishGraphBuild( FsmGraph *graph ) -{ - /* Resolve any labels that point to multiple states. Any labels that are - * still around are referenced only by gotos and calls and they need to be - * made into deterministic entry points. */ - graph->deterministicEntry(); - - /* - * All state construction is now complete. - */ - - /* Transfer global error actions. */ - for ( StateList::Iter state = graph->stateList; state.lte(); state++ ) - graph->transferErrorActions( state, 0 ); - - removeActionDups( graph ); - - /* Remove unreachable states. There should be no dead end states. The - * subtract and intersection operators are the only places where they may - * be created and those operators clean them up. */ - graph->removeUnreachableStates(); - - /* No more fsm operations are to be done. Action ordering numbers are - * no longer of use and will just hinder minimization. Clear them. */ - graph->nullActionKeys(); - - /* Transition priorities are no longer of use. We can clear them - * because they will just hinder minimization as well. Clear them. */ - graph->clearAllPriorities(); - - /* Minimize here even if we minimized at every op. Now that function - * keys have been cleared we may get a more minimal fsm. */ - graph->minimizePartition2(); - graph->compressTransitions(); -} - -void Compiler::printNameTree( NameInst *rootName ) -{ - /* Print the name instance map. */ - cerr << "name tree:" << endl; - for ( NameVect::Iter name = rootName->childVect; name.lte(); name++ ) - printNameInst( *name, 0 ); -} - -void Compiler::printNameIndex( NameInst **nameIndex ) -{ - /* The name index is terminated with a null pointer. */ - cerr << "name index:" << endl; - for ( int ni = 0; nameIndex[ni]; ni++ ) { - cerr << ni << ": "; - char *name = nameIndex[ni]->name; - cerr << ( name != 0 ? name : "<ANON>" ) << endl; - } -} - - -/* Build the name tree and supporting data structures. */ -NameInst *Compiler::makeNameTree() -{ - /* Create the root name. */ - nextNameId = 0; - NameInst *rootName = new NameInst( InputLoc(), 0, 0, nextNameId++, false ); - - /* First make the name tree. */ - initNameWalk( rootName ); - for ( RegionGraphList::Iter glel = instanceList; glel.lte(); glel++ ) { - /* Recurse on the instance. */ - glel->value->makeNameTree( glel->loc, this ); - } - - return rootName; -} - -FsmGraph *Compiler::makeAllRegions() -{ - /* Build the name tree and supporting data structures. */ - NameInst *rootName = makeNameTree( ); - NameInst **nameIndex = makeNameIndex( rootName ); - - /* Resovle the implicit name references to the nfa instantiations. */ - referenceRegions( rootName ); - - int numGraphs = 0; - FsmGraph **graphs = new FsmGraph*[instanceList.length()]; - - /* Make all the instantiations, we know that main exists in this list. */ - initNameWalk( rootName ); - for ( RegionGraphList::Iter glel = instanceList; glel.lte(); glel++ ) { - /* Build the graph from a walk of the parse tree. */ - FsmGraph *newGraph = glel->value->walk( this ); - - /* Wrap up the construction. */ - finishGraphBuild( newGraph ); - - /* Save off the new graph. */ - graphs[numGraphs++] = newGraph; - } - - /* NOTE: If putting in minimization here we need to include eofTarget - * into the minimization algorithm. It is currently set by the longest - * match operator and not considered anywhere else. */ - - /* Add all the other graphs into the first. */ - FsmGraph *all = graphs[0]; - all->globOp( graphs+1, numGraphs-1 ); - delete[] graphs; - - /* Go through all the token regions and check for lmRequiresErrorState. */ - for ( RegionList::Iter reg = regionList; reg.lte(); reg++ ) { - if ( reg->lmSwitchHandlesError ) - all->lmRequiresErrorState = true; - } - - all->rootName = rootName; - all->nameIndex = nameIndex; - - return all; -} - -void Compiler::analyzeAction( Action *action, InlineList *inlineList ) -{ - /* FIXME: Actions used as conditions should be very constrained. */ - for ( InlineList::Iter item = *inlineList; item.lte(); item++ ) { - //if ( item->type == InlineItem::Call || item->type == InlineItem::CallExpr ) - // action->anyCall = true; - - /* Need to recurse into longest match items. */ - if ( item->type == InlineItem::LmSwitch ) { - TokenRegion *lm = item->tokenRegion; - for ( TokenDefListReg::Iter lmi = lm->tokenDefList; lmi.lte(); lmi++ ) { - if ( lmi->action != 0 ) - analyzeAction( action, lmi->action->inlineList ); - } - } - - if ( item->type == InlineItem::LmOnLast || - item->type == InlineItem::LmOnNext || - item->type == InlineItem::LmOnLagBehind ) - { - TokenDef *lmi = item->longestMatchPart; - if ( lmi->action != 0 ) - analyzeAction( action, lmi->action->inlineList ); - } - - if ( item->children != 0 ) - analyzeAction( action, item->children ); - } -} - -void Compiler::analyzeGraph( FsmGraph *graph ) -{ - for ( ActionList::Iter act = actionList; act.lte(); act++ ) - analyzeAction( act, act->inlineList ); - - for ( StateList::Iter st = graph->stateList; st.lte(); st++ ) { - /* The transition list. */ - for ( TransList::Iter trans = st->outList; trans.lte(); trans++ ) { - for ( ActionTable::Iter at = trans->actionTable; at.lte(); at++ ) - at->value->numTransRefs += 1; - } - - for ( ActionTable::Iter at = st->toStateActionTable; at.lte(); at++ ) - at->value->numToStateRefs += 1; - - for ( ActionTable::Iter at = st->fromStateActionTable; at.lte(); at++ ) - at->value->numFromStateRefs += 1; - - for ( ActionTable::Iter at = st->eofActionTable; at.lte(); at++ ) - at->value->numEofRefs += 1; - - for ( StateCondList::Iter sc = st->stateCondList; sc.lte(); sc++ ) { - for ( CondSet::Iter sci = sc->condSpace->condSet; sci.lte(); sci++ ) - (*sci)->numCondRefs += 1; - } - } -} - -FsmGraph *Compiler::makeScanner() -{ - /* Make the graph, do minimization. */ - FsmGraph *fsmGraph = makeAllRegions(); - - /* If any errors have occured in the input file then don't write anything. */ - if ( gblErrorCount > 0 ) - return 0; - - analyzeGraph( fsmGraph ); - - /* Decide if an error state is necessary. - * 1. There is an error transition - * 2. There is a gap in the transitions - * 3. The longest match operator requires it. */ - if ( fsmGraph->lmRequiresErrorState || fsmGraph->hasErrorTrans() ) - fsmGraph->errState = fsmGraph->addState(); - - /* State numbers need to be assigned such that all final states have a - * larger state id number than all non-final states. This enables the - * first_final mechanism to function correctly. We also want states to be - * ordered in a predictable fashion. So we first apply a depth-first - * search, then do a stable sort by final state status, then assign - * numbers. */ - - fsmGraph->depthFirstOrdering(); - fsmGraph->sortStatesByFinal(); - fsmGraph->setStateNumbers( 0 ); - - return fsmGraph; -} - -void Compiler::createDefaultScanner() -{ - InputLoc loc = { 0, 0, 0 }; - - const char *name = "___DEFAULT_SCANNER"; - - /* Create the default namespace. */ - defaultNamespace = new Namespace( InputLoc(), name, - namespaceList.length(), 0 ); - namespaceList.append( defaultNamespace ); - - /* Create a scanner which will be used when no other scanner can be - * figured out. It returns single characters. */ - defaultRegion = new TokenRegion( InputLoc(), name, - regionList.length(), 0 ); - regionList.append( defaultRegion ); - - /* Insert the machine definition into the graph dictionary. */ - RegionGraphDictEl *newEl = rootNamespace->graphDict.insert( name ); - assert( newEl != 0 ); - newEl->value = new RegionDef( name, defaultRegion ); - newEl->isInstance = true; - instanceList.append( newEl ); - - Join *join = new Join( new Expression( BT_Any ) ); - - TokenDef *tokenDef = new TokenDef( name, String(), false, false, - join, 0, loc, nextTokenId++, - rootNamespace, defaultRegion, 0, 0, 0 ); - - defaultRegion->tokenDefList.append( tokenDef ); - - /* Now create the one and only token -> "<chr>" / any / */ - name = "___DEFAULT_SCANNER_CHR"; - defaultCharLangEl = addLangEl( this, defaultNamespace, name, LangEl::Term ); - - tokenDef->tdLangEl = defaultCharLangEl; - defaultCharLangEl->tokenDef = tokenDef; -} - -LangEl *Compiler::makeRepeatProd( Namespace *nspace, const String &repeatName, - NamespaceQual *nspaceQual, const String &name ) -{ - LangEl *prodName = addLangEl( this, nspace, repeatName, LangEl::NonTerm ); - prodName->isRepeat = true; - - ProdElList *prodElList1 = new ProdElList; - - /* Build the first production of the repeat. */ - TypeRef *typeRef1 = new TypeRef( InputLoc(), nspaceQual, name ); - ProdEl *factor1 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef1, 0 ); - - UniqueType *prodNameUT = findUniqueType( TYPE_TREE, prodName ); - TypeRef *typeRef2 = new TypeRef( InputLoc(), prodNameUT ); - ProdEl *factor2 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef2, 0 ); - - prodElList1->append( factor1 ); - prodElList1->append( factor2 ); - - Definition *newDef1 = new Definition( InputLoc(), - prodName, prodElList1, false, 0, - prodList.length(), prodName->defList.length(), - Definition::Production ); - - prodName->defList.append( newDef1 ); - prodList.append( newDef1 ); - - /* Build the second production of the repeat. */ - ProdElList *prodElList2 = new ProdElList; - - Definition *newDef2 = new Definition( InputLoc(), - prodName, prodElList2, false, 0, - prodList.length(), prodName->defList.length(), - Definition::Production ); - - prodName->defList.append( newDef2 ); - prodList.append( newDef2 ); - - return prodName; -} - -LangEl *Compiler::makeListProd( Namespace *nspace, const String &listName, NamespaceQual *nspaceQual, const String &name ) -{ - LangEl *prodName = addLangEl( this, nspace, listName, LangEl::NonTerm ); - prodName->isList = true; - - /* Build the first production of the list. */ - TypeRef *typeRef1 = new TypeRef( InputLoc(), nspaceQual, name ); - ProdEl *factor1 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef1, 0 ); - - UniqueType *prodNameUT = findUniqueType( TYPE_TREE, prodName ); - TypeRef *typeRef2 = new TypeRef( InputLoc(), prodNameUT ); - ProdEl *factor2 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef2, 0 ); - - ProdElList *prodElList1 = new ProdElList; - prodElList1->append( factor1 ); - prodElList1->append( factor2 ); - - Definition *newDef1 = new Definition( InputLoc(), - prodName, prodElList1, false, 0, - prodList.length(), prodName->defList.length(), - Definition::Production ); - - prodName->defList.append( newDef1 ); - prodList.append( newDef1 ); - - /* Build the second production of the list. */ - TypeRef *typeRef3 = new TypeRef( InputLoc(), nspaceQual, name ); - ProdEl *factor3 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef3, 0 ); - - ProdElList *prodElList2 = new ProdElList; - prodElList2->append( factor3 ); - - Definition *newDef2 = new Definition( InputLoc(), - prodName, prodElList2, false, 0, - prodList.length(), prodName->defList.length(), - Definition::Production ); - - prodName->defList.append( newDef2 ); - prodList.append( newDef2 ); - - return prodName; -} - -LangEl *Compiler::makeOptProd( Namespace *nspace, const String &optName, NamespaceQual *nspaceQual, const String &name ) -{ - LangEl *prodName = addLangEl( this, nspace, optName, LangEl::NonTerm ); - prodName->isOpt = true; - - ProdElList *prodElList1 = new ProdElList; - - /* Build the first production of the repeat. */ - TypeRef *typeRef1 = new TypeRef( InputLoc(), nspaceQual, name ); - ProdEl *factor1 = new ProdEl( ProdEl::ReferenceType, InputLoc(), 0, false, typeRef1, 0 ); - prodElList1->append( factor1 ); - - Definition *newDef1 = new Definition( InputLoc(), - prodName, prodElList1, false, 0, - prodList.length(), prodName->defList.length(), - Definition::Production ); - - prodName->defList.append( newDef1 ); - prodList.append( newDef1 ); - - /* Build the second production of the repeat. */ - ProdElList *prodElList2 = new ProdElList; - - Definition *newDef2 = new Definition( InputLoc(), - prodName, prodElList2, false, 0, - prodList.length(), prodName->defList.length(), - Definition::Production ); - - prodName->defList.append( newDef2 ); - prodList.append( newDef2 ); - - return prodName; -} - -Namespace *Namespace::findNamespace( const String &name ) -{ - for ( NamespaceVect::Iter c = childNamespaces; c.lte(); c++ ) { - if ( strcmp( name, (*c)->name ) == 0 ) - return *c; - } - return 0; -} - -/* Search from a previously resolved qualification. (name 1+ in a qual list). */ -Namespace *NamespaceQual::searchFrom( Namespace *from, StringVect::Iter &qualPart ) -{ - /* While there are still parts in the qualification. */ - while ( qualPart.lte() ) { - Namespace *child = from->findNamespace( *qualPart ); - if ( child == 0 ) - return 0; - - from = child; - qualPart.increment(); - } - - return from; -} - -Namespace *NamespaceQual::getQual( Compiler *pd ) -{ - /* Do the search only once. */ - if ( cachedNspaceQual != 0 ) - return cachedNspaceQual; - - if ( qualNames.length() == 0 ) { - /* No qualification, use the region the qualification was - * declared in. */ - cachedNspaceQual = declInNspace; - } - else if ( strcmp( qualNames[0], "root" ) == 0 ) { - /* First item is "root." Start the downward search from there. */ - StringVect::Iter qualPart = qualNames; - qualPart.increment(); - cachedNspaceQual = searchFrom( pd->rootNamespace, qualPart ); - return cachedNspaceQual; - } - else { - /* Have a qualification. Move upwards through the declared - * regions looking for the first part. */ - StringVect::Iter qualPart = qualNames; - Namespace *parentNamespace = declInNspace; - while ( parentNamespace != 0 ) { - /* Search for the first part underneath the current parent. */ - Namespace *child = parentNamespace->findNamespace( *qualPart ); - - if ( child != 0 ) { - /* Found the first part. Start going below the result. */ - qualPart.increment(); - cachedNspaceQual = searchFrom( child, qualPart ); - return cachedNspaceQual; - } - - /* Not found, move up to the parent. */ - parentNamespace = parentNamespace->parentNamespace; - } - - /* Failed to find the place to start from. */ - cachedNspaceQual = 0; - } - - return cachedNspaceQual; -} - -void Compiler::initEmptyScanners() -{ - for ( RegionList::Iter reg = regionList; reg.lte(); reg++ ) { - if ( reg->tokenDefList.length() == 0 ) { - reg->wasEmpty = true; - - static int def = 1; - InputLoc loc = { 0, 0, 0 }; - String name( reg->name.length() + 16, "__%s_DEF_PAT_%d", reg->name.data, def++ ); - - Join *join = new Join( new Expression( BT_Any ) ); - - TokenDef *tokenDef = new TokenDef( name, String(), false, false, join, - 0, loc, nextTokenId++, rootNamespace, reg, 0, 0, 0 ); - reg->tokenDefList.append( tokenDef ); - - /* These do not go in the namespace so so they cannot get declared - * in the declare pass. */ - LangEl *lel = addLangEl( this, rootNamespace, name, LangEl::Term ); - - tokenDef->tdLangEl = lel; - lel->tokenDef = tokenDef; - } - } -} - - -void Compiler::parsePatterns() -{ - Program *prg = colmNewProgram( runtimeData, 0, 0 ); - - /* Turn off context-dependent parsing. */ - prg->ctxDepParsing = 0; - - Tree **vm_stack = stackAlloc(); - Tree **root = &vm_stack[VM_STACK_SIZE]; - - for ( ReplList::Iter repl = replList; repl.lte(); repl++ ) { - if ( colm_log_compile ) { - cerr << "parsing replacement at " << - repl->loc.line << ' ' << repl->loc.col << endl; - } - - InputStream *in = new InputStream; - FsmRun *fsmRun = new FsmRun; - repl->pdaRun = new PdaRun; - - initInputStream( in ); - initPdaRun( repl->pdaRun, prg, pdaTables, fsmRun, repl->langEl->parserId, 0, false, 0 ); - initFsmRun( fsmRun, prg ); - - Stream *res = streamAllocate( prg ); - res->id = LEL_ID_STREAM; - res->in = newSourceStreamRepl( repl ); - appendStream( in, (Tree*)res ); - setEof( in ); - - newToken( prg, repl->pdaRun, fsmRun ); - long pcr = parseLoop( prg, root, repl->pdaRun, fsmRun, in, PcrStart ); - assert( pcr == PcrDone ); - if ( repl->pdaRun->parseError ) - cout << "parse error" << endp; - } - - for ( PatternList::Iter pat = patternList; pat.lte(); pat++ ) { - if ( colm_log_compile ) { - cerr << "parsing pattern at " << - pat->loc.line << ' ' << pat->loc.col << endl; - } - - InputStream *in = new InputStream; - FsmRun *fsmRun = new FsmRun; - pat->pdaRun = new PdaRun; - - initInputStream( in ); - initPdaRun( pat->pdaRun, prg, pdaTables, fsmRun, pat->langEl->parserId, 0, false, 0 ); - initFsmRun( fsmRun, prg ); - - Stream *res = streamAllocate( prg ); - res->id = LEL_ID_STREAM; - res->in = newSourceStreamPattern( pat ); - appendStream( in, (Tree*)res ); - setEof( in ); - - newToken( prg, pat->pdaRun, fsmRun ); - long pcr = parseLoop( prg, root, pat->pdaRun, fsmRun, in, PcrStart ); - assert( pcr == PcrDone ); - if ( pat->pdaRun->parseError ) - cout << "parse error" << endp; - } - - fillInPatterns( prg ); -} - -void Compiler::collectParserEls( BstSet<LangEl*> &parserEls ) -{ - for ( PatternList::Iter pat = patternList; pat.lte(); pat++ ) { - /* We assume the reduction action compilation phase was run before - * pattern parsing and it decorated the pattern with the target type. */ - assert( pat->langEl != 0 ); - if ( pat->langEl->type != LangEl::NonTerm ) - error(pat->loc) << "pattern type is not a non-terminal" << endp; - - if ( pat->langEl->parserId < 0 ) { - /* Make a parser for the language element. */ - parserEls.insert( pat->langEl ); - pat->langEl->parserId = nextParserId++; - } - } - - for ( ReplList::Iter repl = replList; repl.lte(); repl++ ) { - /* We assume the reduction action compilation phase was run before - * replacement parsing decorated the replacement with the target type. */ - assert( repl->langEl != 0 ); - - if ( repl->langEl->parserId < 0 ) { - /* Make a parser for the language element. */ - parserEls.insert( repl->langEl ); - repl->langEl->parserId = nextParserId++; - } - } - - /* Make parsers that we need. */ - for ( LelList::Iter lel = langEls; lel.lte(); lel++ ) { - if ( lel->parserId >= 0 ) - parserEls.insert( lel ); - } -} - - -void Compiler::generateOutput() -{ - FsmCodeGen *fsmGen = new FsmCodeGen("<INPUT>", sectionName, - *outStream, redFsm, fsmTables ); - - PdaCodeGen *pdaGen = new PdaCodeGen( outputFileName, "parser", this, *outStream ); - - fsmGen->writeIncludes(); - pdaGen->defineRuntime(); - fsmGen->writeCode(); - - /* Make parsers that we need. */ - pdaGen->writeParserData( 0, pdaTables ); - - /* Write the runtime data. */ - pdaGen->writeRuntimeData( runtimeData, pdaTables ); - - if ( !gblLibrary ) - fsmGen->writeMain(); - - outStream->flush(); -} - - -void Compiler::prepGrammar() -{ - /* This will create language elements. */ - wrapNonTerminals(); - - makeLangElIds(); - makeLangElNames(); - makeDefinitionNames(); - noUndefindLangEls(); - - /* Put the language elements in an index by language element id. */ - langElIndex = new LangEl*[nextSymbolId+1]; - memset( langElIndex, 0, sizeof(LangEl*)*(nextSymbolId+1) ); - for ( LelList::Iter lel = langEls; lel.lte(); lel++ ) - langElIndex[lel->id] = lel; - - makeProdFsms(); - - /* Allocate the Runtime data now. Every PdaTable that we make - * will reference it, but it will be filled in after all the tables are - * built. */ - runtimeData = new RuntimeData; -} - -void Compiler::compile() -{ - beginProcessing(); - initKeyOps(); - - - /* Type declaration. */ - typeDeclaration(); - - /* Type resolving. */ - typeResolve(); - - makeTerminalWrappers(); - makeEofElements(); - - /* - * Parsers - */ - - /* Init the longest match data */ - initLongestMatchData(); - FsmGraph *fsmGraph = makeScanner(); - - if ( colm_log_compile ) { - printNameTree( fsmGraph->rootName ); - printNameIndex( fsmGraph->nameIndex ); - } - - prepGrammar(); - - /* Compile bytecode. */ - compileByteCode(); - - /* Make the reduced fsm. */ - RedFsmBuild reduce( sectionName, this, fsmGraph ); - redFsm = reduce.reduceMachine(); - - BstSet<LangEl*> parserEls; - collectParserEls( parserEls ); - - makeParser( parserEls ); - - /* Make the scanner tables. */ - fsmTables = redFsm->makeFsmTables(); - - /* Now that all parsers are built, make the global runtimeData. */ - makeRuntimeData(); - - /* - * All compilation is now complete. - */ - - /* Parse patterns and replacements. */ - parsePatterns(); -} - |