summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGES42
-rw-r--r--doc/internal.html851
-rw-r--r--doc/ply.html785
-rw-r--r--example/BASIC/basiclog.py79
-rw-r--r--example/BASIC/basparse.py4
-rw-r--r--example/calc/calc.py6
-rw-r--r--example/calcdebug/calc.py113
-rw-r--r--example/closurecalc/calc.py6
-rw-r--r--example/optcalc/README2
-rw-r--r--ply/lex.py41
-rw-r--r--ply/yacc.py3585
-rw-r--r--test/testyacc.py141
-rw-r--r--test/yacc_badid.py2
13 files changed, 3594 insertions, 2063 deletions
diff --git a/CHANGES b/CHANGES
index b82acb0..784784c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,5 +1,45 @@
-Version 2.6
+Version 3.0
-----------------------------
+01/13/09: beazley
+ Minor change to the procedure for signalling a syntax error in a
+ production rule. A normal SyntaxError exception should be raised
+ instead of yacc.SyntaxError.
+
+01/13/09: beazley
+ Added a new method p.set_lineno(n,lineno) that can be used to set the
+ line number of symbol n in grammar rules. This simplifies manual
+ tracking of line numbers.
+
+01/11/09: beazley
+ Vastly improved debugging support for yacc.parse(). Instead of passing
+ debug as an integer, you can supply a Logging object (see the logging
+ module). Messages will be generated at the ERROR, INFO, and DEBUG
+ logging levels, each level providing progressively more information.
+ The debugging trace also shows states, grammar rule, values passed
+ into grammar rules, and the result of each reduction.
+
+01/09/09: beazley
+ The yacc() command now does all error-reporting and diagnostics using
+ the interface of the logging module. Use the errorlog parameter to
+ specify a logging object for error messages. Use the debuglog parameter
+ to specify a logging object for the 'parser.out' output.
+
+01/09/09: beazley
+ *HUGE* refactoring of the the ply.yacc() implementation. The high-level
+ user interface is backwards compatible, but the internals are completely
+ reorganized into classes. No more global variables. The internals
+ are also more extensible. For example, you can use the classes to
+ construct a LALR(1) parser in an entirely different manner than
+ what is currently the case. Documentation is forthcoming.
+
+01/07/09: beazley
+ Various cleanup and refactoring of yacc internals.
+
+01/06/09: beazley
+ Fixed a bug with precedence assignment. yacc was assigning the precedence
+ each rule based on the left-most token, when in fact, it should have been
+ using the right-most token. Reported by Bruce Frederiksen.
+
11/27/08: beazley
Numerous changes to support Python 3.0 including removal of deprecated
statements (e.g., has_key) and the additional of compatibility code
diff --git a/doc/internal.html b/doc/internal.html
new file mode 100644
index 0000000..9192bcb
--- /dev/null
+++ b/doc/internal.html
@@ -0,0 +1,851 @@
+<html>
+<head>
+<title>PLY Internals</title>
+</head>
+<body bgcolor="#ffffff">
+
+<h1>PLY Internals</h1>
+
+<b>
+David M. Beazley <br>
+dave@dabeaz.com<br>
+</b>
+
+<p>
+<b>PLY Version: 3.0</b>
+<p>
+
+<!-- INDEX -->
+<!-- INDEX -->
+
+<H2>1. Introduction</H2>
+
+This document describes classes and functions that make up the internal
+operation of PLY. Using this programming interface, it is possible to
+manually build an parser using a different interface specification
+than what PLY normally uses. For example, you could build a gramar
+from information parsed in a completely different input format. Some of
+these objects may be useful for building more advanced parsing engines
+such as GLR.
+
+<p>
+It should be stressed that using PLY at this level is not for the
+faint of heart. Generally, it's assumed that you know a bit of
+the underlying compiler theory and how an LR parser is put together.
+
+<h2>2. Grammar Class</h2>
+
+The file <tt>ply.yacc</tt> defines a class <tt>Grammar</tt> that
+is used to hold and manipulate information about a grammar
+specification. It encapsulates the same basic information
+about a grammar that is put into a YACC file including
+the list of tokens, precedence rules, and grammar rules.
+Various operations are provided to perform different validations
+on the grammar. In addition, there are operations to compute
+the first and follow sets that are needed by the various table
+generation algorithms.
+
+<p>
+<tt><b>Grammar(terminals)</b></tt>
+
+<blockquote>
+Creates a new grammar object. <tt>terminals</tt> is a list of strings
+specifying the terminals for the grammar. An instance <tt>g</tt> of
+<tt>Grammar</tt> has the following methods:
+</blockquote>
+
+<p>
+<b><tt>g.set_precedence(term,assoc,level)</tt></b>
+<blockquote>
+Sets the precedence level and associativity for a given terminal <tt>term</tt>.
+<tt>assoc</tt> is one of <tt>'right'</tt>,
+<tt>'left'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is a positive integer. The higher
+the value of <tt>level</tt>, the higher the precedence. Here is an example of typical
+precedence settings:
+
+<pre>
+g.set_precedence('PLUS', 'left',1)
+g.set_precedence('MINUS', 'left',1)
+g.set_precedence('TIMES', 'left',2)
+g.set_precedence('DIVIDE','left',2)
+g.set_precedence('UMINUS','left',3)
+</pre>
+
+This method must be called prior to adding any productions to the
+grammar with <tt>g.add_production()</tt>. The precedence of individual grammar
+rules is determined by the precedence of the right-most terminal.
+
+</blockquote>
+<p>
+<b><tt>g.add_production(name,syms,func=None,file='',line=0)</tt></b>
+<blockquote>
+Adds a new grammar rule. <tt>name</tt> is the name of the rule,
+<tt>syms</tt> is a list of symbols making up the right hand
+side of the rule, <tt>func</tt> is the function to call when
+reducing the rule. <tt>file</tt> and <tt>line</tt> specify
+the filename and line number of the rule and are used for
+generating error messages.
+
+<p>
+The list of symbols in <tt>syms</tt> may include character
+literals and <tt>%prec</tt> specifiers. Here are some
+examples:
+
+<pre>
+g.add_production('expr',['expr','PLUS','term'],func,file,line)
+g.add_production('expr',['expr','"+"','term'],func,file,line)
+g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line)
+</pre>
+
+<p>
+If any kind of error is detected, a <tt>GrammarError</tt> exception
+is raised with a message indicating the reason for the failure.
+</blockquote>
+
+<p>
+<b><tt>g.set_start(start=None)</tt></b>
+<blockquote>
+Sets the starting rule for the grammar. <tt>start</tt> is a string
+specifying the name of the start rule. If <tt>start</tt> is omitted,
+the first grammar rule added with <tt>add_production()</tt> is taken to be
+the starting rule. This method must always be called after all
+productions have been added.
+</blockquote>
+
+<p>
+<b><tt>g.find_unreachable()</tt></b>
+<blockquote>
+Diagnostic function. Returns a list of all unreachable non-terminals
+defined in the grammar. This is used to identify inactive parts of
+the grammar specification.
+</blockquote>
+
+<p>
+<b><tt>g.infinite_cycle()</tt></b>
+<blockquote>
+Diagnostic function. Returns a list of all non-terminals in the
+grammar that result in an infinite cycle. This condition occurs if
+there is no way for a grammar rule to expand to a string containing
+only terminal symbols.
+</blockquote>
+
+<p>
+<b><tt>g.undefined_symbols()</tt></b>
+<blockquote>
+Diagnostic function. Returns a list of tuples <tt>(name, prod)</tt>
+corresponding to undefined symbols in the grammar. <tt>name</tt> is the
+name of the undefined symbol and <tt>prod</tt> is an instance of
+<tt>Production</tt> which has information about the production rule
+where the undefined symbol was used.
+</blockquote>
+
+<p>
+<b><tt>g.unused_terminals()</tt></b>
+<blockquote>
+Diagnostic function. Returns a list of terminals that were defined,
+but never used in the grammar.
+</blockquote>
+
+<p>
+<b><tt>g.unused_rules()</tt></b>
+<blockquote>
+Diagnostic function. Returns a list of <tt>Production</tt> instances
+corresponding to production rules that were defined in the grammar,
+but never used anywhere. This is slightly different
+than <tt>find_unreachable()</tt>.
+</blockquote>
+
+<p>
+<b><tt>g.unused_precedence()</tt></b>
+<blockquote>
+Diagnostic function. Returns a list of tuples <tt>(term, assoc)</tt>
+corresponding to precedence rules that were set, but never used the
+grammar. <tt>term</tt> is the terminal name and <tt>assoc</tt> is the
+precedence associativity (e.g., <tt>'left'</tt>, <tt>'right'</tt>,
+or <tt>'nonassoc'</tt>.
+</blockquote>
+
+<p>
+<b><tt>g.compute_first()</tt></b>
+<blockquote>
+Compute all of the first sets for all symbols in the grammar. Returns a dictionary
+mapping symbol names to a list of all first symbols.
+</blockquote>
+
+<p>
+<b><tt>g.compute_follow()</tt></b>
+<blockquote>
+Compute all of the follow sets for all non-terminals in the grammar.
+The follow set is the set of all possible symbols that might follow a
+given non-terminal. Returns a dictionary mapping non-terminal names
+to a list of symbols.
+</blockquote>
+
+<p>
+<b><tt>g.build_lritems()</tt></b>
+<blockquote>
+Calculates all of the LR items for all productions in the grammar. This
+step is required before using the grammar for any kind of table generation.
+See the section on LR items below.
+</blockquote>
+
+<p>
+The following attributes are set by the above methods and may be useful
+in code that works with the grammar. All of these attributes should be
+assumed to be read-only. Changing their values directly will likely
+break the grammar.
+
+<p>
+<b><tt>g.Productions</tt></b>
+<blockquote>
+A list of all productions added. The first entry is reserved for
+a production representing the starting rule. The objects in this list
+are instances of the <tt>Production</tt> class, described shortly.
+</blockquote>
+
+<p>
+<b><tt>g.Prodnames</tt></b>
+<blockquote>
+A dictionary mapping the names of nonterminals to a list of all
+productions of that nonterminal.
+</blockquote>
+
+<p>
+<b><tt>g.Terminals</tt></b>
+<blockquote>
+A dictionary mapping the names of terminals to a list of the
+production numbers where they are used.
+</blockquote>
+
+<p>
+<b><tt>g.Nonterminals</tt></b>
+<blockquote>
+A dictionary mapping the names of nonterminals to a list of the
+production numbers where they are used.
+</blockquote>
+
+<p>
+<b><tt>g.First</tt></b>
+<blockquote>
+A dictionary representing the first sets for all grammar symbols. This is
+computed and returned by the <tt>compute_first()</tt> method.
+</blockquote>
+
+<p>
+<b><tt>g.Follow</tt></b>
+<blockquote>
+A dictionary representing the follow sets for all grammar rules. This is
+computed and returned by the <tt>compute_follow()</tt> method.
+</blockquote>
+
+<p>
+<b><tt>g.Start</tt></b>
+<blockquote>
+Starting symbol for the grammar. Set by the <tt>set_start()</tt> method.
+</blockquote>
+
+For the purposes of debugging, a <tt>Grammar</tt> object supports the <tt>__len__()</tt> and
+<tt>__getitem__()</tt> special methods. Accessing <tt>g[n]</tt> returns the nth production
+from the grammar.
+
+
+<h2>3. Productions</h2>
+
+<tt>Grammar</tt> objects store grammar rules as instances of a <tt>Production</tt> class. This
+class has no public constructor--you should only create productions by calling <tt>Grammar.add_production()</tt>.
+The following attributes are available on a <tt>Production</tt> instance <tt>p</tt>.
+
+<p>
+<b><tt>p.name</tt></b>
+<blockquote>
+The name of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>'A'</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.prod</tt></b>
+<blockquote>
+A tuple of symbols making up the right-hand side of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>('B','C','D')</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.number</tt></b>
+<blockquote>
+Production number. An integer containing the index of the production in the grammar's <tt>Productions</tt> list.
+</blockquote>
+
+<p>
+<b><tt>p.func</tt></b>
+<blockquote>
+The name of the reduction function associated with the production.
+This is the function that will execute when reducing the entire
+grammar rule during parsing.
+</blockquote>
+
+<p>
+<b><tt>p.callable</tt></b>
+<blockquote>
+The callable object associated with the name in <tt>p.func</tt>. This is <tt>None</tt>
+unless the production has been bound using <tt>bind()</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.file</tt></b>
+<blockquote>
+Filename associated with the production. Typically this is the file where the production was defined. Used for error messages.
+</blockquote>
+
+<p>
+<b><tt>p.lineno</tt></b>
+<blockquote>
+Line number associated with the production. Typically this is the line number in <tt>p.file</tt> where the production was defined. Used for error messages.
+</blockquote>
+
+<p>
+<b><tt>p.prec</tt></b>
+<blockquote>
+Precedence and associativity associated with the production. This is a tuple <tt>(assoc,level)</tt> where
+<tt>assoc</tt> is one of <tt>'left'</tt>,<tt>'right'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is
+an integer. This value is determined by the precedence of the right-most terminal symbol in the production
+or by use of the <tt>%prec</tt> specifier when adding the production.
+</blockquote>
+
+<p>
+<b><tt>p.usyms</tt></b>
+<blockquote>
+A list of all unique symbols found in the production.
+</blockquote>
+
+<p>
+<b><tt>p.lr_items</tt></b>
+<blockquote>
+A list of all LR items for this production. This attribute only has a meaningful value if the
+<tt>Grammar.build_lritems()</tt> method has been called. The items in this list are
+instances of <tt>LRItem</tt> described below.
+</blockquote>
+
+<p>
+<b><tt>p.lr_next</tt></b>
+<blockquote>
+The head of a linked-list representation of the LR items in <tt>p.lr_items</tt>.
+This attribute only has a meaningful value if the <tt>Grammar.build_lritems()</tt>
+method has been called. Each <tt>LRItem</tt> instance has a <tt>lr_next</tt> attribute
+to move to the next item. The list is terminated by <tt>None</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.bind(dict)</tt></b>
+<blockquote>
+Binds the production function name in <tt>p.func</tt> to a callable object in
+<tt>dict</tt>. This operation is typically carried out in the last step
+prior to running the parsing engine and is needed since parsing tables are typically
+read from files which only include the function names, not the functions themselves.
+</blockquote>
+
+<P>
+<tt>Production</tt> objects support
+the <tt>__len__()</tt>, <tt>__getitem__()</tt>, and <tt>__str__()</tt>
+special methods.
+<tt>len(p)</tt> returns the number of symbols in <tt>p.prod</tt>
+and <tt>p[n]</tt> is the same as <tt>p.prod[n]</tt>.
+
+<h2>4. LRItems</h2>
+
+The construction of parsing tables in an LR-based parser generator is primarily
+done over a set of "LR Items". An LR item represents a stage of parsing one
+of the grammar rules. To compute the LR items, it is first necessary to
+call <tt>Grammar.build_lritems()</tt>. Once this step, all of the productions
+in the grammar will have their LR items attached to them.
+
+<p>
+Here is an interactive example that shows what LR items look like if you
+interactively experiment. In this example, <tt>g</tt> is a <tt>Grammar</tt>
+object.
+
+<blockquote>
+<pre>
+>>> <b>g.build_lritems()</b>
+>>> <b>p = g[1]</b>
+>>> <b>p</b>
+Production(statement -> ID = expr)
+>>>
+</pre>
+</blockquote>
+
+In the above code, <tt>p</tt> represents the first grammar rule. In
+this case, a rule <tt>'statement -> ID = expr'</tt>.
+
+<p>
+Now, let's look at the LR items for <tt>p</tt>.
+
+<blockquote>
+<pre>
+>>> <b>p.lr_items</b>
+[LRItem(statement -> . ID = expr),
+ LRItem(statement -> ID . = expr),
+ LRItem(statement -> ID = . expr),
+ LRItem(statement -> ID = expr .)]
+>>>
+</pre>
+</blockquote>
+
+In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot
+is advanced by one symbol. It is only when the dot reaches the very end that a production
+is successfully parsed.
+
+<p>
+An instance <tt>lr</tt> of <tt>LRItem</tt> has the following
+attributes that hold information related to that specific stage of
+parsing.
+
+<p>
+<b><tt>lr.name</tt></b>
+<blockquote>
+The name of the grammar rule. For example, <tt>'statement'</tt> in the above example.
+</blockquote>
+
+<p>
+<b><tt>lr.prod</tt></b>
+<blockquote>
+A tuple of symbols representing the right-hand side of the production, including the
+special <tt>'.'</tt> character. For example, <tt>('ID','.','=','expr')</tt>.
+</blockquote>
+
+<p>
+<b><tt>lr.number</tt></b>
+<blockquote>
+An integer representing the production number in the grammar.
+</blockquote>
+
+<p>
+<b><tt>lr.usyms</tt></b>
+<blockquote>
+A set of unique symbols in the production. Inherited from the original <tt>Production</tt> instance.
+</blockquote>
+
+<p>
+<b><tt>lr.lr_index</tt></b>
+<blockquote>
+An integer representing the position of the dot (.). You should never use <tt>lr.prod.index()</tt>
+to search for it--the result will be wrong if the grammar happens to also use (.) as a character
+literal.
+</blockquote>
+
+<p>
+<b><tt>lr.lr_after</tt></b>
+<blockquote>
+A list of all productions that can legally appear immediately to the right of the
+dot (.). This list contains <tt>Production</tt> instances. This attribute
+represents all of the possible branches a parse can take from the current position.
+For example, suppose that <tt>lr</tt> represents a stage immediately before
+an expression like this:
+
+<pre>
+>>> <b>lr</b>
+LRItem(statement -> ID = . expr)
+>>>
+</pre>
+
+Then, the value of <tt>lr.lr_after</tt> might look like this, showing all productions that
+can legally appear next:
+
+<pre>
+>>> <b>lr.lr_after</b>
+[Production(expr -> expr PLUS expr),
+ Production(expr -> expr MINUS expr),
+ Production(expr -> expr TIMES expr),
+ Production(expr -> expr DIVIDE expr),
+ Production(expr -> MINUS expr),
+ Production(expr -> LPAREN expr RPAREN),
+ Production(expr -> NUMBER),
+ Production(expr -> ID)]
+>>>
+</pre>
+
+</blockquote>
+
+<p>
+<b><tt>lr.lr_before</tt></b>
+<blockquote>
+The grammar symbol that appears immediately before the dot (.) or <tt>None</tt> if
+at the beginning of the parse.
+</blockquote>
+
+<p>
+<b><tt>lr.lr_next</tt></b>
+<blockquote>
+A link to the next LR item, representing the next stage of the parse. <tt>None</tt> if <tt>lr</tt>
+is the last LR item.
+</blockquote>
+
+<tt>LRItem</tt> instances also support the <tt>__len__()</tt> and <tt>__getitem__()</tt> special methods.
+<tt>len(lr)</tt> returns the number of items in <tt>lr.prod</tt> including the dot (.). <tt>lr[n]</tt>
+returns <tt>lr.prod[n]</tt>.
+
+<p>
+It goes without saying that all of the attributes associated with LR
+items should be assumed to be read-only. Modifications will very
+likely create a small black-hole that will consume you and your code.
+
+<h2>5. LRTable</h2>
+
+The <tt>LRTable</tt> class is used to represent LR parsing table data. This
+minimally includes the production list, action table, and goto table.
+
+<p>
+<b><tt>LRTable()</tt></b>
+<blockquote>
+Create an empty LRTable object. This object contains only the information needed to
+run an LR parser.
+</blockquote>
+
+An instance <tt>lrtab</tt> of <tt>LRTable</tt> has the following methods:
+
+<p>
+<b><tt>lrtab.read_table(module)</tt></b>
+<blockquote>
+Populates the LR table with information from the module specified in <tt>module</tt>.
+<tt>module</tt> is either a module object already loaded with <tt>import</tt> or
+the name of a Python module. If it's a string containing a module name, it is
+loaded and parsing data is extracted. Returns the signature value that was used
+when initially writing the tables. Raises a <tt>VersionError</tt> exception if
+the module was created using an incompatible version of PLY.
+</blockquote>
+
+<p>
+<b><tt>lrtab.bind_callables(dict)</tt></b>
+<blockquote>
+This binds all of the function names used in productions to callable objects
+found in the dictionary <tt>dict</tt>. During table generation and when reading
+LR tables from files, PLY only uses the names of action functions such as <tt>'p_expr'</tt>,
+<tt>'p_statement'</tt>, etc. In order to actually run the parser, these names
+have to be bound to callable objects. This method is always called prior to
+running a parser.
+</blockquote>
+
+After <tt>lrtab</tt> has been populated, the following attributes are defined.
+
+<p>
+<b><tt>lrtab.lr_method</tt></b>
+<blockquote>
+The LR parsing method used (e.g., <tt>'LALR'</tt>)
+</blockquote>
+
+
+<p>
+<b><tt>lrtab.lr_productions</tt></b>
+<blockquote>
+The production list. If the parsing tables have been newly
+constructed, this will be a list of <tt>Production</tt> instances. If
+the parsing tables have been read from a file, it's a list
+of <tt>MiniProduction</tt> instances. This, together
+with <tt>lr_action</tt> and <tt>lr_goto</tt> contain all of the
+information needed by the LR parsing engine.
+</blockquote>
+
+<p>
+<b><tt>lrtab.lr_action</tt></b>
+<blockquote>
+The LR action dictionary that implements the underlying state machine.
+The keys of this dictionary are the LR states.
+</blockquote>
+
+<p>
+<b><tt>lrtab.lr_goto</tt></b>
+<blockquote>
+The LR goto table that contains information about grammar rule reductions.
+</blockquote>
+
+
+<h2>6. LRGeneratedTable</h2>
+
+The <tt>LRGeneratedTable</tt> class represents constructed LR parsing tables on a
+grammar. It is a subclass of <tt>LRTable</tt>.
+
+<p>
+<b><tt>LRGeneratedTable(grammar, method='LALR',log=None)</tt></b>
+<blockquote>
+Create the LR parsing tables on a grammar. <tt>grammar</tt> is an instance of <tt>Grammar</tt>,
+<tt>method</tt> is a string with the parsing method (<tt>'SLR'</tt> or <tt>'LALR'</tt>), and
+<tt>log</tt> is a logger object used to write debugging information. The debugging information
+written to <tt>log</tt> is the same as what appears in the <tt>parser.out</tt> file created
+by yacc. By supplying a custom logger with a different message format, it is possible to get
+more information (e.g., the line number in <tt>yacc.py</tt> used for issuing each line of
+output in the log). The result is an instance of <tt>LRGeneratedTable</tt>.
+</blockquote>
+
+<p>
+An instance <tt>lr</tt> of <tt>LRGeneratedTable</tt> has the following attributes.
+
+<p>
+<b><tt>lr.grammar</tt></b>
+<blockquote>
+A link to the Grammar object used to construct the parsing tables.
+</blockquote>
+
+<p>
+<b><tt>lr.lr_method</tt></b>
+<blockquote>
+The LR parsing method used (e.g., <tt>'LALR'</tt>)
+</blockquote>
+
+
+<p>
+<b><tt>lr.lr_productions</tt></b>
+<blockquote>
+A reference to <tt>grammar.Productions</tt>. This, together with <tt>lr_action</tt> and <tt>lr_goto</tt>
+contain all of the information needed by the LR parsing engine.
+</blockquote>
+
+<p>
+<b><tt>lr.lr_action</tt></b>
+<blockquote>
+The LR action dictionary that implements the underlying state machine. The keys of this dictionary are
+the LR states.
+</blockquote>
+
+<p>
+<b><tt>lr.lr_goto</tt></b>
+<blockquote>
+The LR goto table that contains information about grammar rule reductions.
+</blockquote>
+
+<p>
+<b><tt>lr.sr_conflicts</tt></b>
+<blockquote>
+A list of tuples <tt>(state,token,resolution)</tt> identifying all shift/reduce conflicts. <tt>state</tt> is the LR state
+number where the conflict occurred, <tt>token</tt> is the token causing the conflict, and <tt>resolution</tt> is
+a string describing the resolution taken. <tt>resolution</tt> is either <tt>'shift'</tt> or <tt>'reduce'</tt>.
+</blockquote>
+
+<p>
+<b><tt>lr.rr_conflicts</tt></b>
+<blockquote>
+A list of tuples <tt>(state,rule,rejected)</tt> identifying all reduce/reduce conflicts. <tt>state</tt> is the
+LR state number where the conflict occurred, <tt>rule</tt> is the production rule that was selected
+and <tt>rejected</tt> is the production rule that was rejected. Both <tt>rule</tt> and </tt>rejected</tt> are
+instances of <tt>Production</tt>. They can be inspected to provide the user with more information.
+</blockquote>
+
+<p>
+There are two public methods of <tt>LRGeneratedTable</tt>.
+
+<p>
+<b><tt>lr.write_table(modulename,outputdir="",signature="")</tt></b>
+<blockquote>
+Writes the LR parsing table information to a Python module. <tt>modulename</tt> is a string
+specifying the name of a module such as <tt>"parsetab"</tt>. <tt>outputdir</tt> is the name of a
+directory where the module should be created. <tt>signature</tt> is a string representing a
+grammar signature that's written into the output file. This can be used to detect when
+the data stored in a module file is out-of-sync with the the grammar specification (and that
+the tables need to be regenerated). If <tt>modulename</tt> is a string <tt>"parsetab"</tt>,
+this function creates a file called <tt>parsetab.py</tt>. If the module name represents a
+package such as <tt>"foo.bar.parsetab"</tt>, then only the last component, <tt>"parsetab"</tt> is
+used.
+</blockquote>
+
+
+<h2>7. LRParser</h2>
+
+The <tt>LRParser</tt> class implements the low-level LR parsing engine.
+
+
+<p>
+<b><tt>LRParser(lrtab, error_func)</tt></b>
+<blockquote>
+Create an LRParser. <tt>lrtab</tt> is an instance of <tt>LRTable</tt>
+containing the LR production and state tables. <tt>error_func</tt> is the
+error function to invoke in the event of a parsing error.
+</blockquote>
+
+An instance <tt>p</tt> of <tt>LRParser</tt> has the following methods:
+
+<p>
+<b><tt>p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None)</tt></b>
+<blockquote>
+Run the parser. <tt>input</tt> is a string, which if supplied is fed into the
+lexer using its <tt>input()</tt> method. <tt>lexer</tt> is an instance of the
+<tt>Lexer</tt> class to use for tokenizing. If not supplied, the last lexer
+created with the <tt>lex</tt> module is used. <tt>debug</tt> is a boolean flag
+that enables debugging. <tt>tracking</tt> is a boolean flag that tells the
+parser to perform additional line number tracking. <tt>tokenfunc</tt> is a callable
+function that returns the next token. If supplied, the parser will use it to get
+all tokens.
+</blockquote>
+
+<p>
+<b><tt>p.restart()</tt></b>
+<blockquote>
+Resets the parser state for a parse already in progress.
+</blockquote>
+
+<h2>8. ParserReflect</h2>
+
+<p>
+The <tt>ParserReflect</tt> class is used to collect parser specification data
+from a Python module or object. This class is what collects all of the
+<tt>p_rule()</tt> functions in a PLY file, performs basic error checking,
+and collects all of the needed information to build a grammar. Most of the
+high-level PLY interface as used by the <tt>yacc()</tt> function is actually
+implemented by this class.
+
+<p>
+<b><tt>ParserReflect(pdict, log=None)</tt></b>
+<blockquote>
+Creates a <tt>ParserReflect</tt> instance. <tt>pdict</tt> is a dictionary
+containing parser specification data. This dictionary typically corresponds
+to the module or class dictionary of code that implements a PLY parser.
+<tt>log</tt> is a logger instance that will be used to report error
+messages.
+</blockquote>
+
+An instance <tt>p</tt> of <tt>ParserReflect</tt> has the following methods:
+
+<p>
+<b><tt>p.get_all()</tt></b>
+<blockquote>
+Collect and store all required parsing information.
+</blockquote>
+
+<p>
+<b><tt>p.validate_all()</tt></b>
+<blockquote>
+Validate all of the collected parsing information. This is a seprate step
+from <tt>p.get_all()</tt> as a performance optimization. In order to
+increase parser start-up time, a parser can elect to only validate the
+parsing data when regenerating the parsing tables. The validation
+step tries to collect as much information as possible rather than
+raising an exception at the first sign of trouble. The attribute
+<tt>p.error</tt> is set if there are any validation errors. The
+value of this attribute is also returned.
+</blockquote>
+
+<p>
+<b><tt>p.signature()</tt></b>
+<blockquote>
+Compute a signature representing the contents of the collected parsing
+data. The signature value should change if anything in the parser
+specification has changed in a way that would justify parser table
+regeneration. This method can be called after <tt>p.get_all()</tt>,
+but before <tt>p.validate_all()</tt>.
+</blockquote>
+
+The following attributes are set in the process of collecting data:
+
+<p>
+<b><tt>p.start</tt></b>
+<blockquote>
+The grammar start symbol, if any. Taken from <tt>pdict['start']</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.error_func</tt></b>
+<blockquote>
+The error handling function or <tt>None</tt>. Taken from <tt>pdict['p_error']</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.tokens</tt></b>
+<blockquote>
+The token list. Taken from <tt>pdict['tokens']</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.prec</tt></b>
+<blockquote>
+The precedence specifier. Taken from <tt>pdict['precedence']</tt>.
+</blockquote>
+
+<p>
+<b><tt>p.preclist</tt></b>
+<blockquote>
+A parsed version of the precedence specified. A list of tuples of the form
+<tt>(token,assoc,level)</tt> where <tt>token</tt> is the terminal symbol,
+<tt>assoc</tt> is the associativity (e.g., <tt>'left'</tt>) and <tt>level</tt>
+is a numeric precedence level.
+</blockquote>
+
+<p>
+<b><tt>p.grammar</tt></b>
+<blockquote>
+A list of tuples <tt>(name, rules)</tt> representing the grammar rules. <tt>name</tt> is the
+name of a Python function or method in <tt>pdict</tt> that starts with <tt>"p_"</tt>.
+<tt>rules</tt> is a list of tuples <tt>(filename,line,prodname,syms)</tt> representing
+the grammar rules found in the documentation string of that function. <tt>filename</tt> and <tt>line</tt> contain location
+information that can be used for debugging. <tt>prodname</tt> is the name of the
+production. <tt>syms</tt> is the right-hand side of the production. If you have a
+function like this
+
+<pre>
+def p_expr(p):
+ '''expr : expr PLUS expr
+ | expr MINUS expr
+ | expr TIMES expr
+ | expr DIVIDE expr'''
+</pre>
+
+then the corresponding entry in <tt>p.grammar</tt> might look like this:
+
+<pre>
+('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']),
+ ('calc.py',11,'expr', ['expr','MINUS','expr']),
+ ('calc.py',12,'expr', ['expr','TIMES','expr']),
+ ('calc.py',13,'expr', ['expr','DIVIDE','expr'])
+ ])
+</pre>
+</blockquote>
+
+<p>
+<b><tt>p.pfuncs</tt></b>
+<blockquote>
+A sorted list of tuples <tt>(line, file, name, doc)</tt> representing all of
+the <tt>p_</tt> functions found. <tt>line</tt> and <tt>file</tt> give location
+information. <tt>name</tt> is the name of the function. <tt>doc</tt> is the
+documentation string. This list is sorted in ascending order by line number.
+</blockquote>
+
+<p>
+<b><tt>p.files</tt></b>
+<blockquote>
+A dictionary holding all of the source filenames that were encountered
+while collecting parser information. Only the keys of this dictionary have
+any meaning.
+</blockquote>
+
+<p>
+<b><tt>p.error</tt></b>
+<blockquote>
+An attribute that indicates whether or not any critical errors
+occurred in validation. If this is set, it means that that some kind
+of problem was detected and that no further processing should be
+performed.
+</blockquote>
+
+
+<h2>9. High-level operation</h2>
+
+Using all of the above classes requires some attention to detail. The <tt>yacc()</tt>
+function carries out a very specific sequence of operations to create a grammar.
+This same sequence should be emulated if you build an alternative PLY interface.
+
+<ol>
+<li>A <tt>ParserReflect</tt> object is created and raw grammar specification data is
+collected.
+<li>A <tt>Grammar</tt> object is created and populated with information
+from the specification data.
+<li>A <tt>LRGenerator</tt> object is created to run the LALR algorithm over
+the <tt>Grammar</tt> object.
+<li>Productions in the LRGenerator and bound to callables using the <tt>bind_callables()</tt>
+method.
+<li>A <tt>LRParser</tt> object is created from from the information in the
+<tt>LRGenerator</tt> object.
+</ol>
+
+</body>
+</html>
+
+
+
+
+
+
+
diff --git a/doc/ply.html b/doc/ply.html
index 13a2631..f9fe036 100644
--- a/doc/ply.html
+++ b/doc/ply.html
@@ -12,7 +12,7 @@ dave@dabeaz.com<br>
</b>
<p>
-<b>PLY Version: 2.5</b>
+<b>PLY Version: 3.0</b>
<p>
<!-- INDEX -->
@@ -97,7 +97,10 @@ include lexical analysis, parsing, type checking, type inference,
nested scoping, and code generation for the SPARC processor.
Approximately 30 different compiler implementations were completed in
this course. Most of PLY's interface and operation has been influenced by common
-usability problems encountered by students.
+usability problems encountered by students. Since 2001, PLY has
+continued to be improved as feedback has been received from users.
+PLY-3.0 represents a major refactoring of the original implementation
+with an eye towards future enhancements.
<p>
Since PLY was primarily developed as an instructional tool, you will
@@ -245,11 +248,7 @@ t_RPAREN = r'\)'
# A regular expression rule with some action code
def t_NUMBER(t):
r'\d+'
- try:
- t.value = int(t.value)
- except ValueError:
- print "Line %d: Number %s is too large!" % (t.lineno,t.value)
- t.value = 0
+ t.value = int(t.value)
return t
# Define a rule so we can track line numbers
@@ -266,11 +265,14 @@ def t_error(t):
t.lexer.skip(1)
# Build the lexer
-lex.lex()
+lexer = lex.lex()
</pre>
</blockquote>
-To use the lexer, you first need to feed it some input text using its <tt>input()</tt> method. After that, repeated calls to <tt>token()</tt> produce tokens. The following code shows how this works:
+To use the lexer, you first need to feed it some input text using
+its <tt>input()</tt> method. After that, repeated calls
+to <tt>token()</tt> produce tokens. The following code shows how this
+works:
<blockquote>
<pre>
@@ -282,11 +284,11 @@ data = '''
'''
# Give the lexer some input
-lex.input(data)
+lexer.input(data)
# Tokenize
-while 1:
- tok = lex.token()
+while True:
+ tok = lexer.token()
if not tok: break # No more input
print tok
</pre>
@@ -310,7 +312,16 @@ LexToken(NUMBER,2,3,21)
</pre>
</blockquote>
-The tokens returned by <tt>lex.token()</tt> are instances
+Lexers also support the iteration protocol. So, you can write the above loop as follows:
+
+<blockquote>
+<pre>
+for tok in lexer:
+ print tok
+</pre>
+</blockquote>
+
+The tokens returned by <tt>lexer.token()</tt> are instances
of <tt>LexToken</tt>. This object has
attributes <tt>tok.type</tt>, <tt>tok.value</tt>,
<tt>tok.lineno</tt>, and <tt>tok.lexpos</tt>. The following code shows an example of
@@ -319,8 +330,8 @@ accessing these attributes:
<blockquote>
<pre>
# Tokenize
-while 1:
- tok = lex.token()
+while True:
+ tok = lexer.token()
if not tok: break # No more input
print tok.type, tok.value, tok.line, tok.lexpos
</pre>
@@ -429,7 +440,7 @@ reserved = {
...
}
-tokens = ['LPAREN','RPAREN',...,'ID'] + reserved.values()
+tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values())
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
@@ -530,11 +541,10 @@ column information as a separate step. For instance, just count backwards unti
# input is the input text string
# token is a token instance
def find_column(input,token):
- i = token.lexpos
- while i > 0:
- if input[i] == '\n': break
- i -= 1
- column = (token.lexpos - i)+1
+ last_cr = input.rfind('\n',0,token.lexpos)
+ if last_cr < 0:
+ last_cr = 0
+ column = (token.lexpos - last_cr) + 1
return column
</pre>
</blockquote>
@@ -607,36 +617,34 @@ In this case, we simply print the offending character and skip ahead one charact
<p>
To build the lexer, the function <tt>lex.lex()</tt> is used. This function
uses Python reflection (or introspection) to read the the regular expression rules
-out of the calling context and build the lexer. Once the lexer has been built, two functions can
+out of the calling context and build the lexer. Once the lexer has been built, two methods can
be used to control the lexer.
<ul>
-<li><tt>lex.input(data)</tt>. Reset the lexer and store a new input string.
-<li><tt>lex.token()</tt>. Return the next token. Returns a special <tt>LexToken</tt> instance on success or
+<li><tt>lexer.input(data)</tt>. Reset the lexer and store a new input string.
+<li><tt>lexer.token()</tt>. Return the next token. Returns a special <tt>LexToken</tt> instance on success or
None if the end of the input text has been reached.
</ul>
-If desired, the lexer can also be used as an object. The <tt>lex()</tt> returns a <tt>Lexer</tt> object that
-can be used for this purpose. For example:
+The preferred way to use PLY is to invoke the above methods directly on the lexer object returned by the
+<tt>lex()</tt> function. The legacy interface to PLY involves module-level functions <tt>lex.input()</tt> and <tt>lex.token()</tt>.
+For example:
<blockquote>
<pre>
-lexer = lex.lex()
-lexer.input(sometext)
+lex.lex()
+lex.input(sometext)
while 1:
- tok = lexer.token()
+ tok = lex.token()
if not tok: break
print tok
</pre>
</blockquote>
<p>
-This latter technique should be used if you intend to use multiple lexers in your application. Simply define each
-lexer in its own module and use the object returned by <tt>lex()</tt> as appropriate.
-
-<p>
-Note: The global functions <tt>lex.input()</tt> and <tt>lex.token()</tt> are bound to the <tt>input()</tt>
-and <tt>token()</tt> methods of the last lexer created by the lex module.
+In this example, the module-level functions <tt>lex.input()</tt> and <tt>lex.token()</tt> are bound to the <tt>input()</tt>
+and <tt>token()</tt> methods of the last lexer created by the lex module. This interface may go away at some point so
+it's probably best not to use it.
<H3><a name="ply_nn14"></a>3.11 The @TOKEN decorator</H3>
@@ -785,11 +793,7 @@ t_RPAREN = r'\)'
# A regular expression rule with some action code
def t_NUMBER(t):
r'\d+'
- try:
- t.value = int(t.value)
- except ValueError:
- print "Line %d: Number %s is too large!" % (t.lineno,t.value)
- t.value = 0
+ t.value = int(t.value)
return t
# Define a rule so we can track line numbers
@@ -826,7 +830,7 @@ None
</pre>
</blockquote>
-The <tt>object</tt> option can be used to define lexers as a class instead of a module. For example:
+The <tt>module</tt> option can also be used to define lexers from instances of a class. For example:
<blockquote>
<pre>
@@ -856,11 +860,7 @@ class MyLexer:
# Note addition of self parameter since we're in a class
def t_NUMBER(self,t):
r'\d+'
- try:
- t.value = int(t.value)
- except ValueError:
- print "Line %d: Number %s is too large!" % (t.lineno,t.value)
- t.value = 0
+ t.value = int(t.value)
return t
# Define a rule so we can track line numbers
@@ -878,12 +878,12 @@ class MyLexer:
<b># Build the lexer
def build(self,**kwargs):
- self.lexer = lex.lex(object=self, **kwargs)</b>
+ self.lexer = lex.lex(module=self, **kwargs)</b>
# Test it output
def test(self,data):
self.lexer.input(data)
- while 1:
+ while True:
tok = lexer.token()
if not tok: break
print tok
@@ -895,18 +895,80 @@ m.test("3 + 4") # Test it
</pre>
</blockquote>
-When building a lexer from class, you should construct the lexer from
-an instance of the class, not the class object itself. Also, for
-reasons that are subtle, you should <em>NOT</em>
-invoke <tt>lex.lex()</tt> inside the <tt>__init__()</tt> method of
-your class. If you do, it may cause bizarre behavior if someone tries
-to duplicate a lexer object.
+
+When building a lexer from class, <em>you should construct the lexer from
+an instance of the class</em>, not the class object itself. This is because
+PLY only works properly if the lexer actions are defined by bound-methods.
+
+<p>
+When using the <tt>module</tt> option to <tt>lex()</tt>, PLY collects symbols
+from the underlying object using the <tt>dir()</tt> function. There is no
+direct access to the <tt>__dict__</tt> attribute of the object supplied as a
+module value.
+
+<P>
+Finally, if you want to keep things nicely encapsulated, but don't want to use a
+full-fledged class definition, lexers can be defined using closures. For example:
+
+<blockquote>
+<pre>
+import ply.lex as lex
+
+# List of token names. This is always required
+tokens = (
+ 'NUMBER',
+ 'PLUS',
+ 'MINUS',
+ 'TIMES',
+ 'DIVIDE',
+ 'LPAREN',
+ 'RPAREN',
+)
+
+def MyLexer():
+ # Regular expression rules for simple tokens
+ t_PLUS = r'\+'
+ t_MINUS = r'-'
+ t_TIMES = r'\*'
+ t_DIVIDE = r'/'
+ t_LPAREN = r'\('
+ t_RPAREN = r'\)'
+
+ # A regular expression rule with some action code
+ def t_NUMBER(t):
+ r'\d+'
+ t.value = int(t.value)
+ return t
+
+ # Define a rule so we can track line numbers
+ def t_newline(t):
+ r'\n+'
+ t.lexer.lineno += len(t.value)
+
+ # A string containing ignored characters (spaces and tabs)
+ t_ignore = ' \t'
+
+ # Error handling rule
+ def t_error(t):
+ print "Illegal character '%s'" % t.value[0]
+ t.lexer.skip(1)
+
+ # Build the lexer from my environment and return it
+ return lex.lex()
+</pre>
+</blockquote>
+
<H3><a name="ply_nn18"></a>3.15 Maintaining state</H3>
-In your lexer, you may want to maintain a variety of state information. This might include mode settings, symbol tables, and other details. There are a few
-different ways to handle this situation. One way to do this is to keep a set of global variables in the module
-where you created the lexer. For example:
+In your lexer, you may want to maintain a variety of state
+information. This might include mode settings, symbol tables, and
+other details. As an example, suppose that you wanted to keep
+track of how many NUMBER tokens had been encountered.
+
+<p>
+One way to do this is to keep a set of global variables in the module
+where you created the lexer. For example:
<blockquote>
<pre>
@@ -915,28 +977,22 @@ def t_NUMBER(t):
r'\d+'
global num_count
num_count += 1
- try:
- t.value = int(t.value)
- except ValueError:
- print "Line %d: Number %s is too large!" % (t.lineno,t.value)
- t.value = 0
+ t.value = int(t.value)
return t
</pre>
</blockquote>
-Alternatively, you can store this information inside the Lexer object created by <tt>lex()</tt>. To this, you can use the <tt>lexer</tt> attribute
-of tokens passed to the various rules. For example:
+If you don't like the use of a global variable, another place to store
+information is inside the Lexer object created by <tt>lex()</tt>.
+To this, you can use the <tt>lexer</tt> attribute of tokens passed to
+the various rules. For example:
<blockquote>
<pre>
def t_NUMBER(t):
r'\d+'
t.lexer.num_count += 1 # Note use of lexer attribute
- try:
- t.value = int(t.value)
- except ValueError:
- print "Line %d: Number %s is too large!" % (t.lineno,t.value)
- t.value = 0
+ t.value = int(t.value)
return t
lexer = lex.lex()
@@ -944,17 +1000,20 @@ lexer.num_count = 0 # Set the initial count
</pre>
</blockquote>
-This latter approach has the advantage of storing information inside
-the lexer object itself---something that may be useful if multiple instances
-of the same lexer have been created. However, it may also feel kind
-of "hacky" to the OO purists. Just to put their mind at some ease, all
+This latter approach has the advantage of being simple and working
+correctly in applications where multiple instantiations of a given
+lexer exist in the same application. However, this might also feel
+like a gross violation of encapsulation to OO purists.
+Just to put your mind at some ease, all
internal attributes of the lexer (with the exception of <tt>lineno</tt>) have names that are prefixed
by <tt>lex</tt> (e.g., <tt>lexdata</tt>,<tt>lexpos</tt>, etc.). Thus,
-it should be perfectly safe to store attributes in the lexer that
-don't have names starting with that prefix.
+it is perfectly safe to store attributes in the lexer that
+don't have names starting with that prefix or a name that conlicts with one of the
+predefined methods (e.g., <tt>input()</tt>, <tt>token()</tt>, etc.).
<p>
-A third approach is to define the lexer as a class as shown in the previous example:
+If you don't like assigning values on the lexer object, you can define your lexer as a class as
+shown in the previous section:
<blockquote>
<pre>
@@ -963,11 +1022,7 @@ class MyLexer:
def t_NUMBER(self,t):
r'\d+'
self.num_count += 1
- try:
- t.value = int(t.value)
- except ValueError:
- print "Line %d: Number %s is too large!" % (t.lineno,t.value)
- t.value = 0
+ t.value = int(t.value)
return t
def build(self, **kwargs):
@@ -975,10 +1030,6 @@ class MyLexer:
def __init__(self):
self.num_count = 0
-
-# Create a lexer
-m = MyLexer()
-lexer = lex.lex(object=m)
</pre>
</blockquote>
@@ -986,10 +1037,28 @@ The class approach may be the easiest to manage if your application is
going to be creating multiple instances of the same lexer and you need
to manage a lot of state.
+<p>
+State can also be managed through closures. For example, in Python 3:
+
+<blockquote>
+<pre>
+def MyLexer():
+ num_count = 0
+ ...
+ def t_NUMBER(t):
+ r'\d+'
+ nonlocal num_count
+ num_count += 1
+ t.value = int(t.value)
+ return t
+ ...
+</pre>
+</blockquote>
+
<H3><a name="ply_nn19"></a>3.16 Lexer cloning</H3>
<p>
-If necessary, a lexer object can be quickly duplicated by invoking its <tt>clone()</tt> method. For example:
+If necessary, a lexer object can be duplicated by invoking its <tt>clone()</tt> method. For example:
<blockquote>
<pre>
@@ -1009,9 +1078,15 @@ clone and use it to look ahead. Or, if you were implementing some kind of prepr
cloned lexers could be used to handle different input files.
<p>
-Special considerations need to be made when cloning lexers that also maintain their own
-internal state. Namely, you need to be aware that the newly created lexers will share all
-of this state with the original lexer. For example, if you defined a lexer as a class and did this:
+Creating a clone is different than calling <tt>lex.lex()</tt> in that
+PLY doesn't regenerate any of the internal tables or regular expressions. So,
+
+<p>
+Special considerations need to be made when cloning lexers that also
+maintain their own internal state using classes or closures. Namely,
+you need to be aware that the newly created lexers will share all of
+this state with the original lexer. For example, if you defined a
+lexer as a class and did this:
<blockquote>
<pre>
@@ -1024,8 +1099,9 @@ b = a.clone() # Clone the lexer
Then both <tt>a</tt> and <tt>b</tt> are going to be bound to the same
object <tt>m</tt> and any changes to <tt>m</tt> will be reflected in both lexers. It's
-important to emphasize that <tt>clone()</tt> is not meant to make a totally new copy of a
-lexer. If you want to do that, call <tt>lex()</tt> again to create a new lexer.
+important to emphasize that <tt>clone()</tt> is only meant to create a new lexer
+that reuses the regular expressions and environment of another lexer. If you
+need to make a totally new copy of a lexer, then call <tt>lex()</tt> again.
<H3><a name="ply_nn20"></a>3.17 Internal lexer state</H3>
@@ -1045,8 +1121,9 @@ matched at the new position.
<p>
<tt>lexer.lineno</tt>
<blockquote>
-The current value of the line number attribute stored in the lexer. This can be modified as needed to
-change the line number.
+The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute
+exists---it never sets, updates, or performs any processing with it. If you want to track line numbers,
+you will need to add code yourself (see the section on line numbers and positional information).
</blockquote>
<p>
@@ -1066,7 +1143,6 @@ Note: This attribute is only updated when tokens are defined and processed by fu
<H3><a name="ply_nn21"></a>3.18 Conditional lexing and start conditions</H3>
-
In advanced parsing applications, it may be useful to have different
lexing states. For instance, you may want the occurrence of a certain
token or syntactic construct to trigger a different kind of lexing.
@@ -1329,9 +1405,10 @@ factor : NUMBER
</blockquote>
In the grammar, symbols such as <tt>NUMBER</tt>, <tt>+</tt>, <tt>-</tt>, <tt>*</tt>, and <tt>/</tt> are known
-as <em>terminals</em> and correspond to raw input tokens. Identifiers such as <tt>term</tt> and <tt>factor</tt> refer to more
-complex rules, typically comprised of a collection of tokens. These identifiers are known as <em>non-terminals</em>.
+as <em>terminals</em> and correspond to raw input tokens. Identifiers such as <tt>term</tt> and <tt>factor</tt> refer to
+grammar rules comprised of a collection of terminals and other rules. These identifiers are known as <em>non-terminals</em>.
<P>
+
The semantic behavior of a language is often specified using a
technique known as syntax directed translation. In syntax directed
translation, attributes are attached to each symbol in a given grammar
@@ -1357,9 +1434,12 @@ factor : NUMBER factor.val = int(NUMBER.lexval)
</pre>
</blockquote>
-A good way to think about syntax directed translation is to simply think of each symbol in the grammar as some
-kind of object. The semantics of the language are then expressed as a collection of methods/operations on these
-objects.
+A good way to think about syntax directed translation is to
+view each symbol in the grammar as a kind of object. Associated
+with each symbol is a value representing its "state" (for example, the
+<tt>val</tt> attribute above). Semantic
+actions are then expressed as a collection of functions or methods
+that operate on the symbols and associated values.
<p>
Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a
@@ -1368,64 +1448,78 @@ Whenever a valid right-hand-side is found in the input, the appropriate action c
grammar symbols are replaced by the grammar symbol on the left-hand-side.
<p>
-LR parsing is commonly implemented by shifting grammar symbols onto a stack and looking at the stack and the next
-input token for patterns. The details of the algorithm can be found in a compiler text, but the
-following example illustrates the steps that are performed if you wanted to parse the expression
-<tt>3 + 5 * (10 - 20)</tt> using the grammar defined above:
+LR parsing is commonly implemented by shifting grammar symbols onto a
+stack and looking at the stack and the next input token for patterns that
+match one of the grammar rules.
+The details of the algorithm can be found in a compiler textbook, but the
+following example illustrates the steps that are performed if you
+wanted to parse the expression
+<tt>3 + 5 * (10 - 20)</tt> using the grammar defined above. In the example,
+the special symbol <tt>$</tt> represents the end of input.
+
<blockquote>
<pre>
Step Symbol Stack Input Tokens Action
---- --------------------- --------------------- -------------------------------
-1 $ 3 + 5 * ( 10 - 20 )$ Shift 3
-2 $ 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER
-3 $ factor + 5 * ( 10 - 20 )$ Reduce term : factor
-4 $ term + 5 * ( 10 - 20 )$ Reduce expr : term
-5 $ expr + 5 * ( 10 - 20 )$ Shift +
-6 $ expr + 5 * ( 10 - 20 )$ Shift 5
-7 $ expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER
-8 $ expr + factor * ( 10 - 20 )$ Reduce term : factor
-9 $ expr + term * ( 10 - 20 )$ Shift *
-10 $ expr + term * ( 10 - 20 )$ Shift (
-11 $ expr + term * ( 10 - 20 )$ Shift 10
-12 $ expr + term * ( 10 - 20 )$ Reduce factor : NUMBER
-13 $ expr + term * ( factor - 20 )$ Reduce term : factor
-14 $ expr + term * ( term - 20 )$ Reduce expr : term
-15 $ expr + term * ( expr - 20 )$ Shift -
-16 $ expr + term * ( expr - 20 )$ Shift 20
-17 $ expr + term * ( expr - 20 )$ Reduce factor : NUMBER
-18 $ expr + term * ( expr - factor )$ Reduce term : factor
-19 $ expr + term * ( expr - term )$ Reduce expr : expr - term
-20 $ expr + term * ( expr )$ Shift )
-21 $ expr + term * ( expr ) $ Reduce factor : (expr)
-22 $ expr + term * factor $ Reduce term : term * factor
-23 $ expr + term $ Reduce expr : expr + term
-24 $ expr $ Reduce expr
-25 $ $ Success!
-</pre>
-</blockquote>
-
-When parsing the expression, an underlying state machine and the current input token determine what to do next.
-If the next token looks like part of a valid grammar rule (based on other items on the stack), it is generally shifted
-onto the stack. If the top of the stack contains a valid right-hand-side of a grammar rule, it is
-usually "reduced" and the symbols replaced with the symbol on the left-hand-side. When this reduction occurs, the
-appropriate action is triggered (if defined). If the input token can't be shifted and the top of stack doesn't match
-any grammar rules, a syntax error has occurred and the parser must take some kind of recovery step (or bail out).
-
-<p>
-It is important to note that the underlying implementation is built around a large finite-state machine that is encoded
-in a collection of tables. The construction of these tables is quite complicated and beyond the scope of this discussion.
-However, subtle details of this process explain why, in the example above, the parser chooses to shift a token
-onto the stack in step 9 rather than reducing the rule <tt>expr : expr + term</tt>.
-
-<H2><a name="ply_nn23"></a>5. Yacc reference</H2>
-
-
-This section describes how to use write parsers in PLY.
+1 3 + 5 * ( 10 - 20 )$ Shift 3
+2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER
+3 factor + 5 * ( 10 - 20 )$ Reduce term : factor
+4 term + 5 * ( 10 - 20 )$ Reduce expr : term
+5 expr + 5 * ( 10 - 20 )$ Shift +
+6 expr + 5 * ( 10 - 20 )$ Shift 5
+7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER
+8 expr + factor * ( 10 - 20 )$ Reduce term : factor
+9 expr + term * ( 10 - 20 )$ Shift *
+10 expr + term * ( 10 - 20 )$ Shift (
+11 expr + term * ( 10 - 20 )$ Shift 10
+12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER
+13 expr + term * ( factor - 20 )$ Reduce term : factor
+14 expr + term * ( term - 20 )$ Reduce expr : term
+15 expr + term * ( expr - 20 )$ Shift -
+16 expr + term * ( expr - 20 )$ Shift 20
+17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER
+18 expr + term * ( expr - factor )$ Reduce term : factor
+19 expr + term * ( expr - term )$ Reduce expr : expr - term
+20 expr + term * ( expr )$ Shift )
+21 expr + term * ( expr ) $ Reduce factor : (expr)
+22 expr + term * factor $ Reduce term : term * factor
+23 expr + term $ Reduce expr : expr + term
+24 expr $ Reduce expr
+25 $ Success!
+</pre>
+</blockquote>
+
+When parsing the expression, an underlying state machine and the
+current input token determine what happens next. If the next token
+looks like part of a valid grammar rule (based on other items on the
+stack), it is generally shifted onto the stack. If the top of the
+stack contains a valid right-hand-side of a grammar rule, it is
+usually "reduced" and the symbols replaced with the symbol on the
+left-hand-side. When this reduction occurs, the appropriate action is
+triggered (if defined). If the input token can't be shifted and the
+top of stack doesn't match any grammar rules, a syntax error has
+occurred and the parser must take some kind of recovery step (or bail
+out). A parse is only successful if the parser reaches a state where
+the symbol stack is empty and there are no more input tokens.
+
+<p>
+It is important to note that the underlying implementation is built
+around a large finite-state machine that is encoded in a collection of
+tables. The construction of these tables is non-trivial and
+beyond the scope of this discussion. However, subtle details of this
+process explain why, in the example above, the parser chooses to shift
+a token onto the stack in step 9 rather than reducing the
+rule <tt>expr : expr + term</tt>.
+
+<H2><a name="ply_nn23"></a>5. Yacc</H2>
+
+The <tt>ply.yacc</tt> module implements the parsing component of PLY.
+The name "yacc" stands for "Yet Another Compiler Compiler" and is
+borrowed from the Unix tool of the same name.
<H3><a name="ply_nn24"></a>5.1 An example</H3>
-
Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. Here is
how you would do it with <tt>yacc.py</tt>:
@@ -1475,26 +1569,26 @@ def p_error(p):
print "Syntax error in input!"
# Build the parser
-yacc.yacc()
-
-# Use this if you want to build the parser using SLR instead of LALR
-# yacc.yacc(method="SLR")
+parser = yacc.yacc()
-while 1:
+while True:
try:
s = raw_input('calc > ')
except EOFError:
break
if not s: continue
- result = yacc.parse(s)
+ result = parser.parse(s)
print result
</pre>
</blockquote>
-In this example, each grammar rule is defined by a Python function where the docstring to that function contains the
-appropriate context-free grammar specification. Each function accepts a single
-argument <tt>p</tt> that is a sequence containing the values of each grammar symbol in the corresponding rule. The values of
-<tt>p[i]</tt> are mapped to grammar symbols as shown here:
+In this example, each grammar rule is defined by a Python function
+where the docstring to that function contains the appropriate
+context-free grammar specification. The statements that make up the
+function body implement the semantic actions of the rule. Each function
+accepts a single argument <tt>p</tt> that is a sequence containing the
+values of each grammar symbol in the corresponding rule. The values
+of <tt>p[i]</tt> are mapped to grammar symbols as shown here:
<blockquote>
<pre>
@@ -1507,42 +1601,49 @@ def p_expression_plus(p):
</pre>
</blockquote>
-For tokens, the "value" of the corresponding <tt>p[i]</tt> is the
-<em>same</em> as the <tt>p.value</tt> attribute assigned
-in the lexer module. For non-terminals, the value is determined by
-whatever is placed in <tt>p[0]</tt> when rules are reduced. This
-value can be anything at all. However, it probably most common for
-the value to be a simple Python type, a tuple, or an instance. In this example, we
-are relying on the fact that the <tt>NUMBER</tt> token stores an integer value in its value
-field. All of the other rules simply perform various types of integer operations and store
-the result.
-
-<P>
-Note: The use of negative indices have a special meaning in yacc---specially <tt>p[-1]</tt> does
-not have the same value as <tt>p[3]</tt> in this example. Please see the section on "Embedded Actions" for further
-details.
-
<p>
-The first rule defined in the yacc specification determines the starting grammar
-symbol (in this case, a rule for <tt>expression</tt> appears first). Whenever
-the starting rule is reduced by the parser and no more input is available, parsing
-stops and the final value is returned (this value will be whatever the top-most rule
-placed in <tt>p[0]</tt>). Note: an alternative starting symbol can be specified using the <tt>start</tt> keyword argument to
+For tokens, the "value" of the corresponding <tt>p[i]</tt> is the
+<em>same</em> as the <tt>p.value</tt> attribute assigned in the lexer
+module. For non-terminals, the value is determined by whatever is
+placed in <tt>p[0]</tt> when rules are reduced. This value can be
+anything at all. However, it probably most common for the value to be
+a simple Python type, a tuple, or an instance. In this example, we
+are relying on the fact that the <tt>NUMBER</tt> token stores an
+integer value in its value field. All of the other rules simply
+perform various types of integer operations and propagate the result.
+</p>
+
+<p>
+Note: The use of negative indices have a special meaning in
+yacc---specially <tt>p[-1]</tt> does not have the same value
+as <tt>p[3]</tt> in this example. Please see the section on "Embedded
+Actions" for further details.
+</p>
+
+<p>
+The first rule defined in the yacc specification determines the
+starting grammar symbol (in this case, a rule for <tt>expression</tt>
+appears first). Whenever the starting rule is reduced by the parser
+and no more input is available, parsing stops and the final value is
+returned (this value will be whatever the top-most rule placed
+in <tt>p[0]</tt>). Note: an alternative starting symbol can be
+specified using the <tt>start</tt> keyword argument to
<tt>yacc()</tt>.
-<p>The <tt>p_error(p)</tt> rule is defined to catch syntax errors. See the error handling section
-below for more detail.
+<p>The <tt>p_error(p)</tt> rule is defined to catch syntax errors.
+See the error handling section below for more detail.
<p>
-To build the parser, call the <tt>yacc.yacc()</tt> function. This function
-looks at the module and attempts to construct all of the LR parsing tables for the grammar
-you have specified. The first time <tt>yacc.yacc()</tt> is invoked, you will get a message
-such as this:
+To build the parser, call the <tt>yacc.yacc()</tt> function. This
+function looks at the module and attempts to construct all of the LR
+parsing tables for the grammar you have specified. The first
+time <tt>yacc.yacc()</tt> is invoked, you will get a message such as
+this:
<blockquote>
<pre>
$ python calcparse.py
-yacc: Generating LALR parsing table...
+Generating LALR tables
calc >
</pre>
</blockquote>
@@ -1554,7 +1655,8 @@ debugging file called <tt>parser.out</tt> is created. On subsequent
executions, <tt>yacc</tt> will reload the table from
<tt>parsetab.py</tt> unless it has detected a change in the underlying
grammar (in which case the tables and <tt>parsetab.py</tt> file are
-regenerated). Note: The names of parser output files can be changed if necessary. See the notes that follow later.
+regenerated). Note: The names of parser output files can be changed
+if necessary. See the <a href="reference.html">PLY Reference</a> for details.
<p>
If any errors are detected in your grammar specification, <tt>yacc.py</tt> will produce
@@ -1569,7 +1671,16 @@ diagnostic messages and possibly raise an exception. Some of the errors that ca
<li>Undefined rules and tokens
</ul>
-The next few sections now discuss a few finer points of grammar construction.
+The next few sections discuss grammar specification in more detail.
+
+<p>
+The final part of the example shows how to actually run the parser
+created by
+<tt>yacc()</tt>. To run the parser, you simply have to call
+the <tt>parse()</tt> with a string of input text. This will run all
+of the grammar rules and return the result of the entire parse. This
+result return is the value assigned to <tt>p[0]</tt> in the starting
+grammar rule.
<H3><a name="ply_nn25"></a>5.2 Combining Grammar Rule Functions</H3>
@@ -1640,8 +1751,15 @@ def p_expressions(p):
</pre>
</blockquote>
-<H3><a name="ply_nn26"></a>5.3 Character Literals</H3>
+If parsing performance is a concern, you should resist the urge to put
+too much conditional processing into a single grammar rule as shown in
+these examples. When you add checks to see which grammar rule is
+being handled, you are actually duplicating the work that the parser
+has already performed (i.e., the parser already knows exactly what rule it
+matched). You can eliminate this overhead by using a
+separate <tt>p_rule()</tt> function for each grammar rule.
+<H3><a name="ply_nn26"></a>5.3 Character Literals</H3>
If desired, a grammar may contain tokens defined as single character literals. For example:
@@ -1700,12 +1818,13 @@ def p_optitem(p):
</pre>
</blockquote>
-Note: You can write empty rules anywhere by simply specifying an empty right hand side. However, I personally find that
-writing an "empty" rule and using "empty" to denote an empty production is easier to read.
+Note: You can write empty rules anywhere by simply specifying an empty
+right hand side. However, I personally find that writing an "empty"
+rule and using "empty" to denote an empty production is easier to read
+and more clearly states your intentions.
<H3><a name="ply_nn28"></a>5.5 Changing the starting symbol</H3>
-
Normally, the first rule found in a yacc specification defines the starting grammar rule (top level rule). To change this, simply
supply a <tt>start</tt> specifier in your file. For example:
@@ -1723,8 +1842,10 @@ def p_foo(p):
</pre>
</blockquote>
-The use of a <tt>start</tt> specifier may be useful during debugging since you can use it to have yacc build a subset of
-a larger grammar. For this purpose, it is also possible to specify a starting symbol as an argument to <tt>yacc()</tt>. For example:
+The use of a <tt>start</tt> specifier may be useful during debugging
+since you can use it to have yacc build a subset of a larger grammar.
+For this purpose, it is also possible to specify a starting symbol as
+an argument to <tt>yacc()</tt>. For example:
<blockquote>
<pre>
@@ -1735,9 +1856,11 @@ yacc.yacc(start='foo')
<H3><a name="ply_nn27"></a>5.6 Dealing With Ambiguous Grammars</H3>
-The expression grammar given in the earlier example has been written in a special format to eliminate ambiguity.
-However, in many situations, it is extremely difficult or awkward to write grammars in this format. A
-much more natural way to express the grammar is in a more compact form like this:
+The expression grammar given in the earlier example has been written
+in a special format to eliminate ambiguity. However, in many
+situations, it is extremely difficult or awkward to write grammars in
+this format. A much more natural way to express the grammar is in a
+more compact form like this:
<blockquote>
<pre>
@@ -1750,15 +1873,18 @@ expression : expression PLUS expression
</pre>
</blockquote>
-Unfortunately, this grammar specification is ambiguous. For example, if you are parsing the string
-"3 * 4 + 5", there is no way to tell how the operators are supposed to be grouped.
-For example, does the expression mean "(3 * 4) + 5" or is it "3 * (4+5)"?
+Unfortunately, this grammar specification is ambiguous. For example,
+if you are parsing the string "3 * 4 + 5", there is no way to tell how
+the operators are supposed to be grouped. For example, does the
+expression mean "(3 * 4) + 5" or is it "3 * (4+5)"?
<p>
-When an ambiguous grammar is given to <tt>yacc.py</tt> it will print messages about "shift/reduce conflicts"
-or a "reduce/reduce conflicts". A shift/reduce conflict is caused when the parser generator can't decide
-whether or not to reduce a rule or shift a symbol on the parsing stack. For example, consider
-the string "3 * 4 + 5" and the internal parsing stack:
+When an ambiguous grammar is given to <tt>yacc.py</tt> it will print
+messages about "shift/reduce conflicts" or "reduce/reduce conflicts".
+A shift/reduce conflict is caused when the parser generator can't
+decide whether or not to reduce a rule or shift a symbol on the
+parsing stack. For example, consider the string "3 * 4 + 5" and the
+internal parsing stack:
<blockquote>
<pre>
@@ -1773,20 +1899,25 @@ Step Symbol Stack Input Tokens Action
</pre>
</blockquote>
-In this case, when the parser reaches step 6, it has two options. One is to reduce the
-rule <tt>expr : expr * expr</tt> on the stack. The other option is to shift the
-token <tt>+</tt> on the stack. Both options are perfectly legal from the rules
-of the context-free-grammar.
+In this case, when the parser reaches step 6, it has two options. One
+is to reduce the rule <tt>expr : expr * expr</tt> on the stack. The
+other option is to shift the token <tt>+</tt> on the stack. Both
+options are perfectly legal from the rules of the
+context-free-grammar.
<p>
-By default, all shift/reduce conflicts are resolved in favor of shifting. Therefore, in the above
-example, the parser will always shift the <tt>+</tt> instead of reducing. Although this
-strategy works in many cases (including the ambiguous if-then-else), it is not enough for arithmetic
-expressions. In fact, in the above example, the decision to shift <tt>+</tt> is completely wrong---we should have
-reduced <tt>expr * expr</tt> since multiplication has higher mathematical precedence than addition.
+By default, all shift/reduce conflicts are resolved in favor of
+shifting. Therefore, in the above example, the parser will always
+shift the <tt>+</tt> instead of reducing. Although this strategy
+works in many cases (for example, the case of
+"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact,
+in the above example, the decision to shift <tt>+</tt> is completely
+wrong---we should have reduced <tt>expr * expr</tt> since
+multiplication has higher mathematical precedence than addition.
-<p>To resolve ambiguity, especially in expression grammars, <tt>yacc.py</tt> allows individual
-tokens to be assigned a precedence level and associativity. This is done by adding a variable
+<p>To resolve ambiguity, especially in expression
+grammars, <tt>yacc.py</tt> allows individual tokens to be assigned a
+precedence level and associativity. This is done by adding a variable
<tt>precedence</tt> to the grammar file like this:
<blockquote>
@@ -1798,17 +1929,19 @@ precedence = (
</pre>
</blockquote>
-This declaration specifies that <tt>PLUS</tt>/<tt>MINUS</tt> have
-the same precedence level and are left-associative and that
-<tt>TIMES</tt>/<tt>DIVIDE</tt> have the same precedence and are left-associative.
-Within the <tt>precedence</tt> declaration, tokens are ordered from lowest to highest precedence. Thus,
-this declaration specifies that <tt>TIMES</tt>/<tt>DIVIDE</tt> have higher
-precedence than <tt>PLUS</tt>/<tt>MINUS</tt> (since they appear later in the
+This declaration specifies that <tt>PLUS</tt>/<tt>MINUS</tt> have the
+same precedence level and are left-associative and that
+<tt>TIMES</tt>/<tt>DIVIDE</tt> have the same precedence and are
+left-associative. Within the <tt>precedence</tt> declaration, tokens
+are ordered from lowest to highest precedence. Thus, this declaration
+specifies that <tt>TIMES</tt>/<tt>DIVIDE</tt> have higher precedence
+than <tt>PLUS</tt>/<tt>MINUS</tt> (since they appear later in the
precedence specification).
<p>
-The precedence specification works by associating a numerical precedence level value and associativity direction to
-the listed tokens. For example, in the above example you get:
+The precedence specification works by associating a numerical
+precedence level value and associativity direction to the listed
+tokens. For example, in the above example you get:
<blockquote>
<pre>
@@ -1819,9 +1952,10 @@ DIVIDE : level = 2, assoc = 'left'
</pre>
</blockquote>
-These values are then used to attach a numerical precedence value and associativity direction
-to each grammar rule. <em>This is always determined by looking at the precedence of the right-most terminal symbol.</em>
-For example:
+These values are then used to attach a numerical precedence value and
+associativity direction to each grammar rule. <em>This is always
+determined by looking at the precedence of the right-most terminal
+symbol.</em> For example:
<blockquote>
<pre>
@@ -1839,7 +1973,7 @@ looking at the precedence rules and associativity specifiers.
<p>
<ol>
-<li>If the current token has higher precedence, it is shifted.
+<li>If the current token has higher precedence than the rule on the stack, it is shifted.
<li>If the grammar rule on the stack has higher precedence, the rule is reduced.
<li>If the current token and the grammar rule have the same precedence, the
rule is reduced for left associativity, whereas the token is shifted for right associativity.
@@ -1847,21 +1981,28 @@ rule is reduced for left associativity, whereas the token is shifted for right a
favor of shifting (the default).
</ol>
-For example, if "expression PLUS expression" has been parsed and the next token
-is "TIMES", the action is going to be a shift because "TIMES" has a higher precedence level than "PLUS". On the other
-hand, if "expression TIMES expression" has been parsed and the next token is "PLUS", the action
-is going to be reduce because "PLUS" has a lower precedence than "TIMES."
+For example, if "expression PLUS expression" has been parsed and the
+next token is "TIMES", the action is going to be a shift because
+"TIMES" has a higher precedence level than "PLUS". On the other hand,
+if "expression TIMES expression" has been parsed and the next token is
+"PLUS", the action is going to be reduce because "PLUS" has a lower
+precedence than "TIMES."
<p>
-When shift/reduce conflicts are resolved using the first three techniques (with the help of
-precedence rules), <tt>yacc.py</tt> will report no errors or conflicts in the grammar.
+When shift/reduce conflicts are resolved using the first three
+techniques (with the help of precedence rules), <tt>yacc.py</tt> will
+report no errors or conflicts in the grammar (although it will print
+some information in the <tt>parser.out</tt> debugging file).
<p>
-One problem with the precedence specifier technique is that it is sometimes necessary to
-change the precedence of an operator in certain contents. For example, consider a unary-minus operator
-in "3 + 4 * -5". Normally, unary minus has a very high precedence--being evaluated before the multiply.
-However, in our precedence specifier, MINUS has a lower precedence than TIMES. To deal with this,
-precedence rules can be given for fictitious tokens like this:
+One problem with the precedence specifier technique is that it is
+sometimes necessary to change the precedence of an operator in certain
+contexts. For example, consider a unary-minus operator in "3 + 4 *
+-5". Mathematically, the unary minus is normally given a very high
+precedence--being evaluated before the multiply. However, in our
+precedence specifier, MINUS has a lower precedence than TIMES. To
+deal with this, precedence rules can be given for so-called "fictitious tokens"
+like this:
<blockquote>
<pre>
@@ -1950,9 +2091,25 @@ whether it's supposed to reduce the 5 as an expression and then reduce
the rule <tt>assignment : ID EQUALS expression</tt>.
<p>
-It should be noted that reduce/reduce conflicts are notoriously difficult to spot
-simply looking at the input grammer. To locate these, it is usually easier to look at the
-<tt>parser.out</tt> debugging file with an appropriately high level of caffeination.
+It should be noted that reduce/reduce conflicts are notoriously
+difficult to spot simply looking at the input grammer. When a
+reduce/reduce conflict occurs, <tt>yacc()</tt> will try to help by
+printing a warning message such as this:
+
+<blockquote>
+<pre>
+WARNING: 1 reduce/reduce conflict
+WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER)
+WARNING: rejected rule (expression -> NUMBER)
+</pre>
+</blockquote>
+
+This message identifies the two rules that are in conflict. However,
+it may not tell you how the parser arrived at such a state. To try
+and figure it out, you'll probably have to look at your grammar and
+the contents of the
+<tt>parser.out</tt> debugging file with an appropriately high level of
+caffeination.
<H3><a name="ply_nn28"></a>5.7 The parser.out file</H3>
@@ -2212,10 +2369,15 @@ state 13
</pre>
</blockquote>
-In the file, each state of the grammar is described. Within each state the "." indicates the current
-location of the parse within any applicable grammar rules. In addition, the actions for each valid
-input token are listed. When a shift/reduce or reduce/reduce conflict arises, rules <em>not</em> selected
-are prefixed with an !. For example:
+The different states that appear in this file are a representation of
+every possible sequence of valid input tokens allowed by the grammar.
+When receiving input tokens, the parser is building up a stack and
+looking for matching rules. Each state keeps track of the grammar
+rules that might be in the process of being matched at that point. Within each
+rule, the "." character indicates the current location of the parse
+within that rule. In addition, the actions for each valid input token
+are listed. When a shift/reduce or reduce/reduce conflict arises,
+rules <em>not</em> selected are prefixed with an !. For example:
<blockquote>
<pre>
@@ -2232,10 +2394,19 @@ bad. However, the only way to be sure that they are resolved correctly is to lo
<H3><a name="ply_nn29"></a>5.8 Syntax Error Handling</H3>
+If you are creating a parser for production use, the handling of
+syntax errors is important. As a general rule, you don't want a
+parser to simply throw up its hands and stop at the first sign of
+trouble. Instead, you want it to report the error, recover if possible, and
+continue parsing so that all of the errors in the input get reported
+to the user at once. This is the standard behavior found in compilers
+for languages such as C, C++, and Java.
-When a syntax error occurs during parsing, the error is immediately
+In PLY, when a syntax error occurs during parsing, the error is immediately
detected (i.e., the parser does not read any more tokens beyond the
-source of the error). Error recovery in LR parsers is a delicate
+source of the error). However, at this point, the parser enters a
+recovery mode that can be used to try and continue further parsing.
+As a general rule, error recovery in LR parsers is a delicate
topic that involves ancient rituals and black-magic. The recovery mechanism
provided by <tt>yacc.py</tt> is comparable to Unix yacc so you may want
consult a book like O'Reilly's "Lex and Yacc" for some of the finer details.
@@ -2407,7 +2578,7 @@ is done by raising the <tt>SyntaxError</tt> exception like this:
<pre>
def p_production(p):
'production : some production ...'
- raise yacc.SyntaxError
+ raise SyntaxError
</pre>
</blockquote>
@@ -2438,8 +2609,9 @@ to discard huge portions of the input text to find a valid restart point.
<H3><a name="ply_nn33"></a>5.9 Line Number and Position Tracking</H3>
-Position tracking is often a tricky problem when writing compilers. By default, PLY tracks the line number and position of
-all tokens. This information is available using the following functions:
+Position tracking is often a tricky problem when writing compilers.
+By default, PLY tracks the line number and position of all tokens.
+This information is available using the following functions:
<ul>
<li><tt>p.lineno(num)</tt>. Return the line number for symbol <em>num</em>
@@ -2457,9 +2629,11 @@ def p_expression(p):
</pre>
</blockquote>
-As an optional feature, <tt>yacc.py</tt> can automatically track line numbers and positions for all of the grammar symbols
-as well. However, this
-extra tracking requires extra processing and can significantly slow down parsing. Therefore, it must be enabled by passing the
+As an optional feature, <tt>yacc.py</tt> can automatically track line
+numbers and positions for all of the grammar symbols as well.
+However, this extra tracking requires extra processing and can
+significantly slow down parsing. Therefore, it must be enabled by
+passing the
<tt>tracking=True</tt> option to <tt>yacc.parse()</tt>. For example:
<blockquote>
@@ -2468,8 +2642,9 @@ yacc.parse(data,tracking=True)
</pre>
</blockquote>
-Once enabled, the <tt>lineno()</tt> and <tt>lexpos()</tt> methods work for all grammar symbols. In addition, two
-additional methods can be used:
+Once enabled, the <tt>lineno()</tt> and <tt>lexpos()</tt> methods work
+for all grammar symbols. In addition, two additional methods can be
+used:
<ul>
<li><tt>p.linespan(num)</tt>. Return a tuple (startline,endline) with the starting and ending line number for symbol <em>num</em>.
@@ -2511,29 +2686,58 @@ def p_bad_func(p):
</blockquote>
<p>
-Similarly, you may get better parsing performance if you only propagate line number
-information where it's needed. For example:
+Similarly, you may get better parsing performance if you only
+selectively propagate line number information where it's needed using
+the <tt>p.set_lineno()</tt> method. For example:
<blockquote>
<pre>
def p_fname(p):
'fname : ID'
- p[0] = (p[1],p.lineno(1))
+ p[0] = p[1]
+ p.set_lineno(0,p.lineno(1))
</pre>
</blockquote>
-Finally, it should be noted that PLY does not store position information after a rule has been
-processed. If it is important for you to retain this information in an abstract syntax tree, you
-must make your own copy.
+PLY doesn't retain line number information from rules that have already been
+parsed. If you are building an abstract syntax tree and need to have line numbers,
+you should make sure that the line numbers appear in the tree itself.
<H3><a name="ply_nn34"></a>5.10 AST Construction</H3>
+<tt>yacc.py</tt> provides no special functions for constructing an
+abstract syntax tree. However, such construction is easy enough to do
+on your own.
-<tt>yacc.py</tt> provides no special functions for constructing an abstract syntax tree. However, such
-construction is easy enough to do on your own. Simply create a data structure for abstract syntax tree nodes
-and assign nodes to <tt>p[0]</tt> in each rule.
+<p>A minimal way to construct a tree is to simply create and
+propagate a tuple or list in each grammar rule function. There
+are many possible ways to do this, but one example would be something
+like this:
-For example:
+<blockquote>
+<pre>
+def p_expression_binop(p):
+ '''expression : expression PLUS expression
+ | expression MINUS expression
+ | expression TIMES expression
+ | expression DIVIDE expression'''
+
+ p[0] = ('binary-expression',p[2],p[1],p[3])
+
+def p_expression_group(p):
+ 'expression : LPAREN expression RPAREN'
+ p[0] = ('group-expression',p[2])
+
+def p_expression_number(p):
+ 'expression : NUMBER'
+ p[0] = ('number-expression',p[1])
+</pre>
+</blockquote>
+
+<p>
+Another approach is to create a set of data structure for different
+kinds of abstract syntax tree nodes and assign nodes to <tt>p[0]</tt>
+in each rule. For example:
<blockquote>
<pre>
@@ -2569,8 +2773,12 @@ def p_expression_number(p):
</pre>
</blockquote>
-To simplify tree traversal, it may make sense to pick a very generic tree structure for your parse tree nodes.
-For example:
+The advantage to this approach is that it may make it easier to attach more complicated
+semantics, type checking, code generation, and other features to the node classes.
+
+<p>
+To simplify tree traversal, it may make sense to pick a very generic
+tree structure for your parse tree nodes. For example:
<blockquote>
<pre>
@@ -2613,7 +2821,7 @@ symbols <tt>A</tt>, <tt>B</tt>, <tt>C</tt>, and <tt>D</tt> have been
parsed. Sometimes, however, it is useful to execute small code
fragments during intermediate stages of parsing. For example, suppose
you wanted to perform some action immediately after <tt>A</tt> has
-been parsed. To do this, you can write a empty rule like this:
+been parsed. To do this, write an empty rule like this:
<blockquote>
<pre>
@@ -2676,8 +2884,11 @@ def p_seen_AB(p):
</pre>
</blockquote>
-an extra shift-reduce conflict will be introduced. This conflict is caused by the fact that the same symbol <tt>C</tt> appears next in
-both the <tt>abcd</tt> and <tt>abcx</tt> rules. The parser can either shift the symbol (<tt>abcd</tt> rule) or reduce the empty rule <tt>seen_AB</tt> (<tt>abcx</tt> rule).
+an extra shift-reduce conflict will be introduced. This conflict is
+caused by the fact that the same symbol <tt>C</tt> appears next in
+both the <tt>abcd</tt> and <tt>abcx</tt> rules. The parser can either
+shift the symbol (<tt>abcd</tt> rule) or reduce the empty
+rule <tt>seen_AB</tt> (<tt>abcx</tt> rule).
<p>
A common use of embedded rules is to control other aspects of parsing
@@ -2701,10 +2912,14 @@ def p_new_scope(p):
</pre>
</blockquote>
-In this case, the embedded action <tt>new_scope</tt> executes immediately after a <tt>LBRACE</tt> (<tt>{</tt>) symbol is parsed. This might
-adjust internal symbol tables and other aspects of the parser. Upon completion of the rule <tt>statements_block</tt>, code might undo the operations performed in the embedded action (e.g., <tt>pop_scope()</tt>).
+In this case, the embedded action <tt>new_scope</tt> executes
+immediately after a <tt>LBRACE</tt> (<tt>{</tt>) symbol is parsed.
+This might adjust internal symbol tables and other aspects of the
+parser. Upon completion of the rule <tt>statements_block</tt>, code
+might undo the operations performed in the embedded action
+(e.g., <tt>pop_scope()</tt>).
-<H3><a name="ply_nn36"></a>5.12 Yacc implementation notes</H3>
+<H3><a name="ply_nn36"></a>5.12 Miscellaneous Yacc Notes</h3>
<ul>
@@ -2817,17 +3032,17 @@ machine. Please be patient.
size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules.
</ul>
-<H2><a name="ply_nn37"></a>6. Parser and Lexer State Management</H2>
+<H2><a name="ply_nn37"></a>6. Multiple Parsers and Lexers</H2>
In advanced parsing applications, you may want to have multiple
-parsers and lexers. Furthermore, the parser may want to control the
-behavior of the lexer in some way.
+parsers and lexers.
<p>
-To do this, it is important to note that both the lexer and parser are
-actually implemented as objects. These objects are returned by the
-<tt>lex()</tt> and <tt>yacc()</tt> functions respectively. For example:
+As a general rules this isn't a problem. However, to make it work,
+you need to carefully make sure everything gets hooked up correctly.
+First, make sure you save the objects returned by <tt>lex()</tt> and
+<tt>yacc()</tt>. For example:
<blockquote>
<pre>
@@ -2836,7 +3051,8 @@ parser = yacc.yacc() # Return parser object
</pre>
</blockquote>
-To attach the lexer and parser together, make sure you use the <tt>lexer</tt> argumemnt to parse. For example:
+Next, when parsing, make sure you give the <tt>parse()</tt> function a reference to the lexer it
+should be using. For example:
<blockquote>
<pre>
@@ -2844,8 +3060,13 @@ parser.parse(text,lexer=lexer)
</pre>
</blockquote>
-Within lexer and parser rules, these objects are also available. In the lexer,
-the "lexer" attribute of a token refers to the lexer object in use. For example:
+If you forget to do this, the parser will use the last lexer
+created--which is not always what you want.
+
+<p>
+Within lexer and parser rule functions, these objects are also
+available. In the lexer, the "lexer" attribute of a token refers to
+the lexer object that triggered the rule. For example:
<blockquote>
<pre>
diff --git a/example/BASIC/basiclog.py b/example/BASIC/basiclog.py
new file mode 100644
index 0000000..ccfd7b9
--- /dev/null
+++ b/example/BASIC/basiclog.py
@@ -0,0 +1,79 @@
+# An implementation of Dartmouth BASIC (1964)
+#
+
+import sys
+sys.path.insert(0,"../..")
+
+if sys.version_info[0] >= 3:
+ raw_input = input
+
+import logging
+logging.basicConfig(
+ level = logging.INFO,
+ filename = "parselog.txt",
+ filemode = "w"
+)
+log = logging.getLogger()
+
+import basiclex
+import basparse
+import basinterp
+
+# If a filename has been specified, we try to run it.
+# If a runtime error occurs, we bail out and enter
+# interactive mode below
+if len(sys.argv) == 2:
+ data = open(sys.argv[1]).read()
+ prog = basparse.parse(data,debug=log)
+ if not prog: raise SystemExit
+ b = basinterp.BasicInterpreter(prog)
+ try:
+ b.run()
+ raise SystemExit
+ except RuntimeError:
+ pass
+
+else:
+ b = basinterp.BasicInterpreter({})
+
+# Interactive mode. This incrementally adds/deletes statements
+# from the program stored in the BasicInterpreter object. In
+# addition, special commands 'NEW','LIST',and 'RUN' are added.
+# Specifying a line number with no code deletes that line from
+# the program.
+
+while 1:
+ try:
+ line = raw_input("[BASIC] ")
+ except EOFError:
+ raise SystemExit
+ if not line: continue
+ line += "\n"
+ prog = basparse.parse(line,debug=log)
+ if not prog: continue
+
+ keys = list(prog)
+ if keys[0] > 0:
+ b.add_statements(prog)
+ else:
+ stat = prog[keys[0]]
+ if stat[0] == 'RUN':
+ try:
+ b.run()
+ except RuntimeError:
+ pass
+ elif stat[0] == 'LIST':
+ b.list()
+ elif stat[0] == 'BLANK':
+ b.del_line(stat[1])
+ elif stat[0] == 'NEW':
+ b.new()
+
+
+
+
+
+
+
+
+
diff --git a/example/BASIC/basparse.py b/example/BASIC/basparse.py
index d773715..ccdeb16 100644
--- a/example/BASIC/basparse.py
+++ b/example/BASIC/basparse.py
@@ -403,9 +403,9 @@ def p_error(p):
bparser = yacc.yacc()
-def parse(data):
+def parse(data,debug=0):
bparser.error = 0
- p = bparser.parse(data)
+ p = bparser.parse(data,debug=debug)
if bparser.error: return None
return p
diff --git a/example/calc/calc.py b/example/calc/calc.py
index 2e36c7d..b923780 100644
--- a/example/calc/calc.py
+++ b/example/calc/calc.py
@@ -23,11 +23,7 @@ t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
def t_NUMBER(t):
r'\d+'
- try:
- t.value = int(t.value)
- except ValueError:
- print("Integer value too large %s" % t.value)
- t.value = 0
+ t.value = int(t.value)
return t
t_ignore = " \t"
diff --git a/example/calcdebug/calc.py b/example/calcdebug/calc.py
new file mode 100644
index 0000000..6732f9f
--- /dev/null
+++ b/example/calcdebug/calc.py
@@ -0,0 +1,113 @@
+# -----------------------------------------------------------------------------
+# calc.py
+#
+# This example shows how to run the parser in a debugging mode
+# with output routed to a logging object.
+# -----------------------------------------------------------------------------
+
+import sys
+sys.path.insert(0,"../..")
+
+if sys.version_info[0] >= 3:
+ raw_input = input
+
+tokens = (
+ 'NAME','NUMBER',
+ )
+
+literals = ['=','+','-','*','/', '(',')']
+
+# Tokens
+
+t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*'
+
+def t_NUMBER(t):
+ r'\d+'
+ t.value = int(t.value)
+ return t
+
+t_ignore = " \t"
+
+def t_newline(t):
+ r'\n+'
+ t.lexer.lineno += t.value.count("\n")
+
+def t_error(t):
+ print("Illegal character '%s'" % t.value[0])
+ t.lexer.skip(1)
+
+# Build the lexer
+import ply.lex as lex
+lex.lex()
+
+# Parsing rules
+
+precedence = (
+ ('left','+','-'),
+ ('left','*','/'),
+ ('right','UMINUS'),
+ )
+
+# dictionary of names
+names = { }
+
+def p_statement_assign(p):
+ 'statement : NAME "=" expression'
+ names[p[1]] = p[3]
+
+def p_statement_expr(p):
+ 'statement : expression'
+ print(p[1])
+
+def p_expression_binop(p):
+ '''expression : expression '+' expression
+ | expression '-' expression
+ | expression '*' expression
+ | expression '/' expression'''
+ if p[2] == '+' : p[0] = p[1] + p[3]
+ elif p[2] == '-': p[0] = p[1] - p[3]
+ elif p[2] == '*': p[0] = p[1] * p[3]
+ elif p[2] == '/': p[0] = p[1] / p[3]
+
+def p_expression_uminus(p):
+ "expression : '-' expression %prec UMINUS"
+ p[0] = -p[2]
+
+def p_expression_group(p):
+ "expression : '(' expression ')'"
+ p[0] = p[2]
+
+def p_expression_number(p):
+ "expression : NUMBER"
+ p[0] = p[1]
+
+def p_expression_name(p):
+ "expression : NAME"
+ try:
+ p[0] = names[p[1]]
+ except LookupError:
+ print("Undefined name '%s'" % p[1])
+ p[0] = 0
+
+def p_error(p):
+ if p:
+ print("Syntax error at '%s'" % p.value)
+ else:
+ print("Syntax error at EOF")
+
+import ply.yacc as yacc
+yacc.yacc()
+
+import logging
+logging.basicConfig(
+ level=logging.INFO,
+ filename="parselog.txt"
+)
+
+while 1:
+ try:
+ s = raw_input('calc > ')
+ except EOFError:
+ break
+ if not s: continue
+ yacc.parse(s,debug=logging.getLogger())
diff --git a/example/closurecalc/calc.py b/example/closurecalc/calc.py
index a1d5845..6598f58 100644
--- a/example/closurecalc/calc.py
+++ b/example/closurecalc/calc.py
@@ -36,11 +36,7 @@ def make_calculator():
def t_NUMBER(t):
r'\d+'
- try:
- t.value = int(t.value)
- except ValueError:
- print("Integer value too large %s" % t.value)
- t.value = 0
+ t.value = int(t.value)
return t
def t_newline(t):
diff --git a/example/optcalc/README b/example/optcalc/README
index 6d196f0..53dd5fc 100644
--- a/example/optcalc/README
+++ b/example/optcalc/README
@@ -5,5 +5,5 @@ To run:
- Then run 'python -OO calc.py'
-If working corretly, the second version should run the
+If working correctly, the second version should run the
same way.
diff --git a/ply/lex.py b/ply/lex.py
index b9a478d..71b33b5 100644
--- a/ply/lex.py
+++ b/ply/lex.py
@@ -3,7 +3,7 @@
#
# Author: David M. Beazley (dave@dabeaz.com)
#
-# Copyright (C) 2001-2008, David M. Beazley
+# Copyright (C) 2001-2009, David M. Beazley
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -22,12 +22,12 @@
# See the file COPYING for a complete copy of the LGPL.
# -----------------------------------------------------------------------------
-__version__ = "2.6"
-__tabversion__ = "2.4" # Version of table file used
+__version__ = "3.0"
+__tabversion__ = "3.0" # Version of table file used
import re, sys, types, copy, os
-# This tuple lists known string types
+# This tuple contains known string types
try:
# Python 2.6
StringTypes = (types.StringType, types.UnicodeType)
@@ -35,7 +35,9 @@ except AttributeError:
# Python 3.0
StringTypes = (str, bytes)
-# Compatibility function for python 2.6/3.0
+# Extract the code attribute of a function. Different implementations
+# are for Python 2/3 compatibility.
+
if sys.version_info[0] < 3:
def func_code(f):
return f.func_code
@@ -54,27 +56,12 @@ class LexError(Exception):
self.args = (message,)
self.text = s
-# An object used to issue one-time warning messages for various features
-
-class LexWarning(object):
- def __init__(self):
- self.warned = 0
- def __call__(self,msg):
- if not self.warned:
- sys.stderr.write("ply.lex: Warning: " + msg+"\n")
- self.warned = 1
-
-_SkipWarning = LexWarning() # Warning for use of t.skip() on tokens
-
# Token class. This class is used to represent the tokens produced.
class LexToken(object):
def __str__(self):
return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
def __repr__(self):
return str(self)
- def skip(self,n):
- self.lexer.skip(n)
- _SkipWarning("Calling t.skip() on a token is deprecated. Please use t.lexer.skip()")
# -----------------------------------------------------------------------------
# Lexer class
@@ -372,6 +359,19 @@ class Lexer:
raise RuntimeError("No input string given with input()")
return None
+ # Iterator interface
+ def __iter__(self):
+ return self
+
+ def next(self):
+ t = self.token()
+ if t is None:
+ raise StopIteration
+ return t
+
+ __next__ = next
+
+
# -----------------------------------------------------------------------------
# _validate_file()
#
@@ -891,7 +891,6 @@ def runmain(lexer=None,data=None):
if not tok: break
sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos))
-
# -----------------------------------------------------------------------------
# @TOKEN(regex)
#
diff --git a/ply/yacc.py b/ply/yacc.py
index 52ac7ef..f660f44 100644
--- a/ply/yacc.py
+++ b/ply/yacc.py
@@ -1,9 +1,9 @@
-#-----------------------------------------------------------------------------
+# -----------------------------------------------------------------------------
# ply: yacc.py
#
# Author(s): David M. Beazley (dave@dabeaz.com)
#
-# Copyright (C) 2001-2008, David M. Beazley
+# Copyright (C) 2001-2009, David M. Beazley
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -50,8 +50,8 @@
# own risk!
# ----------------------------------------------------------------------------
-__version__ = "2.6"
-__tabversion__ = "2.4" # Table version
+__version__ = "3.0"
+__tabversion__ = "3.0" # Table version
#-----------------------------------------------------------------------------
# === User configurable parameters ===
@@ -73,28 +73,6 @@ yaccdevel = 0 # Set to True if developing yacc. This turns off
import re, types, sys, os.path
-# Python 2.6/3.0 compatibility
-try:
- import cStringIO
-except ImportError:
- import io as cStringIO
-
-# Python 2.6/3.0 compatibility function. Create a new MD5 object for computing
-# the grammar signature.
-
-def md5_new():
- try:
- import hashlib
- return hashlib.md5()
- except ImportError:
- import md5
- return md5.new()
-
-# Python 2.6/3.0 compatibility function. Update the MD5 signature
-# using UTF-8 encoded data.
-def Signature_update(data):
- Signature.update(data.encode('utf-8'))
-
# Compatibility function for python 2.6/3.0
if sys.version_info[0] < 3:
def func_code(f):
@@ -114,17 +92,40 @@ def load_ply_lex():
if sys.version_info[0] < 3:
import lex
else:
- env = { }
- exec("from . import lex", env, env)
- lex = env['lex']
+ import ply.lex as lex
return lex
+# This object is a stand-in for a logging object created by the
+# logging module. PLY will use this by default to create things
+# such as the parser.out file. If a user wants more detailed
+# information, they can create their own logging object and pass
+# it into PLY.
+
+class PlyLogger(object):
+ def __init__(self,f):
+ self.f = f
+ def debug(self,msg,*args,**kwargs):
+ self.f.write((msg % args) + "\n")
+ info = debug
+
+ def warning(self,msg,*args,**kwargs):
+ self.f.write("WARNING: "+ (msg % args) + "\n")
+
+ def error(self,msg,*args,**kwargs):
+ self.f.write("ERROR: " + (msg % args) + "\n")
+
+ critical = debug
+
+# Null logger is used when no output is generated. Does nothing.
+class NullLogger(object):
+ def __getattribute__(self,name):
+ return self
+ def __call__(self,*args,**kwargs):
+ return self
+
# Exception raised for yacc-related errors
class YaccError(Exception): pass
-# Exception raised for errors raised in production rules
-class SyntaxError(Exception): pass
-
#-----------------------------------------------------------------------------
# === LR Parsing Engine ===
#
@@ -177,6 +178,9 @@ class YaccProduction:
def lineno(self,n):
return getattr(self.slice[n],"lineno",0)
+ def set_lineno(self,n,lineno):
+ self.slice[n].lineno = n
+
def linespan(self,n):
startline = getattr(self.slice[n],"lineno",0)
endline = getattr(self.slice[n],"endlineno",startline)
@@ -194,27 +198,18 @@ class YaccProduction:
raise SyntaxError
-# The LR Parsing engine. This is defined as a class so that multiple parsers
-# can exist in the same process. A user never instantiates this directly.
-# Instead, the global yacc() function should be used to create a suitable Parser
-# object.
-
-class Parser:
- def __init__(self,magic=None):
-
- # This is a hack to keep users from trying to instantiate a Parser
- # object directly.
-
- if magic != "xyzzy":
- raise YaccError("Can't directly instantiate Parser. Use yacc() instead.")
+# -----------------------------------------------------------------------------
+# == LRParser ==
+#
+# The LR Parsing engine.
+# -----------------------------------------------------------------------------
- # Reset internal state
- self.productions = None # List of productions
- self.errorfunc = None # Error handling function
- self.action = { } # LR Action table
- self.goto = { } # LR goto table
- self.require = { } # Attribute require table
- self.method = "Unknown LR" # Table construction method used
+class LRParser:
+ def __init__(self,lrtab,errorf):
+ self.productions = lrtab.lr_productions
+ self.action = lrtab.lr_action
+ self.goto = lrtab.lr_goto
+ self.errorfunc = errorf
def errok(self):
self.errorok = 1
@@ -229,6 +224,8 @@ class Parser:
def parse(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None):
if debug or yaccdevel:
+ if isinstance(debug,int):
+ debug = PlyLogger(sys.stderr)
return self.parsedebug(input,lexer,debug,tracking,tokenfunc)
elif tracking:
return self.parseopt(input,lexer,debug,tracking,tokenfunc)
@@ -250,7 +247,7 @@ class Parser:
#
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None):
+ def parsedebug(self,input=None,lexer=None,debug=None,tracking=0,tokenfunc=None):
lookahead = None # Current lookahead symbol
lookaheadstack = [ ] # Stack of lookahead symbols
actions = self.action # Local reference to action table (to avoid lookup on self.)
@@ -259,6 +256,10 @@ class Parser:
pslice = YaccProduction(None) # Production object passed to grammar rules
errorcount = 0 # Used during error recovery
+ # --! DEBUG
+ debug.info("PLY: PARSE DEBUG START")
+ # --! DEBUG
+
# If no lexer was given, we will try to use the lex module
if not lexer:
lex = load_ply_lex()
@@ -301,8 +302,8 @@ class Parser:
# the next token off of the lookaheadstack or from the lexer
# --! DEBUG
- if debug > 1:
- sys.stdout.write('state %s\n' % state)
+ debug.debug('')
+ debug.debug('State : %s', state)
# --! DEBUG
if not lookahead:
@@ -315,32 +316,22 @@ class Parser:
lookahead.type = "$end"
# --! DEBUG
- if debug:
- errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()
+ debug.debug('Stack : %s',
+ ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip())
# --! DEBUG
# Check the action table
ltype = lookahead.type
t = actions[state].get(ltype)
- # --! DEBUG
- if debug > 1:
- sys.stdout.write('action %s\n' % t)
- # --! DEBUG
-
if t is not None:
if t > 0:
# shift a symbol on the stack
- if ltype == "$end":
- # Error, end of input
- sys.stderr.write("yacc: Parse error. EOF\n")
- return
statestack.append(t)
state = t
# --! DEBUG
- if debug > 1:
- sys.stderr.write("%-60s shift state %s\n" % (errorlead, t))
+ debug.debug("Action : Shift and goto state %s", t)
# --! DEBUG
symstack.append(lookahead)
@@ -362,8 +353,11 @@ class Parser:
sym.value = None
# --! DEBUG
- if debug > 1:
- sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t))
+ if plen:
+ debug.info("Action : Reduce rule [%s] with %s and goto state %d", p.str, [_v.value for _v in symstack[-plen:]],-t)
+ else:
+ debug.info("Action : Reduce rule [%s] with %s and goto state %d", p.str, [],-t)
+
# --! DEBUG
if plen:
@@ -392,7 +386,10 @@ class Parser:
# Call the grammar rule with our special slice object
del symstack[-plen:]
del statestack[-plen:]
- p.func(pslice)
+ p.callable(pslice)
+ # --! DEBUG
+ debug.info("Result : %r", pslice[0])
+ # --! DEBUG
symstack.append(sym)
state = goto[statestack[-1]][pname]
statestack.append(state)
@@ -428,7 +425,10 @@ class Parser:
try:
# Call the grammar rule with our special slice object
- p.func(pslice)
+ p.callable(pslice)
+ # --! DEBUG
+ debug.info("Result : %r", pslice[0])
+ # --! DEBUG
symstack.append(sym)
state = goto[statestack[-1]][pname]
statestack.append(state)
@@ -447,13 +447,18 @@ class Parser:
if t == 0:
n = symstack[-1]
- return getattr(n,"value",None)
+ result = getattr(n,"value",None)
+ # --! DEBUG
+ debug.info("Done : Returning %r", result)
+ debug.info("PLY: PARSE DEBUG END")
+ # --! DEBUG
+ return result
if t == None:
# --! DEBUG
- if debug:
- sys.stderr.write(errorlead + "\n")
+ debug.error('Error : %s',
+ ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip())
# --! DEBUG
# We have some kind of parsing error here. To handle
@@ -621,10 +626,6 @@ class Parser:
if t is not None:
if t > 0:
# shift a symbol on the stack
- if ltype == '$end':
- # Error, end of input
- sys.stderr.write("yacc: Parse error. EOF\n")
- return
statestack.append(t)
state = t
@@ -672,7 +673,7 @@ class Parser:
# Call the grammar rule with our special slice object
del symstack[-plen:]
del statestack[-plen:]
- p.func(pslice)
+ p.callable(pslice)
symstack.append(sym)
state = goto[statestack[-1]][pname]
statestack.append(state)
@@ -708,7 +709,7 @@ class Parser:
try:
# Call the grammar rule with our special slice object
- p.func(pslice)
+ p.callable(pslice)
symstack.append(sym)
state = goto[statestack[-1]][pname]
statestack.append(state)
@@ -895,10 +896,6 @@ class Parser:
if t is not None:
if t > 0:
# shift a symbol on the stack
- if ltype == '$end':
- # Error, end of input
- sys.stderr.write("yacc: Parse error. EOF\n")
- return
statestack.append(t)
state = t
@@ -935,7 +932,7 @@ class Parser:
# Call the grammar rule with our special slice object
del symstack[-plen:]
del statestack[-plen:]
- p.func(pslice)
+ p.callable(pslice)
symstack.append(sym)
state = goto[statestack[-1]][pname]
statestack.append(state)
@@ -965,7 +962,7 @@ class Parser:
try:
# Call the grammar rule with our special slice object
- p.func(pslice)
+ p.callable(pslice)
symstack.append(sym)
state = goto[statestack[-1]][pname]
statestack.append(state)
@@ -1078,167 +1075,169 @@ class Parser:
# Call an error function here
raise RuntimeError("yacc: internal parser error!!!\n")
-
-# -----------------------------------------------------------------------------
-# === Parser Construction ===
-#
-# The following functions and variables are used to implement the yacc() function
-# itself. This is pretty hairy stuff involving lots of error checking,
-# construction of LR items, kernels, and so forth. Although a lot of
-# this work is done using global variables, the resulting Parser object
-# is completely self contained--meaning that it is safe to repeatedly
-# call yacc() with different grammars in the same application.
-# -----------------------------------------------------------------------------
-
# -----------------------------------------------------------------------------
-# validate_file()
+# === Grammar Representation ===
#
-# This function checks to see if there are duplicated p_rulename() functions
-# in the parser module file. Without this function, it is really easy for
-# users to make mistakes by cutting and pasting code fragments (and it's a real
-# bugger to try and figure out why the resulting parser doesn't work). Therefore,
-# we just do a little regular expression pattern matching of def statements
-# to try and detect duplicates.
+# The following functions, classes, and variables are used to represent and
+# manipulate the rules that make up a grammar.
# -----------------------------------------------------------------------------
-def validate_file(filename):
- base,ext = os.path.splitext(filename)
- if ext != '.py': return 1 # No idea. Assume it's okay.
+import re
- try:
- f = open(filename)
- lines = f.readlines()
- f.close()
- except IOError:
- return 1 # Oh well
-
- # Match def p_funcname(
- fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(')
- counthash = { }
- linen = 1
- noerror = 1
- for l in lines:
- m = fre.match(l)
- if m:
- name = m.group(1)
- prev = counthash.get(name)
- if not prev:
- counthash[name] = linen
- else:
- sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev))
- noerror = 0
- linen += 1
- return noerror
-
-# This function looks for functions that might be grammar rules, but which don't have the proper p_suffix.
-def validate_dict(d):
- for n,v in d.items():
- if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue
- if n[0:2] == 't_': continue
-
- if n[0:2] == 'p_':
- sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n)
- if 1 and isinstance(v,types.FunctionType) and func_code(v).co_argcount == 1:
- try:
- doc = v.__doc__.split(" ")
- if doc[1] == ':':
- sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (func_code(v).co_filename, func_code(v).co_firstlineno,n))
- except Exception:
- pass
+# regex matching identifiers
+_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$')
# -----------------------------------------------------------------------------
-# === GRAMMAR FUNCTIONS ===
+# class Production:
+#
+# This class stores the raw information about a single production or grammar rule.
+# A grammar rule refers to a specification such as this:
+#
+# expr : expr PLUS term
+#
+# Here are the basic attributes defined on all productions
+#
+# name - Name of the production. For example 'expr'
+# prod - A list of symbols on the right side ['expr','PLUS','term']
+# prec - Production precedence level
+# number - Production number.
+# func - Function that executes on reduce
+# file - File where production function is defined
+# lineno - Line number where production function is defined
#
-# The following global variables and functions are used to store, manipulate,
-# and verify the grammar rules specified by the user.
+# The following attributes are defined or optional.
+#
+# len - Length of the production (number of symbols on right hand side)
+# usyms - Set of unique symbols found in the production
# -----------------------------------------------------------------------------
-# Initialize all of the global variables used during grammar construction
-def initialize_vars():
- global Productions, Prodnames, Prodmap, Terminals
- global Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems
- global Errorfunc, Signature, Requires
-
- Productions = [None] # A list of all of the productions. The first
- # entry is always reserved for the purpose of
- # building an augmented grammar
-
- Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all
- # productions of that nonterminal.
+class Production(object):
+ def __init__(self,number,name,prod,precedence=('right',0),func=None,file='',line=0):
+ self.name = name
+ self.prod = tuple(prod)
+ self.number = number
+ self.func = func
+ self.callable = None
+ self.file = file
+ self.line = line
+ self.prec = precedence
+
+ # Internal settings used during table construction
+
+ self.len = len(self.prod) # Length of the production
- Prodmap = { } # A dictionary that is only used to detect duplicate
- # productions.
+ # Create a list of unique production symbols used in the production
+ self.usyms = [ ]
+ for s in self.prod:
+ if s not in self.usyms:
+ self.usyms.append(s)
- Terminals = { } # A dictionary mapping the names of terminal symbols to a
- # list of the rules where they are used.
+ # List of all LR items for the production
+ self.lr_items = []
+ self.lr_next = None
- Nonterminals = { } # A dictionary mapping names of nonterminals to a list
- # of rule numbers where they are used.
+ # Create a string representation
+ if self.prod:
+ self.str = "%s -> %s" % (self.name," ".join(self.prod))
+ else:
+ self.str = "%s -> <empty>" % self.name
- First = { } # A dictionary of precomputed FIRST(x) symbols
+ def __str__(self):
+ return self.str
- Follow = { } # A dictionary of precomputed FOLLOW(x) symbols
+ def __repr__(self):
+ return "Production("+str(self)+")"
- Precedence = { } # Precedence rules for each terminal. Contains tuples of the
- # form ('right',level) or ('nonassoc', level) or ('left',level)
+ def __len__(self):
+ return len(self.prod)
- UsedPrecedence = { } # Precedence rules that were actually used by the grammer.
- # This is only used to provide error checking and to generate
- # a warning about unused precedence rules.
+ def __nonzero__(self):
+ return 1
- LRitems = [ ] # A list of all LR items for the grammar. These are the
- # productions with the "dot" like E -> E . PLUS E
+ def __getitem__(self,index):
+ return self.prod[index]
+
+ # Return the nth lr_item from the production (or None if at the end)
+ def lr_item(self,n):
+ if n > len(self.prod): return None
+ p = LRItem(self,n)
- Errorfunc = None # User defined error handler
+ # Precompute the list of productions immediately following. Hack. Remove later
+ try:
+ p.lr_after = Prodnames[p.prod[n+1]]
+ except (IndexError,KeyError):
+ p.lr_after = []
+ try:
+ p.lr_before = p.prod[n-1]
+ except IndexError:
+ p.lr_before = None
- Signature = md5_new() # Digital signature of the grammar rules, precedence
- # and other information. Used to determined when a
- # parsing table needs to be regenerated.
+ return p
- Signature_update(__tabversion__)
+ # Bind the production function name to a callable
+ def bind(self,pdict):
+ if self.func:
+ self.callable = pdict[self.func]
+
+# This class serves as a minimal standin for Production objects when
+# reading table data from files. It only contains information
+# actually used by the LR parsing engine, plus some additional
+# debugging information.
+class MiniProduction(object):
+ def __init__(self,str,name,len,func,file,line):
+ self.name = name
+ self.len = len
+ self.func = func
+ self.callable = None
+ self.file = file
+ self.line = line
+ self.str = str
+ def __str__(self):
+ return self.str
+ def __repr__(self):
+ return "MiniProduction(%s)" % self.str
- Requires = { } # Requires list
+ # Bind the production function name to a callable
+ def bind(self,pdict):
+ if self.func:
+ self.callable = pdict[self.func]
- # File objects used when creating the parser.out debugging file
- global _vf, _vfc
- _vf = cStringIO.StringIO()
- _vfc = cStringIO.StringIO()
# -----------------------------------------------------------------------------
-# class Production:
+# class LRItem
#
-# This class stores the raw information about a single production or grammar rule.
-# It has a few required attributes:
+# This class represents a specific stage of parsing a production rule. For
+# example:
#
-# name - Name of the production (nonterminal)
-# prod - A list of symbols making up its production
-# number - Production number.
+# expr : expr . PLUS term
#
-# In addition, a few additional attributes are used to help with debugging or
-# optimization of table generation.
+# In the above, the "." represents the current location of the parse. Here
+# basic attributes:
#
-# file - File where production action is defined.
-# lineno - Line number where action is defined
-# func - Action function
-# prec - Precedence level
-# lr_next - Next LR item. Example, if we are ' E -> E . PLUS E'
-# then lr_next refers to 'E -> E PLUS . E'
-# lr_index - LR item index (location of the ".") in the prod list.
+# name - Name of the production. For example 'expr'
+# prod - A list of symbols on the right side ['expr','.', 'PLUS','term']
+# number - Production number.
+#
+# lr_next Next LR item. Example, if we are ' expr -> expr . PLUS term'
+# then lr_next refers to 'expr -> expr PLUS . term'
+# lr_index - LR item index (location of the ".") in the prod list.
# lookaheads - LALR lookahead symbols for this item
-# len - Length of the production (number of symbols on right hand side)
+# len - Length of the production (number of symbols on right hand side)
+# lr_after - List of all productions that immediately follow
+# lr_before - Grammar symbol immediately before
# -----------------------------------------------------------------------------
-class Production:
- def __init__(self,**kw):
- for k,v in kw.items():
- setattr(self,k,v)
- self.lr_index = -1
- self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure
- self.lr1_added = 0 # Flag indicating whether or not added to LR1
- self.usyms = [ ]
+class LRItem(object):
+ def __init__(self,p,n):
+ self.name = p.name
+ self.prod = list(p.prod)
+ self.number = p.number
+ self.lr_index = n
self.lookaheads = { }
- self.lk_added = { }
- self.setnumbers = [ ]
+ self.prod.insert(n,".")
+ self.prod = tuple(self.prod)
+ self.len = len(self.prod)
+ self.usyms = p.usyms
def __str__(self):
if self.prod:
@@ -1248,933 +1247,580 @@ class Production:
return s
def __repr__(self):
- return str(self)
-
- # Compute lr_items from the production
- def lr_item(self,n):
- if n > len(self.prod): return None
- p = Production()
- p.name = self.name
- p.prod = list(self.prod)
- p.number = self.number
- p.lr_index = n
- p.lookaheads = { }
- p.setnumbers = self.setnumbers
- p.prod.insert(n,".")
- p.prod = tuple(p.prod)
- p.len = len(p.prod)
- p.usyms = self.usyms
-
- # Precompute list of productions immediately following
- try:
- p.lrafter = Prodnames[p.prod[n+1]]
- except (IndexError,KeyError):
- p.lrafter = []
- try:
- p.lrbefore = p.prod[n-1]
- except IndexError:
- p.lrbefore = None
+ return "LRItem("+str(self)+")"
- return p
+ def __len__(self):
+ return len(self.prod)
-class MiniProduction:
- pass
-
-# regex matching identifiers
-_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$')
+ def __getitem__(self,index):
+ return self.prod[index]
# -----------------------------------------------------------------------------
-# add_production()
+# rightmost_terminal()
#
-# Given an action function, this function assembles a production rule.
-# The production rule is assumed to be found in the function's docstring.
-# This rule has the general syntax:
+# Return the rightmost terminal from a list of symbols. Used in add_production()
+# -----------------------------------------------------------------------------
+def rightmost_terminal(symbols, terminals):
+ i = len(symbols) - 1
+ while i >= 0:
+ if symbols[i] in terminals:
+ return symbols[i]
+ i -= 1
+ return None
+
+# -----------------------------------------------------------------------------
+# === GRAMMAR CLASS ===
#
-# name1 ::= production1
-# | production2
-# | production3
-# ...
-# | productionn
-# name2 ::= production1
-# | production2
-# ...
+# The following class represents the contents of the specified grammar along
+# with various computed properties such as first sets, follow sets, LR items, etc.
+# This data is used for critical parts of the table generation process later.
# -----------------------------------------------------------------------------
-def add_production(f,file,line,prodname,syms):
-
- if prodname in Terminals:
- sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname))
- return -1
- if prodname == 'error':
- sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname))
- return -1
-
- if not _is_identifier.match(prodname):
- sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname))
- return -1
-
- for x in range(len(syms)):
- s = syms[x]
- if s[0] in "'\"":
- try:
- c = eval(s)
- if (len(c) > 1):
- sys.stderr.write("%s:%d: Literal token %s in rule '%s' may only be a single character\n" % (file,line,s, prodname))
- return -1
- if not c in Terminals:
- Terminals[c] = []
- syms[x] = c
- continue
- except SyntaxError:
- pass
- if not _is_identifier.match(s) and s != '%prec':
- sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname))
- return -1
-
- # See if the rule is already in the rulemap
- map = "%s -> %s" % (prodname,syms)
- if map in Prodmap:
- m = Prodmap[map]
- sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m))
- sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line))
- return -1
-
- p = Production()
- p.name = prodname
- p.prod = syms
- p.file = file
- p.line = line
- p.func = f
- p.number = len(Productions)
-
-
- Productions.append(p)
- Prodmap[map] = p
- if not prodname in Nonterminals:
- Nonterminals[prodname] = [ ]
-
- # Add all terminals to Terminals
- i = 0
- while i < len(p.prod):
- t = p.prod[i]
- if t == '%prec':
- try:
- precname = p.prod[i+1]
- except IndexError:
- sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line))
- return -1
-
- prec = Precedence.get(precname,None)
- if not prec:
- sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname))
- return -1
- else:
- p.prec = prec
- UsedPrecedence[precname] = 1
- del p.prod[i]
- del p.prod[i]
- continue
-
- if t in Terminals:
- Terminals[t].append(p.number)
- # Is a terminal. We'll assign a precedence to p based on this
- if not hasattr(p,"prec"):
- p.prec = Precedence.get(t,('right',0))
- else:
- if not t in Nonterminals:
- Nonterminals[t] = [ ]
- Nonterminals[t].append(p.number)
- i += 1
+class GrammarError(YaccError): pass
- if not hasattr(p,"prec"):
- p.prec = ('right',0)
+class Grammar(object):
+ def __init__(self,terminals):
+ self.Productions = [None] # A list of all of the productions. The first
+ # entry is always reserved for the purpose of
+ # building an augmented grammar
- # Set final length of productions
- p.len = len(p.prod)
- p.prod = tuple(p.prod)
+ self.Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all
+ # productions of that nonterminal.
- # Calculate unique syms in the production
- p.usyms = [ ]
- for s in p.prod:
- if s not in p.usyms:
- p.usyms.append(s)
+ self.Prodmap = { } # A dictionary that is only used to detect duplicate
+ # productions.
- # Add to the global productions list
- try:
- Prodnames[p.name].append(p)
- except KeyError:
- Prodnames[p.name] = [ p ]
- return 0
+ self.Terminals = { } # A dictionary mapping the names of terminal symbols to a
+ # list of the rules where they are used.
-# Given a raw rule function, this function rips out its doc string
-# and adds rules to the grammar
+ for term in terminals:
+ self.Terminals[term] = []
-def add_function(f):
- line = func_code(f).co_firstlineno
- file = func_code(f).co_filename
- error = 0
+ self.Terminals['error'] = []
- if isinstance(f,types.MethodType):
- reqdargs = 2
- else:
- reqdargs = 1
-
- if func_code(f).co_argcount > reqdargs:
- sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__))
- return -1
-
- if func_code(f).co_argcount < reqdargs:
- sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__))
- return -1
-
- if f.__doc__:
- # Split the doc string into lines
- pstrings = f.__doc__.splitlines()
- lastp = None
- dline = line
- for ps in pstrings:
- dline += 1
- p = ps.split()
- if not p: continue
- try:
- if p[0] == '|':
- # This is a continuation of a previous rule
- if not lastp:
- sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline))
- return -1
- prodname = lastp
- if len(p) > 1:
- syms = p[1:]
- else:
- syms = [ ]
- else:
- prodname = p[0]
- lastp = prodname
- assign = p[1]
- if len(p) > 2:
- syms = p[2:]
- else:
- syms = [ ]
- if assign != ':' and assign != '::=':
- sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline))
- return -1
+ self.Nonterminals = { } # A dictionary mapping names of nonterminals to a list
+ # of rule numbers where they are used.
+ self.First = { } # A dictionary of precomputed FIRST(x) symbols
- e = add_production(f,file,dline,prodname,syms)
- error += e
+ self.Follow = { } # A dictionary of precomputed FOLLOW(x) symbols
+ self.Precedence = { } # Precedence rules for each terminal. Contains tuples of the
+ # form ('right',level) or ('nonassoc', level) or ('left',level)
- except Exception:
- sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps))
- error -= 1
- else:
- sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__))
- return error
-
-
-# Cycle checking code (Michael Dyck)
-
-def compute_reachable():
- '''
- Find each symbol that can be reached from the start symbol.
- Print a warning for any nonterminals that can't be reached.
- (Unused terminals have already had their warning.)
- '''
- Reachable = { }
- for s in list(Terminals) + list(Nonterminals):
- Reachable[s] = 0
-
- mark_reachable_from( Productions[0].prod[0], Reachable )
-
- for s in list(Nonterminals):
- if not Reachable[s]:
- sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s)
-
-def mark_reachable_from(s, Reachable):
- '''
- Mark all symbols that are reachable from symbol s.
- '''
- if Reachable[s]:
- # We've already reached symbol s.
- return
- Reachable[s] = 1
- for p in Prodnames.get(s,[]):
- for r in p.prod:
- mark_reachable_from(r, Reachable)
+ self.UsedPrecedence = { } # Precedence rules that were actually used by the grammer.
+ # This is only used to provide error checking and to generate
+ # a warning about unused precedence rules.
-# -----------------------------------------------------------------------------
-# compute_terminates()
-#
-# This function looks at the various parsing rules and tries to detect
-# infinite recursion cycles (grammar rules where there is no possible way
-# to derive a string of only terminals).
-# -----------------------------------------------------------------------------
-def compute_terminates():
- '''
- Raise an error for any symbols that don't terminate.
- '''
- Terminates = {}
-
- # Terminals:
- for t in Terminals:
- Terminates[t] = 1
-
- Terminates['$end'] = 1
-
- # Nonterminals:
-
- # Initialize to false:
- for n in Nonterminals:
- Terminates[n] = 0
-
- # Then propagate termination until no change:
- while 1:
- some_change = 0
- for (n,pl) in Prodnames.items():
- # Nonterminal n terminates iff any of its productions terminates.
- for p in pl:
- # Production p terminates iff all of its rhs symbols terminate.
- for s in p.prod:
- if not Terminates[s]:
- # The symbol s does not terminate,
- # so production p does not terminate.
- p_terminates = 0
- break
- else:
- # didn't break from the loop,
- # so every symbol s terminates
- # so production p terminates.
- p_terminates = 1
-
- if p_terminates:
- # symbol n terminates!
- if not Terminates[n]:
- Terminates[n] = 1
- some_change = 1
- # Don't need to consider any more productions for this n.
- break
-
- if not some_change:
- break
-
- some_error = 0
- for (s,terminates) in Terminates.items():
- if not terminates:
- if not s in Prodnames and not s in Terminals and s != 'error':
- # s is used-but-not-defined, and we've already warned of that,
- # so it would be overkill to say that it's also non-terminating.
- pass
- else:
- sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s)
- some_error = 1
+ self.Start = None # Starting symbol for the grammar
- return some_error
-# -----------------------------------------------------------------------------
-# verify_productions()
-#
-# This function examines all of the supplied rules to see if they seem valid.
-# -----------------------------------------------------------------------------
-def verify_productions(cycle_check=1):
- error = 0
- for p in Productions:
- if not p: continue
+ def __len__(self):
+ return len(self.Productions)
- for s in p.prod:
- if not s in Prodnames and not s in Terminals and s != 'error':
- sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s))
- error = 1
- continue
+ def __getitem__(self,index):
+ return self.Productions[index]
- unused_tok = 0
- # Now verify all of the tokens
- if yaccdebug:
- _vf.write("Unused terminals:\n\n")
- for s,v in Terminals.items():
- if s != 'error' and not v:
- sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s)
- if yaccdebug: _vf.write(" %s\n"% s)
- unused_tok += 1
-
- # Print out all of the productions
- if yaccdebug:
- _vf.write("\nGrammar\n\n")
- for i in range(1,len(Productions)):
- _vf.write("Rule %-5d %s\n" % (i, Productions[i]))
-
- unused_prod = 0
- # Verify the use of all productions
- for s,v in Nonterminals.items():
- if not v:
- p = Prodnames[s][0]
- sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s))
- unused_prod += 1
-
-
- if unused_tok == 1:
- sys.stderr.write("yacc: Warning. There is 1 unused token.\n")
- if unused_tok > 1:
- sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok)
-
- if unused_prod == 1:
- sys.stderr.write("yacc: Warning. There is 1 unused rule.\n")
- if unused_prod > 1:
- sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod)
-
- if yaccdebug:
- _vf.write("\nTerminals, with rules where they appear\n\n")
- ks = list(Terminals)
- ks.sort()
- for k in ks:
- _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]])))
- _vf.write("\nNonterminals, with rules where they appear\n\n")
- ks = list(Nonterminals)
- ks.sort()
- for k in ks:
- _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]])))
-
- if (cycle_check):
- compute_reachable()
- error += compute_terminates()
-# error += check_cycles()
- return error
+ # -----------------------------------------------------------------------------
+ # set_precedence()
+ #
+ # Sets the precedence for a given terminal. assoc is the associativity such as
+ # 'left','right', or 'nonassoc'. level is a numeric level.
+ #
+ # -----------------------------------------------------------------------------
+
+ def set_precedence(self,term,assoc,level):
+ assert self.Productions == [None],"Must call set_precedence() before add_production()"
+ if term in self.Precedence:
+ raise GrammarError("Precedence already specified for terminal '%s'" % term)
+ if assoc not in ['left','right','nonassoc']:
+ raise GrammarError("Associativity must be one of 'left','right', or 'nonassoc'")
+ self.Precedence[term] = (assoc,level)
+
+ # -----------------------------------------------------------------------------
+ # add_production()
+ #
+ # Given an action function, this function assembles a production rule and
+ # computes its precedence level.
+ #
+ # The production rule is supplied as a list of symbols. For example,
+ # a rule such as 'expr : expr PLUS term' has a production name of 'expr' and
+ # symbols ['expr','PLUS','term'].
+ #
+ # Precedence is determined by the precedence of the right-most non-terminal
+ # or the precedence of a terminal specified by %prec.
+ #
+ # A variety of error checks are performed to make sure production symbols
+ # are valid and that %prec is used correctly.
+ # -----------------------------------------------------------------------------
+
+ def add_production(self,prodname,syms,func=None,file='',line=0):
+
+ if prodname in self.Terminals:
+ raise GrammarError("%s:%d: Illegal rule name '%s'. Already defined as a token" % (file,line,prodname))
+ if prodname == 'error':
+ raise GrammarError("%s:%d: Illegal rule name '%s'. error is a reserved word" % (file,line,prodname))
+ if not _is_identifier.match(prodname):
+ raise GrammarError("%s:%d: Illegal rule name '%s'" % (file,line,prodname))
+
+ # Look for literal tokens
+ for n,s in enumerate(syms):
+ if s[0] in "'\"":
+ try:
+ c = eval(s)
+ if (len(c) > 1):
+ raise GrammarError("%s:%d: Literal token %s in rule '%s' may only be a single character" % (file,line,s, prodname))
+ if not c in self.Terminals:
+ self.Terminals[c] = []
+ syms[n] = c
+ continue
+ except SyntaxError:
+ pass
+ if not _is_identifier.match(s) and s != '%prec':
+ raise GrammarError("%s:%d: Illegal name '%s' in rule '%s'" % (file,line,s, prodname))
+
+ # Determine the precedence level
+ if '%prec' in syms:
+ if syms[-1] == '%prec':
+ raise GrammarError("%s:%d: Syntax error. Nothing follows %%prec" % (file,line))
+ if syms[-2] != '%prec':
+ raise GrammarError("%s:%d: Syntax error. %%prec can only appear at the end of a grammar rule" % (file,line))
+ precname = syms[-1]
+ prodprec = self.Precedence.get(precname,None)
+ if not prodprec:
+ raise GrammarError("%s:%d: Nothing known about the precedence of '%s'" % (file,line,precname))
+ else:
+ self.UsedPrecedence[precname] = 1
+ del syms[-2:] # Drop %prec from the rule
+ else:
+ # If no %prec, precedence is determined by the rightmost terminal symbol
+ precname = rightmost_terminal(syms,self.Terminals)
+ prodprec = self.Precedence.get(precname,('right',0))
+
+ # See if the rule is already in the rulemap
+ map = "%s -> %s" % (prodname,syms)
+ if map in self.Prodmap:
+ m = self.Prodmap[map]
+ raise GrammarError("%s:%d: Duplicate rule %s. " % (file,line, m) +
+ "Previous definition at %s:%d" % (file,line, m.file, m.line))
+
+ # From this point on, everything is valid. Create a new Production instance
+ pnumber = len(self.Productions)
+ if not prodname in self.Nonterminals:
+ self.Nonterminals[prodname] = [ ]
+
+ # Add the production number to Terminals and Nonterminals
+ for t in syms:
+ if t in self.Terminals:
+ self.Terminals[t].append(pnumber)
+ else:
+ if not t in self.Nonterminals:
+ self.Nonterminals[t] = [ ]
+ self.Nonterminals[t].append(pnumber)
-# -----------------------------------------------------------------------------
-# build_lritems()
-#
-# This function walks the list of productions and builds a complete set of the
-# LR items. The LR items are stored in two ways: First, they are uniquely
-# numbered and placed in the list _lritems. Second, a linked list of LR items
-# is built for each production. For example:
-#
-# E -> E PLUS E
-#
-# Creates the list
-#
-# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ]
-# -----------------------------------------------------------------------------
+ # Create a production and add it to the list of productions
+ p = Production(pnumber,prodname,syms,prodprec,func,file,line)
+ self.Productions.append(p)
+ self.Prodmap[map] = p
-def build_lritems():
- for p in Productions:
- lastlri = p
- lri = p.lr_item(0)
- i = 0
- while 1:
- lri = p.lr_item(i)
- lastlri.lr_next = lri
- if not lri: break
- lri.lr_num = len(LRitems)
- LRitems.append(lri)
- lastlri = lri
- i += 1
+ # Add to the global productions list
+ try:
+ self.Prodnames[prodname].append(p)
+ except KeyError:
+ self.Prodnames[prodname] = [ p ]
+ return 0
- # In order for the rest of the parser generator to work, we need to
- # guarantee that no more lritems are generated. Therefore, we nuke
- # the p.lr_item method. (Only used in debugging)
- # Production.lr_item = None
+ # -----------------------------------------------------------------------------
+ # set_start()
+ #
+ # Sets the starting symbol and creates the augmented grammar. Production
+ # rule 0 is S' -> start where start is the start symbol.
+ # -----------------------------------------------------------------------------
+
+ def set_start(self,start=None):
+ if not start:
+ start = self.Productions[1].name
+ if start not in self.Nonterminals:
+ raise GrammarError("start symbol %s undefined" % start)
+ self.Productions[0] = Production(0,"S'",[start])
+ self.Nonterminals[start].append(0)
+ self.Start = start
+
+ # -----------------------------------------------------------------------------
+ # find_unreachable()
+ #
+ # Find all of the nonterminal symbols that can't be reached from the starting
+ # symbol. Returns a list of nonterminals that can't be reached.
+ # -----------------------------------------------------------------------------
-# -----------------------------------------------------------------------------
-# add_precedence()
-#
-# Given a list of precedence rules, add to the precedence table.
-# -----------------------------------------------------------------------------
+ def find_unreachable(self):
+
+ # Mark all symbols that are reachable from a symbol s
+ def mark_reachable_from(s):
+ if reachable[s]:
+ # We've already reached symbol s.
+ return
+ reachable[s] = 1
+ for p in self.Prodnames.get(s,[]):
+ for r in p.prod:
+ mark_reachable_from(r)
+
+ reachable = { }
+ for s in list(self.Terminals) + list(self.Nonterminals):
+ reachable[s] = 0
+
+ mark_reachable_from( self.Productions[0].prod[0] )
+
+ return [s for s in list(self.Nonterminals)
+ if not reachable[s]]
+
+ # -----------------------------------------------------------------------------
+ # infinite_cycles()
+ #
+ # This function looks at the various parsing rules and tries to detect
+ # infinite recursion cycles (grammar rules where there is no possible way
+ # to derive a string of only terminals).
+ # -----------------------------------------------------------------------------
-def add_precedence(plist):
- plevel = 0
- error = 0
- for p in plist:
- plevel += 1
- try:
- prec = p[0]
- terms = p[1:]
- if prec != 'left' and prec != 'right' and prec != 'nonassoc':
- sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec)
- return -1
- for t in terms:
- if t in Precedence:
- sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t)
- error += 1
- continue
- Precedence[t] = (prec,plevel)
- except Exception:
- sys.stderr.write("yacc: Invalid precedence table.\n")
- error += 1
+ def infinite_cycles(self):
+ terminates = {}
- return error
+ # Terminals:
+ for t in self.Terminals:
+ terminates[t] = 1
-# -----------------------------------------------------------------------------
-# check_precedence()
-#
-# Checks the use of the Precedence tables. This makes sure all of the symbols
-# are terminals or were used with %prec
-# -----------------------------------------------------------------------------
+ terminates['$end'] = 1
-def check_precedence():
- error = 0
- for precname in Precedence:
- if not (precname in Terminals or precname in UsedPrecedence):
- sys.stderr.write("yacc: Precedence rule '%s' defined for unknown symbol '%s'\n" % (Precedence[precname][0],precname))
- error += 1
- return error
+ # Nonterminals:
-# -----------------------------------------------------------------------------
-# augment_grammar()
-#
-# Compute the augmented grammar. This is just a rule S' -> start where start
-# is the starting symbol.
-# -----------------------------------------------------------------------------
+ # Initialize to false:
+ for n in self.Nonterminals:
+ terminates[n] = 0
-def augment_grammar(start=None):
- if not start:
- start = Productions[1].name
- Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None)
- Productions[0].usyms = [ start ]
- Nonterminals[start].append(0)
+ # Then propagate termination until no change:
+ while 1:
+ some_change = 0
+ for (n,pl) in self.Prodnames.items():
+ # Nonterminal n terminates iff any of its productions terminates.
+ for p in pl:
+ # Production p terminates iff all of its rhs symbols terminate.
+ for s in p.prod:
+ if not terminates[s]:
+ # The symbol s does not terminate,
+ # so production p does not terminate.
+ p_terminates = 0
+ break
+ else:
+ # didn't break from the loop,
+ # so every symbol s terminates
+ # so production p terminates.
+ p_terminates = 1
+
+ if p_terminates:
+ # symbol n terminates!
+ if not terminates[n]:
+ terminates[n] = 1
+ some_change = 1
+ # Don't need to consider any more productions for this n.
+ break
+ if not some_change:
+ break
-# -------------------------------------------------------------------------
-# first()
-#
-# Compute the value of FIRST1(beta) where beta is a tuple of symbols.
-#
-# During execution of compute_first1, the result may be incomplete.
-# Afterward (e.g., when called from compute_follow()), it will be complete.
-# -------------------------------------------------------------------------
-def first(beta):
-
- # We are computing First(x1,x2,x3,...,xn)
- result = [ ]
- for x in beta:
- x_produces_empty = 0
-
- # Add all the non-<empty> symbols of First[x] to the result.
- for f in First[x]:
- if f == '<empty>':
- x_produces_empty = 1
- else:
- if f not in result: result.append(f)
+ infinite = []
+ for (s,term) in terminates.items():
+ if not term:
+ if not s in self.Prodnames and not s in self.Terminals and s != 'error':
+ # s is used-but-not-defined, and we've already warned of that,
+ # so it would be overkill to say that it's also non-terminating.
+ pass
+ else:
+ infinite.append(s)
- if x_produces_empty:
- # We have to consider the next x in beta,
- # i.e. stay in the loop.
- pass
- else:
- # We don't have to consider any further symbols in beta.
- break
- else:
- # There was no 'break' from the loop,
- # so x_produces_empty was true for all x in beta,
- # so beta produces empty as well.
- result.append('<empty>')
-
- return result
-
-
-# FOLLOW(x)
-# Given a non-terminal. This function computes the set of all symbols
-# that might follow it. Dragon book, p. 189.
-
-def compute_follow(start=None):
- # Add '$end' to the follow list of the start symbol
- for k in Nonterminals:
- Follow[k] = [ ]
-
- if not start:
- start = Productions[1].name
-
- Follow[start] = [ '$end' ]
-
- while 1:
- didadd = 0
- for p in Productions[1:]:
- # Here is the production set
- for i in range(len(p.prod)):
- B = p.prod[i]
- if B in Nonterminals:
- # Okay. We got a non-terminal in a production
- fst = first(p.prod[i+1:])
- hasempty = 0
- for f in fst:
- if f != '<empty>' and f not in Follow[B]:
- Follow[B].append(f)
- didadd = 1
- if f == '<empty>':
- hasempty = 1
- if hasempty or i == (len(p.prod)-1):
- # Add elements of follow(a) to follow(b)
- for f in Follow[p.name]:
- if f not in Follow[B]:
- Follow[B].append(f)
- didadd = 1
- if not didadd: break
+ return infinite
- if 0 and yaccdebug:
- _vf.write('\nFollow:\n')
- for k in Nonterminals:
- _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]])))
-# -------------------------------------------------------------------------
-# compute_first1()
-#
-# Compute the value of FIRST1(X) for all symbols
-# -------------------------------------------------------------------------
-def compute_first1():
-
- # Terminals:
- for t in Terminals:
- First[t] = [t]
-
- First['$end'] = ['$end']
- First['#'] = ['#'] # what's this for?
-
- # Nonterminals:
-
- # Initialize to the empty set:
- for n in Nonterminals:
- First[n] = []
-
- # Then propagate symbols until no change:
- while 1:
- some_change = 0
- for n in Nonterminals:
- for p in Prodnames[n]:
- for f in first(p.prod):
- if f not in First[n]:
- First[n].append( f )
- some_change = 1
- if not some_change:
- break
-
- if 0 and yaccdebug:
- _vf.write('\nFirst:\n')
- for k in Nonterminals:
- _vf.write("%-20s : %s\n" %
- (k, " ".join([str(s) for s in First[k]])))
+ # -----------------------------------------------------------------------------
+ # undefined_symbols()
+ #
+ # Find all symbols that were used the grammar, but not defined as tokens or
+ # grammar rules. Returns a list of tuples (sym, prod) where sym in the symbol
+ # and prod is the production where the symbol was used.
+ # -----------------------------------------------------------------------------
+ def undefined_symbols(self):
+ result = []
+ for p in self.Productions:
+ if not p: continue
-# -----------------------------------------------------------------------------
-# === SLR Generation ===
-#
-# The following functions are used to construct SLR (Simple LR) parsing tables
-# as described on p.221-229 of the dragon book.
-# -----------------------------------------------------------------------------
+ for s in p.prod:
+ if not s in self.Prodnames and not s in self.Terminals and s != 'error':
+ result.append((s,p))
+ return result
-# Global variables for the LR parsing engine
-def lr_init_vars():
- global _lr_action, _lr_goto, _lr_method
- global _lr_goto_cache, _lr0_cidhash
-
- _lr_action = { } # Action table
- _lr_goto = { } # Goto table
- _lr_method = "Unknown" # LR method used
- _lr_goto_cache = { }
- _lr0_cidhash = { }
-
-
-# Compute the LR(0) closure operation on I, where I is a set of LR(0) items.
-# prodlist is a list of productions.
-
-_add_count = 0 # Counter used to detect cycles
-
-def lr0_closure(I):
- global _add_count
-
- _add_count += 1
- prodlist = Productions
-
- # Add everything in I to J
- J = I[:]
- didadd = 1
- while didadd:
- didadd = 0
- for j in J:
- for x in j.lrafter:
- if x.lr0_added == _add_count: continue
- # Add B --> .G to J
- J.append(x.lr_next)
- x.lr0_added = _add_count
- didadd = 1
-
- return J
-
-# Compute the LR(0) goto function goto(I,X) where I is a set
-# of LR(0) items and X is a grammar symbol. This function is written
-# in a way that guarantees uniqueness of the generated goto sets
-# (i.e. the same goto set will never be returned as two different Python
-# objects). With uniqueness, we can later do fast set comparisons using
-# id(obj) instead of element-wise comparison.
-
-def lr0_goto(I,x):
- # First we look for a previously cached entry
- g = _lr_goto_cache.get((id(I),x),None)
- if g: return g
-
- # Now we generate the goto set in a way that guarantees uniqueness
- # of the result
-
- s = _lr_goto_cache.get(x,None)
- if not s:
- s = { }
- _lr_goto_cache[x] = s
-
- gs = [ ]
- for p in I:
- n = p.lr_next
- if n and n.lrbefore == x:
- s1 = s.get(id(n),None)
- if not s1:
- s1 = { }
- s[id(n)] = s1
- gs.append(n)
- s = s1
- g = s.get('$end',None)
- if not g:
- if gs:
- g = lr0_closure(gs)
- s['$end'] = g
+ # -----------------------------------------------------------------------------
+ # unused_terminals()
+ #
+ # Find all terminals that were defined, but not used by the grammar. Returns
+ # a list of all symbols.
+ # -----------------------------------------------------------------------------
+ def unused_terminals(self):
+ unused_tok = []
+ for s,v in self.Terminals.items():
+ if s != 'error' and not v:
+ unused_tok.append(s)
+
+ return unused_tok
+
+ # ------------------------------------------------------------------------------
+ # unused_rules()
+ #
+ # Find all grammar rules that were defined, but not used (maybe not reachable)
+ # Returns a list of productions.
+ # ------------------------------------------------------------------------------
+
+ def unused_rules(self):
+ unused_prod = []
+ for s,v in self.Nonterminals.items():
+ if not v:
+ p = self.Prodnames[s][0]
+ unused_prod.append(p)
+ return unused_prod
+
+ # -----------------------------------------------------------------------------
+ # unused_precedence()
+ #
+ # Returns a list of tuples (term,precedence) corresponding to precedence
+ # rules that were never used by the grammar. term is the name of the terminal
+ # on which precedence was applied and precedence is a string such as 'left' or
+ # 'right' corresponding to the type of precedence.
+ # -----------------------------------------------------------------------------
+
+ def unused_precedence(self):
+ unused = []
+ for termname in self.Precedence:
+ if not (termname in self.Terminals or termname in self.UsedPrecedence):
+ unused.append((termname,self.Precedence[termname][0]))
+
+ return unused
+
+ # -------------------------------------------------------------------------
+ # _first()
+ #
+ # Compute the value of FIRST1(beta) where beta is a tuple of symbols.
+ #
+ # During execution of compute_first1, the result may be incomplete.
+ # Afterward (e.g., when called from compute_follow()), it will be complete.
+ # -------------------------------------------------------------------------
+ def _first(self,beta):
+
+ # We are computing First(x1,x2,x3,...,xn)
+ result = [ ]
+ for x in beta:
+ x_produces_empty = 0
+
+ # Add all the non-<empty> symbols of First[x] to the result.
+ for f in self.First[x]:
+ if f == '<empty>':
+ x_produces_empty = 1
+ else:
+ if f not in result: result.append(f)
+
+ if x_produces_empty:
+ # We have to consider the next x in beta,
+ # i.e. stay in the loop.
+ pass
+ else:
+ # We don't have to consider any further symbols in beta.
+ break
else:
- s['$end'] = gs
- _lr_goto_cache[(id(I),x)] = g
- return g
-
-_lr0_cidhash = { }
-
-# Compute the LR(0) sets of item function
-def lr0_items():
-
- C = [ lr0_closure([Productions[0].lr_next]) ]
- i = 0
- for I in C:
- _lr0_cidhash[id(I)] = i
- i += 1
-
- # Loop over the items in C and each grammar symbols
- i = 0
- while i < len(C):
- I = C[i]
- i += 1
-
- # Collect all of the symbols that could possibly be in the goto(I,X) sets
- asyms = { }
- for ii in I:
- for s in ii.usyms:
- asyms[s] = None
-
- for x in asyms:
- g = lr0_goto(I,x)
- if not g: continue
- if id(g) in _lr0_cidhash: continue
- _lr0_cidhash[id(g)] = len(C)
- C.append(g)
-
- return C
+ # There was no 'break' from the loop,
+ # so x_produces_empty was true for all x in beta,
+ # so beta produces empty as well.
+ result.append('<empty>')
-# -----------------------------------------------------------------------------
-# ==== LALR(1) Parsing ====
-#
-# LALR(1) parsing is almost exactly the same as SLR except that instead of
-# relying upon Follow() sets when performing reductions, a more selective
-# lookahead set that incorporates the state of the LR(0) machine is utilized.
-# Thus, we mainly just have to focus on calculating the lookahead sets.
-#
-# The method used here is due to DeRemer and Pennelo (1982).
-#
-# DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1)
-# Lookahead Sets", ACM Transactions on Programming Languages and Systems,
-# Vol. 4, No. 4, Oct. 1982, pp. 615-649
-#
-# Further details can also be found in:
-#
-# J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing",
-# McGraw-Hill Book Company, (1985).
-#
-# Note: This implementation is a complete replacement of the LALR(1)
-# implementation in PLY-1.x releases. That version was based on
-# a less efficient algorithm and it had bugs in its implementation.
-# -----------------------------------------------------------------------------
+ return result
-# -----------------------------------------------------------------------------
-# compute_nullable_nonterminals()
-#
-# Creates a dictionary containing all of the non-terminals that might produce
-# an empty production.
-# -----------------------------------------------------------------------------
+ # -------------------------------------------------------------------------
+ # compute_first()
+ #
+ # Compute the value of FIRST1(X) for all symbols
+ # -------------------------------------------------------------------------
+ def compute_first(self):
+ if self.First:
+ return self.First
-def compute_nullable_nonterminals():
- nullable = {}
- num_nullable = 0
- while 1:
- for p in Productions[1:]:
- if p.len == 0:
- nullable[p.name] = 1
- continue
- for t in p.prod:
- if not t in nullable: break
- else:
- nullable[p.name] = 1
- if len(nullable) == num_nullable: break
- num_nullable = len(nullable)
- return nullable
+ # Terminals:
+ for t in self.Terminals:
+ self.First[t] = [t]
-# -----------------------------------------------------------------------------
-# find_nonterminal_trans(C)
-#
-# Given a set of LR(0) items, this functions finds all of the non-terminal
-# transitions. These are transitions in which a dot appears immediately before
-# a non-terminal. Returns a list of tuples of the form (state,N) where state
-# is the state number and N is the nonterminal symbol.
-#
-# The input C is the set of LR(0) items.
-# -----------------------------------------------------------------------------
+ self.First['$end'] = ['$end']
-def find_nonterminal_transitions(C):
- trans = []
- for state in range(len(C)):
- for p in C[state]:
- if p.lr_index < p.len - 1:
- t = (state,p.prod[p.lr_index+1])
- if t[1] in Nonterminals:
- if t not in trans: trans.append(t)
- state = state + 1
- return trans
+ # Nonterminals:
-# -----------------------------------------------------------------------------
-# dr_relation()
-#
-# Computes the DR(p,A) relationships for non-terminal transitions. The input
-# is a tuple (state,N) where state is a number and N is a nonterminal symbol.
-#
-# Returns a list of terminals.
-# -----------------------------------------------------------------------------
+ # Initialize to the empty set:
+ for n in self.Nonterminals:
+ self.First[n] = []
+
+ # Then propagate symbols until no change:
+ while 1:
+ some_change = 0
+ for n in self.Nonterminals:
+ for p in self.Prodnames[n]:
+ for f in self._first(p.prod):
+ if f not in self.First[n]:
+ self.First[n].append( f )
+ some_change = 1
+ if not some_change:
+ break
+
+ return self.First
+
+ # ---------------------------------------------------------------------
+ # compute_follow()
+ #
+ # Computes all of the follow sets for every non-terminal symbol. The
+ # follow set is the set of all symbols that might follow a given
+ # non-terminal. See the Dragon book, 2nd Ed. p. 189.
+ # ---------------------------------------------------------------------
+ def compute_follow(self,start=None):
+ # If already computed, return the result
+ if self.Follow:
+ return self.Follow
+
+ # If first sets not computed yet, do that first.
+ if not self.First:
+ self.compute_first()
-def dr_relation(C,trans,nullable):
- dr_set = { }
- state,N = trans
- terms = []
+ # Add '$end' to the follow list of the start symbol
+ for k in self.Nonterminals:
+ self.Follow[k] = [ ]
- g = lr0_goto(C[state],N)
- for p in g:
- if p.lr_index < p.len - 1:
- a = p.prod[p.lr_index+1]
- if a in Terminals:
- if a not in terms: terms.append(a)
+ if not start:
+ start = self.Productions[1].name
- # This extra bit is to handle the start state
- if state == 0 and N == Productions[0].prod[0]:
- terms.append('$end')
+ self.Follow[start] = [ '$end' ]
- return terms
+ while 1:
+ didadd = 0
+ for p in self.Productions[1:]:
+ # Here is the production set
+ for i in range(len(p.prod)):
+ B = p.prod[i]
+ if B in self.Nonterminals:
+ # Okay. We got a non-terminal in a production
+ fst = self._first(p.prod[i+1:])
+ hasempty = 0
+ for f in fst:
+ if f != '<empty>' and f not in self.Follow[B]:
+ self.Follow[B].append(f)
+ didadd = 1
+ if f == '<empty>':
+ hasempty = 1
+ if hasempty or i == (len(p.prod)-1):
+ # Add elements of follow(a) to follow(b)
+ for f in self.Follow[p.name]:
+ if f not in self.Follow[B]:
+ self.Follow[B].append(f)
+ didadd = 1
+ if not didadd: break
+ return self.Follow
+
+
+ # -----------------------------------------------------------------------------
+ # build_lritems()
+ #
+ # This function walks the list of productions and builds a complete set of the
+ # LR items. The LR items are stored in two ways: First, they are uniquely
+ # numbered and placed in the list _lritems. Second, a linked list of LR items
+ # is built for each production. For example:
+ #
+ # E -> E PLUS E
+ #
+ # Creates the list
+ #
+ # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ]
+ # -----------------------------------------------------------------------------
+
+ def build_lritems(self):
+ for p in self.Productions:
+ lastlri = p
+ i = 0
+ lr_items = []
+ while 1:
+ if i > len(p):
+ lri = None
+ else:
+ lri = LRItem(p,i)
+ # Precompute the list of productions immediately following
+ try:
+ lri.lr_after = self.Prodnames[lri.prod[i+1]]
+ except (IndexError,KeyError):
+ lri.lr_after = []
+ try:
+ lri.lr_before = lri.prod[i-1]
+ except IndexError:
+ lri.lr_before = None
+
+ lastlri.lr_next = lri
+ if not lri: break
+ lr_items.append(lri)
+ lastlri = lri
+ i += 1
+ p.lr_items = lr_items
# -----------------------------------------------------------------------------
-# reads_relation()
+# == Class LRTable ==
#
-# Computes the READS() relation (p,A) READS (t,C).
+# This basic class represents a basic table of LR parsing information.
+# Methods for generating the tables are not defined here. They are defined
+# in the derived class LRGeneratedTable.
# -----------------------------------------------------------------------------
-def reads_relation(C, trans, empty):
- # Look for empty transitions
- rel = []
- state, N = trans
+class VersionError(YaccError): pass
+
+class LRTable(object):
+ def __init__(self):
+ self.lr_action = None
+ self.lr_goto = None
+ self.lr_productions = None
+ self.lr_method = None
+
+ def read_table(self,module):
+ if isinstance(module,types.ModuleType):
+ parsetab = module
+ else:
+ if sys.version_info[0] < 3:
+ exec("import %s as parsetab" % module)
+ else:
+ env = { }
+ exec("import %s as parsetab" % module, env, env)
+ parsetab = env['parsetab']
+
+ if parsetab._tabversion != __tabversion__:
+ raise VersionError("yacc table file version is out of date")
+
+ self.lr_action = parsetab._lr_action
+ self.lr_goto = parsetab._lr_goto
- g = lr0_goto(C[state],N)
- j = _lr0_cidhash.get(id(g),-1)
- for p in g:
- if p.lr_index < p.len - 1:
- a = p.prod[p.lr_index + 1]
- if a in empty:
- rel.append((j,a))
+ self.lr_productions = []
+ for p in parsetab._lr_productions:
+ self.lr_productions.append(MiniProduction(*p))
- return rel
+ self.lr_method = parsetab._lr_method
+ return parsetab._lr_signature
+ # Bind all production function names to callable objects in pdict
+ def bind_callables(self,pdict):
+ for p in self.lr_productions:
+ p.bind(pdict)
+
# -----------------------------------------------------------------------------
-# compute_lookback_includes()
-#
-# Determines the lookback and includes relations
-#
-# LOOKBACK:
-#
-# This relation is determined by running the LR(0) state machine forward.
-# For example, starting with a production "N : . A B C", we run it forward
-# to obtain "N : A B C ." We then build a relationship between this final
-# state and the starting state. These relationships are stored in a dictionary
-# lookdict.
-#
-# INCLUDES:
-#
-# Computes the INCLUDE() relation (p,A) INCLUDES (p',B).
-#
-# This relation is used to determine non-terminal transitions that occur
-# inside of other non-terminal transition states. (p,A) INCLUDES (p', B)
-# if the following holds:
-#
-# B -> LAT, where T -> epsilon and p' -L-> p
-#
-# L is essentially a prefix (which may be empty), T is a suffix that must be
-# able to derive an empty string. State p' must lead to state p with the string L.
+# === LR Generator ===
#
+# The following classes and functions are used to generate LR parsing tables on
+# a grammar.
# -----------------------------------------------------------------------------
-def compute_lookback_includes(C,trans,nullable):
-
- lookdict = {} # Dictionary of lookback relations
- includedict = {} # Dictionary of include relations
-
- # Make a dictionary of non-terminal transitions
- dtrans = {}
- for t in trans:
- dtrans[t] = 1
-
- # Loop over all transitions and compute lookbacks and includes
- for state,N in trans:
- lookb = []
- includes = []
- for p in C[state]:
- if p.name != N: continue
-
- # Okay, we have a name match. We now follow the production all the way
- # through the state machine until we get the . on the right hand side
-
- lr_index = p.lr_index
- j = state
- while lr_index < p.len - 1:
- lr_index = lr_index + 1
- t = p.prod[lr_index]
-
- # Check to see if this symbol and state are a non-terminal transition
- if (j,t) in dtrans:
- # Yes. Okay, there is some chance that this is an includes relation
- # the only way to know for certain is whether the rest of the
- # production derives empty
-
- li = lr_index + 1
- while li < p.len:
- if p.prod[li] in Terminals: break # No forget it
- if not p.prod[li] in nullable: break
- li = li + 1
- else:
- # Appears to be a relation between (j,t) and (state,N)
- includes.append((j,t))
-
- g = lr0_goto(C[j],t) # Go to next set
- j = _lr0_cidhash.get(id(g),-1) # Go to next state
-
- # When we get here, j is the final state, now we have to locate the production
- for r in C[j]:
- if r.name != p.name: continue
- if r.len != p.len: continue
- i = 0
- # This look is comparing a production ". A B C" with "A B C ."
- while i < r.lr_index:
- if r.prod[i] != p.prod[i+1]: break
- i = i + 1
- else:
- lookb.append((j,r))
- for i in includes:
- if not i in includedict: includedict[i] = []
- includedict[i].append((state,N))
- lookdict[(state,N)] = lookb
-
- return lookdict,includedict
-
# -----------------------------------------------------------------------------
# digraph()
# traverse()
@@ -2224,349 +1870,659 @@ def traverse(x,N,stack,F,X,R,FP):
F[stack[-1]] = F[x]
element = stack.pop()
+class LALRError(YaccError): pass
+
# -----------------------------------------------------------------------------
-# compute_read_sets()
+# == LRGeneratedTable ==
#
-# Given a set of LR(0) items, this function computes the read sets.
-#
-# Inputs: C = Set of LR(0) items
-# ntrans = Set of nonterminal transitions
-# nullable = Set of empty transitions
-#
-# Returns a set containing the read sets
+# This class implements the LR table generation algorithm. There are no
+# public methods except for write()
# -----------------------------------------------------------------------------
-def compute_read_sets(C, ntrans, nullable):
- FP = lambda x: dr_relation(C,x,nullable)
- R = lambda x: reads_relation(C,x,nullable)
- F = digraph(ntrans,R,FP)
- return F
+class LRGeneratedTable(LRTable):
+ def __init__(self,grammar,method='LALR',log=None):
+ if method not in ['SLR','LALR']:
+ raise LALRError("Unsupported method %s" % method)
+
+ self.grammar = grammar
+ self.lr_method = method
+
+ # Set up the logger
+ if not log:
+ log = NullLogger()
+ self.log = log
+
+ # Internal attributes
+ self.lr_action = {} # Action table
+ self.lr_goto = {} # Goto table
+ self.lr_productions = grammar.Productions # Copy of grammar Production array
+ self.lr_goto_cache = {} # Cache of computed gotos
+ self.lr0_cidhash = {} # Cache of closures
+
+ self._add_count = 0 # Internal counter used to detect cycles
+
+ # Diagonistic information filled in by the table generator
+ self.sr_conflict = 0
+ self.rr_conflict = 0
+ self.conflicts = [] # List of conflicts
+
+ self.sr_conflicts = []
+ self.rr_conflicts = []
+
+ # Build the tables
+ self.grammar.build_lritems()
+ self.grammar.compute_first()
+ self.grammar.compute_follow()
+ self.lr_parse_table()
+
+ # Compute the LR(0) closure operation on I, where I is a set of LR(0) items.
+
+ def lr0_closure(self,I):
+ self._add_count += 1
+
+ # Add everything in I to J
+ J = I[:]
+ didadd = 1
+ while didadd:
+ didadd = 0
+ for j in J:
+ for x in j.lr_after:
+ if getattr(x,"lr0_added",0) == self._add_count: continue
+ # Add B --> .G to J
+ J.append(x.lr_next)
+ x.lr0_added = self._add_count
+ didadd = 1
+
+ return J
+
+ # Compute the LR(0) goto function goto(I,X) where I is a set
+ # of LR(0) items and X is a grammar symbol. This function is written
+ # in a way that guarantees uniqueness of the generated goto sets
+ # (i.e. the same goto set will never be returned as two different Python
+ # objects). With uniqueness, we can later do fast set comparisons using
+ # id(obj) instead of element-wise comparison.
+
+ def lr0_goto(self,I,x):
+ # First we look for a previously cached entry
+ g = self.lr_goto_cache.get((id(I),x),None)
+ if g: return g
+
+ # Now we generate the goto set in a way that guarantees uniqueness
+ # of the result
+
+ s = self.lr_goto_cache.get(x,None)
+ if not s:
+ s = { }
+ self.lr_goto_cache[x] = s
+
+ gs = [ ]
+ for p in I:
+ n = p.lr_next
+ if n and n.lr_before == x:
+ s1 = s.get(id(n),None)
+ if not s1:
+ s1 = { }
+ s[id(n)] = s1
+ gs.append(n)
+ s = s1
+ g = s.get('$end',None)
+ if not g:
+ if gs:
+ g = self.lr0_closure(gs)
+ s['$end'] = g
+ else:
+ s['$end'] = gs
+ self.lr_goto_cache[(id(I),x)] = g
+ return g
-# -----------------------------------------------------------------------------
-# compute_follow_sets()
-#
-# Given a set of LR(0) items, a set of non-terminal transitions, a readset,
-# and an include set, this function computes the follow sets
-#
-# Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)}
-#
-# Inputs:
-# ntrans = Set of nonterminal transitions
-# readsets = Readset (previously computed)
-# inclsets = Include sets (previously computed)
-#
-# Returns a set containing the follow sets
-# -----------------------------------------------------------------------------
+ # Compute the LR(0) sets of item function
+ def lr0_items(self):
-def compute_follow_sets(ntrans,readsets,inclsets):
- FP = lambda x: readsets[x]
- R = lambda x: inclsets.get(x,[])
- F = digraph(ntrans,R,FP)
- return F
+ C = [ self.lr0_closure([self.grammar.Productions[0].lr_next]) ]
+ i = 0
+ for I in C:
+ self.lr0_cidhash[id(I)] = i
+ i += 1
-# -----------------------------------------------------------------------------
-# add_lookaheads()
-#
-# Attaches the lookahead symbols to grammar rules.
-#
-# Inputs: lookbacks - Set of lookback relations
-# followset - Computed follow set
-#
-# This function directly attaches the lookaheads to productions contained
-# in the lookbacks set
-# -----------------------------------------------------------------------------
+ # Loop over the items in C and each grammar symbols
+ i = 0
+ while i < len(C):
+ I = C[i]
+ i += 1
-def add_lookaheads(lookbacks,followset):
- for trans,lb in lookbacks.items():
- # Loop over productions in lookback
- for state,p in lb:
- if not state in p.lookaheads:
- p.lookaheads[state] = []
- f = followset.get(trans,[])
- for a in f:
- if a not in p.lookaheads[state]: p.lookaheads[state].append(a)
+ # Collect all of the symbols that could possibly be in the goto(I,X) sets
+ asyms = { }
+ for ii in I:
+ for s in ii.usyms:
+ asyms[s] = None
-# -----------------------------------------------------------------------------
-# add_lalr_lookaheads()
-#
-# This function does all of the work of adding lookahead information for use
-# with LALR parsing
-# -----------------------------------------------------------------------------
+ for x in asyms:
+ g = self.lr0_goto(I,x)
+ if not g: continue
+ if id(g) in self.lr0_cidhash: continue
+ self.lr0_cidhash[id(g)] = len(C)
+ C.append(g)
-def add_lalr_lookaheads(C):
- # Determine all of the nullable nonterminals
- nullable = compute_nullable_nonterminals()
+ return C
- # Find all non-terminal transitions
- trans = find_nonterminal_transitions(C)
+ # -----------------------------------------------------------------------------
+ # ==== LALR(1) Parsing ====
+ #
+ # LALR(1) parsing is almost exactly the same as SLR except that instead of
+ # relying upon Follow() sets when performing reductions, a more selective
+ # lookahead set that incorporates the state of the LR(0) machine is utilized.
+ # Thus, we mainly just have to focus on calculating the lookahead sets.
+ #
+ # The method used here is due to DeRemer and Pennelo (1982).
+ #
+ # DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1)
+ # Lookahead Sets", ACM Transactions on Programming Languages and Systems,
+ # Vol. 4, No. 4, Oct. 1982, pp. 615-649
+ #
+ # Further details can also be found in:
+ #
+ # J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing",
+ # McGraw-Hill Book Company, (1985).
+ #
+ # -----------------------------------------------------------------------------
- # Compute read sets
- readsets = compute_read_sets(C,trans,nullable)
+ # -----------------------------------------------------------------------------
+ # compute_nullable_nonterminals()
+ #
+ # Creates a dictionary containing all of the non-terminals that might produce
+ # an empty production.
+ # -----------------------------------------------------------------------------
- # Compute lookback/includes relations
- lookd, included = compute_lookback_includes(C,trans,nullable)
+ def compute_nullable_nonterminals(self):
+ nullable = {}
+ num_nullable = 0
+ while 1:
+ for p in self.grammar.Productions[1:]:
+ if p.len == 0:
+ nullable[p.name] = 1
+ continue
+ for t in p.prod:
+ if not t in nullable: break
+ else:
+ nullable[p.name] = 1
+ if len(nullable) == num_nullable: break
+ num_nullable = len(nullable)
+ return nullable
+
+ # -----------------------------------------------------------------------------
+ # find_nonterminal_trans(C)
+ #
+ # Given a set of LR(0) items, this functions finds all of the non-terminal
+ # transitions. These are transitions in which a dot appears immediately before
+ # a non-terminal. Returns a list of tuples of the form (state,N) where state
+ # is the state number and N is the nonterminal symbol.
+ #
+ # The input C is the set of LR(0) items.
+ # -----------------------------------------------------------------------------
+
+ def find_nonterminal_transitions(self,C):
+ trans = []
+ for state in range(len(C)):
+ for p in C[state]:
+ if p.lr_index < p.len - 1:
+ t = (state,p.prod[p.lr_index+1])
+ if t[1] in self.grammar.Nonterminals:
+ if t not in trans: trans.append(t)
+ state = state + 1
+ return trans
+
+ # -----------------------------------------------------------------------------
+ # dr_relation()
+ #
+ # Computes the DR(p,A) relationships for non-terminal transitions. The input
+ # is a tuple (state,N) where state is a number and N is a nonterminal symbol.
+ #
+ # Returns a list of terminals.
+ # -----------------------------------------------------------------------------
- # Compute LALR FOLLOW sets
- followsets = compute_follow_sets(trans,readsets,included)
+ def dr_relation(self,C,trans,nullable):
+ dr_set = { }
+ state,N = trans
+ terms = []
- # Add all of the lookaheads
- add_lookaheads(lookd,followsets)
+ g = self.lr0_goto(C[state],N)
+ for p in g:
+ if p.lr_index < p.len - 1:
+ a = p.prod[p.lr_index+1]
+ if a in self.grammar.Terminals:
+ if a not in terms: terms.append(a)
-# -----------------------------------------------------------------------------
-# lr_parse_table()
-#
-# This function constructs the parse tables for SLR or LALR
-# -----------------------------------------------------------------------------
-def lr_parse_table(method):
- global _lr_method
- goto = _lr_goto # Goto array
- action = _lr_action # Action array
- actionp = { } # Action production array (temporary)
+ # This extra bit is to handle the start state
+ if state == 0 and N == self.grammar.Productions[0].prod[0]:
+ terms.append('$end')
- _lr_method = method
+ return terms
- n_srconflict = 0
- n_rrconflict = 0
+ # -----------------------------------------------------------------------------
+ # reads_relation()
+ #
+ # Computes the READS() relation (p,A) READS (t,C).
+ # -----------------------------------------------------------------------------
+
+ def reads_relation(self,C, trans, empty):
+ # Look for empty transitions
+ rel = []
+ state, N = trans
+
+ g = self.lr0_goto(C[state],N)
+ j = self.lr0_cidhash.get(id(g),-1)
+ for p in g:
+ if p.lr_index < p.len - 1:
+ a = p.prod[p.lr_index + 1]
+ if a in empty:
+ rel.append((j,a))
+
+ return rel
+
+ # -----------------------------------------------------------------------------
+ # compute_lookback_includes()
+ #
+ # Determines the lookback and includes relations
+ #
+ # LOOKBACK:
+ #
+ # This relation is determined by running the LR(0) state machine forward.
+ # For example, starting with a production "N : . A B C", we run it forward
+ # to obtain "N : A B C ." We then build a relationship between this final
+ # state and the starting state. These relationships are stored in a dictionary
+ # lookdict.
+ #
+ # INCLUDES:
+ #
+ # Computes the INCLUDE() relation (p,A) INCLUDES (p',B).
+ #
+ # This relation is used to determine non-terminal transitions that occur
+ # inside of other non-terminal transition states. (p,A) INCLUDES (p', B)
+ # if the following holds:
+ #
+ # B -> LAT, where T -> epsilon and p' -L-> p
+ #
+ # L is essentially a prefix (which may be empty), T is a suffix that must be
+ # able to derive an empty string. State p' must lead to state p with the string L.
+ #
+ # -----------------------------------------------------------------------------
+
+ def compute_lookback_includes(self,C,trans,nullable):
+
+ lookdict = {} # Dictionary of lookback relations
+ includedict = {} # Dictionary of include relations
+
+ # Make a dictionary of non-terminal transitions
+ dtrans = {}
+ for t in trans:
+ dtrans[t] = 1
+
+ # Loop over all transitions and compute lookbacks and includes
+ for state,N in trans:
+ lookb = []
+ includes = []
+ for p in C[state]:
+ if p.name != N: continue
+
+ # Okay, we have a name match. We now follow the production all the way
+ # through the state machine until we get the . on the right hand side
+
+ lr_index = p.lr_index
+ j = state
+ while lr_index < p.len - 1:
+ lr_index = lr_index + 1
+ t = p.prod[lr_index]
+
+ # Check to see if this symbol and state are a non-terminal transition
+ if (j,t) in dtrans:
+ # Yes. Okay, there is some chance that this is an includes relation
+ # the only way to know for certain is whether the rest of the
+ # production derives empty
+
+ li = lr_index + 1
+ while li < p.len:
+ if p.prod[li] in self.grammar.Terminals: break # No forget it
+ if not p.prod[li] in nullable: break
+ li = li + 1
+ else:
+ # Appears to be a relation between (j,t) and (state,N)
+ includes.append((j,t))
+
+ g = self.lr0_goto(C[j],t) # Go to next set
+ j = self.lr0_cidhash.get(id(g),-1) # Go to next state
+
+ # When we get here, j is the final state, now we have to locate the production
+ for r in C[j]:
+ if r.name != p.name: continue
+ if r.len != p.len: continue
+ i = 0
+ # This look is comparing a production ". A B C" with "A B C ."
+ while i < r.lr_index:
+ if r.prod[i] != p.prod[i+1]: break
+ i = i + 1
+ else:
+ lookb.append((j,r))
+ for i in includes:
+ if not i in includedict: includedict[i] = []
+ includedict[i].append((state,N))
+ lookdict[(state,N)] = lookb
+
+ return lookdict,includedict
+
+ # -----------------------------------------------------------------------------
+ # compute_read_sets()
+ #
+ # Given a set of LR(0) items, this function computes the read sets.
+ #
+ # Inputs: C = Set of LR(0) items
+ # ntrans = Set of nonterminal transitions
+ # nullable = Set of empty transitions
+ #
+ # Returns a set containing the read sets
+ # -----------------------------------------------------------------------------
+
+ def compute_read_sets(self,C, ntrans, nullable):
+ FP = lambda x: self.dr_relation(C,x,nullable)
+ R = lambda x: self.reads_relation(C,x,nullable)
+ F = digraph(ntrans,R,FP)
+ return F
+
+ # -----------------------------------------------------------------------------
+ # compute_follow_sets()
+ #
+ # Given a set of LR(0) items, a set of non-terminal transitions, a readset,
+ # and an include set, this function computes the follow sets
+ #
+ # Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)}
+ #
+ # Inputs:
+ # ntrans = Set of nonterminal transitions
+ # readsets = Readset (previously computed)
+ # inclsets = Include sets (previously computed)
+ #
+ # Returns a set containing the follow sets
+ # -----------------------------------------------------------------------------
+
+ def compute_follow_sets(self,ntrans,readsets,inclsets):
+ FP = lambda x: readsets[x]
+ R = lambda x: inclsets.get(x,[])
+ F = digraph(ntrans,R,FP)
+ return F
- if yaccdebug:
- sys.stderr.write("yacc: Generating %s parsing table...\n" % method)
- _vf.write("\n\nParsing method: %s\n\n" % method)
+ # -----------------------------------------------------------------------------
+ # add_lookaheads()
+ #
+ # Attaches the lookahead symbols to grammar rules.
+ #
+ # Inputs: lookbacks - Set of lookback relations
+ # followset - Computed follow set
+ #
+ # This function directly attaches the lookaheads to productions contained
+ # in the lookbacks set
+ # -----------------------------------------------------------------------------
+
+ def add_lookaheads(self,lookbacks,followset):
+ for trans,lb in lookbacks.items():
+ # Loop over productions in lookback
+ for state,p in lb:
+ if not state in p.lookaheads:
+ p.lookaheads[state] = []
+ f = followset.get(trans,[])
+ for a in f:
+ if a not in p.lookaheads[state]: p.lookaheads[state].append(a)
+
+ # -----------------------------------------------------------------------------
+ # add_lalr_lookaheads()
+ #
+ # This function does all of the work of adding lookahead information for use
+ # with LALR parsing
+ # -----------------------------------------------------------------------------
- # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items
- # This determines the number of states
+ def add_lalr_lookaheads(self,C):
+ # Determine all of the nullable nonterminals
+ nullable = self.compute_nullable_nonterminals()
- C = lr0_items()
+ # Find all non-terminal transitions
+ trans = self.find_nonterminal_transitions(C)
- if method == 'LALR':
- add_lalr_lookaheads(C)
+ # Compute read sets
+ readsets = self.compute_read_sets(C,trans,nullable)
+ # Compute lookback/includes relations
+ lookd, included = self.compute_lookback_includes(C,trans,nullable)
- # Build the parser table, state by state
- st = 0
- for I in C:
- # Loop over each production in I
- actlist = [ ] # List of actions
- st_action = { }
- st_actionp = { }
- st_goto = { }
- if yaccdebug:
- _vf.write("\nstate %d\n\n" % st)
+ # Compute LALR FOLLOW sets
+ followsets = self.compute_follow_sets(trans,readsets,included)
+
+ # Add all of the lookaheads
+ self.add_lookaheads(lookd,followsets)
+
+ # -----------------------------------------------------------------------------
+ # lr_parse_table()
+ #
+ # This function constructs the parse tables for SLR or LALR
+ # -----------------------------------------------------------------------------
+ def lr_parse_table(self):
+ goto = self.lr_goto # Goto array
+ action = self.lr_action # Action array
+ log = self.log # Logger for output
+
+ actionp = { } # Action production array (temporary)
+
+ log.info("Parsing method: %s", self.lr_method)
+
+ # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items
+ # This determines the number of states
+
+ C = self.lr0_items()
+
+ if self.lr_method == 'LALR':
+ self.add_lalr_lookaheads(C)
+
+ # Build the parser table, state by state
+ st = 0
+ for I in C:
+ # Loop over each production in I
+ actlist = [ ] # List of actions
+ st_action = { }
+ st_actionp = { }
+ st_goto = { }
+ log.info("")
+ log.info("state %d", st)
+ log.info("")
for p in I:
- _vf.write(" (%d) %s\n" % (p.number, str(p)))
- _vf.write("\n")
+ log.info(" (%d) %s", p.number, str(p))
+ log.info("")
- for p in I:
- try:
- if p.len == p.lr_index + 1:
- if p.name == "S'":
- # Start symbol. Accept!
- st_action["$end"] = 0
- st_actionp["$end"] = p
- else:
- # We are at the end of a production. Reduce!
- if method == 'LALR':
- laheads = p.lookaheads[st]
+ for p in I:
+ if p.len == p.lr_index + 1:
+ if p.name == "S'":
+ # Start symbol. Accept!
+ st_action["$end"] = 0
+ st_actionp["$end"] = p
else:
- laheads = Follow[p.name]
- for a in laheads:
- actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p)))
- r = st_action.get(a,None)
- if r is not None:
- # Whoa. Have a shift/reduce or reduce/reduce conflict
- if r > 0:
- # Need to decide on shift or reduce here
- # By default we favor shifting. Need to add
- # some precedence rules here.
- sprec,slevel = Productions[st_actionp[a].number].prec
- rprec,rlevel = Precedence.get(a,('right',0))
- if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')):
- # We really need to reduce here.
- st_action[a] = -p.number
- st_actionp[a] = p
- if not slevel and not rlevel:
- _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a)
- n_srconflict += 1
- elif (slevel == rlevel) and (rprec == 'nonassoc'):
- st_action[a] = None
- else:
- # Hmmm. Guess we'll keep the shift
- if not rlevel:
- _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a)
- n_srconflict +=1
- elif r < 0:
- # Reduce/reduce conflict. In this case, we favor the rule
- # that was defined first in the grammar file
- oldp = Productions[-r]
- pp = Productions[p.number]
- if oldp.line > pp.line:
- st_action[a] = -p.number
- st_actionp[a] = p
- # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st)
- n_rrconflict += 1
- _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, st_actionp[a].number, st_actionp[a]))
- _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,st_actionp[a].number, st_actionp[a]))
- else:
- sys.stderr.write("Unknown conflict in state %d\n" % st)
+ # We are at the end of a production. Reduce!
+ if self.lr_method == 'LALR':
+ laheads = p.lookaheads[st]
else:
- st_action[a] = -p.number
- st_actionp[a] = p
- else:
- i = p.lr_index
- a = p.prod[i+1] # Get symbol right after the "."
- if a in Terminals:
- g = lr0_goto(I,a)
- j = _lr0_cidhash.get(id(g),-1)
- if j >= 0:
- # We are in a shift state
- actlist.append((a,p,"shift and go to state %d" % j))
- r = st_action.get(a,None)
- if r is not None:
- # Whoa have a shift/reduce or shift/shift conflict
- if r > 0:
- if r != j:
- sys.stderr.write("Shift/shift conflict in state %d\n" % st)
- elif r < 0:
- # Do a precedence check.
- # - if precedence of reduce rule is higher, we reduce.
- # - if precedence of reduce is same and left assoc, we reduce.
- # - otherwise we shift
- rprec,rlevel = Productions[st_actionp[a].number].prec
- sprec,slevel = Precedence.get(a,('right',0))
- if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')):
- # We decide to shift here... highest precedence to shift
- st_action[a] = j
- st_actionp[a] = p
- if not rlevel:
- n_srconflict += 1
- _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a)
- elif (slevel == rlevel) and (rprec == 'nonassoc'):
- st_action[a] = None
+ laheads = self.grammar.Follow[p.name]
+ for a in laheads:
+ actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p)))
+ r = st_action.get(a,None)
+ if r is not None:
+ # Whoa. Have a shift/reduce or reduce/reduce conflict
+ if r > 0:
+ # Need to decide on shift or reduce here
+ # By default we favor shifting. Need to add
+ # some precedence rules here.
+ sprec,slevel = self.grammar.Productions[st_actionp[a].number].prec
+ rprec,rlevel = self.grammar.Precedence.get(a,('right',0))
+ if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')):
+ # We really need to reduce here.
+ st_action[a] = -p.number
+ st_actionp[a] = p
+ if not slevel and not rlevel:
+ log.info(" ! shift/reduce conflict for %s resolved as reduce",a)
+ self.sr_conflicts.append((st,a,'reduce'))
+ elif (slevel == rlevel) and (rprec == 'nonassoc'):
+ st_action[a] = None
+ else:
+ # Hmmm. Guess we'll keep the shift
+ if not rlevel:
+ log.info(" ! shift/reduce conflict for %s resolved as shift",a)
+ self.sr_conflicts.append((st,a,'shift'))
+ elif r < 0:
+ # Reduce/reduce conflict. In this case, we favor the rule
+ # that was defined first in the grammar file
+ oldp = self.grammar.Productions[-r]
+ pp = self.grammar.Productions[p.number]
+ if oldp.line > pp.line:
+ st_action[a] = -p.number
+ st_actionp[a] = p
+ chosenp,rejectp = pp,oldp
+ else:
+ chosenp,rejectp = oldp,pp
+ self.rr_conflicts.append((st,chosenp,rejectp))
+ log.info(" ! reduce/reduce conflict for %s resolved using rule %d (%s)", a,st_actionp[a].number, st_actionp[a])
else:
- # Hmmm. Guess we'll keep the reduce
- if not slevel and not rlevel:
- n_srconflict +=1
- _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st)
- _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a)
-
+ raise LALRError("Unknown conflict in state %d" % st)
else:
- sys.stderr.write("Unknown conflict in state %d\n" % st)
- else:
- st_action[a] = j
- st_actionp[a] = p
+ st_action[a] = -p.number
+ st_actionp[a] = p
+ else:
+ i = p.lr_index
+ a = p.prod[i+1] # Get symbol right after the "."
+ if a in self.grammar.Terminals:
+ g = self.lr0_goto(I,a)
+ j = self.lr0_cidhash.get(id(g),-1)
+ if j >= 0:
+ # We are in a shift state
+ actlist.append((a,p,"shift and go to state %d" % j))
+ r = st_action.get(a,None)
+ if r is not None:
+ # Whoa have a shift/reduce or shift/shift conflict
+ if r > 0:
+ if r != j:
+ raise LALRError("Shift/shift conflict in state %d" % st)
+ elif r < 0:
+ # Do a precedence check.
+ # - if precedence of reduce rule is higher, we reduce.
+ # - if precedence of reduce is same and left assoc, we reduce.
+ # - otherwise we shift
+ rprec,rlevel = self.grammar.Productions[st_actionp[a].number].prec
+ sprec,slevel = self.grammar.Precedence.get(a,('right',0))
+ if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')):
+ # We decide to shift here... highest precedence to shift
+ st_action[a] = j
+ st_actionp[a] = p
+ if not rlevel:
+ log.info(" ! shift/reduce conflict for %s resolved as shift",a)
+ self.sr_conflicts.append((st,a,'shift'))
+ elif (slevel == rlevel) and (rprec == 'nonassoc'):
+ st_action[a] = None
+ else:
+ # Hmmm. Guess we'll keep the reduce
+ if not slevel and not rlevel:
+ log.info(" ! shift/reduce conflict for %s resolved as reduce",a)
+ self.sr_conflicts.append((st,a,'reduce'))
- except Exception:
- sys.stdout.write(str(sys.exc_info()) + "\n")
- raise YaccError("Hosed in lr_parse_table")
-
- # Print the actions associated with each terminal
- if yaccdebug:
- _actprint = { }
- for a,p,m in actlist:
- if a in st_action:
- if p is st_actionp[a]:
- _vf.write(" %-15s %s\n" % (a,m))
- _actprint[(a,m)] = 1
- _vf.write("\n")
- for a,p,m in actlist:
- if a in st_action:
- if p is not st_actionp[a]:
- if not (a,m) in _actprint:
- _vf.write(" ! %-15s [ %s ]\n" % (a,m))
+ else:
+ raise LALRError("Unknown conflict in state %d" % st)
+ else:
+ st_action[a] = j
+ st_actionp[a] = p
+
+ # Print the actions associated with each terminal
+ _actprint = { }
+ for a,p,m in actlist:
+ if a in st_action:
+ if p is st_actionp[a]:
+ log.info(" %-15s %s",a,m)
_actprint[(a,m)] = 1
+ log.info("")
+ # Print the actions that were not used. (debugging)
+ not_used = 0
+ for a,p,m in actlist:
+ if a in st_action:
+ if p is not st_actionp[a]:
+ if not (a,m) in _actprint:
+ log.debug(" ! %-15s [ %s ]",a,m)
+ not_used = 1
+ _actprint[(a,m)] = 1
+ if not_used:
+ log.debug("")
+
+ # Construct the goto table for this state
+
+ nkeys = { }
+ for ii in I:
+ for s in ii.usyms:
+ if s in self.grammar.Nonterminals:
+ nkeys[s] = None
+ for n in nkeys:
+ g = self.lr0_goto(I,n)
+ j = self.lr0_cidhash.get(id(g),-1)
+ if j >= 0:
+ st_goto[n] = j
+ log.info(" %-30s shift and go to state %d",n,j)
+
+ action[st] = st_action
+ actionp[st] = st_actionp
+ goto[st] = st_goto
+ st += 1
+
+
+ # -----------------------------------------------------------------------------
+ # write()
+ #
+ # This function writes the LR parsing tables to a file
+ # -----------------------------------------------------------------------------
- # Construct the goto table for this state
- if yaccdebug:
- _vf.write("\n")
- nkeys = { }
- for ii in I:
- for s in ii.usyms:
- if s in Nonterminals:
- nkeys[s] = None
- for n in nkeys:
- g = lr0_goto(I,n)
- j = _lr0_cidhash.get(id(g),-1)
- if j >= 0:
- st_goto[n] = j
- if yaccdebug:
- _vf.write(" %-30s shift and go to state %d\n" % (n,j))
-
- action[st] = st_action
- actionp[st] = st_actionp
- goto[st] = st_goto
-
- st += 1
-
- if yaccdebug:
- if n_srconflict == 1:
- sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict)
- if n_srconflict > 1:
- sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict)
- if n_rrconflict == 1:
- sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict)
- if n_rrconflict > 1:
- sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict)
-
-# -----------------------------------------------------------------------------
-# ==== LR Utility functions ====
-# -----------------------------------------------------------------------------
-
-# -----------------------------------------------------------------------------
-# _lr_write_tables()
-#
-# This function writes the LR parsing tables to a file
-# -----------------------------------------------------------------------------
-
-def lr_write_tables(modulename=tab_module,outputdir=''):
- if isinstance(modulename, types.ModuleType):
- sys.stderr.write("Warning module %s is inconsistent with the grammar (ignored)\n" % modulename)
- return
-
- basemodulename = modulename.split(".")[-1]
- filename = os.path.join(outputdir,basemodulename) + ".py"
- try:
- f = open(filename,"w")
+ def write_table(self,modulename,outputdir='',signature=""):
+ basemodulename = modulename.split(".")[-1]
+ filename = os.path.join(outputdir,basemodulename) + ".py"
+ try:
+ f = open(filename,"w")
- f.write("""
+ f.write("""
# %s
# This file is automatically generated. Do not edit.
+_tabversion = %r
-_lr_method = %s
-
-_lr_signature = %s
-""" % (filename, repr(_lr_method), repr(Signature.digest())))
-
- # Change smaller to 0 to go back to original tables
- smaller = 1
+_lr_method = %r
- # Factor out names to try and make smaller
- if smaller:
- items = { }
+_lr_signature = %r
+ """ % (filename, __tabversion__, self.lr_method, signature))
- for s,nd in _lr_action.items():
- for name,v in nd.items():
- i = items.get(name)
- if not i:
- i = ([],[])
- items[name] = i
- i[0].append(s)
- i[1].append(v)
+ # Change smaller to 0 to go back to original tables
+ smaller = 1
- f.write("\n_lr_action_items = {")
- for k,v in items.items():
- f.write("%r:([" % k)
- for i in v[0]:
- f.write("%r," % i)
- f.write("],[")
- for i in v[1]:
- f.write("%r," % i)
-
- f.write("]),")
- f.write("}\n")
-
- f.write("""
+ # Factor out names to try and make smaller
+ if smaller:
+ items = { }
+
+ for s,nd in self.lr_action.items():
+ for name,v in nd.items():
+ i = items.get(name)
+ if not i:
+ i = ([],[])
+ items[name] = i
+ i[0].append(s)
+ i[1].append(v)
+
+ f.write("\n_lr_action_items = {")
+ for k,v in items.items():
+ f.write("%r:([" % k)
+ for i in v[0]:
+ f.write("%r," % i)
+ f.write("],[")
+ for i in v[1]:
+ f.write("%r," % i)
+
+ f.write("]),")
+ f.write("}\n")
+
+ f.write("""
_lr_action = { }
for _k, _v in _lr_action_items.items():
for _x,_y in zip(_v[0],_v[1]):
@@ -2575,38 +2531,38 @@ for _k, _v in _lr_action_items.items():
del _lr_action_items
""")
- else:
- f.write("\n_lr_action = { ");
- for k,v in _lr_action.items():
- f.write("(%r,%r):%r," % (k[0],k[1],v))
- f.write("}\n");
-
- if smaller:
- # Factor out names to try and make smaller
- items = { }
-
- for s,nd in _lr_goto.items():
- for name,v in nd.items():
- i = items.get(name)
- if not i:
- i = ([],[])
- items[name] = i
- i[0].append(s)
- i[1].append(v)
-
- f.write("\n_lr_goto_items = {")
- for k,v in items.items():
- f.write("%r:([" % k)
- for i in v[0]:
- f.write("%r," % i)
- f.write("],[")
- for i in v[1]:
- f.write("%r," % i)
-
- f.write("]),")
- f.write("}\n")
-
- f.write("""
+ else:
+ f.write("\n_lr_action = { ");
+ for k,v in self.lr_action.items():
+ f.write("(%r,%r):%r," % (k[0],k[1],v))
+ f.write("}\n");
+
+ if smaller:
+ # Factor out names to try and make smaller
+ items = { }
+
+ for s,nd in self.lr_goto.items():
+ for name,v in nd.items():
+ i = items.get(name)
+ if not i:
+ i = ([],[])
+ items[name] = i
+ i[0].append(s)
+ i[1].append(v)
+
+ f.write("\n_lr_goto_items = {")
+ for k,v in items.items():
+ f.write("%r:([" % k)
+ for i in v[0]:
+ f.write("%r," % i)
+ f.write("],[")
+ for i in v[1]:
+ f.write("%r," % i)
+
+ f.write("]),")
+ f.write("}\n")
+
+ f.write("""
_lr_goto = { }
for _k, _v in _lr_goto_items.items():
for _x,_y in zip(_v[0],_v[1]):
@@ -2614,318 +2570,585 @@ for _k, _v in _lr_goto_items.items():
_lr_goto[_x][_k] = _y
del _lr_goto_items
""")
- else:
- f.write("\n_lr_goto = { ");
- for k,v in _lr_goto.items():
- f.write("(%r,%r):%r," % (k[0],k[1],v))
- f.write("}\n");
-
- # Write production table
- f.write("_lr_productions = [\n")
- for p in Productions:
- if p:
- if (p.func):
- f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line))
- else:
- f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len))
- else:
- f.write(" None,\n")
- f.write("]\n")
-
- f.close()
-
- except IOError:
- e = sys.exc_info()[1]
- sys.stderr.write("Unable to create '%s'\n" % filename)
- sys.stderr.write(str(e)+"\n")
- return
-
-def lr_read_tables(module=tab_module,optimize=0):
- global _lr_action, _lr_goto, _lr_productions, _lr_method
- parsetab = None
- try:
- if isinstance(module,types.ModuleType):
- parsetab = module
- else:
- if sys.version_info[0] < 3:
- exec("import %s as parsetab" % module)
else:
- env = { }
- exec("import %s as parsetab" % module, env, env)
- parsetab = env['parsetab']
-
- if (optimize) or (Signature.digest() == parsetab._lr_signature):
- _lr_action = parsetab._lr_action
- _lr_goto = parsetab._lr_goto
- _lr_productions = parsetab._lr_productions
- _lr_method = parsetab._lr_method
- return 1
- else:
- return 0
+ f.write("\n_lr_goto = { ");
+ for k,v in self.lr_goto.items():
+ f.write("(%r,%r):%r," % (k[0],k[1],v))
+ f.write("}\n");
+
+ # Write production table
+ f.write("_lr_productions = [\n")
+ for p in self.lr_productions:
+ if p.func:
+ f.write(" (%r,%r,%d,%r,%r,%d),\n" % (p.str,p.name, p.len, p.func,p.file,p.line))
+ else:
+ f.write(" (%r,%r,%d,None,None,None),\n" % (str(p),p.name, p.len))
+ f.write("]\n")
+ f.close()
- except (ImportError,AttributeError):
- return 0
+ except IOError:
+ e = sys.exc_info()[1]
+ sys.stderr.write("Unable to create '%s'\n" % filename)
+ sys.stderr.write(str(e)+"\n")
+ return
# -----------------------------------------------------------------------------
-# yacc(module)
+# === INTROSPECTION ===
#
-# Build the parser module
+# The following functions and classes are used to implement the PLY
+# introspection features followed by the yacc() function itself.
# -----------------------------------------------------------------------------
-def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file,outputdir=''):
- global yaccdebug
- yaccdebug = debug
-
- initialize_vars()
- files = { }
- error = 0
-
-
- # Add parsing method to signature
- Signature_update(method)
-
- # If a "module" parameter was supplied, extract its dictionary.
- # Note: a module may in fact be an instance as well.
+# -----------------------------------------------------------------------------
+# get_caller_module_dict()
+#
+# This function returns a dictionary containing all of the symbols defined within
+# a caller further down the call stack. This is used to get the environment
+# associated with the yacc() call if none was provided.
+# -----------------------------------------------------------------------------
- if module:
- _items = [(k,getattr(module,k)) for k in dir(module)]
- ldict = { }
- for i in _items:
- ldict[i[0]] = i[1]
- else:
- # No module given. We might be able to get information from the caller.
- # Throw an exception and unwind the traceback to get the globals
+def get_caller_module_dict(levels):
+ try:
+ raise RuntimeError
+ except RuntimeError:
+ e,b,t = sys.exc_info()
+ f = t.tb_frame
+ while levels > 0:
+ f = f.f_back # Walk out to our calling function
+ levels -= 1
+ ldict = f.f_globals.copy()
+ if f.f_globals != f.f_locals:
+ ldict.update(f.f_locals)
+
+ return ldict
+# -----------------------------------------------------------------------------
+# parse_grammar()
+#
+# This takes a raw grammar rule string and parses it into production data
+# -----------------------------------------------------------------------------
+def parse_grammar(doc,file,line):
+ grammar = []
+ # Split the doc string into lines
+ pstrings = doc.splitlines()
+ lastp = None
+ dline = line
+ for ps in pstrings:
+ dline += 1
+ p = ps.split()
+ if not p: continue
try:
- raise RuntimeError
- except RuntimeError:
- e,b,t = sys.exc_info()
- f = t.tb_frame
- f = f.f_back # Walk out to our calling function
- if f.f_globals is f.f_locals: # Collect global and local variations from caller
- ldict = f.f_globals
+ if p[0] == '|':
+ # This is a continuation of a previous rule
+ if not lastp:
+ raise SyntaxError("%s:%d: Misplaced '|'" % (file,dline))
+ prodname = lastp
+ syms = p[1:]
else:
- ldict = f.f_globals.copy()
- ldict.update(f.f_locals)
+ prodname = p[0]
+ lastp = prodname
+ syms = p[2:]
+ assign = p[1]
+ if assign != ':' and assign != '::=':
+ raise SyntaxError("%s:%d: Syntax error. Expected ':'" % (file,dline))
+
+ grammar.append((file,dline,prodname,syms))
+ except SyntaxError:
+ raise
+ except Exception:
+ raise SyntaxError("%s:%d: Syntax error in rule '%s'" % (file,dline,ps.strip()))
- # Add starting symbol to signature
- if not start:
- start = ldict.get("start",None)
- if start:
- Signature_update(start)
+ return grammar
- # Look for error handler
- ef = ldict.get('p_error',None)
- if ef:
- if isinstance(ef,types.FunctionType):
- ismethod = 0
- elif isinstance(ef, types.MethodType):
- ismethod = 1
+# -----------------------------------------------------------------------------
+# ParserReflect()
+#
+# This class represents information extracted for building a parser including
+# start symbol, error function, tokens, precedence list, action functions,
+# etc.
+# -----------------------------------------------------------------------------
+class ParserReflect(object):
+ def __init__(self,pdict,log=None):
+ self.pdict = pdict
+ self.start = None
+ self.error_func = None
+ self.tokens = None
+ self.files = {}
+ self.grammar = []
+ self.error = 0
+
+ if log is None:
+ self.log = PlyLogger(sys.stderr)
else:
- raise YaccError("'p_error' defined, but is not a function or method.")
- eline = func_code(ef).co_firstlineno
- efile = func_code(ef).co_filename
- files[efile] = None
-
- if (func_code(ef).co_argcount != 1+ismethod):
- raise YaccError("%s:%d: p_error() requires 1 argument." % (efile,eline))
- global Errorfunc
- Errorfunc = ef
- else:
- sys.stderr.write("yacc: Warning. no p_error() function is defined.\n")
-
- # If running in optimized mode. We're going to read tables instead
-
- if (optimize and lr_read_tables(tabmodule,1)):
- # Read parse table
- del Productions[:]
- for p in _lr_productions:
- if not p:
- Productions.append(None)
- else:
- m = MiniProduction()
- m.name = p[0]
- m.len = p[1]
- m.file = p[3]
- m.line = p[4]
- if p[2]:
- m.func = ldict[p[2]]
- Productions.append(m)
-
- else:
- # Get the tokens map
- tokens = ldict.get("tokens",None)
- if not tokens:
- raise YaccError("module does not define a list 'tokens'")
- if not isinstance(tokens,(list, tuple)):
- raise YaccError("tokens must be a list or tuple.")
+ self.log = log
+
+ # Get all of the basic information
+ def get_all(self):
+ self.get_start()
+ self.get_error_func()
+ self.get_tokens()
+ self.get_precedence()
+ self.get_pfunctions()
+
+ # Validate all of the information
+ def validate_all(self):
+ self.validate_start()
+ self.validate_error_func()
+ self.validate_tokens()
+ self.validate_precedence()
+ self.validate_pfunctions()
+ self.validate_files()
+ return self.error
+
+ # Compute a signature over the grammar
+ def signature(self):
+ from binascii import crc32
+ sig = 0
+ try:
+ if self.start:
+ sig = crc32(self.start.encode('latin-1'),sig)
+ if self.prec:
+ sig = crc32("".join(["".join(p) for p in self.prec]).encode('latin-1'),sig)
+ if self.tokens:
+ sig = crc32(" ".join(self.tokens).encode('latin-1'),sig)
+ for f in self.pfuncs:
+ if f[3]:
+ sig = crc32(f[3].encode('latin-1'),sig)
+ except (TypeError,ValueError):
+ pass
+ return sig & 0xffffffff
- # Check to see if a requires dictionary is defined.
- requires = ldict.get("require",None)
- if requires:
- if not (isinstance(requires,dict)):
- raise YaccError("require must be a dictionary.")
+ # -----------------------------------------------------------------------------
+ # validate_file()
+ #
+ # This method checks to see if there are duplicated p_rulename() functions
+ # in the parser module file. Without this function, it is really easy for
+ # users to make mistakes by cutting and pasting code fragments (and it's a real
+ # bugger to try and figure out why the resulting parser doesn't work). Therefore,
+ # we just do a little regular expression pattern matching of def statements
+ # to try and detect duplicates.
+ # -----------------------------------------------------------------------------
+
+ def validate_files(self):
+ # Match def p_funcname(
+ fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(')
+
+ for filename in self.files.keys():
+ base,ext = os.path.splitext(filename)
+ if ext != '.py': return 1 # No idea. Assume it's okay.
- for r,v in requires.items():
- try:
- if not isinstance(v,list):
- raise TypeError
- v1 = [x.split(".") for x in v]
- Requires[r] = v1
- except Exception:
- sys.stderr.write("Invalid specification for rule '%s' in require. Expected a list of strings\n" % r)
+ try:
+ f = open(filename)
+ lines = f.readlines()
+ f.close()
+ except IOError:
+ continue
+ counthash = { }
+ for linen,l in enumerate(lines):
+ linen += 1
+ m = fre.match(l)
+ if m:
+ name = m.group(1)
+ prev = counthash.get(name)
+ if not prev:
+ counthash[name] = linen
+ else:
+ self.log.warning("%s:%d: Function %s redefined. Previously defined on line %d", filename,linen,name,prev)
- # Build the dictionary of terminals. We a record a 0 in the
- # dictionary to track whether or not a terminal is actually
- # used in the grammar
+ # Get the start symbol
+ def get_start(self):
+ self.start = self.pdict.get('start')
- if 'error' in tokens:
- sys.stderr.write("yacc: Illegal token 'error'. Is a reserved word.\n")
- raise YaccError("Illegal token name")
+ # Validate the start symbol
+ def validate_start(self):
+ if self.start is not None:
+ if not isinstance(self.start,str):
+ self.log.error("'start' must be a string")
- for n in tokens:
- if n in Terminals:
- sys.stderr.write("yacc: Warning. Token '%s' multiply defined.\n" % n)
- Terminals[n] = [ ]
+ # Look for error handler
+ def get_error_func(self):
+ self.error_func = self.pdict.get('p_error')
+
+ # Validate the error function
+ def validate_error_func(self):
+ if self.error_func:
+ if isinstance(self.error_func,types.FunctionType):
+ ismethod = 0
+ elif isinstance(self.error_func, types.MethodType):
+ ismethod = 1
+ else:
+ self.log.error("'p_error' defined, but is not a function or method")
+ self.error = 1
+ return
- Terminals['error'] = [ ]
+ eline = func_code(self.error_func).co_firstlineno
+ efile = func_code(self.error_func).co_filename
+ self.files[efile] = 1
- # Get the precedence map (if any)
- prec = ldict.get("precedence",None)
- if prec:
- if not isinstance(prec,(list,tuple)):
- raise YaccError("precedence must be a list or tuple.")
- add_precedence(prec)
- Signature_update(repr(prec))
+ if (func_code(self.error_func).co_argcount != 1+ismethod):
+ self.log.error("%s:%d: p_error() requires 1 argument",efile,eline)
+ self.error = 1
- for n in tokens:
- if not n in Precedence:
- Precedence[n] = ('right',0) # Default, right associative, 0 precedence
+ # Get the tokens map
+ def get_tokens(self):
+ tokens = self.pdict.get("tokens",None)
+ if not tokens:
+ self.log.error("No token list is defined")
+ self.error = 1
+ return
- # Get the list of built-in functions with p_ prefix
- symbols = [ldict[f] for f in ldict
- if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_'
- and ldict[f].__name__ != 'p_error')]
+ if not isinstance(tokens,(list, tuple)):
+ self.log.error("tokens must be a list or tuple")
+ self.error = 1
+ return
+
+ if not tokens:
+ self.log.error("tokens is empty")
+ self.error = 1
+ return
+
+ self.tokens = tokens
+
+ # Validate the tokens
+ def validate_tokens(self):
+ # Validate the tokens.
+ if 'error' in self.tokens:
+ self.log.error("Illegal token name 'error'. Is a reserved word")
+ self.error = 1
+ return
+
+ terminals = {}
+ for n in self.tokens:
+ if n in terminals:
+ self.log.warning("Token '%s' multiply defined", n)
+ terminals[n] = 1
+
+ # Get the precedence map (if any)
+ def get_precedence(self):
+ self.prec = self.pdict.get("precedence",None)
+
+ # Validate and parse the precedence map
+ def validate_precedence(self):
+ preclist = []
+ if self.prec:
+ if not isinstance(self.prec,(list,tuple)):
+ self.log.error("precedence must be a list or tuple")
+ self.error = 1
+ return
+ for level,p in enumerate(self.prec):
+ if not isinstance(p,(list,tuple)):
+ self.log.error("Bad precedence table")
+ self.error = 1
+ return
+ if len(p) < 2:
+ self.log.error("Malformed precedence entry %s. Must be (assoc, term, ..., term)",p)
+ self.error = 1
+ return
+ assoc = p[0]
+ if not isinstance(assoc,str):
+ self.log.error("precedence associativity must be a string")
+ self.error = 1
+ return
+ for term in p[1:]:
+ if not isinstance(term,str):
+ self.log.error("precedence items must be strings")
+ self.error = 1
+ return
+ preclist.append((term,assoc,level+1))
+ self.preclist = preclist
+
+ # Get all p_functions from the grammar
+ def get_pfunctions(self):
+ p_functions = []
+ for name, item in self.pdict.items():
+ if name[:2] != 'p_': continue
+ if name == 'p_error': continue
+ if isinstance(item,(types.FunctionType,types.MethodType)):
+ line = func_code(item).co_firstlineno
+ file = func_code(item).co_filename
+ p_functions.append((line,file,name,item.__doc__))
+
+ # Sort all of the actions by line number
+ p_functions.sort()
+ self.pfuncs = p_functions
+
+
+ # Validate all of the p_functions
+ def validate_pfunctions(self):
+ grammar = []
# Check for non-empty symbols
- if len(symbols) == 0:
- raise YaccError("no rules of the form p_rulename are defined.")
-
- # Sort the symbols by line number
- if sys.version_info[0] < 3:
- symbols.sort(lambda x,y: cmp(func_code(x).co_firstlineno,func_code(y).co_firstlineno))
- else:
- # Python 3
- symbols.sort(key=lambda x: func_code(x).co_firstlineno)
-
- # Add all of the symbols to the grammar
- for f in symbols:
- if (add_function(f)) < 0:
- error += 1
+ if len(self.pfuncs) == 0:
+ self.log.error("no rules of the form p_rulename are defined")
+ self.error = 1
+ return
+
+ for line, file, name, doc in self.pfuncs:
+ func = self.pdict[name]
+ if isinstance(func, types.MethodType):
+ reqargs = 2
else:
- files[func_code(f).co_filename] = None
-
- # Make a signature of the docstrings
- for f in symbols:
- if f.__doc__:
- Signature_update(f.__doc__)
-
- lr_init_vars()
-
- if error:
- raise YaccError("Unable to construct parser.")
-
- if not lr_read_tables(tabmodule):
-
- # Validate files
- for filename in files:
- if not validate_file(filename):
- error = 1
-
- # Validate dictionary
- validate_dict(ldict)
-
- if start and not start in Prodnames:
- raise YaccError("Bad starting symbol '%s'" % start)
+ reqargs = 1
+ if func_code(func).co_argcount > reqargs:
+ self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,func.__name__)
+ self.error = 1
+ elif func_code(func).co_argcount < reqargs:
+ self.log.error("%s:%d: Rule '%s' requires an argument",file,line,func.__name__)
+ self.error = 1
+ elif not func.__doc__:
+ self.log.warning("%s:%d: No documentation string specified in function '%s' (ignored)",file,line,func.__name__)
+ else:
+ try:
+ parsed_g = parse_grammar(doc,file,line)
+ for g in parsed_g:
+ grammar.append((name, g))
+ except SyntaxError:
+ e = sys.exc_info()[1]
+ self.log.error(str(e))
+ self.error = 1
+
+ # Looks like a valid grammar rule
+ # Mark the file in which defined.
+ self.files[file] = 1
+
+ # Secondary validation step that looks for p_ definitions that are not functions
+ # or functions that look like they might be grammar rules.
+
+ for n,v in self.pdict.items():
+ if n[0:2] == 'p_' and isinstance(v, (types.FunctionType, types.MethodType)): continue
+ if n[0:2] == 't_': continue
+ if n[0:2] == 'p_' and n != 'p_error':
+ self.log.warning("'%s' not defined as a function", n)
+ if ((isinstance(v,types.FunctionType) and func_code(v).co_argcount == 1) or
+ (isinstance(v,types.MethodType) and func_code(v).co_argcount == 2)):
+ try:
+ doc = v.__doc__.split(" ")
+ if doc[1] == ':':
+ self.log.warning("%s:%d: Possible grammar rule '%s' defined without p_ prefix",
+ func_code(v).co_filename, func_code(v).co_firstlineno,n)
+ except Exception:
+ pass
- augment_grammar(start)
- error = verify_productions(cycle_check=check_recursion)
- otherfunc = [ldict[f] for f in ldict
- if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')]
+ self.grammar = grammar
- # Check precedence rules
- if check_precedence():
- error = 1
+# -----------------------------------------------------------------------------
+# yacc(module)
+#
+# Build a parser
+# -----------------------------------------------------------------------------
- if error:
- raise YaccError("Unable to construct parser.")
+def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, start=None,
+ check_recursion=1, optimize=0, write_tables=1, debugfile=debug_file,outputdir='',
+ debuglog=None, errorlog = None):
- build_lritems()
- compute_first1()
- compute_follow(start)
+ global parse # Reference to the parsing method of the last built parser
- if method in ['SLR','LALR']:
- lr_parse_table(method)
- else:
- raise YaccError("Unknown parsing method '%s'" % method)
+ if errorlog is None:
+ errorlog = PlyLogger(sys.stderr)
- if write_tables:
- lr_write_tables(tabmodule,outputdir)
+ # Get the module dictionary used for the parser
+ if module:
+ _items = [(k,getattr(module,k)) for k in dir(module)]
+ pdict = dict(_items)
+ else:
+ pdict = get_caller_module_dict(2)
- if yaccdebug:
- try:
- f = open(os.path.join(outputdir,debugfile),"w")
- f.write(_vfc.getvalue())
- f.write("\n\n")
- f.write(_vf.getvalue())
- f.close()
- except IOError:
- e = sys.exc_info()[1]
- sys.stderr.write("yacc: can't create '%s' %s\n" % (debugfile,e))
+ # Collect parser information from the dictionary
+ pinfo = ParserReflect(pdict,log=errorlog)
+ pinfo.get_all()
- # Made it here. Create a parser object and set up its internal state.
- # Set global parse() method to bound method of parser object.
+ if pinfo.error:
+ raise YaccError("Unable to build parser")
- p = Parser("xyzzy")
- p.productions = Productions
- p.errorfunc = Errorfunc
- p.action = _lr_action
- p.goto = _lr_goto
- p.method = _lr_method
- p.require = Requires
+ # Check signature against table files (if any)
+ signature = pinfo.signature()
- global parse
- parse = p.parse
+ # Read the tables
+ try:
+ lr = LRTable()
+ read_signature = lr.read_table(tabmodule)
+ if optimize or (read_signature == signature):
+ try:
+ lr.bind_callables(pinfo.pdict)
+ parser = LRParser(lr,pinfo.error_func)
+ parse = parser.parse
+ return parser
+ except Exception:
+ e = sys.exc_info()[1]
+ errorlog.warning("There was a problem loading the table file: %s", repr(e))
+ except VersionError:
+ e = sys.exc_info()
+ errorlog.warning(str(e))
+ except Exception:
+ pass
+
+ if debuglog is None:
+ if debug:
+ debuglog = PlyLogger(open(debugfile,"w"))
+ else:
+ debuglog = NullLogger()
- global parser
- parser = p
+ debuglog.info("Created by PLY version %s (http://www.dabeaz.com/ply)", __version__)
- # Clean up all of the globals we created
- if (not optimize):
- yacc_cleanup()
- return p
-# yacc_cleanup function. Delete all of the global variables
-# used during table construction
+ errors = 0
-def yacc_cleanup():
- global _lr_action, _lr_goto, _lr_method, _lr_goto_cache
- del _lr_action, _lr_goto, _lr_method, _lr_goto_cache
+ # Validate the parser information
+ if pinfo.validate_all():
+ raise YaccError("Unable to build parser")
+
+ if not pinfo.error_func:
+ errorlog.warning("no p_error() function is defined")
- global Productions, Prodnames, Prodmap, Terminals
- global Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems
- global Errorfunc, Signature, Requires
+ # Create a grammar object
+ grammar = Grammar(pinfo.tokens)
- del Productions, Prodnames, Prodmap, Terminals
- del Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems
- del Errorfunc, Signature, Requires
+ # Set precedence level for terminals
+ for term, assoc, level in pinfo.preclist:
+ try:
+ grammar.set_precedence(term,assoc,level)
+ except GrammarError:
+ e = sys.exc_info()[1]
+ errorlog.warning("%s",str(e))
+
+ # Add productions to the grammar
+ for funcname, gram in pinfo.grammar:
+ file, line, prodname, syms = gram
+ try:
+ grammar.add_production(prodname,syms,funcname,file,line)
+ except GrammarError:
+ e = sys.exc_info()[1]
+ errorlog.error("%s",str(e))
+ errors = 1
- global _vf, _vfc
- del _vf, _vfc
+ # Set the grammar start symbols
+ try:
+ grammar.set_start(pinfo.start)
+ except GrammarError:
+ e = sys.exc_info()[1]
+ errorlog.error(str(e))
+ errors = 1
+
+ if errors:
+ raise YaccError("Unable to build parser")
+
+ # Verify the grammar structure
+ undefined_symbols = grammar.undefined_symbols()
+ for sym, prod in undefined_symbols:
+ errorlog.error("%s:%d: Symbol '%s' used, but not defined as a token or a rule",prod.file,prod.line,sym)
+ errors = 1
+
+ unused_terminals = grammar.unused_terminals()
+ if unused_terminals:
+ debuglog.info("")
+ debuglog.info("Unused terminals:")
+ debuglog.info("")
+ for term in unused_terminals:
+ errorlog.warning("Token '%s' defined, but not used", term)
+ debuglog.info(" %s", term)
+
+ # Print out all productions to the debug log
+ if debug:
+ debuglog.info("")
+ debuglog.info("Grammar")
+ debuglog.info("")
+ for n,p in enumerate(grammar.Productions):
+ debuglog.info("Rule %-5d %s", n+1, p)
+
+ # Find unused non-terminals
+ unused_rules = grammar.unused_rules()
+ for prod in unused_rules:
+ errorlog.warning("%s:%d: Rule '%s' defined, but not used", prod.file, prod.line, prod.name)
+
+ if len(unused_terminals) == 1:
+ errorlog.warning("There is 1 unused token")
+ if len(unused_terminals) > 1:
+ errorlog.warning("There are %d unused tokens", len(unused_terminals))
+
+ if len(unused_rules) == 1:
+ errorlog.warning("There is 1 unused rule")
+ if len(unused_rules) > 1:
+ errorlog.warning("There are %d unused rules", len(unused_rules))
+
+ if debug:
+ debuglog.info("")
+ debuglog.info("Terminals, with rules where they appear")
+ debuglog.info("")
+ terms = list(grammar.Terminals)
+ terms.sort()
+ for term in terms:
+ debuglog.info("%-20s : %s", term, " ".join([str(s) for s in grammar.Terminals[term]]))
+
+ debuglog.info("")
+ debuglog.info("Nonterminals, with rules where they appear")
+ debuglog.info("")
+ nonterms = list(grammar.Nonterminals)
+ nonterms.sort()
+ for nonterm in nonterms:
+ debuglog.info("%-20s : %s", nonterm, " ".join([str(s) for s in grammar.Nonterminals[nonterm]]))
+ debuglog.info("")
+
+ if check_recursion:
+ unreachable = grammar.find_unreachable()
+ for u in unreachable:
+ errorlog.warning("Symbol '%s' is unreachable",u)
+
+ infinite = grammar.infinite_cycles()
+ for inf in infinite:
+ errorlog.error("Infinite recursion detected for symbol '%s'", inf)
+ errors = 1
+
+ unused_prec = grammar.unused_precedence()
+ for term, assoc in unused_prec:
+ errorlog.error("Precedence rule '%s' defined for unknown symbol '%s'", assoc, term)
+ errors = 1
+ if errors:
+ raise YaccError("Unable to build parser")
+
+ # Run the LRGeneratedTable on the grammar
+ errorlog.debug("Generating %s tables", method)
+
+ lr = LRGeneratedTable(grammar,method,debuglog)
+
+ num_sr = len(lr.sr_conflicts)
+
+ # Report shift/reduce and reduce/reduce conflicts
+ if num_sr == 1:
+ errorlog.warning("1 shift/reduce conflict")
+ elif num_sr > 1:
+ errorlog.warning("%d shift/reduce conflicts", num_sr)
+
+ num_rr = len(lr.rr_conflicts)
+ if num_rr == 1:
+ errorlog.warning("1 reduce/reduce conflict")
+ elif num_rr > 1:
+ errorlog.warning("%d reduce/reduce conflicts", num_rr)
+
+ # Write out conflicts to the output file
+ if debug and (lr.sr_conflicts or lr.rr_conflicts):
+ debuglog.warning("")
+ debuglog.warning("Conflicts:")
+ debuglog.warning("")
+
+ for state, tok, resolution in lr.sr_conflicts:
+ debuglog.warning("shift/reduce conflict for %s in state %d resolved as %s", tok, state, resolution)
+
+ for state, rule, rejected in lr.rr_conflicts:
+ debuglog.warning("reduce/reduce conflict in state %d resolved using rule (%s)", state, rule)
+ debuglog.warning("rejected rule (%s)", rejected)
+ errorlog.warning("reduce/reduce conflict in state %d resolved using rule (%s)", state, rule)
+ errorlog.warning("rejected rule (%s)", rejected)
+
+ # Write the table file if requested
+ if write_tables:
+ lr.write_table(tabmodule,outputdir,signature)
+
+ # Build the parser
+ lr.bind_callables(pinfo.pdict)
+ parser = LRParser(lr,pinfo.error_func)
-# Stub that raises an error if parsing is attempted without first calling yacc()
-def parse(*args,**kwargs):
- raise YaccError("yacc: No parser built with yacc()")
+ parse = parser.parse
+ return parser
diff --git a/test/testyacc.py b/test/testyacc.py
index 64d41e3..e78b097 100644
--- a/test/testyacc.py
+++ b/test/testyacc.py
@@ -15,7 +15,14 @@ sys.tracebacklimit = 0
import ply.yacc
def check_expected(result,expected):
- resultlines = result.splitlines()
+ resultlines = []
+ for line in result.splitlines():
+ if line.startswith("WARNING: "):
+ line = line[9:]
+ elif line.startswith("ERROR: "):
+ line = line[7:]
+ resultlines.append(line)
+
expectedlines = expected.splitlines()
if len(resultlines) != len(expectedlines):
return False
@@ -47,8 +54,8 @@ class YaccErrorWarningTests(unittest.TestCase):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badargs")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments.\n"
- "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument.\n"
+ "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments\n"
+ "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument\n"
))
def test_yacc_badid(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badid")
@@ -62,24 +69,24 @@ class YaccErrorWarningTests(unittest.TestCase):
try:
run_import("yacc_badprec")
except ply.yacc.YaccError:
- e = sys.exc_info()[1]
- self.assert_(check_expected(str(e),
- "precedence must be a list or tuple."))
+ result = sys.stderr.getvalue()
+ self.assert_(check_expected(result,
+ "precedence must be a list or tuple\n"
+ ))
def test_yacc_badprec2(self):
- run_import("yacc_badprec2")
+ self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badprec2")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Invalid precedence table.\n"
- "yacc: Generating LALR parsing table...\n"
- "yacc: 8 shift/reduce conflicts\n"
+ "Bad precedence table\n"
))
def test_yacc_badprec3(self):
run_import("yacc_badprec3")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Precedence already specified for terminal 'MINUS'\n"
- "yacc: Generating LALR parsing table...\n"
+ "Precedence already specified for terminal 'MINUS'\n"
+ "Generating LALR tables\n"
+
))
def test_yacc_badrule(self):
@@ -96,58 +103,60 @@ class YaccErrorWarningTests(unittest.TestCase):
try:
run_import("yacc_badtok")
except ply.yacc.YaccError:
- e = sys.exc_info()[1]
- self.assert_(check_expected(str(e),
- "tokens must be a list or tuple."))
+ result = sys.stderr.getvalue()
+ self.assert_(check_expected(result,
+ "tokens must be a list or tuple\n"))
def test_yacc_dup(self):
run_import("yacc_dup")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
"yacc_dup.py:27: Function p_statement redefined. Previously defined on line 23\n"
- "yacc: Warning. Token 'EQUALS' defined, but not used.\n"
- "yacc: Warning. There is 1 unused token.\n"
- "yacc: Generating LALR parsing table...\n"
+ "Token 'EQUALS' defined, but not used\n"
+ "There is 1 unused token\n"
+ "Generating LALR tables\n"
+
))
def test_yacc_error1(self):
try:
run_import("yacc_error1")
except ply.yacc.YaccError:
- e = sys.exc_info()[1]
- self.assert_(check_expected(str(e),
- "yacc_error1.py:61: p_error() requires 1 argument."))
+ result = sys.stderr.getvalue()
+ self.assert_(check_expected(result,
+ "yacc_error1.py:61: p_error() requires 1 argument\n"))
def test_yacc_error2(self):
try:
run_import("yacc_error2")
except ply.yacc.YaccError:
- e = sys.exc_info()[1]
- self.assert_(check_expected(str(e),
- "yacc_error2.py:61: p_error() requires 1 argument."))
+ result = sys.stderr.getvalue()
+ self.assert_(check_expected(result,
+ "yacc_error2.py:61: p_error() requires 1 argument\n"))
def test_yacc_error3(self):
try:
run_import("yacc_error3")
except ply.yacc.YaccError:
e = sys.exc_info()[1]
- self.assert_(check_expected(str(e),
- "'p_error' defined, but is not a function or method."))
+ result = sys.stderr.getvalue()
+ self.assert_(check_expected(result,
+ "'p_error' defined, but is not a function or method\n"))
def test_yacc_error4(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_error4")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token.\n"
+ "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token\n"
))
def test_yacc_inf(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_inf")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Warning. Token 'NUMBER' defined, but not used.\n"
- "yacc: Warning. There is 1 unused token.\n"
- "yacc: Infinite recursion detected for symbol 'statement'.\n"
- "yacc: Infinite recursion detected for symbol 'expression'.\n"
+ "Token 'NUMBER' defined, but not used\n"
+ "There is 1 unused token\n"
+ "Infinite recursion detected for symbol 'statement'\n"
+ "Infinite recursion detected for symbol 'expression'\n"
))
def test_yacc_literal(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_literal")
@@ -159,14 +168,14 @@ class YaccErrorWarningTests(unittest.TestCase):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_misplaced")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_misplaced.py:32: Misplaced '|'.\n"
+ "yacc_misplaced.py:32: Misplaced '|'\n"
))
def test_yacc_missing1(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_missing1")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule.\n"
+ "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule\n"
))
def test_yacc_nested(self):
@@ -182,92 +191,96 @@ class YaccErrorWarningTests(unittest.TestCase):
run_import("yacc_nodoc")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr'\n"
- "yacc: Generating LALR parsing table...\n"
+ "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr' (ignored)\n"
+ "Generating LALR tables\n"
))
def test_yacc_noerror(self):
run_import("yacc_noerror")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Warning. no p_error() function is defined.\n"
- "yacc: Generating LALR parsing table...\n"
+ "no p_error() function is defined\n"
+ "Generating LALR tables\n"
))
def test_yacc_nop(self):
run_import("yacc_nop")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_nop.py:27: Warning. Possible grammar rule 'statement_expr' defined without p_ prefix.\n"
- "yacc: Generating LALR parsing table...\n"
+ "yacc_nop.py:27: Possible grammar rule 'statement_expr' defined without p_ prefix\n"
+ "Generating LALR tables\n"
))
def test_yacc_notfunc(self):
run_import("yacc_notfunc")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Warning. 'p_statement_assign' not defined as a function\n"
- "yacc: Warning. Token 'EQUALS' defined, but not used.\n"
- "yacc: Warning. There is 1 unused token.\n"
- "yacc: Generating LALR parsing table...\n"
+ "'p_statement_assign' not defined as a function\n"
+ "Token 'EQUALS' defined, but not used\n"
+ "There is 1 unused token\n"
+ "Generating LALR tables\n"
))
def test_yacc_notok(self):
try:
run_import("yacc_notok")
except ply.yacc.YaccError:
- e = sys.exc_info()[1]
- self.assert_(check_expected(str(e),
- "module does not define a list 'tokens'"))
+ result = sys.stderr.getvalue()
+ self.assert_(check_expected(result,
+ "No token list is defined\n"))
def test_yacc_rr(self):
run_import("yacc_rr")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Generating LALR parsing table...\n"
- "yacc: 1 reduce/reduce conflict\n"
+ "Generating LALR tables\n"
+ "1 reduce/reduce conflict\n"
+ "reduce/reduce conflict in state 15 resolved using rule (statement -> NAME EQUALS NUMBER)\n"
+ "rejected rule (expression -> NUMBER)\n"
+
))
def test_yacc_simple(self):
run_import("yacc_simple")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Generating LALR parsing table...\n"
+ "Generating LALR tables\n"
))
def test_yacc_sr(self):
run_import("yacc_sr")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Generating LALR parsing table...\n"
- "yacc: 20 shift/reduce conflicts\n"
+ "Generating LALR tables\n"
+ "20 shift/reduce conflicts\n"
))
def test_yacc_term1(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_term1")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token.\n"
+ "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token\n"
))
def test_yacc_unused(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_unused")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule.\n"
- "yacc: Symbol 'COMMA' is unreachable.\n"
- "yacc: Symbol 'exprlist' is unreachable.\n"
+ "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule\n"
+ "Symbol 'COMMA' is unreachable\n"
+ "Symbol 'exprlist' is unreachable\n"
))
def test_yacc_unused_rule(self):
run_import("yacc_unused_rule")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_unused_rule.py:62: Warning. Rule 'integer' defined, but not used.\n"
- "yacc: Warning. There is 1 unused rule.\n"
- "yacc: Symbol 'integer' is unreachable.\n"
- "yacc: Generating LALR parsing table...\n"
+ "yacc_unused_rule.py:62: Rule 'integer' defined, but not used\n"
+ "There is 1 unused rule\n"
+ "Symbol 'integer' is unreachable\n"
+ "Generating LALR tables\n"
))
def test_yacc_uprec(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec")
result = sys.stderr.getvalue()
+ print repr(result)
self.assert_(check_expected(result,
"yacc_uprec.py:37: Nothing known about the precedence of 'UMINUS'\n"
))
@@ -276,17 +289,17 @@ class YaccErrorWarningTests(unittest.TestCase):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec2")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc_uprec2.py:37: Syntax error. Nothing follows %prec.\n"
+ "yacc_uprec2.py:37: Syntax error. Nothing follows %prec\n"
))
def test_yacc_prec1(self):
self.assertRaises(ply.yacc.YaccError,run_import,"yacc_prec1")
result = sys.stderr.getvalue()
self.assert_(check_expected(result,
- "yacc: Precedence rule 'left' defined for unknown symbol '+'\n"
- "yacc: Precedence rule 'left' defined for unknown symbol '*'\n"
- "yacc: Precedence rule 'left' defined for unknown symbol '-'\n"
- "yacc: Precedence rule 'left' defined for unknown symbol '/'\n"
+ "Precedence rule 'left' defined for unknown symbol '+'\n"
+ "Precedence rule 'left' defined for unknown symbol '*'\n"
+ "Precedence rule 'left' defined for unknown symbol '-'\n"
+ "Precedence rule 'left' defined for unknown symbol '/'\n"
))
diff --git a/test/yacc_badid.py b/test/yacc_badid.py
index 1df351c..e4b9f5e 100644
--- a/test/yacc_badid.py
+++ b/test/yacc_badid.py
@@ -28,7 +28,7 @@ def p_statement_expr(t):
'statement : expression'
print(t[1])
-def p_statement_expr(t):
+def p_statement_expr2(t):
'statement : bad&rule'
pass