diff options
author | David Beazley <dave@dabeaz.com> | 2020-02-22 15:57:54 -0600 |
---|---|---|
committer | David Beazley <dave@dabeaz.com> | 2020-02-22 15:57:54 -0600 |
commit | 1fac9fed647909b92f3779dd34beb8564f6f247b (patch) | |
tree | c520f1c56791c541105c1023c8dd68d99853ad8b | |
parent | 1321375e013425958ea090b55aecae0a4b7face6 (diff) | |
download | ply-1fac9fed647909b92f3779dd34beb8564f6f247b.tar.gz |
Massive refactoring/cleanup
70 files changed, 4157 insertions, 8657 deletions
@@ -4,10 +4,42 @@ IMPORTANT NOTE (2018-12-22): PLY is no longer be released in any package-installable format. If you want to use the latest version, you need to COPY the contents of the ply/ directory into your own project and use it. Although PLY is no longer distributed as a package, it is -maintained as a mature library. No new features are planned, but -issues and pull requests for bugs are still welcome. Any changes to the +maintained as a mature library. No new major features are planned, but +issues reported for bugs are still welcome. Any changes to the software will be noted here. +Version 4.0 (In progress) +------------------------- +Note: The 4.0 series of PLY represents a massive cleanup and modernization +effort. At a fundamental level, no new "features" are being added. +Instead, a lot of outdated, inconsistent, and problematic features are +being eliminated. Here is a short summary: + + - PLY no longer writes table files or cached data. If you want this, + it's your responsibility to serialize the parser tables. Use pickle. + + - Elimination of side-effects and global variables (generally). + + - Elimination of numerous optional features in an effort to + simplify the API. + + - More use of modern Python features including iterators/generators, + keyword-only arguments, f-strings, etc. + + - Dropped support for Python 2.x +------------------------ + +01/26/20 PLY no longer writes cached table files. Honestly, the use of + the cached files made more sense when I was developing PLY on + my 200Mhz PC in 2001. It's not as much as an issue now. For small + to medium sized grammars, PLY should be almost instantaneous. + If you're working with a large grammar, you can arrange + to pickle the associated grammar instance yourself if need be. + The removal of table files eliminated a large number of optional + arguments to yacc() concerning the names and packages of these files. + +01/26/20 PLY no longer supports Python 2.x. + 01/01/20 Added an install.py script to make it easy to install PLY into virtual environment if you just want to play with it. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7a62354..3e1cad6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,19 +3,5 @@ Contributing to PLY PLY is a mature project. However, if you feel that you have found a bug in PLY or its documentation, please submit an issue in the form -of a bug report. - -Important note: The Github repo for PLY always contains the most -up-to-date version of the software. If you want to use the current -version, you should COPY the contents of the `ply/` directory into -your own project and use it. PLY is free software for you to use, -but it is not maintained as a package that you install using pip -or similar tools. - - - - - - - - +of a bug report at https://github.com/dabeaz/ply. Pull requests +to the project are not accepted. @@ -93,7 +93,7 @@ virtual environment. PLY has no third-party dependencies. -The file doc/ply.html contains complete documentation on how to use +The docs/ directory contains complete documentation on how to use the system. The example directory contains several different examples including a @@ -176,7 +176,7 @@ def t_newline(t): t.lexer.lineno += t.value.count("\n") def t_error(t): - print("Illegal character '%s'" % t.value[0]) + print(f"Illegal character {t.value[0]!r}") t.lexer.skip(1) # Build the lexer @@ -228,18 +228,18 @@ def p_expression_name(p): try: p[0] = names[p[1]] except LookupError: - print("Undefined name '%s'" % p[1]) + print(f"Undefined name {p[1]!r}") p[0] = 0 def p_error(p): - print("Syntax error at '%s'" % p.value) + print(f"Syntax error at {p.value!r}") import ply.yacc as yacc yacc.yacc() while True: try: - s = raw_input('calc > ') # use input() on Python 3 + s = input('calc > ') except EOFError: break yacc.parse(s) diff --git a/doc/internal.html b/doc/internal.html deleted file mode 100644 index 57e87df..0000000 --- a/doc/internal.html +++ /dev/null @@ -1,874 +0,0 @@ -<html> -<head> -<title>PLY Internals</title> -</head> -<body bgcolor="#ffffff"> - -<h1>PLY Internals</h1> - -<b> -David M. Beazley <br> -dave@dabeaz.com<br> -</b> - -<p> -<b>PLY Version: 3.11</b> -<p> - -<!-- INDEX --> -<div class="sectiontoc"> -<ul> -<li><a href="#internal_nn1">Introduction</a> -<li><a href="#internal_nn2">Grammar Class</a> -<li><a href="#internal_nn3">Productions</a> -<li><a href="#internal_nn4">LRItems</a> -<li><a href="#internal_nn5">LRTable</a> -<li><a href="#internal_nn6">LRGeneratedTable</a> -<li><a href="#internal_nn7">LRParser</a> -<li><a href="#internal_nn8">ParserReflect</a> -<li><a href="#internal_nn9">High-level operation</a> -</ul> -</div> -<!-- INDEX --> - - -<H2><a name="internal_nn1"></a>1. Introduction</H2> - - -This document describes classes and functions that make up the internal -operation of PLY. Using this programming interface, it is possible to -manually build an parser using a different interface specification -than what PLY normally uses. For example, you could build a gramar -from information parsed in a completely different input format. Some of -these objects may be useful for building more advanced parsing engines -such as GLR. - -<p> -It should be stressed that using PLY at this level is not for the -faint of heart. Generally, it's assumed that you know a bit of -the underlying compiler theory and how an LR parser is put together. - -<H2><a name="internal_nn2"></a>2. Grammar Class</H2> - - -The file <tt>ply.yacc</tt> defines a class <tt>Grammar</tt> that -is used to hold and manipulate information about a grammar -specification. It encapsulates the same basic information -about a grammar that is put into a YACC file including -the list of tokens, precedence rules, and grammar rules. -Various operations are provided to perform different validations -on the grammar. In addition, there are operations to compute -the first and follow sets that are needed by the various table -generation algorithms. - -<p> -<tt><b>Grammar(terminals)</b></tt> - -<blockquote> -Creates a new grammar object. <tt>terminals</tt> is a list of strings -specifying the terminals for the grammar. An instance <tt>g</tt> of -<tt>Grammar</tt> has the following methods: -</blockquote> - -<p> -<b><tt>g.set_precedence(term,assoc,level)</tt></b> -<blockquote> -Sets the precedence level and associativity for a given terminal <tt>term</tt>. -<tt>assoc</tt> is one of <tt>'right'</tt>, -<tt>'left'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is a positive integer. The higher -the value of <tt>level</tt>, the higher the precedence. Here is an example of typical -precedence settings: - -<pre> -g.set_precedence('PLUS', 'left',1) -g.set_precedence('MINUS', 'left',1) -g.set_precedence('TIMES', 'left',2) -g.set_precedence('DIVIDE','left',2) -g.set_precedence('UMINUS','left',3) -</pre> - -This method must be called prior to adding any productions to the -grammar with <tt>g.add_production()</tt>. The precedence of individual grammar -rules is determined by the precedence of the right-most terminal. - -</blockquote> -<p> -<b><tt>g.add_production(name,syms,func=None,file='',line=0)</tt></b> -<blockquote> -Adds a new grammar rule. <tt>name</tt> is the name of the rule, -<tt>syms</tt> is a list of symbols making up the right hand -side of the rule, <tt>func</tt> is the function to call when -reducing the rule. <tt>file</tt> and <tt>line</tt> specify -the filename and line number of the rule and are used for -generating error messages. - -<p> -The list of symbols in <tt>syms</tt> may include character -literals and <tt>%prec</tt> specifiers. Here are some -examples: - -<pre> -g.add_production('expr',['expr','PLUS','term'],func,file,line) -g.add_production('expr',['expr','"+"','term'],func,file,line) -g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) -</pre> - -<p> -If any kind of error is detected, a <tt>GrammarError</tt> exception -is raised with a message indicating the reason for the failure. -</blockquote> - -<p> -<b><tt>g.set_start(start=None)</tt></b> -<blockquote> -Sets the starting rule for the grammar. <tt>start</tt> is a string -specifying the name of the start rule. If <tt>start</tt> is omitted, -the first grammar rule added with <tt>add_production()</tt> is taken to be -the starting rule. This method must always be called after all -productions have been added. -</blockquote> - -<p> -<b><tt>g.find_unreachable()</tt></b> -<blockquote> -Diagnostic function. Returns a list of all unreachable non-terminals -defined in the grammar. This is used to identify inactive parts of -the grammar specification. -</blockquote> - -<p> -<b><tt>g.infinite_cycle()</tt></b> -<blockquote> -Diagnostic function. Returns a list of all non-terminals in the -grammar that result in an infinite cycle. This condition occurs if -there is no way for a grammar rule to expand to a string containing -only terminal symbols. -</blockquote> - -<p> -<b><tt>g.undefined_symbols()</tt></b> -<blockquote> -Diagnostic function. Returns a list of tuples <tt>(name, prod)</tt> -corresponding to undefined symbols in the grammar. <tt>name</tt> is the -name of the undefined symbol and <tt>prod</tt> is an instance of -<tt>Production</tt> which has information about the production rule -where the undefined symbol was used. -</blockquote> - -<p> -<b><tt>g.unused_terminals()</tt></b> -<blockquote> -Diagnostic function. Returns a list of terminals that were defined, -but never used in the grammar. -</blockquote> - -<p> -<b><tt>g.unused_rules()</tt></b> -<blockquote> -Diagnostic function. Returns a list of <tt>Production</tt> instances -corresponding to production rules that were defined in the grammar, -but never used anywhere. This is slightly different -than <tt>find_unreachable()</tt>. -</blockquote> - -<p> -<b><tt>g.unused_precedence()</tt></b> -<blockquote> -Diagnostic function. Returns a list of tuples <tt>(term, assoc)</tt> -corresponding to precedence rules that were set, but never used the -grammar. <tt>term</tt> is the terminal name and <tt>assoc</tt> is the -precedence associativity (e.g., <tt>'left'</tt>, <tt>'right'</tt>, -or <tt>'nonassoc'</tt>. -</blockquote> - -<p> -<b><tt>g.compute_first()</tt></b> -<blockquote> -Compute all of the first sets for all symbols in the grammar. Returns a dictionary -mapping symbol names to a list of all first symbols. -</blockquote> - -<p> -<b><tt>g.compute_follow()</tt></b> -<blockquote> -Compute all of the follow sets for all non-terminals in the grammar. -The follow set is the set of all possible symbols that might follow a -given non-terminal. Returns a dictionary mapping non-terminal names -to a list of symbols. -</blockquote> - -<p> -<b><tt>g.build_lritems()</tt></b> -<blockquote> -Calculates all of the LR items for all productions in the grammar. This -step is required before using the grammar for any kind of table generation. -See the section on LR items below. -</blockquote> - -<p> -The following attributes are set by the above methods and may be useful -in code that works with the grammar. All of these attributes should be -assumed to be read-only. Changing their values directly will likely -break the grammar. - -<p> -<b><tt>g.Productions</tt></b> -<blockquote> -A list of all productions added. The first entry is reserved for -a production representing the starting rule. The objects in this list -are instances of the <tt>Production</tt> class, described shortly. -</blockquote> - -<p> -<b><tt>g.Prodnames</tt></b> -<blockquote> -A dictionary mapping the names of nonterminals to a list of all -productions of that nonterminal. -</blockquote> - -<p> -<b><tt>g.Terminals</tt></b> -<blockquote> -A dictionary mapping the names of terminals to a list of the -production numbers where they are used. -</blockquote> - -<p> -<b><tt>g.Nonterminals</tt></b> -<blockquote> -A dictionary mapping the names of nonterminals to a list of the -production numbers where they are used. -</blockquote> - -<p> -<b><tt>g.First</tt></b> -<blockquote> -A dictionary representing the first sets for all grammar symbols. This is -computed and returned by the <tt>compute_first()</tt> method. -</blockquote> - -<p> -<b><tt>g.Follow</tt></b> -<blockquote> -A dictionary representing the follow sets for all grammar rules. This is -computed and returned by the <tt>compute_follow()</tt> method. -</blockquote> - -<p> -<b><tt>g.Start</tt></b> -<blockquote> -Starting symbol for the grammar. Set by the <tt>set_start()</tt> method. -</blockquote> - -For the purposes of debugging, a <tt>Grammar</tt> object supports the <tt>__len__()</tt> and -<tt>__getitem__()</tt> special methods. Accessing <tt>g[n]</tt> returns the nth production -from the grammar. - - -<H2><a name="internal_nn3"></a>3. Productions</H2> - - -<tt>Grammar</tt> objects store grammar rules as instances of a <tt>Production</tt> class. This -class has no public constructor--you should only create productions by calling <tt>Grammar.add_production()</tt>. -The following attributes are available on a <tt>Production</tt> instance <tt>p</tt>. - -<p> -<b><tt>p.name</tt></b> -<blockquote> -The name of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>'A'</tt>. -</blockquote> - -<p> -<b><tt>p.prod</tt></b> -<blockquote> -A tuple of symbols making up the right-hand side of the production. For a grammar rule such as <tt>A : B C D</tt>, this is <tt>('B','C','D')</tt>. -</blockquote> - -<p> -<b><tt>p.number</tt></b> -<blockquote> -Production number. An integer containing the index of the production in the grammar's <tt>Productions</tt> list. -</blockquote> - -<p> -<b><tt>p.func</tt></b> -<blockquote> -The name of the reduction function associated with the production. -This is the function that will execute when reducing the entire -grammar rule during parsing. -</blockquote> - -<p> -<b><tt>p.callable</tt></b> -<blockquote> -The callable object associated with the name in <tt>p.func</tt>. This is <tt>None</tt> -unless the production has been bound using <tt>bind()</tt>. -</blockquote> - -<p> -<b><tt>p.file</tt></b> -<blockquote> -Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. -</blockquote> - -<p> -<b><tt>p.lineno</tt></b> -<blockquote> -Line number associated with the production. Typically this is the line number in <tt>p.file</tt> where the production was defined. Used for error messages. -</blockquote> - -<p> -<b><tt>p.prec</tt></b> -<blockquote> -Precedence and associativity associated with the production. This is a tuple <tt>(assoc,level)</tt> where -<tt>assoc</tt> is one of <tt>'left'</tt>,<tt>'right'</tt>, or <tt>'nonassoc'</tt> and <tt>level</tt> is -an integer. This value is determined by the precedence of the right-most terminal symbol in the production -or by use of the <tt>%prec</tt> specifier when adding the production. -</blockquote> - -<p> -<b><tt>p.usyms</tt></b> -<blockquote> -A list of all unique symbols found in the production. -</blockquote> - -<p> -<b><tt>p.lr_items</tt></b> -<blockquote> -A list of all LR items for this production. This attribute only has a meaningful value if the -<tt>Grammar.build_lritems()</tt> method has been called. The items in this list are -instances of <tt>LRItem</tt> described below. -</blockquote> - -<p> -<b><tt>p.lr_next</tt></b> -<blockquote> -The head of a linked-list representation of the LR items in <tt>p.lr_items</tt>. -This attribute only has a meaningful value if the <tt>Grammar.build_lritems()</tt> -method has been called. Each <tt>LRItem</tt> instance has a <tt>lr_next</tt> attribute -to move to the next item. The list is terminated by <tt>None</tt>. -</blockquote> - -<p> -<b><tt>p.bind(dict)</tt></b> -<blockquote> -Binds the production function name in <tt>p.func</tt> to a callable object in -<tt>dict</tt>. This operation is typically carried out in the last step -prior to running the parsing engine and is needed since parsing tables are typically -read from files which only include the function names, not the functions themselves. -</blockquote> - -<P> -<tt>Production</tt> objects support -the <tt>__len__()</tt>, <tt>__getitem__()</tt>, and <tt>__str__()</tt> -special methods. -<tt>len(p)</tt> returns the number of symbols in <tt>p.prod</tt> -and <tt>p[n]</tt> is the same as <tt>p.prod[n]</tt>. - -<H2><a name="internal_nn4"></a>4. LRItems</H2> - - -The construction of parsing tables in an LR-based parser generator is primarily -done over a set of "LR Items". An LR item represents a stage of parsing one -of the grammar rules. To compute the LR items, it is first necessary to -call <tt>Grammar.build_lritems()</tt>. Once this step, all of the productions -in the grammar will have their LR items attached to them. - -<p> -Here is an interactive example that shows what LR items look like if you -interactively experiment. In this example, <tt>g</tt> is a <tt>Grammar</tt> -object. - -<blockquote> -<pre> ->>> <b>g.build_lritems()</b> ->>> <b>p = g[1]</b> ->>> <b>p</b> -Production(statement -> ID = expr) ->>> -</pre> -</blockquote> - -In the above code, <tt>p</tt> represents the first grammar rule. In -this case, a rule <tt>'statement -> ID = expr'</tt>. - -<p> -Now, let's look at the LR items for <tt>p</tt>. - -<blockquote> -<pre> ->>> <b>p.lr_items</b> -[LRItem(statement -> . ID = expr), - LRItem(statement -> ID . = expr), - LRItem(statement -> ID = . expr), - LRItem(statement -> ID = expr .)] ->>> -</pre> -</blockquote> - -In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot -is advanced by one symbol. It is only when the dot reaches the very end that a production -is successfully parsed. - -<p> -An instance <tt>lr</tt> of <tt>LRItem</tt> has the following -attributes that hold information related to that specific stage of -parsing. - -<p> -<b><tt>lr.name</tt></b> -<blockquote> -The name of the grammar rule. For example, <tt>'statement'</tt> in the above example. -</blockquote> - -<p> -<b><tt>lr.prod</tt></b> -<blockquote> -A tuple of symbols representing the right-hand side of the production, including the -special <tt>'.'</tt> character. For example, <tt>('ID','.','=','expr')</tt>. -</blockquote> - -<p> -<b><tt>lr.number</tt></b> -<blockquote> -An integer representing the production number in the grammar. -</blockquote> - -<p> -<b><tt>lr.usyms</tt></b> -<blockquote> -A set of unique symbols in the production. Inherited from the original <tt>Production</tt> instance. -</blockquote> - -<p> -<b><tt>lr.lr_index</tt></b> -<blockquote> -An integer representing the position of the dot (.). You should never use <tt>lr.prod.index()</tt> -to search for it--the result will be wrong if the grammar happens to also use (.) as a character -literal. -</blockquote> - -<p> -<b><tt>lr.lr_after</tt></b> -<blockquote> -A list of all productions that can legally appear immediately to the right of the -dot (.). This list contains <tt>Production</tt> instances. This attribute -represents all of the possible branches a parse can take from the current position. -For example, suppose that <tt>lr</tt> represents a stage immediately before -an expression like this: - -<pre> ->>> <b>lr</b> -LRItem(statement -> ID = . expr) ->>> -</pre> - -Then, the value of <tt>lr.lr_after</tt> might look like this, showing all productions that -can legally appear next: - -<pre> ->>> <b>lr.lr_after</b> -[Production(expr -> expr PLUS expr), - Production(expr -> expr MINUS expr), - Production(expr -> expr TIMES expr), - Production(expr -> expr DIVIDE expr), - Production(expr -> MINUS expr), - Production(expr -> LPAREN expr RPAREN), - Production(expr -> NUMBER), - Production(expr -> ID)] ->>> -</pre> - -</blockquote> - -<p> -<b><tt>lr.lr_before</tt></b> -<blockquote> -The grammar symbol that appears immediately before the dot (.) or <tt>None</tt> if -at the beginning of the parse. -</blockquote> - -<p> -<b><tt>lr.lr_next</tt></b> -<blockquote> -A link to the next LR item, representing the next stage of the parse. <tt>None</tt> if <tt>lr</tt> -is the last LR item. -</blockquote> - -<tt>LRItem</tt> instances also support the <tt>__len__()</tt> and <tt>__getitem__()</tt> special methods. -<tt>len(lr)</tt> returns the number of items in <tt>lr.prod</tt> including the dot (.). <tt>lr[n]</tt> -returns <tt>lr.prod[n]</tt>. - -<p> -It goes without saying that all of the attributes associated with LR -items should be assumed to be read-only. Modifications will very -likely create a small black-hole that will consume you and your code. - -<H2><a name="internal_nn5"></a>5. LRTable</H2> - - -The <tt>LRTable</tt> class is used to represent LR parsing table data. This -minimally includes the production list, action table, and goto table. - -<p> -<b><tt>LRTable()</tt></b> -<blockquote> -Create an empty LRTable object. This object contains only the information needed to -run an LR parser. -</blockquote> - -An instance <tt>lrtab</tt> of <tt>LRTable</tt> has the following methods: - -<p> -<b><tt>lrtab.read_table(module)</tt></b> -<blockquote> -Populates the LR table with information from the module specified in <tt>module</tt>. -<tt>module</tt> is either a module object already loaded with <tt>import</tt> or -the name of a Python module. If it's a string containing a module name, it is -loaded and parsing data is extracted. Returns the signature value that was used -when initially writing the tables. Raises a <tt>VersionError</tt> exception if -the module was created using an incompatible version of PLY. -</blockquote> - -<p> -<b><tt>lrtab.bind_callables(dict)</tt></b> -<blockquote> -This binds all of the function names used in productions to callable objects -found in the dictionary <tt>dict</tt>. During table generation and when reading -LR tables from files, PLY only uses the names of action functions such as <tt>'p_expr'</tt>, -<tt>'p_statement'</tt>, etc. In order to actually run the parser, these names -have to be bound to callable objects. This method is always called prior to -running a parser. -</blockquote> - -After <tt>lrtab</tt> has been populated, the following attributes are defined. - -<p> -<b><tt>lrtab.lr_method</tt></b> -<blockquote> -The LR parsing method used (e.g., <tt>'LALR'</tt>) -</blockquote> - - -<p> -<b><tt>lrtab.lr_productions</tt></b> -<blockquote> -The production list. If the parsing tables have been newly -constructed, this will be a list of <tt>Production</tt> instances. If -the parsing tables have been read from a file, it's a list -of <tt>MiniProduction</tt> instances. This, together -with <tt>lr_action</tt> and <tt>lr_goto</tt> contain all of the -information needed by the LR parsing engine. -</blockquote> - -<p> -<b><tt>lrtab.lr_action</tt></b> -<blockquote> -The LR action dictionary that implements the underlying state machine. -The keys of this dictionary are the LR states. -</blockquote> - -<p> -<b><tt>lrtab.lr_goto</tt></b> -<blockquote> -The LR goto table that contains information about grammar rule reductions. -</blockquote> - - -<H2><a name="internal_nn6"></a>6. LRGeneratedTable</H2> - - -The <tt>LRGeneratedTable</tt> class represents constructed LR parsing tables on a -grammar. It is a subclass of <tt>LRTable</tt>. - -<p> -<b><tt>LRGeneratedTable(grammar, method='LALR',log=None)</tt></b> -<blockquote> -Create the LR parsing tables on a grammar. <tt>grammar</tt> is an instance of <tt>Grammar</tt>, -<tt>method</tt> is a string with the parsing method (<tt>'SLR'</tt> or <tt>'LALR'</tt>), and -<tt>log</tt> is a logger object used to write debugging information. The debugging information -written to <tt>log</tt> is the same as what appears in the <tt>parser.out</tt> file created -by yacc. By supplying a custom logger with a different message format, it is possible to get -more information (e.g., the line number in <tt>yacc.py</tt> used for issuing each line of -output in the log). The result is an instance of <tt>LRGeneratedTable</tt>. -</blockquote> - -<p> -An instance <tt>lr</tt> of <tt>LRGeneratedTable</tt> has the following attributes. - -<p> -<b><tt>lr.grammar</tt></b> -<blockquote> -A link to the Grammar object used to construct the parsing tables. -</blockquote> - -<p> -<b><tt>lr.lr_method</tt></b> -<blockquote> -The LR parsing method used (e.g., <tt>'LALR'</tt>) -</blockquote> - - -<p> -<b><tt>lr.lr_productions</tt></b> -<blockquote> -A reference to <tt>grammar.Productions</tt>. This, together with <tt>lr_action</tt> and <tt>lr_goto</tt> -contain all of the information needed by the LR parsing engine. -</blockquote> - -<p> -<b><tt>lr.lr_action</tt></b> -<blockquote> -The LR action dictionary that implements the underlying state machine. The keys of this dictionary are -the LR states. -</blockquote> - -<p> -<b><tt>lr.lr_goto</tt></b> -<blockquote> -The LR goto table that contains information about grammar rule reductions. -</blockquote> - -<p> -<b><tt>lr.sr_conflicts</tt></b> -<blockquote> -A list of tuples <tt>(state,token,resolution)</tt> identifying all shift/reduce conflicts. <tt>state</tt> is the LR state -number where the conflict occurred, <tt>token</tt> is the token causing the conflict, and <tt>resolution</tt> is -a string describing the resolution taken. <tt>resolution</tt> is either <tt>'shift'</tt> or <tt>'reduce'</tt>. -</blockquote> - -<p> -<b><tt>lr.rr_conflicts</tt></b> -<blockquote> -A list of tuples <tt>(state,rule,rejected)</tt> identifying all reduce/reduce conflicts. <tt>state</tt> is the -LR state number where the conflict occurred, <tt>rule</tt> is the production rule that was selected -and <tt>rejected</tt> is the production rule that was rejected. Both <tt>rule</tt> and </tt>rejected</tt> are -instances of <tt>Production</tt>. They can be inspected to provide the user with more information. -</blockquote> - -<p> -There are two public methods of <tt>LRGeneratedTable</tt>. - -<p> -<b><tt>lr.write_table(modulename,outputdir="",signature="")</tt></b> -<blockquote> -Writes the LR parsing table information to a Python module. <tt>modulename</tt> is a string -specifying the name of a module such as <tt>"parsetab"</tt>. <tt>outputdir</tt> is the name of a -directory where the module should be created. <tt>signature</tt> is a string representing a -grammar signature that's written into the output file. This can be used to detect when -the data stored in a module file is out-of-sync with the the grammar specification (and that -the tables need to be regenerated). If <tt>modulename</tt> is a string <tt>"parsetab"</tt>, -this function creates a file called <tt>parsetab.py</tt>. If the module name represents a -package such as <tt>"foo.bar.parsetab"</tt>, then only the last component, <tt>"parsetab"</tt> is -used. -</blockquote> - - -<H2><a name="internal_nn7"></a>7. LRParser</H2> - - -The <tt>LRParser</tt> class implements the low-level LR parsing engine. - - -<p> -<b><tt>LRParser(lrtab, error_func)</tt></b> -<blockquote> -Create an LRParser. <tt>lrtab</tt> is an instance of <tt>LRTable</tt> -containing the LR production and state tables. <tt>error_func</tt> is the -error function to invoke in the event of a parsing error. -</blockquote> - -An instance <tt>p</tt> of <tt>LRParser</tt> has the following methods: - -<p> -<b><tt>p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None)</tt></b> -<blockquote> -Run the parser. <tt>input</tt> is a string, which if supplied is fed into the -lexer using its <tt>input()</tt> method. <tt>lexer</tt> is an instance of the -<tt>Lexer</tt> class to use for tokenizing. If not supplied, the last lexer -created with the <tt>lex</tt> module is used. <tt>debug</tt> is a boolean flag -that enables debugging. <tt>tracking</tt> is a boolean flag that tells the -parser to perform additional line number tracking. <tt>tokenfunc</tt> is a callable -function that returns the next token. If supplied, the parser will use it to get -all tokens. -</blockquote> - -<p> -<b><tt>p.restart()</tt></b> -<blockquote> -Resets the parser state for a parse already in progress. -</blockquote> - -<H2><a name="internal_nn8"></a>8. ParserReflect</H2> - - -<p> -The <tt>ParserReflect</tt> class is used to collect parser specification data -from a Python module or object. This class is what collects all of the -<tt>p_rule()</tt> functions in a PLY file, performs basic error checking, -and collects all of the needed information to build a grammar. Most of the -high-level PLY interface as used by the <tt>yacc()</tt> function is actually -implemented by this class. - -<p> -<b><tt>ParserReflect(pdict, log=None)</tt></b> -<blockquote> -Creates a <tt>ParserReflect</tt> instance. <tt>pdict</tt> is a dictionary -containing parser specification data. This dictionary typically corresponds -to the module or class dictionary of code that implements a PLY parser. -<tt>log</tt> is a logger instance that will be used to report error -messages. -</blockquote> - -An instance <tt>p</tt> of <tt>ParserReflect</tt> has the following methods: - -<p> -<b><tt>p.get_all()</tt></b> -<blockquote> -Collect and store all required parsing information. -</blockquote> - -<p> -<b><tt>p.validate_all()</tt></b> -<blockquote> -Validate all of the collected parsing information. This is a seprate step -from <tt>p.get_all()</tt> as a performance optimization. In order to -increase parser start-up time, a parser can elect to only validate the -parsing data when regenerating the parsing tables. The validation -step tries to collect as much information as possible rather than -raising an exception at the first sign of trouble. The attribute -<tt>p.error</tt> is set if there are any validation errors. The -value of this attribute is also returned. -</blockquote> - -<p> -<b><tt>p.signature()</tt></b> -<blockquote> -Compute a signature representing the contents of the collected parsing -data. The signature value should change if anything in the parser -specification has changed in a way that would justify parser table -regeneration. This method can be called after <tt>p.get_all()</tt>, -but before <tt>p.validate_all()</tt>. -</blockquote> - -The following attributes are set in the process of collecting data: - -<p> -<b><tt>p.start</tt></b> -<blockquote> -The grammar start symbol, if any. Taken from <tt>pdict['start']</tt>. -</blockquote> - -<p> -<b><tt>p.error_func</tt></b> -<blockquote> -The error handling function or <tt>None</tt>. Taken from <tt>pdict['p_error']</tt>. -</blockquote> - -<p> -<b><tt>p.tokens</tt></b> -<blockquote> -The token list. Taken from <tt>pdict['tokens']</tt>. -</blockquote> - -<p> -<b><tt>p.prec</tt></b> -<blockquote> -The precedence specifier. Taken from <tt>pdict['precedence']</tt>. -</blockquote> - -<p> -<b><tt>p.preclist</tt></b> -<blockquote> -A parsed version of the precedence specified. A list of tuples of the form -<tt>(token,assoc,level)</tt> where <tt>token</tt> is the terminal symbol, -<tt>assoc</tt> is the associativity (e.g., <tt>'left'</tt>) and <tt>level</tt> -is a numeric precedence level. -</blockquote> - -<p> -<b><tt>p.grammar</tt></b> -<blockquote> -A list of tuples <tt>(name, rules)</tt> representing the grammar rules. <tt>name</tt> is the -name of a Python function or method in <tt>pdict</tt> that starts with <tt>"p_"</tt>. -<tt>rules</tt> is a list of tuples <tt>(filename,line,prodname,syms)</tt> representing -the grammar rules found in the documentation string of that function. <tt>filename</tt> and <tt>line</tt> contain location -information that can be used for debugging. <tt>prodname</tt> is the name of the -production. <tt>syms</tt> is the right-hand side of the production. If you have a -function like this - -<pre> -def p_expr(p): - '''expr : expr PLUS expr - | expr MINUS expr - | expr TIMES expr - | expr DIVIDE expr''' -</pre> - -then the corresponding entry in <tt>p.grammar</tt> might look like this: - -<pre> -('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), - ('calc.py',11,'expr', ['expr','MINUS','expr']), - ('calc.py',12,'expr', ['expr','TIMES','expr']), - ('calc.py',13,'expr', ['expr','DIVIDE','expr']) - ]) -</pre> -</blockquote> - -<p> -<b><tt>p.pfuncs</tt></b> -<blockquote> -A sorted list of tuples <tt>(line, file, name, doc)</tt> representing all of -the <tt>p_</tt> functions found. <tt>line</tt> and <tt>file</tt> give location -information. <tt>name</tt> is the name of the function. <tt>doc</tt> is the -documentation string. This list is sorted in ascending order by line number. -</blockquote> - -<p> -<b><tt>p.files</tt></b> -<blockquote> -A dictionary holding all of the source filenames that were encountered -while collecting parser information. Only the keys of this dictionary have -any meaning. -</blockquote> - -<p> -<b><tt>p.error</tt></b> -<blockquote> -An attribute that indicates whether or not any critical errors -occurred in validation. If this is set, it means that that some kind -of problem was detected and that no further processing should be -performed. -</blockquote> - - -<H2><a name="internal_nn9"></a>9. High-level operation</H2> - - -Using all of the above classes requires some attention to detail. The <tt>yacc()</tt> -function carries out a very specific sequence of operations to create a grammar. -This same sequence should be emulated if you build an alternative PLY interface. - -<ol> -<li>A <tt>ParserReflect</tt> object is created and raw grammar specification data is -collected. -<li>A <tt>Grammar</tt> object is created and populated with information -from the specification data. -<li>A <tt>LRGenerator</tt> object is created to run the LALR algorithm over -the <tt>Grammar</tt> object. -<li>Productions in the LRGenerator and bound to callables using the <tt>bind_callables()</tt> -method. -<li>A <tt>LRParser</tt> object is created from from the information in the -<tt>LRGenerator</tt> object. -</ol> - -</body> -</html> - - - - - - - diff --git a/doc/makedoc.py b/doc/makedoc.py deleted file mode 100644 index e5cbdb0..0000000 --- a/doc/makedoc.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/local/bin/python - -############################################################################### -# Takes a chapter as input and adds internal links and numbering to all -# of the H1, H2, H3, H4 and H5 sections. -# -# Every heading HTML tag (H1, H2 etc) is given an autogenerated name to link -# to. However, if the name is not an autogenerated name from a previous run, -# it will be kept. If it is autogenerated, it might change on subsequent runs -# of this program. Thus if you want to create links to one of the headings, -# then change the heading link name to something that does not look like an -# autogenerated link name. -############################################################################### - -import sys -import re -import string - -############################################################################### -# Functions -############################################################################### - -# Regexs for <a name="..."></a> -alink = re.compile(r"<a *name *= *\"(.*)\"></a>", re.IGNORECASE) -heading = re.compile(r"(_nn\d)", re.IGNORECASE) - -def getheadingname(m): - autogeneratedheading = True - if m.group(1) != None: - amatch = alink.match(m.group(1)) - if amatch: - # A non-autogenerated heading - keep it - headingname = amatch.group(1) - autogeneratedheading = heading.match(headingname) - if autogeneratedheading: - # The heading name was either non-existent or autogenerated, - # We can create a new heading / change the existing heading - headingname = "%s_nn%d" % (filenamebase, nameindex) - return headingname - -############################################################################### -# Main program -############################################################################### - -if len(sys.argv) != 2: - print "usage: makedoc.py filename" - sys.exit(1) - -filename = sys.argv[1] -filenamebase = string.split(filename,".")[0] - -section = 0 -subsection = 0 -subsubsection = 0 -subsubsubsection = 0 -nameindex = 0 - -name = "" - -# Regexs for <h1>,... <h5> sections - -h1 = re.compile(r".*?<H1>(<a.*a>)*[\d\.\s]*(.*?)</H1>", re.IGNORECASE) -h2 = re.compile(r".*?<H2>(<a.*a>)*[\d\.\s]*(.*?)</H2>", re.IGNORECASE) -h3 = re.compile(r".*?<H3>(<a.*a>)*[\d\.\s]*(.*?)</H3>", re.IGNORECASE) -h4 = re.compile(r".*?<H4>(<a.*a>)*[\d\.\s]*(.*?)</H4>", re.IGNORECASE) -h5 = re.compile(r".*?<H5>(<a.*a>)*[\d\.\s]*(.*?)</H5>", re.IGNORECASE) - -# Make backup -with open(filename) as src, open(filename+".bak","w") as dst: - dst.write(src.read()) - -lines = data.splitlines() -result = [ ] # This is the result of postprocessing the file -index = "<!-- INDEX -->\n<div class=\"sectiontoc\">\n" # index contains the index for adding at the top of the file. Also printed to stdout. - -skip = 0 -skipspace = 0 - -for s in lines: - if s == "<!-- INDEX -->": - if not skip: - result.append("@INDEX@") - skip = 1 - else: - skip = 0 - continue - if skip: - continue - - if not s and skipspace: - continue - - if skipspace: - result.append("") - result.append("") - skipspace = 0 - - m = h2.match(s) - if m: - prevheadingtext = m.group(2) - nameindex += 1 - section += 1 - headingname = getheadingname(m) - result.append("""<H2><a name="%s"></a>%d. %s</H2>""" % (headingname,section, prevheadingtext)) - - if subsubsubsection: - index += "</ul>\n" - if subsubsection: - index += "</ul>\n" - if subsection: - index += "</ul>\n" - if section == 1: - index += "<ul>\n" - - index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) - subsection = 0 - subsubsection = 0 - subsubsubsection = 0 - skipspace = 1 - continue - m = h3.match(s) - if m: - prevheadingtext = m.group(2) - nameindex += 1 - subsection += 1 - headingname = getheadingname(m) - result.append("""<H3><a name="%s"></a>%d.%d %s</H3>""" % (headingname,section, subsection, prevheadingtext)) - - if subsubsubsection: - index += "</ul>\n" - if subsubsection: - index += "</ul>\n" - if subsection == 1: - index += "<ul>\n" - - index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) - subsubsection = 0 - skipspace = 1 - continue - m = h4.match(s) - if m: - prevheadingtext = m.group(2) - nameindex += 1 - subsubsection += 1 - subsubsubsection = 0 - headingname = getheadingname(m) - result.append("""<H4><a name="%s"></a>%d.%d.%d %s</H4>""" % (headingname,section, subsection, subsubsection, prevheadingtext)) - - if subsubsubsection: - index += "</ul>\n" - if subsubsection == 1: - index += "<ul>\n" - - index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) - skipspace = 1 - continue - m = h5.match(s) - if m: - prevheadingtext = m.group(2) - nameindex += 1 - subsubsubsection += 1 - headingname = getheadingname(m) - result.append("""<H5><a name="%s"></a>%d.%d.%d.%d %s</H5>""" % (headingname,section, subsection, subsubsection, subsubsubsection, prevheadingtext)) - - if subsubsubsection == 1: - index += "<ul>\n" - - index += """<li><a href="#%s">%s</a>\n""" % (headingname,prevheadingtext) - skipspace = 1 - continue - - result.append(s) - -if subsubsubsection: - index += "</ul>\n" - -if subsubsection: - index += "</ul>\n" - -if subsection: - index += "</ul>\n" - -if section: - index += "</ul>\n" - -index += "</div>\n<!-- INDEX -->\n" - -data = "\n".join(result) - -data = data.replace("@INDEX@",index) + "\n" - -# Write the file back out -with open(filename,"w") as f: - f.write(data) diff --git a/doc/ply.html b/doc/ply.html deleted file mode 100644 index 6b8aca9..0000000 --- a/doc/ply.html +++ /dev/null @@ -1,3496 +0,0 @@ -<html> -<head> -<title>PLY (Python Lex-Yacc)</title> -</head> -<body bgcolor="#ffffff"> - -<h1>PLY (Python Lex-Yacc)</h1> - -<b> -David M. Beazley <br> -dave@dabeaz.com<br> -</b> - -<p> -<b>PLY Version: 3.11</b> -<p> - -<!-- INDEX --> -<div class="sectiontoc"> -<ul> -<li><a href="#ply_nn0">Preface and Requirements</a> -<li><a href="#ply_nn1">Introduction</a> -<li><a href="#ply_nn2">PLY Overview</a> -<li><a href="#ply_nn3">Lex</a> -<ul> -<li><a href="#ply_nn4">Lex Example</a> -<li><a href="#ply_nn5">The tokens list</a> -<li><a href="#ply_nn6">Specification of tokens</a> -<li><a href="#ply_nn7">Token values</a> -<li><a href="#ply_nn8">Discarded tokens</a> -<li><a href="#ply_nn9">Line numbers and positional information</a> -<li><a href="#ply_nn10">Ignored characters</a> -<li><a href="#ply_nn11">Literal characters</a> -<li><a href="#ply_nn12">Error handling</a> -<li><a href="#ply_nn14">EOF Handling</a> -<li><a href="#ply_nn13">Building and using the lexer</a> -<li><a href="#ply_nn14b">The @TOKEN decorator</a> -<li><a href="#ply_nn15">Optimized mode</a> -<li><a href="#ply_nn16">Debugging</a> -<li><a href="#ply_nn17">Alternative specification of lexers</a> -<li><a href="#ply_nn18">Maintaining state</a> -<li><a href="#ply_nn19">Lexer cloning</a> -<li><a href="#ply_nn20">Internal lexer state</a> -<li><a href="#ply_nn21">Conditional lexing and start conditions</a> -<li><a href="#ply_nn21b">Miscellaneous Issues</a> -</ul> -<li><a href="#ply_nn22">Parsing basics</a> -<li><a href="#ply_nn23">Yacc</a> -<ul> -<li><a href="#ply_nn24">An example</a> -<li><a href="#ply_nn25">Combining Grammar Rule Functions</a> -<li><a href="#ply_nn26">Character Literals</a> -<li><a href="#ply_nn26b">Empty Productions</a> -<li><a href="#ply_nn28">Changing the starting symbol</a> -<li><a href="#ply_nn27">Dealing With Ambiguous Grammars</a> -<li><a href="#ply_nn28b">The parser.out file</a> -<li><a href="#ply_nn29">Syntax Error Handling</a> -<ul> -<li><a href="#ply_nn30">Recovery and resynchronization with error rules</a> -<li><a href="#ply_nn31">Panic mode recovery</a> -<li><a href="#ply_nn35">Signalling an error from a production</a> -<li><a href="#ply_nn38">When Do Syntax Errors Get Reported</a> -<li><a href="#ply_nn32">General comments on error handling</a> -</ul> -<li><a href="#ply_nn33">Line Number and Position Tracking</a> -<li><a href="#ply_nn34">AST Construction</a> -<li><a href="#ply_nn35b">Embedded Actions</a> -<li><a href="#ply_nn36">Miscellaneous Yacc Notes</a> -</ul> -<li><a href="#ply_nn37">Multiple Parsers and Lexers</a> -<li><a href="#ply_nn38b">Using Python's Optimized Mode</a> -<li><a href="#ply_nn44">Advanced Debugging</a> -<ul> -<li><a href="#ply_nn45">Debugging the lex() and yacc() commands</a> -<li><a href="#ply_nn46">Run-time Debugging</a> -</ul> -<li><a href="#ply_nn49">Packaging Advice</a> -<li><a href="#ply_nn39">Where to go from here?</a> -</ul> -</div> -<!-- INDEX --> - - - - - - -<H2><a name="ply_nn0"></a>1. Preface and Requirements</H2> - - -<p> -This document provides an overview of lexing and parsing with PLY. -Given the intrinsic complexity of parsing, I would strongly advise -that you read (or at least skim) this entire document before jumping -into a big development project with PLY. -</p> - -<p> -PLY-3.5 is compatible with both Python 2 and Python 3. If you are using -Python 2, you have to use Python 2.6 or newer. -</p> - -<H2><a name="ply_nn1"></a>2. Introduction</H2> - - -PLY is a pure-Python implementation of the popular compiler -construction tools lex and yacc. The main goal of PLY is to stay -fairly faithful to the way in which traditional lex/yacc tools work. -This includes supporting LALR(1) parsing as well as providing -extensive input validation, error reporting, and diagnostics. Thus, -if you've used yacc in another programming language, it should be -relatively straightforward to use PLY. - -<p> -Early versions of PLY were developed to support an Introduction to -Compilers Course I taught in 2001 at the University of Chicago. -Since PLY was primarily developed as an instructional tool, you will -find it to be fairly picky about token and grammar rule -specification. In part, this -added formality is meant to catch common programming mistakes made by -novice users. However, advanced users will also find such features to -be useful when building complicated grammars for real programming -languages. It should also be noted that PLY does not provide much in -the way of bells and whistles (e.g., automatic construction of -abstract syntax trees, tree traversal, etc.). Nor would I consider it -to be a parsing framework. Instead, you will find a bare-bones, yet -fully capable lex/yacc implementation written entirely in Python. - -<p> -The rest of this document assumes that you are somewhat familiar with -parsing theory, syntax directed translation, and the use of compiler -construction tools such as lex and yacc in other programming -languages. If you are unfamiliar with these topics, you will probably -want to consult an introductory text such as "Compilers: Principles, -Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex -and Yacc" by John Levine may also be handy. In fact, the O'Reilly book can be -used as a reference for PLY as the concepts are virtually identical. - -<H2><a name="ply_nn2"></a>3. PLY Overview</H2> - - -<p> -PLY consists of two separate modules; <tt>lex.py</tt> and -<tt>yacc.py</tt>, both of which are found in a Python package -called <tt>ply</tt>. The <tt>lex.py</tt> module is used to break input text into a -collection of tokens specified by a collection of regular expression -rules. <tt>yacc.py</tt> is used to recognize language syntax that has -been specified in the form of a context free grammar. -</p> - -<p> -The two tools are meant to work together. Specifically, -<tt>lex.py</tt> provides an external interface in the form of a -<tt>token()</tt> function that returns the next valid token on the -input stream. <tt>yacc.py</tt> calls this repeatedly to retrieve -tokens and invoke grammar rules. The output of <tt>yacc.py</tt> is -often an Abstract Syntax Tree (AST). However, this is entirely up to -the user. If desired, <tt>yacc.py</tt> can also be used to implement -simple one-pass compilers. - -<p> -Like its Unix counterpart, <tt>yacc.py</tt> provides most of the -features you expect including extensive error checking, grammar -validation, support for empty productions, error tokens, and ambiguity -resolution via precedence rules. In fact, almost everything that is possible in traditional yacc -should be supported in PLY. - -<p> -The primary difference between -<tt>yacc.py</tt> and Unix <tt>yacc</tt> is that <tt>yacc.py</tt> -doesn't involve a separate code-generation process. -Instead, PLY relies on reflection (introspection) -to build its lexers and parsers. Unlike traditional lex/yacc which -require a special input file that is converted into a separate source -file, the specifications given to PLY <em>are</em> valid Python -programs. This means that there are no extra source files nor is -there a special compiler construction step (e.g., running yacc to -generate Python code for the compiler). Since the generation of the -parsing tables is relatively expensive, PLY caches the results and -saves them to a file. If no changes are detected in the input source, -the tables are read from the cache. Otherwise, they are regenerated. - -<H2><a name="ply_nn3"></a>4. Lex</H2> - - -<tt>lex.py</tt> is used to tokenize an input string. For example, suppose -you're writing a programming language and a user supplied the following input string: - -<blockquote> -<pre> -x = 3 + 42 * (s - t) -</pre> -</blockquote> - -A tokenizer splits the string into individual tokens - -<blockquote> -<pre> -'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')' -</pre> -</blockquote> - -Tokens are usually given names to indicate what they are. For example: - -<blockquote> -<pre> -'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES', -'LPAREN','ID','MINUS','ID','RPAREN' -</pre> -</blockquote> - -More specifically, the input is broken into pairs of token types and values. For example: - -<blockquote> -<pre> -('ID','x'), ('EQUALS','='), ('NUMBER','3'), -('PLUS','+'), ('NUMBER','42), ('TIMES','*'), -('LPAREN','('), ('ID','s'), ('MINUS','-'), -('ID','t'), ('RPAREN',')' -</pre> -</blockquote> - -The identification of tokens is typically done by writing a series of regular expression -rules. The next section shows how this is done using <tt>lex.py</tt>. - -<H3><a name="ply_nn4"></a>4.1 Lex Example</H3> - - -The following example shows how <tt>lex.py</tt> is used to write a simple tokenizer. - -<blockquote> -<pre> -# ------------------------------------------------------------ -# calclex.py -# -# tokenizer for a simple expression evaluator for -# numbers and +,-,*,/ -# ------------------------------------------------------------ -import ply.lex as lex - -# List of token names. This is always required -tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', -) - -# Regular expression rules for simple tokens -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_LPAREN = r'\(' -t_RPAREN = r'\)' - -# A regular expression rule with some action code -def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - -# Define a rule so we can track line numbers -def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - -# A string containing ignored characters (spaces and tabs) -t_ignore = ' \t' - -# Error handling rule -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex() - -</pre> -</blockquote> -To use the lexer, you first need to feed it some input text using -its <tt>input()</tt> method. After that, repeated calls -to <tt>token()</tt> produce tokens. The following code shows how this -works: - -<blockquote> -<pre> - -# Test it out -data = ''' -3 + 4 * 10 - + -20 *2 -''' - -# Give the lexer some input -lexer.input(data) - -# Tokenize -while True: - tok = lexer.token() - if not tok: - break # No more input - print(tok) -</pre> -</blockquote> - -When executed, the example will produce the following output: - -<blockquote> -<pre> -$ python example.py -LexToken(NUMBER,3,2,1) -LexToken(PLUS,'+',2,3) -LexToken(NUMBER,4,2,5) -LexToken(TIMES,'*',2,7) -LexToken(NUMBER,10,2,10) -LexToken(PLUS,'+',3,14) -LexToken(MINUS,'-',3,16) -LexToken(NUMBER,20,3,18) -LexToken(TIMES,'*',3,20) -LexToken(NUMBER,2,3,21) -</pre> -</blockquote> - -Lexers also support the iteration protocol. So, you can write the above loop as follows: - -<blockquote> -<pre> -for tok in lexer: - print(tok) -</pre> -</blockquote> - -The tokens returned by <tt>lexer.token()</tt> are instances -of <tt>LexToken</tt>. This object has -attributes <tt>tok.type</tt>, <tt>tok.value</tt>, -<tt>tok.lineno</tt>, and <tt>tok.lexpos</tt>. The following code shows an example of -accessing these attributes: - -<blockquote> -<pre> -# Tokenize -while True: - tok = lexer.token() - if not tok: - break # No more input - print(tok.type, tok.value, tok.lineno, tok.lexpos) -</pre> -</blockquote> - -The <tt>tok.type</tt> and <tt>tok.value</tt> attributes contain the -type and value of the token itself. -<tt>tok.lineno</tt> and <tt>tok.lexpos</tt> contain information about -the location of the token. <tt>tok.lexpos</tt> is the index of the -token relative to the start of the input text. - -<H3><a name="ply_nn5"></a>4.2 The tokens list</H3> - - -<p> -All lexers must provide a list <tt>tokens</tt> that defines all of the possible token -names that can be produced by the lexer. This list is always required -and is used to perform a variety of validation checks. The tokens list is also used by the -<tt>yacc.py</tt> module to identify terminals. -</p> - -<p> -In the example, the following code specified the token names: - -<blockquote> -<pre> -tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', -) -</pre> -</blockquote> - -<H3><a name="ply_nn6"></a>4.3 Specification of tokens</H3> - - -Each token is specified by writing a regular expression rule compatible with Python's <tt>re</tt> module. Each of these rules -are defined by making declarations with a special prefix <tt>t_</tt> to indicate that it -defines a token. For simple tokens, the regular expression can -be specified as strings such as this (note: Python raw strings are used since they are the -most convenient way to write regular expression strings): - -<blockquote> -<pre> -t_PLUS = r'\+' -</pre> -</blockquote> - -In this case, the name following the <tt>t_</tt> must exactly match one of the -names supplied in <tt>tokens</tt>. If some kind of action needs to be performed, -a token rule can be specified as a function. For example, this rule matches numbers and -converts the string into a Python integer. - -<blockquote> -<pre> -def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t -</pre> -</blockquote> - -When a function is used, the regular expression rule is specified in the function documentation string. -The function always takes a single argument which is an instance of -<tt>LexToken</tt>. This object has attributes of <tt>t.type</tt> which is the token type (as a string), -<tt>t.value</tt> which is the lexeme (the actual text matched), <tt>t.lineno</tt> which is the current line number, and <tt>t.lexpos</tt> which -is the position of the token relative to the beginning of the input text. -By default, <tt>t.type</tt> is set to the name following the <tt>t_</tt> prefix. The action -function can modify the contents of the <tt>LexToken</tt> object as appropriate. However, -when it is done, the resulting token should be returned. If no value is returned by the action -function, the token is simply discarded and the next token read. - -<p> -Internally, <tt>lex.py</tt> uses the <tt>re</tt> module to do its pattern matching. Patterns are compiled -using the <tt>re.VERBOSE</tt> flag which can be used to help readability. However, be aware that unescaped -whitespace is ignored and comments are allowed in this mode. If your pattern involves whitespace, make sure you -use <tt>\s</tt>. If you need to match the <tt>#</tt> character, use <tt>[#]</tt>. -</p> - -<p> -When building the master regular expression, -rules are added in the following order: -</p> - -<p> -<ol> -<li>All tokens defined by functions are added in the same order as they appear in the lexer file. -<li>Tokens defined by strings are added next by sorting them in order of decreasing regular expression length (longer expressions -are added first). -</ol> -<p> -Without this ordering, it can be difficult to correctly match certain types of tokens. For example, if you -wanted to have separate tokens for "=" and "==", you need to make sure that "==" is checked first. By sorting regular -expressions in order of decreasing length, this problem is solved for rules defined as strings. For functions, -the order can be explicitly controlled since rules appearing first are checked first. - -<p> -To handle reserved words, you should write a single rule to match an -identifier and do a special name lookup in a function like this: - -<blockquote> -<pre> -reserved = { - 'if' : 'IF', - 'then' : 'THEN', - 'else' : 'ELSE', - 'while' : 'WHILE', - ... -} - -tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) - -def t_ID(t): - r'[a-zA-Z_][a-zA-Z_0-9]*' - t.type = reserved.get(t.value,'ID') # Check for reserved words - return t -</pre> -</blockquote> - -This approach greatly reduces the number of regular expression rules and is likely to make things a little faster. - -<p> -<b>Note:</b> You should avoid writing individual rules for reserved words. For example, if you write rules like this, - -<blockquote> -<pre> -t_FOR = r'for' -t_PRINT = r'print' -</pre> -</blockquote> - -those rules will be triggered for identifiers that include those words as a prefix such as "forget" or "printed". This is probably not -what you want. - -<H3><a name="ply_nn7"></a>4.4 Token values</H3> - - -When tokens are returned by lex, they have a value that is stored in the <tt>value</tt> attribute. Normally, the value is the text -that was matched. However, the value can be assigned to any Python object. For instance, when lexing identifiers, you may -want to return both the identifier name and information from some sort of symbol table. To do this, you might write a rule like this: - -<blockquote> -<pre> -def t_ID(t): - ... - # Look up symbol table information and return a tuple - t.value = (t.value, symbol_lookup(t.value)) - ... - return t -</pre> -</blockquote> - -It is important to note that storing data in other attribute names is <em>not</em> recommended. The <tt>yacc.py</tt> module only exposes the -contents of the <tt>value</tt> attribute. Thus, accessing other attributes may be unnecessarily awkward. If you -need to store multiple values on a token, assign a tuple, dictionary, or instance to <tt>value</tt>. - -<H3><a name="ply_nn8"></a>4.5 Discarded tokens</H3> - - -To discard a token, such as a comment, simply define a token rule that returns no value. For example: - -<blockquote> -<pre> -def t_COMMENT(t): - r'\#.*' - pass - # No return value. Token discarded -</pre> -</blockquote> - -Alternatively, you can include the prefix "ignore_" in the token declaration to force a token to be ignored. For example: - -<blockquote> -<pre> -t_ignore_COMMENT = r'\#.*' -</pre> -</blockquote> - -Be advised that if you are ignoring many different kinds of text, you may still want to use functions since these provide more precise -control over the order in which regular expressions are matched (i.e., functions are matched in order of specification whereas strings are -sorted by regular expression length). - -<H3><a name="ply_nn9"></a>4.6 Line numbers and positional information</H3> - - -<p>By default, <tt>lex.py</tt> knows nothing about line numbers. This is because <tt>lex.py</tt> doesn't know anything -about what constitutes a "line" of input (e.g., the newline character or even if the input is textual data). -To update this information, you need to write a special rule. In the example, the <tt>t_newline()</tt> rule shows how to do this. - -<blockquote> -<pre> -# Define a rule so we can track line numbers -def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) -</pre> -</blockquote> -Within the rule, the <tt>lineno</tt> attribute of the underlying lexer <tt>t.lexer</tt> is updated. -After the line number is updated, the token is simply discarded since nothing is returned. - -<p> -<tt>lex.py</tt> does not perform any kind of automatic column tracking. However, it does record positional -information related to each token in the <tt>lexpos</tt> attribute. Using this, it is usually possible to compute -column information as a separate step. For instance, just count backwards until you reach a newline. - -<blockquote> -<pre> -# Compute column. -# input is the input text string -# token is a token instance -def find_column(input, token): - line_start = input.rfind('\n', 0, token.lexpos) + 1 - return (token.lexpos - line_start) + 1 -</pre> -</blockquote> - -Since column information is often only useful in the context of error handling, calculating the column -position can be performed when needed as opposed to doing it for each token. - -<H3><a name="ply_nn10"></a>4.7 Ignored characters</H3> - - -<p> -The special <tt>t_ignore</tt> rule is reserved by <tt>lex.py</tt> for characters -that should be completely ignored in the input stream. -Usually this is used to skip over whitespace and other non-essential characters. -Although it is possible to define a regular expression rule for whitespace in a manner -similar to <tt>t_newline()</tt>, the use of <tt>t_ignore</tt> provides substantially better -lexing performance because it is handled as a special case and is checked in a much -more efficient manner than the normal regular expression rules. -</p> - -<p> -The characters given in <tt>t_ignore</tt> are not ignored when such characters are part of -other regular expression patterns. For example, if you had a rule to capture quoted text, -that pattern can include the ignored characters (which will be captured in the normal way). The -main purpose of <tt>t_ignore</tt> is to ignore whitespace and other padding between the -tokens that you actually want to parse. -</p> - -<H3><a name="ply_nn11"></a>4.8 Literal characters</H3> - - -<p> -Literal characters can be specified by defining a variable <tt>literals</tt> in your lexing module. For example: - -<blockquote> -<pre> -literals = [ '+','-','*','/' ] -</pre> -</blockquote> - -or alternatively - -<blockquote> -<pre> -literals = "+-*/" -</pre> -</blockquote> - -A literal character is simply a single character that is returned "as is" when encountered by the lexer. Literals are checked -after all of the defined regular expression rules. Thus, if a rule starts with one of the literal characters, it will always -take precedence. - -<p> -When a literal token is returned, both its <tt>type</tt> and <tt>value</tt> attributes are set to the character itself. For example, <tt>'+'</tt>. -</p> - -<p> -It's possible to write token functions that perform additional actions -when literals are matched. However, you'll need to set the token type -appropriately. For example: -</p> - -<blockquote> -<pre> -literals = [ '{', '}' ] - -def t_lbrace(t): - r'\{' - t.type = '{' # Set token type to the expected literal - return t - -def t_rbrace(t): - r'\}' - t.type = '}' # Set token type to the expected literal - return t -</pre> -</blockquote> - -<H3><a name="ply_nn12"></a>4.9 Error handling</H3> - - -<p> -The <tt>t_error()</tt> -function is used to handle lexing errors that occur when illegal -characters are detected. In this case, the <tt>t.value</tt> attribute contains the -rest of the input string that has not been tokenized. In the example, the error function -was defined as follows: - -<blockquote> -<pre> -# Error handling rule -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) -</pre> -</blockquote> - -In this case, we simply print the offending character and skip ahead one character by calling <tt>t.lexer.skip(1)</tt>. - -<H3><a name="ply_nn14"></a>4.10 EOF Handling</H3> - - -<p> -The <tt>t_eof()</tt> function is used to handle an end-of-file (EOF) condition in the input. As input, it -receives a token type <tt>'eof'</tt> with the <tt>lineno</tt> and <tt>lexpos</tt> attributes set appropriately. -The main use of this function is provide more input to the lexer so that it can continue to parse. Here is an -example of how this works: -</p> - -<blockquote> -<pre> -# EOF handling rule -def t_eof(t): - # Get more input (Example) - more = raw_input('... ') - if more: - self.lexer.input(more) - return self.lexer.token() - return None -</pre> -</blockquote> - -<p> -The EOF function should return the next available token (by calling <tt>self.lexer.token())</tt> or <tt>None</tt> to -indicate no more data. Be aware that setting more input with the <tt>self.lexer.input()</tt> method does -NOT reset the lexer state or the <tt>lineno</tt> attribute used for position tracking. The <tt>lexpos</tt> -attribute is reset so be aware of that if you're using it in error reporting. -</p> - -<H3><a name="ply_nn13"></a>4.11 Building and using the lexer</H3> - - -<p> -To build the lexer, the function <tt>lex.lex()</tt> is used. For example:</p> - -<blockquote> -<pre> -lexer = lex.lex() -</pre> -</blockquote> - -<p>This function -uses Python reflection (or introspection) to read the regular expression rules -out of the calling context and build the lexer. Once the lexer has been built, two methods can -be used to control the lexer. -</p> -<ul> -<li><tt>lexer.input(data)</tt>. Reset the lexer and store a new input string. -<li><tt>lexer.token()</tt>. Return the next token. Returns a special <tt>LexToken</tt> instance on success or -None if the end of the input text has been reached. -</ul> - -<H3><a name="ply_nn14b"></a>4.12 The @TOKEN decorator</H3> - - -In some applications, you may want to define build tokens from as a series of -more complex regular expression rules. For example: - -<blockquote> -<pre> -digit = r'([0-9])' -nondigit = r'([_A-Za-z])' -identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' - -def t_ID(t): - # want docstring to be identifier above. ????? - ... -</pre> -</blockquote> - -In this case, we want the regular expression rule for <tt>ID</tt> to be one of the variables above. However, there is no -way to directly specify this using a normal documentation string. To solve this problem, you can use the <tt>@TOKEN</tt> -decorator. For example: - -<blockquote> -<pre> -from ply.lex import TOKEN - -@TOKEN(identifier) -def t_ID(t): - ... -</pre> -</blockquote> - -<p> -This will attach <tt>identifier</tt> to the docstring for <tt>t_ID()</tt> allowing <tt>lex.py</tt> to work normally. -</p> - -<H3><a name="ply_nn15"></a>4.13 Optimized mode</H3> - - -For improved performance, it may be desirable to use Python's -optimized mode (e.g., running Python with the <tt>-O</tt> -option). However, doing so causes Python to ignore documentation -strings. This presents special problems for <tt>lex.py</tt>. To -handle this case, you can create your lexer using -the <tt>optimize</tt> option as follows: - -<blockquote> -<pre> -lexer = lex.lex(optimize=1) -</pre> -</blockquote> - -Next, run Python in its normal operating mode. When you do -this, <tt>lex.py</tt> will write a file called <tt>lextab.py</tt> in -the same directory as the module containing the lexer specification. -This file contains all of the regular -expression rules and tables used during lexing. On subsequent -executions, -<tt>lextab.py</tt> will simply be imported to build the lexer. This -approach substantially improves the startup time of the lexer and it -works in Python's optimized mode. - -<p> -To change the name of the lexer-generated module, use the <tt>lextab</tt> keyword argument. For example: -</p> - -<blockquote> -<pre> -lexer = lex.lex(optimize=1,lextab="footab") -</pre> -</blockquote> - -When running in optimized mode, it is important to note that lex disables most error checking. Thus, this is really only recommended -if you're sure everything is working correctly and you're ready to start releasing production code. - -<H3><a name="ply_nn16"></a>4.14 Debugging</H3> - - -For the purpose of debugging, you can run <tt>lex()</tt> in a debugging mode as follows: - -<blockquote> -<pre> -lexer = lex.lex(debug=1) -</pre> -</blockquote> - -<p> -This will produce various sorts of debugging information including all of the added rules, -the master regular expressions used by the lexer, and tokens generating during lexing. -</p> - -<p> -In addition, <tt>lex.py</tt> comes with a simple main function which -will either tokenize input read from standard input or from a file specified -on the command line. To use it, simply put this in your lexer: -</p> - -<blockquote> -<pre> -if __name__ == '__main__': - lex.runmain() -</pre> -</blockquote> - -Please refer to the "Debugging" section near the end for some more advanced details -of debugging. - -<H3><a name="ply_nn17"></a>4.15 Alternative specification of lexers</H3> - - -As shown in the example, lexers are specified all within one Python module. If you want to -put token rules in a different module from the one in which you invoke <tt>lex()</tt>, use the -<tt>module</tt> keyword argument. - -<p> -For example, you might have a dedicated module that just contains -the token rules: - -<blockquote> -<pre> -# module: tokrules.py -# This module just contains the lexing rules - -# List of token names. This is always required -tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', -) - -# Regular expression rules for simple tokens -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_LPAREN = r'\(' -t_RPAREN = r'\)' - -# A regular expression rule with some action code -def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - -# Define a rule so we can track line numbers -def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - -# A string containing ignored characters (spaces and tabs) -t_ignore = ' \t' - -# Error handling rule -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) -</pre> -</blockquote> - -Now, if you wanted to build a tokenizer from these rules from within a different module, you would do the following (shown for Python interactive mode): - -<blockquote> -<pre> ->>> import tokrules ->>> <b>lexer = lex.lex(module=tokrules)</b> ->>> lexer.input("3 + 4") ->>> lexer.token() -LexToken(NUMBER,3,1,1,0) ->>> lexer.token() -LexToken(PLUS,'+',1,2) ->>> lexer.token() -LexToken(NUMBER,4,1,4) ->>> lexer.token() -None ->>> -</pre> -</blockquote> - -The <tt>module</tt> option can also be used to define lexers from instances of a class. For example: - -<blockquote> -<pre> -import ply.lex as lex - -class MyLexer(object): - # List of token names. This is always required - tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', - ) - - # Regular expression rules for simple tokens - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - # A regular expression rule with some action code - # Note addition of self parameter since we're in a class - def t_NUMBER(self,t): - r'\d+' - t.value = int(t.value) - return t - - # Define a rule so we can track line numbers - def t_newline(self,t): - r'\n+' - t.lexer.lineno += len(t.value) - - # A string containing ignored characters (spaces and tabs) - t_ignore = ' \t' - - # Error handling rule - def t_error(self,t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - <b># Build the lexer - def build(self,**kwargs): - self.lexer = lex.lex(module=self, **kwargs)</b> - - # Test it output - def test(self,data): - self.lexer.input(data) - while True: - tok = self.lexer.token() - if not tok: - break - print(tok) - -# Build the lexer and try it out -m = MyLexer() -m.build() # Build the lexer -m.test("3 + 4") # Test it -</pre> -</blockquote> - - -When building a lexer from class, <em>you should construct the lexer from -an instance of the class</em>, not the class object itself. This is because -PLY only works properly if the lexer actions are defined by bound-methods. - -<p> -When using the <tt>module</tt> option to <tt>lex()</tt>, PLY collects symbols -from the underlying object using the <tt>dir()</tt> function. There is no -direct access to the <tt>__dict__</tt> attribute of the object supplied as a -module value. </p> - -<P> -Finally, if you want to keep things nicely encapsulated, but don't want to use a -full-fledged class definition, lexers can be defined using closures. For example: - -<blockquote> -<pre> -import ply.lex as lex - -# List of token names. This is always required -tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', -) - -def MyLexer(): - # Regular expression rules for simple tokens - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - # A regular expression rule with some action code - def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - - # Define a rule so we can track line numbers - def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - - # A string containing ignored characters (spaces and tabs) - t_ignore = ' \t' - - # Error handling rule - def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - # Build the lexer from my environment and return it - return lex.lex() -</pre> -</blockquote> - -<p> -<b>Important note:</b> If you are defining a lexer using a class or closure, be aware that PLY still requires you to only -define a single lexer per module (source file). There are extensive validation/error checking parts of the PLY that -may falsely report error messages if you don't follow this rule. -</p> - -<H3><a name="ply_nn18"></a>4.16 Maintaining state</H3> - - -In your lexer, you may want to maintain a variety of state -information. This might include mode settings, symbol tables, and -other details. As an example, suppose that you wanted to keep -track of how many NUMBER tokens had been encountered. - -<p> -One way to do this is to keep a set of global variables in the module -where you created the lexer. For example: - -<blockquote> -<pre> -num_count = 0 -def t_NUMBER(t): - r'\d+' - global num_count - num_count += 1 - t.value = int(t.value) - return t -</pre> -</blockquote> - -If you don't like the use of a global variable, another place to store -information is inside the Lexer object created by <tt>lex()</tt>. -To this, you can use the <tt>lexer</tt> attribute of tokens passed to -the various rules. For example: - -<blockquote> -<pre> -def t_NUMBER(t): - r'\d+' - t.lexer.num_count += 1 # Note use of lexer attribute - t.value = int(t.value) - return t - -lexer = lex.lex() -lexer.num_count = 0 # Set the initial count -</pre> -</blockquote> - -This latter approach has the advantage of being simple and working -correctly in applications where multiple instantiations of a given -lexer exist in the same application. However, this might also feel -like a gross violation of encapsulation to OO purists. -Just to put your mind at some ease, all -internal attributes of the lexer (with the exception of <tt>lineno</tt>) have names that are prefixed -by <tt>lex</tt> (e.g., <tt>lexdata</tt>,<tt>lexpos</tt>, etc.). Thus, -it is perfectly safe to store attributes in the lexer that -don't have names starting with that prefix or a name that conflicts with one of the -predefined methods (e.g., <tt>input()</tt>, <tt>token()</tt>, etc.). - -<p> -If you don't like assigning values on the lexer object, you can define your lexer as a class as -shown in the previous section: - -<blockquote> -<pre> -class MyLexer: - ... - def t_NUMBER(self,t): - r'\d+' - self.num_count += 1 - t.value = int(t.value) - return t - - def build(self, **kwargs): - self.lexer = lex.lex(object=self,**kwargs) - - def __init__(self): - self.num_count = 0 -</pre> -</blockquote> - -The class approach may be the easiest to manage if your application is -going to be creating multiple instances of the same lexer and you need -to manage a lot of state. - -<p> -State can also be managed through closures. For example, in Python 3: - -<blockquote> -<pre> -def MyLexer(): - num_count = 0 - ... - def t_NUMBER(t): - r'\d+' - nonlocal num_count - num_count += 1 - t.value = int(t.value) - return t - ... -</pre> -</blockquote> - -<H3><a name="ply_nn19"></a>4.17 Lexer cloning</H3> - - -<p> -If necessary, a lexer object can be duplicated by invoking its <tt>clone()</tt> method. For example: - -<blockquote> -<pre> -lexer = lex.lex() -... -newlexer = lexer.clone() -</pre> -</blockquote> - -When a lexer is cloned, the copy is exactly identical to the original lexer -including any input text and internal state. However, the clone allows a -different set of input text to be supplied which may be processed separately. -This may be useful in situations when you are writing a parser/compiler that -involves recursive or reentrant processing. For instance, if you -needed to scan ahead in the input for some reason, you could create a -clone and use it to look ahead. Or, if you were implementing some kind of preprocessor, -cloned lexers could be used to handle different input files. - -<p> -Creating a clone is different than calling <tt>lex.lex()</tt> in that -PLY doesn't regenerate any of the internal tables or regular expressions. - -<p> -Special considerations need to be made when cloning lexers that also -maintain their own internal state using classes or closures. Namely, -you need to be aware that the newly created lexers will share all of -this state with the original lexer. For example, if you defined a -lexer as a class and did this: - -<blockquote> -<pre> -m = MyLexer() -a = lex.lex(object=m) # Create a lexer - -b = a.clone() # Clone the lexer -</pre> -</blockquote> - -Then both <tt>a</tt> and <tt>b</tt> are going to be bound to the same -object <tt>m</tt> and any changes to <tt>m</tt> will be reflected in both lexers. It's -important to emphasize that <tt>clone()</tt> is only meant to create a new lexer -that reuses the regular expressions and environment of another lexer. If you -need to make a totally new copy of a lexer, then call <tt>lex()</tt> again. - -<H3><a name="ply_nn20"></a>4.18 Internal lexer state</H3> - - -A Lexer object <tt>lexer</tt> has a number of internal attributes that may be useful in certain -situations. - -<p> -<tt>lexer.lexpos</tt> -<blockquote> -This attribute is an integer that contains the current position within the input text. If you modify -the value, it will change the result of the next call to <tt>token()</tt>. Within token rule functions, this points -to the first character <em>after</em> the matched text. If the value is modified within a rule, the next returned token will be -matched at the new position. -</blockquote> - -<p> -<tt>lexer.lineno</tt> -<blockquote> -The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute -exists---it never sets, updates, or performs any processing with it. If you want to track line numbers, -you will need to add code yourself (see the section on line numbers and positional information). -</blockquote> - -<p> -<tt>lexer.lexdata</tt> -<blockquote> -The current input text stored in the lexer. This is the string passed with the <tt>input()</tt> method. It -would probably be a bad idea to modify this unless you really know what you're doing. -</blockquote> - -<P> -<tt>lexer.lexmatch</tt> -<blockquote> -This is the raw <tt>Match</tt> object returned by the Python <tt>re.match()</tt> function (used internally by PLY) for the -current token. If you have written a regular expression that contains named groups, you can use this to retrieve those values. -Note: This attribute is only updated when tokens are defined and processed by functions. -</blockquote> - -<H3><a name="ply_nn21"></a>4.19 Conditional lexing and start conditions</H3> - - -In advanced parsing applications, it may be useful to have different -lexing states. For instance, you may want the occurrence of a certain -token or syntactic construct to trigger a different kind of lexing. -PLY supports a feature that allows the underlying lexer to be put into -a series of different states. Each state can have its own tokens, -lexing rules, and so forth. The implementation is based largely on -the "start condition" feature of GNU flex. Details of this can be found -at <a -href="http://flex.sourceforge.net/manual/Start-Conditions.html">http://flex.sourceforge.net/manual/Start-Conditions.html</a>. - -<p> -To define a new lexing state, it must first be declared. This is done by including a "states" declaration in your -lex file. For example: - -<blockquote> -<pre> -states = ( - ('foo','exclusive'), - ('bar','inclusive'), -) -</pre> -</blockquote> - -This declaration declares two states, <tt>'foo'</tt> -and <tt>'bar'</tt>. States may be of two types; <tt>'exclusive'</tt> -and <tt>'inclusive'</tt>. An exclusive state completely overrides the -default behavior of the lexer. That is, lex will only return tokens -and apply rules defined specifically for that state. An inclusive -state adds additional tokens and rules to the default set of rules. -Thus, lex will return both the tokens defined by default in addition -to those defined for the inclusive state. - -<p> -Once a state has been declared, tokens and rules are declared by including the -state name in token/rule declaration. For example: - -<blockquote> -<pre> -t_foo_NUMBER = r'\d+' # Token 'NUMBER' in state 'foo' -t_bar_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' # Token 'ID' in state 'bar' - -def t_foo_newline(t): - r'\n' - t.lexer.lineno += 1 -</pre> -</blockquote> - -A token can be declared in multiple states by including multiple state names in the declaration. For example: - -<blockquote> -<pre> -t_foo_bar_NUMBER = r'\d+' # Defines token 'NUMBER' in both state 'foo' and 'bar' -</pre> -</blockquote> - -Alternative, a token can be declared in all states using the 'ANY' in the name. - -<blockquote> -<pre> -t_ANY_NUMBER = r'\d+' # Defines a token 'NUMBER' in all states -</pre> -</blockquote> - -If no state name is supplied, as is normally the case, the token is associated with a special state <tt>'INITIAL'</tt>. For example, -these two declarations are identical: - -<blockquote> -<pre> -t_NUMBER = r'\d+' -t_INITIAL_NUMBER = r'\d+' -</pre> -</blockquote> - -<p> -States are also associated with the special <tt>t_ignore</tt>, <tt>t_error()</tt>, and <tt>t_eof()</tt> declarations. For example, if a state treats -these differently, you can declare:</p> - -<blockquote> -<pre> -t_foo_ignore = " \t\n" # Ignored characters for state 'foo' - -def t_bar_error(t): # Special error handler for state 'bar' - pass -</pre> -</blockquote> - -By default, lexing operates in the <tt>'INITIAL'</tt> state. This state includes all of the normally defined tokens. -For users who aren't using different states, this fact is completely transparent. If, during lexing or parsing, you want to change -the lexing state, use the <tt>begin()</tt> method. For example: - -<blockquote> -<pre> -def t_begin_foo(t): - r'start_foo' - t.lexer.begin('foo') # Starts 'foo' state -</pre> -</blockquote> - -To get out of a state, you use <tt>begin()</tt> to switch back to the initial state. For example: - -<blockquote> -<pre> -def t_foo_end(t): - r'end_foo' - t.lexer.begin('INITIAL') # Back to the initial state -</pre> -</blockquote> - -The management of states can also be done with a stack. For example: - -<blockquote> -<pre> -def t_begin_foo(t): - r'start_foo' - t.lexer.push_state('foo') # Starts 'foo' state - -def t_foo_end(t): - r'end_foo' - t.lexer.pop_state() # Back to the previous state -</pre> -</blockquote> - -<p> -The use of a stack would be useful in situations where there are many ways of entering a new lexing state and you merely want to go back -to the previous state afterwards. - -<P> -An example might help clarify. Suppose you were writing a parser and you wanted to grab sections of arbitrary C code enclosed by -curly braces. That is, whenever you encounter a starting brace '{', you want to read all of the enclosed code up to the ending brace '}' -and return it as a string. Doing this with a normal regular expression rule is nearly (if not actually) impossible. This is because braces can -be nested and can be included in comments and strings. Thus, simply matching up to the first matching '}' character isn't good enough. Here is how -you might use lexer states to do this: - -<blockquote> -<pre> -# Declare the state -states = ( - ('ccode','exclusive'), -) - -# Match the first {. Enter ccode state. -def t_ccode(t): - r'\{' - t.lexer.code_start = t.lexer.lexpos # Record the starting position - t.lexer.level = 1 # Initial brace level - t.lexer.begin('ccode') # Enter 'ccode' state - -# Rules for the ccode state -def t_ccode_lbrace(t): - r'\{' - t.lexer.level +=1 - -def t_ccode_rbrace(t): - r'\}' - t.lexer.level -=1 - - # If closing brace, return the code fragment - if t.lexer.level == 0: - t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1] - t.type = "CCODE" - t.lexer.lineno += t.value.count('\n') - t.lexer.begin('INITIAL') - return t - -# C or C++ comment (ignore) -def t_ccode_comment(t): - r'(/\*(.|\n)*?\*/)|(//.*)' - pass - -# C string -def t_ccode_string(t): - r'\"([^\\\n]|(\\.))*?\"' - -# C character literal -def t_ccode_char(t): - r'\'([^\\\n]|(\\.))*?\'' - -# Any sequence of non-whitespace characters (not braces, strings) -def t_ccode_nonspace(t): - r'[^\s\{\}\'\"]+' - -# Ignored characters (whitespace) -t_ccode_ignore = " \t\n" - -# For bad characters, we just skip over it -def t_ccode_error(t): - t.lexer.skip(1) -</pre> -</blockquote> - -In this example, the occurrence of the first '{' causes the lexer to record the starting position and enter a new state <tt>'ccode'</tt>. A collection of rules then match -various parts of the input that follow (comments, strings, etc.). All of these rules merely discard the token (by not returning a value). -However, if the closing right brace is encountered, the rule <tt>t_ccode_rbrace</tt> collects all of the code (using the earlier recorded starting -position), stores it, and returns a token 'CCODE' containing all of that text. When returning the token, the lexing state is restored back to its -initial state. - -<H3><a name="ply_nn21b"></a>4.20 Miscellaneous Issues</H3> - - -<P> -<li>The lexer requires input to be supplied as a single input string. Since most machines have more than enough memory, this -rarely presents a performance concern. However, it means that the lexer currently can't be used with streaming data -such as open files or sockets. This limitation is primarily a side-effect of using the <tt>re</tt> module. You might be -able to work around this by implementing an appropriate <tt>def t_eof()</tt> end-of-file handling rule. The main complication -here is that you'll probably need to ensure that data is fed to the lexer in a way so that it doesn't split in in the middle -of a token.</p> - -<p> -<li>The lexer should work properly with both Unicode strings given as token and pattern matching rules as -well as for input text. - -<p> -<li>If you need to supply optional flags to the re.compile() function, use the reflags option to lex. For example: - -<blockquote> -<pre> -lex.lex(reflags=re.UNICODE | re.VERBOSE) -</pre> -</blockquote> - -Note: by default, <tt>reflags</tt> is set to <tt>re.VERBOSE</tt>. If you provide -your own flags, you may need to include this for PLY to preserve its normal behavior. - -<p> -<li>Since the lexer is written entirely in Python, its performance is -largely determined by that of the Python <tt>re</tt> module. Although -the lexer has been written to be as efficient as possible, it's not -blazingly fast when used on very large input files. If -performance is concern, you might consider upgrading to the most -recent version of Python, creating a hand-written lexer, or offloading -the lexer into a C extension module. - -<p> -If you are going to create a hand-written lexer and you plan to use it with <tt>yacc.py</tt>, -it only needs to conform to the following requirements: - -<ul> -<li>It must provide a <tt>token()</tt> method that returns the next token or <tt>None</tt> if no more -tokens are available. -<li>The <tt>token()</tt> method must return an object <tt>tok</tt> that has <tt>type</tt> and <tt>value</tt> attributes. If -line number tracking is being used, then the token should also define a <tt>lineno</tt> attribute. -</ul> - -<H2><a name="ply_nn22"></a>5. Parsing basics</H2> - - -<tt>yacc.py</tt> is used to parse language syntax. Before showing an -example, there are a few important bits of background that must be -mentioned. First, <em>syntax</em> is usually specified in terms of a BNF grammar. -For example, if you wanted to parse -simple arithmetic expressions, you might first write an unambiguous -grammar specification like this: - -<blockquote> -<pre> -expression : expression + term - | expression - term - | term - -term : term * factor - | term / factor - | factor - -factor : NUMBER - | ( expression ) -</pre> -</blockquote> - -In the grammar, symbols such as <tt>NUMBER</tt>, <tt>+</tt>, <tt>-</tt>, <tt>*</tt>, and <tt>/</tt> are known -as <em>terminals</em> and correspond to raw input tokens. Identifiers such as <tt>term</tt> and <tt>factor</tt> refer to -grammar rules comprised of a collection of terminals and other rules. These identifiers are known as <em>non-terminals</em>. -<P> - -The semantic behavior of a language is often specified using a -technique known as syntax directed translation. In syntax directed -translation, attributes are attached to each symbol in a given grammar -rule along with an action. Whenever a particular grammar rule is -recognized, the action describes what to do. For example, given the -expression grammar above, you might write the specification for a -simple calculator like this: - -<blockquote> -<pre> -Grammar Action --------------------------------- -------------------------------------------- -expression0 : expression1 + term expression0.val = expression1.val + term.val - | expression1 - term expression0.val = expression1.val - term.val - | term expression0.val = term.val - -term0 : term1 * factor term0.val = term1.val * factor.val - | term1 / factor term0.val = term1.val / factor.val - | factor term0.val = factor.val - -factor : NUMBER factor.val = int(NUMBER.lexval) - | ( expression ) factor.val = expression.val -</pre> -</blockquote> - -A good way to think about syntax directed translation is to -view each symbol in the grammar as a kind of object. Associated -with each symbol is a value representing its "state" (for example, the -<tt>val</tt> attribute above). Semantic -actions are then expressed as a collection of functions or methods -that operate on the symbols and associated values. - -<p> -Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a -bottom up technique that tries to recognize the right-hand-side of various grammar rules. -Whenever a valid right-hand-side is found in the input, the appropriate action code is triggered and the -grammar symbols are replaced by the grammar symbol on the left-hand-side. - -<p> -LR parsing is commonly implemented by shifting grammar symbols onto a -stack and looking at the stack and the next input token for patterns that -match one of the grammar rules. -The details of the algorithm can be found in a compiler textbook, but the -following example illustrates the steps that are performed if you -wanted to parse the expression -<tt>3 + 5 * (10 - 20)</tt> using the grammar defined above. In the example, -the special symbol <tt>$</tt> represents the end of input. - - -<blockquote> -<pre> -Step Symbol Stack Input Tokens Action ----- --------------------- --------------------- ------------------------------- -1 3 + 5 * ( 10 - 20 )$ Shift 3 -2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER -3 factor + 5 * ( 10 - 20 )$ Reduce term : factor -4 term + 5 * ( 10 - 20 )$ Reduce expr : term -5 expr + 5 * ( 10 - 20 )$ Shift + -6 expr + 5 * ( 10 - 20 )$ Shift 5 -7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER -8 expr + factor * ( 10 - 20 )$ Reduce term : factor -9 expr + term * ( 10 - 20 )$ Shift * -10 expr + term * ( 10 - 20 )$ Shift ( -11 expr + term * ( 10 - 20 )$ Shift 10 -12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER -13 expr + term * ( factor - 20 )$ Reduce term : factor -14 expr + term * ( term - 20 )$ Reduce expr : term -15 expr + term * ( expr - 20 )$ Shift - -16 expr + term * ( expr - 20 )$ Shift 20 -17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER -18 expr + term * ( expr - factor )$ Reduce term : factor -19 expr + term * ( expr - term )$ Reduce expr : expr - term -20 expr + term * ( expr )$ Shift ) -21 expr + term * ( expr ) $ Reduce factor : (expr) -22 expr + term * factor $ Reduce term : term * factor -23 expr + term $ Reduce expr : expr + term -24 expr $ Reduce expr -25 $ Success! -</pre> -</blockquote> - -When parsing the expression, an underlying state machine and the -current input token determine what happens next. If the next token -looks like part of a valid grammar rule (based on other items on the -stack), it is generally shifted onto the stack. If the top of the -stack contains a valid right-hand-side of a grammar rule, it is -usually "reduced" and the symbols replaced with the symbol on the -left-hand-side. When this reduction occurs, the appropriate action is -triggered (if defined). If the input token can't be shifted and the -top of stack doesn't match any grammar rules, a syntax error has -occurred and the parser must take some kind of recovery step (or bail -out). A parse is only successful if the parser reaches a state where -the symbol stack is empty and there are no more input tokens. - -<p> -It is important to note that the underlying implementation is built -around a large finite-state machine that is encoded in a collection of -tables. The construction of these tables is non-trivial and -beyond the scope of this discussion. However, subtle details of this -process explain why, in the example above, the parser chooses to shift -a token onto the stack in step 9 rather than reducing the -rule <tt>expr : expr + term</tt>. - -<H2><a name="ply_nn23"></a>6. Yacc</H2> - - -The <tt>ply.yacc</tt> module implements the parsing component of PLY. -The name "yacc" stands for "Yet Another Compiler Compiler" and is -borrowed from the Unix tool of the same name. - -<H3><a name="ply_nn24"></a>6.1 An example</H3> - - -Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. Here is -how you would do it with <tt>yacc.py</tt>: - -<blockquote> -<pre> -# Yacc example - -import ply.yacc as yacc - -# Get the token map from the lexer. This is required. -from calclex import tokens - -def p_expression_plus(p): - 'expression : expression PLUS term' - p[0] = p[1] + p[3] - -def p_expression_minus(p): - 'expression : expression MINUS term' - p[0] = p[1] - p[3] - -def p_expression_term(p): - 'expression : term' - p[0] = p[1] - -def p_term_times(p): - 'term : term TIMES factor' - p[0] = p[1] * p[3] - -def p_term_div(p): - 'term : term DIVIDE factor' - p[0] = p[1] / p[3] - -def p_term_factor(p): - 'term : factor' - p[0] = p[1] - -def p_factor_num(p): - 'factor : NUMBER' - p[0] = p[1] - -def p_factor_expr(p): - 'factor : LPAREN expression RPAREN' - p[0] = p[2] - -# Error rule for syntax errors -def p_error(p): - print("Syntax error in input!") - -# Build the parser -parser = yacc.yacc() - -while True: - try: - s = raw_input('calc > ') - except EOFError: - break - if not s: continue - result = parser.parse(s) - print(result) -</pre> -</blockquote> - -In this example, each grammar rule is defined by a Python function -where the docstring to that function contains the appropriate -context-free grammar specification. The statements that make up the -function body implement the semantic actions of the rule. Each function -accepts a single argument <tt>p</tt> that is a sequence containing the -values of each grammar symbol in the corresponding rule. The values -of <tt>p[i]</tt> are mapped to grammar symbols as shown here: - -<blockquote> -<pre> -def p_expression_plus(p): - 'expression : expression PLUS term' - # ^ ^ ^ ^ - # p[0] p[1] p[2] p[3] - - p[0] = p[1] + p[3] -</pre> -</blockquote> - -<p> -For tokens, the "value" of the corresponding <tt>p[i]</tt> is the -<em>same</em> as the <tt>p.value</tt> attribute assigned in the lexer -module. For non-terminals, the value is determined by whatever is -placed in <tt>p[0]</tt> when rules are reduced. This value can be -anything at all. However, it probably most common for the value to be -a simple Python type, a tuple, or an instance. In this example, we -are relying on the fact that the <tt>NUMBER</tt> token stores an -integer value in its value field. All of the other rules simply -perform various types of integer operations and propagate the result. -</p> - -<p> -Note: The use of negative indices have a special meaning in -yacc---specially <tt>p[-1]</tt> does not have the same value -as <tt>p[3]</tt> in this example. Please see the section on "Embedded -Actions" for further details. -</p> - -<p> -The first rule defined in the yacc specification determines the -starting grammar symbol (in this case, a rule for <tt>expression</tt> -appears first). Whenever the starting rule is reduced by the parser -and no more input is available, parsing stops and the final value is -returned (this value will be whatever the top-most rule placed -in <tt>p[0]</tt>). Note: an alternative starting symbol can be -specified using the <tt>start</tt> keyword argument to -<tt>yacc()</tt>. - -<p>The <tt>p_error(p)</tt> rule is defined to catch syntax errors. -See the error handling section below for more detail. - -<p> -To build the parser, call the <tt>yacc.yacc()</tt> function. This -function looks at the module and attempts to construct all of the LR -parsing tables for the grammar you have specified. The first -time <tt>yacc.yacc()</tt> is invoked, you will get a message such as -this: - -<blockquote> -<pre> -$ python calcparse.py -Generating LALR tables -calc > -</pre> -</blockquote> - -<p> -Since table construction is relatively expensive (especially for large -grammars), the resulting parsing table is written to -a file called <tt>parsetab.py</tt>. In addition, a -debugging file called <tt>parser.out</tt> is created. On subsequent -executions, <tt>yacc</tt> will reload the table from -<tt>parsetab.py</tt> unless it has detected a change in the underlying -grammar (in which case the tables and <tt>parsetab.py</tt> file are -regenerated). Both of these files are written to the same directory -as the module in which the parser is specified. -The name of the <tt>parsetab</tt> module can be changed using the -<tt>tabmodule</tt> keyword argument to <tt>yacc()</tt>. For example: -</p> - -<blockquote> -<pre> -parser = yacc.yacc(tabmodule='fooparsetab') -</pre> -</blockquote> - -<p> -If any errors are detected in your grammar specification, <tt>yacc.py</tt> will produce -diagnostic messages and possibly raise an exception. Some of the errors that can be detected include: - -<ul> -<li>Duplicated function names (if more than one rule function have the same name in the grammar file). -<li>Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. -<li>Badly specified grammar rules. -<li>Infinite recursion (rules that can never terminate). -<li>Unused rules and tokens -<li>Undefined rules and tokens -</ul> - -The next few sections discuss grammar specification in more detail. - -<p> -The final part of the example shows how to actually run the parser -created by -<tt>yacc()</tt>. To run the parser, you simply have to call -the <tt>parse()</tt> with a string of input text. This will run all -of the grammar rules and return the result of the entire parse. This -result return is the value assigned to <tt>p[0]</tt> in the starting -grammar rule. - -<H3><a name="ply_nn25"></a>6.2 Combining Grammar Rule Functions</H3> - - -When grammar rules are similar, they can be combined into a single function. -For example, consider the two rules in our earlier example: - -<blockquote> -<pre> -def p_expression_plus(p): - 'expression : expression PLUS term' - p[0] = p[1] + p[3] - -def p_expression_minus(t): - 'expression : expression MINUS term' - p[0] = p[1] - p[3] -</pre> -</blockquote> - -Instead of writing two functions, you might write a single function like this: - -<blockquote> -<pre> -def p_expression(p): - '''expression : expression PLUS term - | expression MINUS term''' - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] -</pre> -</blockquote> - -In general, the doc string for any given function can contain multiple grammar rules. So, it would -have also been legal (although possibly confusing) to write this: - -<blockquote> -<pre> -def p_binary_operators(p): - '''expression : expression PLUS term - | expression MINUS term - term : term TIMES factor - | term DIVIDE factor''' - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - elif p[2] == '*': - p[0] = p[1] * p[3] - elif p[2] == '/': - p[0] = p[1] / p[3] -</pre> -</blockquote> - -When combining grammar rules into a single function, it is usually a good idea for all of the rules to have -a similar structure (e.g., the same number of terms). Otherwise, the corresponding action code may be more -complicated than necessary. However, it is possible to handle simple cases using len(). For example: - -<blockquote> -<pre> -def p_expressions(p): - '''expression : expression MINUS expression - | MINUS expression''' - if (len(p) == 4): - p[0] = p[1] - p[3] - elif (len(p) == 3): - p[0] = -p[2] -</pre> -</blockquote> - -If parsing performance is a concern, you should resist the urge to put -too much conditional processing into a single grammar rule as shown in -these examples. When you add checks to see which grammar rule is -being handled, you are actually duplicating the work that the parser -has already performed (i.e., the parser already knows exactly what rule it -matched). You can eliminate this overhead by using a -separate <tt>p_rule()</tt> function for each grammar rule. - -<H3><a name="ply_nn26"></a>6.3 Character Literals</H3> - - -If desired, a grammar may contain tokens defined as single character literals. For example: - -<blockquote> -<pre> -def p_binary_operators(p): - '''expression : expression '+' term - | expression '-' term - term : term '*' factor - | term '/' factor''' - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - elif p[2] == '*': - p[0] = p[1] * p[3] - elif p[2] == '/': - p[0] = p[1] / p[3] -</pre> -</blockquote> - -A character literal must be enclosed in quotes such as <tt>'+'</tt>. In addition, if literals are used, they must be declared in the -corresponding <tt>lex</tt> file through the use of a special <tt>literals</tt> declaration. - -<blockquote> -<pre> -# Literals. Should be placed in module given to lex() -literals = ['+','-','*','/' ] -</pre> -</blockquote> - -<b>Character literals are limited to a single character</b>. Thus, it is not legal to specify literals such as <tt>'<='</tt> or <tt>'=='</tt>. For this, use -the normal lexing rules (e.g., define a rule such as <tt>t_EQ = r'=='</tt>). - -<H3><a name="ply_nn26b"></a>6.4 Empty Productions</H3> - - -<tt>yacc.py</tt> can handle empty productions by defining a rule like this: - -<blockquote> -<pre> -def p_empty(p): - 'empty :' - pass -</pre> -</blockquote> - -Now to use the empty production, simply use 'empty' as a symbol. For example: - -<blockquote> -<pre> -def p_optitem(p): - 'optitem : item' - ' | empty' - ... -</pre> -</blockquote> - -Note: You can write empty rules anywhere by simply specifying an empty -right hand side. However, I personally find that writing an "empty" -rule and using "empty" to denote an empty production is easier to read -and more clearly states your intentions. - -<H3><a name="ply_nn28"></a>6.5 Changing the starting symbol</H3> - - -Normally, the first rule found in a yacc specification defines the starting grammar rule (top level rule). To change this, simply -supply a <tt>start</tt> specifier in your file. For example: - -<blockquote> -<pre> -start = 'foo' - -def p_bar(p): - 'bar : A B' - -# This is the starting rule due to the start specifier above -def p_foo(p): - 'foo : bar X' -... -</pre> -</blockquote> - -The use of a <tt>start</tt> specifier may be useful during debugging -since you can use it to have yacc build a subset of a larger grammar. -For this purpose, it is also possible to specify a starting symbol as -an argument to <tt>yacc()</tt>. For example: - -<blockquote> -<pre> -parser = yacc.yacc(start='foo') -</pre> -</blockquote> - -<H3><a name="ply_nn27"></a>6.6 Dealing With Ambiguous Grammars</H3> - - -The expression grammar given in the earlier example has been written -in a special format to eliminate ambiguity. However, in many -situations, it is extremely difficult or awkward to write grammars in -this format. A much more natural way to express the grammar is in a -more compact form like this: - -<blockquote> -<pre> -expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression - | LPAREN expression RPAREN - | NUMBER -</pre> -</blockquote> - -Unfortunately, this grammar specification is ambiguous. For example, -if you are parsing the string "3 * 4 + 5", there is no way to tell how -the operators are supposed to be grouped. For example, does the -expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? - -<p> -When an ambiguous grammar is given to <tt>yacc.py</tt> it will print -messages about "shift/reduce conflicts" or "reduce/reduce conflicts". -A shift/reduce conflict is caused when the parser generator can't -decide whether or not to reduce a rule or shift a symbol on the -parsing stack. For example, consider the string "3 * 4 + 5" and the -internal parsing stack: - -<blockquote> -<pre> -Step Symbol Stack Input Tokens Action ----- --------------------- --------------------- ------------------------------- -1 $ 3 * 4 + 5$ Shift 3 -2 $ 3 * 4 + 5$ Reduce : expression : NUMBER -3 $ expr * 4 + 5$ Shift * -4 $ expr * 4 + 5$ Shift 4 -5 $ expr * 4 + 5$ Reduce: expression : NUMBER -6 $ expr * expr + 5$ SHIFT/REDUCE CONFLICT ???? -</pre> -</blockquote> - -In this case, when the parser reaches step 6, it has two options. One -is to reduce the rule <tt>expr : expr * expr</tt> on the stack. The -other option is to shift the token <tt>+</tt> on the stack. Both -options are perfectly legal from the rules of the -context-free-grammar. - -<p> -By default, all shift/reduce conflicts are resolved in favor of -shifting. Therefore, in the above example, the parser will always -shift the <tt>+</tt> instead of reducing. Although this strategy -works in many cases (for example, the case of -"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, -in the above example, the decision to shift <tt>+</tt> is completely -wrong---we should have reduced <tt>expr * expr</tt> since -multiplication has higher mathematical precedence than addition. - -<p>To resolve ambiguity, especially in expression -grammars, <tt>yacc.py</tt> allows individual tokens to be assigned a -precedence level and associativity. This is done by adding a variable -<tt>precedence</tt> to the grammar file like this: - -<blockquote> -<pre> -precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), -) -</pre> -</blockquote> - -This declaration specifies that <tt>PLUS</tt>/<tt>MINUS</tt> have the -same precedence level and are left-associative and that -<tt>TIMES</tt>/<tt>DIVIDE</tt> have the same precedence and are -left-associative. Within the <tt>precedence</tt> declaration, tokens -are ordered from lowest to highest precedence. Thus, this declaration -specifies that <tt>TIMES</tt>/<tt>DIVIDE</tt> have higher precedence -than <tt>PLUS</tt>/<tt>MINUS</tt> (since they appear later in the -precedence specification). - -<p> -The precedence specification works by associating a numerical -precedence level value and associativity direction to the listed -tokens. For example, in the above example you get: - -<blockquote> -<pre> -PLUS : level = 1, assoc = 'left' -MINUS : level = 1, assoc = 'left' -TIMES : level = 2, assoc = 'left' -DIVIDE : level = 2, assoc = 'left' -</pre> -</blockquote> - -These values are then used to attach a numerical precedence value and -associativity direction to each grammar rule. <em>This is always -determined by looking at the precedence of the right-most terminal -symbol.</em> For example: - -<blockquote> -<pre> -expression : expression PLUS expression # level = 1, left - | expression MINUS expression # level = 1, left - | expression TIMES expression # level = 2, left - | expression DIVIDE expression # level = 2, left - | LPAREN expression RPAREN # level = None (not specified) - | NUMBER # level = None (not specified) -</pre> -</blockquote> - -When shift/reduce conflicts are encountered, the parser generator resolves the conflict by -looking at the precedence rules and associativity specifiers. - -<p> -<ol> -<li>If the current token has higher precedence than the rule on the stack, it is shifted. -<li>If the grammar rule on the stack has higher precedence, the rule is reduced. -<li>If the current token and the grammar rule have the same precedence, the -rule is reduced for left associativity, whereas the token is shifted for right associativity. -<li>If nothing is known about the precedence, shift/reduce conflicts are resolved in -favor of shifting (the default). -</ol> - -For example, if "expression PLUS expression" has been parsed and the -next token is "TIMES", the action is going to be a shift because -"TIMES" has a higher precedence level than "PLUS". On the other hand, -if "expression TIMES expression" has been parsed and the next token is -"PLUS", the action is going to be reduce because "PLUS" has a lower -precedence than "TIMES." - -<p> -When shift/reduce conflicts are resolved using the first three -techniques (with the help of precedence rules), <tt>yacc.py</tt> will -report no errors or conflicts in the grammar (although it will print -some information in the <tt>parser.out</tt> debugging file). - -<p> -One problem with the precedence specifier technique is that it is -sometimes necessary to change the precedence of an operator in certain -contexts. For example, consider a unary-minus operator in "3 + 4 * --5". Mathematically, the unary minus is normally given a very high -precedence--being evaluated before the multiply. However, in our -precedence specifier, MINUS has a lower precedence than TIMES. To -deal with this, precedence rules can be given for so-called "fictitious tokens" -like this: - -<blockquote> -<pre> -precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), # Unary minus operator -) -</pre> -</blockquote> - -Now, in the grammar file, we can write our unary minus rule like this: - -<blockquote> -<pre> -def p_expr_uminus(p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] -</pre> -</blockquote> - -In this case, <tt>%prec UMINUS</tt> overrides the default rule precedence--setting it to that -of UMINUS in the precedence specifier. - -<p> -At first, the use of UMINUS in this example may appear very confusing. -UMINUS is not an input token or a grammar rule. Instead, you should -think of it as the name of a special marker in the precedence table. When you use the <tt>%prec</tt> qualifier, you're simply -telling yacc that you want the precedence of the expression to be the same as for this special marker instead of the usual precedence. - -<p> -It is also possible to specify non-associativity in the <tt>precedence</tt> table. This would -be used when you <em>don't</em> want operations to chain together. For example, suppose -you wanted to support comparison operators like <tt><</tt> and <tt>></tt> but you didn't want to allow -combinations like <tt>a < b < c</tt>. To do this, simply specify a rule like this: - -<blockquote> -<pre> -precedence = ( - ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), # Unary minus operator -) -</pre> -</blockquote> - -<p> -If you do this, the occurrence of input text such as <tt> a < b < c</tt> will result in a syntax error. However, simple -expressions such as <tt>a < b</tt> will still be fine. - -<p> -Reduce/reduce conflicts are caused when there are multiple grammar -rules that can be applied to a given set of symbols. This kind of -conflict is almost always bad and is always resolved by picking the -rule that appears first in the grammar file. Reduce/reduce conflicts -are almost always caused when different sets of grammar rules somehow -generate the same set of symbols. For example: - -<blockquote> -<pre> -assignment : ID EQUALS NUMBER - | ID EQUALS expression - -expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression - | LPAREN expression RPAREN - | NUMBER -</pre> -</blockquote> - -In this case, a reduce/reduce conflict exists between these two rules: - -<blockquote> -<pre> -assignment : ID EQUALS NUMBER -expression : NUMBER -</pre> -</blockquote> - -For example, if you wrote "a = 5", the parser can't figure out if this -is supposed to be reduced as <tt>assignment : ID EQUALS NUMBER</tt> or -whether it's supposed to reduce the 5 as an expression and then reduce -the rule <tt>assignment : ID EQUALS expression</tt>. - -<p> -It should be noted that reduce/reduce conflicts are notoriously -difficult to spot simply looking at the input grammar. When a -reduce/reduce conflict occurs, <tt>yacc()</tt> will try to help by -printing a warning message such as this: - -<blockquote> -<pre> -WARNING: 1 reduce/reduce conflict -WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) -WARNING: rejected rule (expression -> NUMBER) -</pre> -</blockquote> - -This message identifies the two rules that are in conflict. However, -it may not tell you how the parser arrived at such a state. To try -and figure it out, you'll probably have to look at your grammar and -the contents of the -<tt>parser.out</tt> debugging file with an appropriately high level of -caffeination. - -<H3><a name="ply_nn28b"></a>6.7 The parser.out file</H3> - - -Tracking down shift/reduce and reduce/reduce conflicts is one of the finer pleasures of using an LR -parsing algorithm. To assist in debugging, <tt>yacc.py</tt> creates a debugging file called -'parser.out' when it generates the parsing table. The contents of this file look like the following: - -<blockquote> -<pre> -Unused terminals: - - -Grammar - -Rule 1 expression -> expression PLUS expression -Rule 2 expression -> expression MINUS expression -Rule 3 expression -> expression TIMES expression -Rule 4 expression -> expression DIVIDE expression -Rule 5 expression -> NUMBER -Rule 6 expression -> LPAREN expression RPAREN - -Terminals, with rules where they appear - -TIMES : 3 -error : -MINUS : 2 -RPAREN : 6 -LPAREN : 6 -DIVIDE : 4 -PLUS : 1 -NUMBER : 5 - -Nonterminals, with rules where they appear - -expression : 1 1 2 2 3 3 4 4 6 0 - - -Parsing method: LALR - - -state 0 - - S' -> . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - -state 1 - - S' -> expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - PLUS shift and go to state 6 - MINUS shift and go to state 5 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - -state 2 - - expression -> LPAREN . expression RPAREN - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - -state 3 - - expression -> NUMBER . - - $ reduce using rule 5 - PLUS reduce using rule 5 - MINUS reduce using rule 5 - TIMES reduce using rule 5 - DIVIDE reduce using rule 5 - RPAREN reduce using rule 5 - - -state 4 - - expression -> expression TIMES . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - -state 5 - - expression -> expression MINUS . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - -state 6 - - expression -> expression PLUS . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - -state 7 - - expression -> expression DIVIDE . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - -state 8 - - expression -> LPAREN expression . RPAREN - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - RPAREN shift and go to state 13 - PLUS shift and go to state 6 - MINUS shift and go to state 5 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - -state 9 - - expression -> expression TIMES expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 3 - PLUS reduce using rule 3 - MINUS reduce using rule 3 - TIMES reduce using rule 3 - DIVIDE reduce using rule 3 - RPAREN reduce using rule 3 - - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - ! TIMES [ shift and go to state 4 ] - ! DIVIDE [ shift and go to state 7 ] - -state 10 - - expression -> expression MINUS expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 2 - PLUS reduce using rule 2 - MINUS reduce using rule 2 - RPAREN reduce using rule 2 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - ! TIMES [ reduce using rule 2 ] - ! DIVIDE [ reduce using rule 2 ] - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - -state 11 - - expression -> expression PLUS expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 1 - PLUS reduce using rule 1 - MINUS reduce using rule 1 - RPAREN reduce using rule 1 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - ! TIMES [ reduce using rule 1 ] - ! DIVIDE [ reduce using rule 1 ] - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - -state 12 - - expression -> expression DIVIDE expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 4 - PLUS reduce using rule 4 - MINUS reduce using rule 4 - TIMES reduce using rule 4 - DIVIDE reduce using rule 4 - RPAREN reduce using rule 4 - - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - ! TIMES [ shift and go to state 4 ] - ! DIVIDE [ shift and go to state 7 ] - -state 13 - - expression -> LPAREN expression RPAREN . - - $ reduce using rule 6 - PLUS reduce using rule 6 - MINUS reduce using rule 6 - TIMES reduce using rule 6 - DIVIDE reduce using rule 6 - RPAREN reduce using rule 6 -</pre> -</blockquote> - -The different states that appear in this file are a representation of -every possible sequence of valid input tokens allowed by the grammar. -When receiving input tokens, the parser is building up a stack and -looking for matching rules. Each state keeps track of the grammar -rules that might be in the process of being matched at that point. Within each -rule, the "." character indicates the current location of the parse -within that rule. In addition, the actions for each valid input token -are listed. When a shift/reduce or reduce/reduce conflict arises, -rules <em>not</em> selected are prefixed with an !. For example: - -<blockquote> -<pre> - ! TIMES [ reduce using rule 2 ] - ! DIVIDE [ reduce using rule 2 ] - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] -</pre> -</blockquote> - -By looking at these rules (and with a little practice), you can usually track down the source -of most parsing conflicts. It should also be stressed that not all shift-reduce conflicts are -bad. However, the only way to be sure that they are resolved correctly is to look at <tt>parser.out</tt>. - -<H3><a name="ply_nn29"></a>6.8 Syntax Error Handling</H3> - - -If you are creating a parser for production use, the handling of -syntax errors is important. As a general rule, you don't want a -parser to simply throw up its hands and stop at the first sign of -trouble. Instead, you want it to report the error, recover if possible, and -continue parsing so that all of the errors in the input get reported -to the user at once. This is the standard behavior found in compilers -for languages such as C, C++, and Java. - -In PLY, when a syntax error occurs during parsing, the error is immediately -detected (i.e., the parser does not read any more tokens beyond the -source of the error). However, at this point, the parser enters a -recovery mode that can be used to try and continue further parsing. -As a general rule, error recovery in LR parsers is a delicate -topic that involves ancient rituals and black-magic. The recovery mechanism -provided by <tt>yacc.py</tt> is comparable to Unix yacc so you may want -consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. - -<p> -When a syntax error occurs, <tt>yacc.py</tt> performs the following steps: - -<ol> -<li>On the first occurrence of an error, the user-defined <tt>p_error()</tt> function -is called with the offending token as an argument. However, if the syntax error is due to -reaching the end-of-file, <tt>p_error()</tt> is called with an - argument of <tt>None</tt>. -Afterwards, the parser enters -an "error-recovery" mode in which it will not make future calls to <tt>p_error()</tt> until it -has successfully shifted at least 3 tokens onto the parsing stack. - -<p> -<li>If no recovery action is taken in <tt>p_error()</tt>, the offending lookahead token is replaced -with a special <tt>error</tt> token. - -<p> -<li>If the offending lookahead token is already set to <tt>error</tt>, the top item of the parsing stack is -deleted. - -<p> -<li>If the entire parsing stack is unwound, the parser enters a restart state and attempts to start -parsing from its initial state. - -<p> -<li>If a grammar rule accepts <tt>error</tt> as a token, it will be -shifted onto the parsing stack. - -<p> -<li>If the top item of the parsing stack is <tt>error</tt>, lookahead tokens will be discarded until the -parser can successfully shift a new symbol or reduce a rule involving <tt>error</tt>. -</ol> - -<H4><a name="ply_nn30"></a>6.8.1 Recovery and resynchronization with error rules</H4> - - -The most well-behaved approach for handling syntax errors is to write grammar rules that include the <tt>error</tt> -token. For example, suppose your language had a grammar rule for a print statement like this: - -<blockquote> -<pre> -def p_statement_print(p): - 'statement : PRINT expr SEMI' - ... -</pre> -</blockquote> - -To account for the possibility of a bad expression, you might write an additional grammar rule like this: - -<blockquote> -<pre> -def p_statement_print_error(p): - 'statement : PRINT error SEMI' - print("Syntax error in print statement. Bad expression") - -</pre> -</blockquote> - -In this case, the <tt>error</tt> token will match any sequence of -tokens that might appear up to the first semicolon that is -encountered. Once the semicolon is reached, the rule will be -invoked and the <tt>error</tt> token will go away. - -<p> -This type of recovery is sometimes known as parser resynchronization. -The <tt>error</tt> token acts as a wildcard for any bad input text and -the token immediately following <tt>error</tt> acts as a -synchronization token. - -<p> -It is important to note that the <tt>error</tt> token usually does not appear as the last token -on the right in an error rule. For example: - -<blockquote> -<pre> -def p_statement_print_error(p): - 'statement : PRINT error' - print("Syntax error in print statement. Bad expression") -</pre> -</blockquote> - -This is because the first bad token encountered will cause the rule to -be reduced--which may make it difficult to recover if more bad tokens -immediately follow. - -<H4><a name="ply_nn31"></a>6.8.2 Panic mode recovery</H4> - - -An alternative error recovery scheme is to enter a panic mode recovery in which tokens are -discarded to a point where the parser might be able to recover in some sensible manner. - -<p> -Panic mode recovery is implemented entirely in the <tt>p_error()</tt> function. For example, this -function starts discarding tokens until it reaches a closing '}'. Then, it restarts the -parser in its initial state. - -<blockquote> -<pre> -def p_error(p): - print("Whoa. You are seriously hosed.") - if not p: - print("End of File!") - return - - # Read ahead looking for a closing '}' - while True: - tok = parser.token() # Get the next token - if not tok or tok.type == 'RBRACE': - break - parser.restart() -</pre> -</blockquote> - -<p> -This function simply discards the bad token and tells the parser that the error was ok. - -<blockquote> -<pre> -def p_error(p): - if p: - print("Syntax error at token", p.type) - # Just discard the token and tell the parser it's okay. - parser.errok() - else: - print("Syntax error at EOF") -</pre> -</blockquote> - -<P> -More information on these methods is as follows: -</p> - -<p> -<ul> -<li><tt>parser.errok()</tt>. This resets the parser state so it doesn't think it's in error-recovery -mode. This will prevent an <tt>error</tt> token from being generated and will reset the internal -error counters so that the next syntax error will call <tt>p_error()</tt> again. - -<p> -<li><tt>parser.token()</tt>. This returns the next token on the input stream. - -<p> -<li><tt>parser.restart()</tt>. This discards the entire parsing stack and resets the parser -to its initial state. -</ul> - -<p> -To supply the next lookahead token to the parser, <tt>p_error()</tt> can return a token. This might be -useful if trying to synchronize on special characters. For example: - -<blockquote> -<pre> -def p_error(p): - # Read ahead looking for a terminating ";" - while True: - tok = parser.token() # Get the next token - if not tok or tok.type == 'SEMI': break - parser.errok() - - # Return SEMI to the parser as the next lookahead token - return tok -</pre> -</blockquote> - -<p> -Keep in mind in that the above error handling functions, -<tt>parser</tt> is an instance of the parser created by -<tt>yacc()</tt>. You'll need to save this instance someplace in your -code so that you can refer to it during error handling. -</p> - -<H4><a name="ply_nn35"></a>6.8.3 Signalling an error from a production</H4> - - -If necessary, a production rule can manually force the parser to enter error recovery. This -is done by raising the <tt>SyntaxError</tt> exception like this: - -<blockquote> -<pre> -def p_production(p): - 'production : some production ...' - raise SyntaxError -</pre> -</blockquote> - -The effect of raising <tt>SyntaxError</tt> is the same as if the last symbol shifted onto the -parsing stack was actually a syntax error. Thus, when you do this, the last symbol shifted is popped off -of the parsing stack and the current lookahead token is set to an <tt>error</tt> token. The parser -then enters error-recovery mode where it tries to reduce rules that can accept <tt>error</tt> tokens. -The steps that follow from this point are exactly the same as if a syntax error were detected and -<tt>p_error()</tt> were called. - -<P> -One important aspect of manually setting an error is that the <tt>p_error()</tt> function will <b>NOT</b> be -called in this case. If you need to issue an error message, make sure you do it in the production that -raises <tt>SyntaxError</tt>. - -<P> -Note: This feature of PLY is meant to mimic the behavior of the YYERROR macro in yacc. - -<H4><a name="ply_nn38"></a>6.8.4 When Do Syntax Errors Get Reported</H4> - - -<p> -In most cases, yacc will handle errors as soon as a bad input token is -detected on the input. However, be aware that yacc may choose to -delay error handling until after it has reduced one or more grammar -rules first. This behavior might be unexpected, but it's related to -special states in the underlying parsing table known as "defaulted -states." A defaulted state is parsing condition where the same -grammar rule will be reduced regardless of what <em>valid</em> token -comes next on the input. For such states, yacc chooses to go ahead -and reduce the grammar rule <em>without reading the next input -token</em>. If the next token is bad, yacc will eventually get around to reading it and -report a syntax error. It's just a little unusual in that you might -see some of your grammar rules firing immediately prior to the syntax -error. -</p> - -<p> -Usually, the delayed error reporting with defaulted states is harmless -(and there are other reasons for wanting PLY to behave in this way). -However, if you need to turn this behavior off for some reason. You -can clear the defaulted states table like this: -</p> - -<blockquote> -<pre> -parser = yacc.yacc() -parser.defaulted_states = {} -</pre> -</blockquote> - -<p> -Disabling defaulted states is not recommended if your grammar makes use -of embedded actions as described in Section 6.11.</p> - -<H4><a name="ply_nn32"></a>6.8.5 General comments on error handling</H4> - - -For normal types of languages, error recovery with error rules and resynchronization characters is probably the most reliable -technique. This is because you can instrument the grammar to catch errors at selected places where it is relatively easy -to recover and continue parsing. Panic mode recovery is really only useful in certain specialized applications where you might want -to discard huge portions of the input text to find a valid restart point. - -<H3><a name="ply_nn33"></a>6.9 Line Number and Position Tracking</H3> - - -Position tracking is often a tricky problem when writing compilers. -By default, PLY tracks the line number and position of all tokens. -This information is available using the following functions: - -<ul> -<li><tt>p.lineno(num)</tt>. Return the line number for symbol <em>num</em> -<li><tt>p.lexpos(num)</tt>. Return the lexing position for symbol <em>num</em> -</ul> - -For example: - -<blockquote> -<pre> -def p_expression(p): - 'expression : expression PLUS expression' - line = p.lineno(2) # line number of the PLUS token - index = p.lexpos(2) # Position of the PLUS token -</pre> -</blockquote> - -As an optional feature, <tt>yacc.py</tt> can automatically track line -numbers and positions for all of the grammar symbols as well. -However, this extra tracking requires extra processing and can -significantly slow down parsing. Therefore, it must be enabled by -passing the -<tt>tracking=True</tt> option to <tt>yacc.parse()</tt>. For example: - -<blockquote> -<pre> -yacc.parse(data,tracking=True) -</pre> -</blockquote> - -Once enabled, the <tt>lineno()</tt> and <tt>lexpos()</tt> methods work -for all grammar symbols. In addition, two additional methods can be -used: - -<ul> -<li><tt>p.linespan(num)</tt>. Return a tuple (startline,endline) with the starting and ending line number for symbol <em>num</em>. -<li><tt>p.lexspan(num)</tt>. Return a tuple (start,end) with the starting and ending positions for symbol <em>num</em>. -</ul> - -For example: - -<blockquote> -<pre> -def p_expression(p): - 'expression : expression PLUS expression' - p.lineno(1) # Line number of the left expression - p.lineno(2) # line number of the PLUS operator - p.lineno(3) # line number of the right expression - ... - start,end = p.linespan(3) # Start,end lines of the right expression - starti,endi = p.lexspan(3) # Start,end positions of right expression - -</pre> -</blockquote> - -Note: The <tt>lexspan()</tt> function only returns the range of values up to the start of the last grammar symbol. - -<p> -Although it may be convenient for PLY to track position information on -all grammar symbols, this is often unnecessary. For example, if you -are merely using line number information in an error message, you can -often just key off of a specific token in the grammar rule. For -example: - -<blockquote> -<pre> -def p_bad_func(p): - 'funccall : fname LPAREN error RPAREN' - # Line number reported from LPAREN token - print("Bad function call at line", p.lineno(2)) -</pre> -</blockquote> - -<p> -Similarly, you may get better parsing performance if you only -selectively propagate line number information where it's needed using -the <tt>p.set_lineno()</tt> method. For example: - -<blockquote> -<pre> -def p_fname(p): - 'fname : ID' - p[0] = p[1] - p.set_lineno(0,p.lineno(1)) -</pre> -</blockquote> - -PLY doesn't retain line number information from rules that have already been -parsed. If you are building an abstract syntax tree and need to have line numbers, -you should make sure that the line numbers appear in the tree itself. - -<H3><a name="ply_nn34"></a>6.10 AST Construction</H3> - - -<tt>yacc.py</tt> provides no special functions for constructing an -abstract syntax tree. However, such construction is easy enough to do -on your own. - -<p>A minimal way to construct a tree is to simply create and -propagate a tuple or list in each grammar rule function. There -are many possible ways to do this, but one example would be something -like this: - -<blockquote> -<pre> -def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - - p[0] = ('binary-expression',p[2],p[1],p[3]) - -def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = ('group-expression',p[2]) - -def p_expression_number(p): - 'expression : NUMBER' - p[0] = ('number-expression',p[1]) -</pre> -</blockquote> - -<p> -Another approach is to create a set of data structure for different -kinds of abstract syntax tree nodes and assign nodes to <tt>p[0]</tt> -in each rule. For example: - -<blockquote> -<pre> -class Expr: pass - -class BinOp(Expr): - def __init__(self,left,op,right): - self.type = "binop" - self.left = left - self.right = right - self.op = op - -class Number(Expr): - def __init__(self,value): - self.type = "number" - self.value = value - -def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - - p[0] = BinOp(p[1],p[2],p[3]) - -def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - -def p_expression_number(p): - 'expression : NUMBER' - p[0] = Number(p[1]) -</pre> -</blockquote> - -The advantage to this approach is that it may make it easier to attach more complicated -semantics, type checking, code generation, and other features to the node classes. - -<p> -To simplify tree traversal, it may make sense to pick a very generic -tree structure for your parse tree nodes. For example: - -<blockquote> -<pre> -class Node: - def __init__(self,type,children=None,leaf=None): - self.type = type - if children: - self.children = children - else: - self.children = [ ] - self.leaf = leaf - -def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - - p[0] = Node("binop", [p[1],p[3]], p[2]) -</pre> -</blockquote> - -<H3><a name="ply_nn35b"></a>6.11 Embedded Actions</H3> - - -The parsing technique used by yacc only allows actions to be executed at the end of a rule. For example, -suppose you have a rule like this: - -<blockquote> -<pre> -def p_foo(p): - "foo : A B C D" - print("Parsed a foo", p[1],p[2],p[3],p[4]) -</pre> -</blockquote> - -<p> -In this case, the supplied action code only executes after all of the -symbols <tt>A</tt>, <tt>B</tt>, <tt>C</tt>, and <tt>D</tt> have been -parsed. Sometimes, however, it is useful to execute small code -fragments during intermediate stages of parsing. For example, suppose -you wanted to perform some action immediately after <tt>A</tt> has -been parsed. To do this, write an empty rule like this: - -<blockquote> -<pre> -def p_foo(p): - "foo : A seen_A B C D" - print("Parsed a foo", p[1],p[3],p[4],p[5]) - print("seen_A returned", p[2]) - -def p_seen_A(p): - "seen_A :" - print("Saw an A = ", p[-1]) # Access grammar symbol to left - p[0] = some_value # Assign value to seen_A - -</pre> -</blockquote> - -<p> -In this example, the empty <tt>seen_A</tt> rule executes immediately -after <tt>A</tt> is shifted onto the parsing stack. Within this -rule, <tt>p[-1]</tt> refers to the symbol on the stack that appears -immediately to the left of the <tt>seen_A</tt> symbol. In this case, -it would be the value of <tt>A</tt> in the <tt>foo</tt> rule -immediately above. Like other rules, a value can be returned from an -embedded action by simply assigning it to <tt>p[0]</tt> - -<p> -The use of embedded actions can sometimes introduce extra shift/reduce conflicts. For example, -this grammar has no conflicts: - -<blockquote> -<pre> -def p_foo(p): - """foo : abcd - | abcx""" - -def p_abcd(p): - "abcd : A B C D" - -def p_abcx(p): - "abcx : A B C X" -</pre> -</blockquote> - -However, if you insert an embedded action into one of the rules like this, - -<blockquote> -<pre> -def p_foo(p): - """foo : abcd - | abcx""" - -def p_abcd(p): - "abcd : A B C D" - -def p_abcx(p): - "abcx : A B seen_AB C X" - -def p_seen_AB(p): - "seen_AB :" -</pre> -</blockquote> - -an extra shift-reduce conflict will be introduced. This conflict is -caused by the fact that the same symbol <tt>C</tt> appears next in -both the <tt>abcd</tt> and <tt>abcx</tt> rules. The parser can either -shift the symbol (<tt>abcd</tt> rule) or reduce the empty -rule <tt>seen_AB</tt> (<tt>abcx</tt> rule). - -<p> -A common use of embedded rules is to control other aspects of parsing -such as scoping of local variables. For example, if you were parsing C code, you might -write code like this: - -<blockquote> -<pre> -def p_statements_block(p): - "statements: LBRACE new_scope statements RBRACE""" - # Action code - ... - pop_scope() # Return to previous scope - -def p_new_scope(p): - "new_scope :" - # Create a new scope for local variables - s = new_scope() - push_scope(s) - ... -</pre> -</blockquote> - -In this case, the embedded action <tt>new_scope</tt> executes -immediately after a <tt>LBRACE</tt> (<tt>{</tt>) symbol is parsed. -This might adjust internal symbol tables and other aspects of the -parser. Upon completion of the rule <tt>statements_block</tt>, code -might undo the operations performed in the embedded action -(e.g., <tt>pop_scope()</tt>). - -<H3><a name="ply_nn36"></a>6.12 Miscellaneous Yacc Notes</H3> - - -<ul> - -<li>By default, <tt>yacc.py</tt> relies on <tt>lex.py</tt> for tokenizing. However, an alternative tokenizer -can be supplied as follows: - -<blockquote> -<pre> -parser = yacc.parse(lexer=x) -</pre> -</blockquote> -in this case, <tt>x</tt> must be a Lexer object that minimally has a <tt>x.token()</tt> method for retrieving the next -token. If an input string is given to <tt>yacc.parse()</tt>, the lexer must also have an <tt>x.input()</tt> method. - -<p> -<li>By default, the yacc generates tables in debugging mode (which produces the parser.out file and other output). -To disable this, use - -<blockquote> -<pre> -parser = yacc.yacc(debug=False) -</pre> -</blockquote> - -<p> -<li>To change the name of the <tt>parsetab.py</tt> file, use: - -<blockquote> -<pre> -parser = yacc.yacc(tabmodule="foo") -</pre> -</blockquote> - -<P> -Normally, the <tt>parsetab.py</tt> file is placed into the same directory as -the module where the parser is defined. If you want it to go somewhere else, you can -given an absolute package name for <tt>tabmodule</tt> instead. In that case, the -tables will be written there. -</p> - -<p> -<li>To change the directory in which the <tt>parsetab.py</tt> file (and other output files) are written, use: -<blockquote> -<pre> -parser = yacc.yacc(tabmodule="foo",outputdir="somedirectory") -</pre> -</blockquote> - -<p> -Note: Be aware that unless the directory specified is also on Python's path (<tt>sys.path</tt>), subsequent -imports of the table file will fail. As a general rule, it's better to specify a destination using the -<tt>tabmodule</tt> argument instead of directly specifying a directory using the <tt>outputdir</tt> argument. -</p> - -<p> -<li>To prevent yacc from generating any kind of parser table file, use: -<blockquote> -<pre> -parser = yacc.yacc(write_tables=False) -</pre> -</blockquote> - -Note: If you disable table generation, yacc() will regenerate the parsing tables -each time it runs (which may take awhile depending on how large your grammar is). - -<P> -<li>To print copious amounts of debugging during parsing, use: - -<blockquote> -<pre> -parser.parse(input_text, debug=True) -</pre> -</blockquote> - -<p> -<li>Since the generation of the LALR tables is relatively expensive, previously generated tables are -cached and reused if possible. The decision to regenerate the tables is determined by taking an MD5 -checksum of all grammar rules and precedence rules. Only in the event of a mismatch are the tables regenerated. - -<p> -It should be noted that table generation is reasonably efficient, even for grammars that involve around a 100 rules -and several hundred states. </li> - - -<p> -<li>Since LR parsing is driven by tables, the performance of the parser is largely independent of the -size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. -</li> -</p> - -<p> -<li><tt>yacc()</tt> also allows parsers to be defined as classes and as closures (see the section on alternative specification of -lexers). However, be aware that only one parser may be defined in a single module (source file). There are various -error checks and validation steps that may issue confusing error messages if you try to define multiple parsers -in the same source file. -</li> -</p> - -<p> -<li>Decorators of production rules have to update the wrapped function's line number. <tt>wrapper.co_firstlineno = func.__code__.co_firstlineno</tt>: - -<blockquote> -<pre> -from functools import wraps -from nodes import Collection - - -def strict(*types): - def decorate(func): - @wraps(func) - def wrapper(p): - func(p) - if not isinstance(p[0], types): - raise TypeError - - wrapper.co_firstlineno = func.__code__.co_firstlineno - return wrapper - - return decorate - -@strict(Collection) -def p_collection(p): - """ - collection : sequence - | map - """ - p[0] = p[1] -</pre> -</blockquote> - -</li> -</p> - - -</ul> -</p> - - -<H2><a name="ply_nn37"></a>7. Multiple Parsers and Lexers</H2> - - -In advanced parsing applications, you may want to have multiple -parsers and lexers. - -<p> -As a general rules this isn't a problem. However, to make it work, -you need to carefully make sure everything gets hooked up correctly. -First, make sure you save the objects returned by <tt>lex()</tt> and -<tt>yacc()</tt>. For example: - -<blockquote> -<pre> -lexer = lex.lex() # Return lexer object -parser = yacc.yacc() # Return parser object -</pre> -</blockquote> - -Next, when parsing, make sure you give the <tt>parse()</tt> function a reference to the lexer it -should be using. For example: - -<blockquote> -<pre> -parser.parse(text,lexer=lexer) -</pre> -</blockquote> - -If you forget to do this, the parser will use the last lexer -created--which is not always what you want. - -<p> -Within lexer and parser rule functions, these objects are also -available. In the lexer, the "lexer" attribute of a token refers to -the lexer object that triggered the rule. For example: - -<blockquote> -<pre> -def t_NUMBER(t): - r'\d+' - ... - print(t.lexer) # Show lexer object -</pre> -</blockquote> - -In the parser, the "lexer" and "parser" attributes refer to the lexer -and parser objects respectively. - -<blockquote> -<pre> -def p_expr_plus(p): - 'expr : expr PLUS expr' - ... - print(p.parser) # Show parser object - print(p.lexer) # Show lexer object -</pre> -</blockquote> - -If necessary, arbitrary attributes can be attached to the lexer or parser object. -For example, if you wanted to have different parsing modes, you could attach a mode -attribute to the parser object and look at it later. - -<H2><a name="ply_nn38b"></a>8. Using Python's Optimized Mode</H2> - - -Because PLY uses information from doc-strings, parsing and lexing -information must be gathered while running the Python interpreter in -normal mode (i.e., not with the -O or -OO options). However, if you -specify optimized mode like this: - -<blockquote> -<pre> -lex.lex(optimize=1) -yacc.yacc(optimize=1) -</pre> -</blockquote> - -then PLY can later be used when Python runs in optimized mode. To make this work, -make sure you first run Python in normal mode. Once the lexing and parsing tables -have been generated the first time, run Python in optimized mode. PLY will use -the tables without the need for doc strings. - -<p> -Beware: running PLY in optimized mode disables a lot of error -checking. You should only do this when your project has stabilized -and you don't need to do any debugging. One of the purposes of -optimized mode is to substantially decrease the startup time of -your compiler (by assuming that everything is already properly -specified and works). - -<H2><a name="ply_nn44"></a>9. Advanced Debugging</H2> - - -<p> -Debugging a compiler is typically not an easy task. PLY provides some -advanced diagostic capabilities through the use of Python's -<tt>logging</tt> module. The next two sections describe this: - -<H3><a name="ply_nn45"></a>9.1 Debugging the lex() and yacc() commands</H3> - - -<p> -Both the <tt>lex()</tt> and <tt>yacc()</tt> commands have a debugging -mode that can be enabled using the <tt>debug</tt> flag. For example: - -<blockquote> -<pre> -lex.lex(debug=True) -yacc.yacc(debug=True) -</pre> -</blockquote> - -Normally, the output produced by debugging is routed to either -standard error or, in the case of <tt>yacc()</tt>, to a file -<tt>parser.out</tt>. This output can be more carefully controlled -by supplying a logging object. Here is an example that adds -information about where different debugging messages are coming from: - -<blockquote> -<pre> -# Set up a logging object -import logging -logging.basicConfig( - level = logging.DEBUG, - filename = "parselog.txt", - filemode = "w", - format = "%(filename)10s:%(lineno)4d:%(message)s" -) -log = logging.getLogger() - -lex.lex(debug=True,debuglog=log) -yacc.yacc(debug=True,debuglog=log) -</pre> -</blockquote> - -If you supply a custom logger, the amount of debugging -information produced can be controlled by setting the logging level. -Typically, debugging messages are either issued at the <tt>DEBUG</tt>, -<tt>INFO</tt>, or <tt>WARNING</tt> levels. - -<p> -PLY's error messages and warnings are also produced using the logging -interface. This can be controlled by passing a logging object -using the <tt>errorlog</tt> parameter. - -<blockquote> -<pre> -lex.lex(errorlog=log) -yacc.yacc(errorlog=log) -</pre> -</blockquote> - -If you want to completely silence warnings, you can either pass in a -logging object with an appropriate filter level or use the <tt>NullLogger</tt> -object defined in either <tt>lex</tt> or <tt>yacc</tt>. For example: - -<blockquote> -<pre> -yacc.yacc(errorlog=yacc.NullLogger()) -</pre> -</blockquote> - -<H3><a name="ply_nn46"></a>9.2 Run-time Debugging</H3> - - -<p> -To enable run-time debugging of a parser, use the <tt>debug</tt> option to parse. This -option can either be an integer (which simply turns debugging on or off) or an instance -of a logger object. For example: - -<blockquote> -<pre> -log = logging.getLogger() -parser.parse(input,debug=log) -</pre> -</blockquote> - -If a logging object is passed, you can use its filtering level to control how much -output gets generated. The <tt>INFO</tt> level is used to produce information -about rule reductions. The <tt>DEBUG</tt> level will show information about the -parsing stack, token shifts, and other details. The <tt>ERROR</tt> level shows information -related to parsing errors. - -<p> -For very complicated problems, you should pass in a logging object that -redirects to a file where you can more easily inspect the output after -execution. - -<H2><a name="ply_nn49"></a>10. Packaging Advice</H2> - - -<p> -If you are distributing a package that makes use of PLY, you should -spend a few moments thinking about how you want to handle the files -that are automatically generated. For example, the <tt>parsetab.py</tt> -file generated by the <tt>yacc()</tt> function.</p> - -<p> -Starting in PLY-3.6, the table files are created in the same directory -as the file where a parser is defined. This means that the -<tt>parsetab.py</tt> file will live side-by-side with your parser -specification. In terms of packaging, this is probably the easiest and -most sane approach to manage. You don't need to give <tt>yacc()</tt> -any extra arguments and it should just "work."</p> - -<p> -One concern is the management of the <tt>parsetab.py</tt> file itself. -For example, should you have this file checked into version control (e.g., GitHub), -should it be included in a package distribution as a normal file, or should you -just let PLY generate it automatically for the user when they install your package? -</p> - -<p> -As of PLY-3.6, the <tt>parsetab.py</tt> file should be compatible across all versions -of Python including Python 2 and 3. Thus, a table file generated in Python 2 should -work fine if it's used on Python 3. Because of this, it should be relatively harmless -to distribute the <tt>parsetab.py</tt> file yourself if you need to. However, be aware -that older/newer versions of PLY may try to regenerate the file if there are future -enhancements or changes to its format. -</p> - -<p> -To make the generation of table files easier for the purposes of installation, you might -way to make your parser files executable using the <tt>-m</tt> option or similar. For -example: -</p> - -<blockquote> -<pre> -# calc.py -... -... -def make_parser(): - parser = yacc.yacc() - return parser - -if __name__ == '__main__': - make_parser() -</pre> -</blockquote> - -<p> -You can then use a command such as <tt>python -m calc.py</tt> to generate the tables. Alternatively, -a <tt>setup.py</tt> script, can import the module and use <tt>make_parser()</tt> to create the -parsing tables. -</p> - -<p> -If you're willing to sacrifice a little startup time, you can also instruct PLY to never write the -tables using <tt>yacc.yacc(write_tables=False, debug=False)</tt>. In this mode, PLY will regenerate -the parsing tables from scratch each time. For a small grammar, you probably won't notice. For a -large grammar, you should probably reconsider--the parsing tables are meant to dramatically speed up this process. -</p> - -<p> -During operation, it is normal for PLY to produce diagnostic error -messages (usually printed to standard error). These are generated -entirely using the <tt>logging</tt> module. If you want to redirect -these messages or silence them, you can provide your own logging -object to <tt>yacc()</tt>. For example: -</p> - -<blockquote> -<pre> -import logging -log = logging.getLogger('ply') -... -parser = yacc.yacc(errorlog=log) -</pre> -</blockquote> - -<H2><a name="ply_nn39"></a>11. Where to go from here?</H2> - - -The <tt>examples</tt> directory of the PLY distribution contains several simple examples. Please consult a -compilers textbook for the theory and underlying implementation details or LR parsing. - -</body> -</html> - - - - - - - diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..91883c3 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sly.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sly.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/sly" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sly" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..abfb05c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,284 @@ +# -*- coding: utf-8 -*- +# +# ply documentation build configuration file, created by +# sphinx-quickstart on Wed Sep 7 13:23:26 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import shlex + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'ply' +copyright = u'2001-2020, David Beazley' +author = u'David Beazley' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4.0' +# The full version, including alpha/beta/rc tags. +release = '4.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'plydoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'ply.tex', u'Ply Documentation', + u'David Beazley', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'ply', u'Ply Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'ply', u'Ply Documentation', + author, 'ply', 'Python Lex-Yacc.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..da22efd --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,58 @@ +PLY (Python Lex-Yacc) +===================== + +Requirements +------------ + +PLY requires the use of Python 3.6 or greater. Older versions +of Python are not supported. + +Overview +-------- + +PLY is a 100% Python implementation of the lex and yacc tools +commonly used to write parsers and compilers. Parsing is +based on the same LALR(1) algorithm used by many yacc tools. +Here are a few notable features: + + - PLY provides *very* extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + + - PLY provides full support for empty productions, error recovery, + precedence specifiers, and moderately ambiguous grammars. + + - PLY can be used to build parsers for "real" programming languages. + Although it is not ultra-fast due to its Python implementation, + PLY can be used to parse grammars consisting of several hundred + rules (as might be found for a language like C). + +More Documentation +================== + +Contents: + +.. toctree:: + :maxdepth: 3 + + ply + internals + +Resources +========= + +For a detailed overview of parsing theory, consult the excellent +book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and +Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown +may also be useful. + +The GitHub page for PLY can be found at: + + https://github.com/dabeaz/ply + +Please direct bug reports and pull requests to the GitHub page. +To contact me directly, send email to dave@dabeaz.com or contact +me on Twitter (@dabeaz). + diff --git a/docs/internals.rst b/docs/internals.rst new file mode 100644 index 0000000..2e0de17 --- /dev/null +++ b/docs/internals.rst @@ -0,0 +1,530 @@ +PLY Internals +============= + +1. Introduction +--------------- + +This document describes classes and functions that make up the internal +operation of PLY. Using this programming interface, it is possible to +manually build an parser using a different interface specification +than what PLY normally uses. For example, you could build a gramar +from information parsed in a completely different input format. Some of +these objects may be useful for building more advanced parsing engines +such as GLR. + +It should be stressed that using PLY at this level is not for the +faint of heart. Generally, it's assumed that you know a bit of +the underlying compiler theory and how an LR parser is put together. + +2. Grammar Class +---------------- + +The file ``ply.yacc`` defines a class ``Grammar`` that +is used to hold and manipulate information about a grammar +specification. It encapsulates the same basic information +about a grammar that is put into a YACC file including +the list of tokens, precedence rules, and grammar rules. +Various operations are provided to perform different validations +on the grammar. In addition, there are operations to compute +the first and follow sets that are needed by the various table +generation algorithms. + + +``Grammar(terminals)`` + Creates a new grammar object. ``terminals`` is a list of strings + specifying the terminals for the grammar. An instance ``g`` of + ``Grammar`` has the following methods: + + +``g.set_precedence(term,assoc,level)`` + Sets the precedence level and associativity for a given terminal ``term``. + ``assoc`` is one of ``'right'``, + ``'left'``, or ``'nonassoc'`` and ``level`` is a positive integer. The higher + the value of ``level``, the higher the precedence. Here is an example of typical + precedence settings:: + + g.set_precedence('PLUS', 'left',1) + g.set_precedence('MINUS', 'left',1) + g.set_precedence('TIMES', 'left',2) + g.set_precedence('DIVIDE','left',2) + g.set_precedence('UMINUS','left',3) + + This method must be called prior to adding any productions to the + grammar with ``g.add_production()``. The precedence of individual grammar + rules is determined by the precedence of the right-most terminal. + + +``g.add_production(name,syms,func=None,file='',line=0)`` + Adds a new grammar rule. ``name`` is the name of the rule, + ``syms`` is a list of symbols making up the right hand + side of the rule, ``func`` is the function to call when + reducing the rule. ``file`` and ``line`` specify + the filename and line number of the rule and are used for + generating error messages. + + The list of symbols in ``syms`` may include character + literals and ``%prec`` specifiers. Here are some + examples:: + + g.add_production('expr',['expr','PLUS','term'],func,file,line) + g.add_production('expr',['expr','"+"','term'],func,file,line) + g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) + + If any kind of error is detected, a ``GrammarError`` exception + is raised with a message indicating the reason for the failure. + + +``g.set_start(start=None)`` + Sets the starting rule for the grammar. ``start`` is a string + specifying the name of the start rule. If ``start`` is omitted, + the first grammar rule added with ``add_production()`` is taken to be + the starting rule. This method must always be called after all + productions have been added. + +``g.find_unreachable()`` + Diagnostic function. Returns a list of all unreachable non-terminals + defined in the grammar. This is used to identify inactive parts of + the grammar specification. + +``g.infinite_cycle()`` + Diagnostic function. Returns a list of all non-terminals in the + grammar that result in an infinite cycle. This condition occurs if + there is no way for a grammar rule to expand to a string containing + only terminal symbols. + +``g.undefined_symbols()`` + Diagnostic function. Returns a list of tuples ``(name, prod)`` + corresponding to undefined symbols in the grammar. ``name`` is the + name of the undefined symbol and ``prod`` is an instance of + ``Production`` which has information about the production rule + where the undefined symbol was used. + +``g.unused_terminals()`` + Diagnostic function. Returns a list of terminals that were defined, + but never used in the grammar. + +``g.unused_rules()`` + Diagnostic function. Returns a list of ``Production`` instances + corresponding to production rules that were defined in the grammar, + but never used anywhere. This is slightly different + than ``find_unreachable()``. + +``g.unused_precedence()`` + Diagnostic function. Returns a list of tuples ``(term, assoc)`` + corresponding to precedence rules that were set, but never used the + grammar. ``term`` is the terminal name and ``assoc`` is the + precedence associativity (e.g., ``'left'``, ``'right'``, + or ``'nonassoc'``. + +``g.compute_first()`` + Compute all of the first sets for all symbols in the grammar. Returns a dictionary + mapping symbol names to a list of all first symbols. + +``g.compute_follow()`` + Compute all of the follow sets for all non-terminals in the grammar. + The follow set is the set of all possible symbols that might follow a + given non-terminal. Returns a dictionary mapping non-terminal names + to a list of symbols. + +``g.build_lritems()`` + Calculates all of the LR items for all productions in the grammar. This + step is required before using the grammar for any kind of table generation. + See the section on LR items below. + +The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +``g.Productions`` + A list of all productions added. The first entry is reserved for + a production representing the starting rule. The objects in this list + are instances of the ``Production`` class, described shortly. + +``g.Prodnames`` + A dictionary mapping the names of nonterminals to a list of all + productions of that nonterminal. + +``g.Terminals`` + A dictionary mapping the names of terminals to a list of the + production numbers where they are used. + +``g.Nonterminals`` + A dictionary mapping the names of nonterminals to a list of the + production numbers where they are used. + +``g.First`` + A dictionary representing the first sets for all grammar symbols. This is + computed and returned by the ``compute_first()`` method. + +``g.Follow`` + A dictionary representing the follow sets for all grammar rules. This is + computed and returned by the ``compute_follow()`` method. + +``g.Start`` + Starting symbol for the grammar. Set by the ``set_start()`` method. + +For the purposes of debugging, a ``Grammar`` object supports the ``__len__()`` and +``__getitem__()`` special methods. Accessing ``g[n]`` returns the nth production +from the grammar. + +3. Productions +-------------- + +``Grammar`` objects store grammar rules as instances of a ``Production`` class. This +class has no public constructor--you should only create productions by calling ``Grammar.add_production()``. +The following attributes are available on a ``Production`` instance ``p``. + +``p.name`` + The name of the production. For a grammar rule such as ``A : B C D``, this is ``'A'``. + +``p.prod`` + A tuple of symbols making up the right-hand side of the production. For a grammar rule such as ``A : B C D``, this is ``('B','C','D')``. + +``p.number`` + Production number. An integer containing the index of the production in the grammar's ``Productions`` list. + +``p.func`` + The name of the reduction function associated with the production. + This is the function that will execute when reducing the entire + grammar rule during parsing. + +``p.callable`` + The callable object associated with the name in ``p.func``. This is ``None`` + unless the production has been bound using ``bind()``. + +``p.file`` + Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. + +``p.lineno`` + Line number associated with the production. Typically this is the line number in ``p.file`` where the production was defined. Used for error messages. + +``p.prec`` + Precedence and associativity associated with the production. This is a tuple ``(assoc,level)`` where + ``assoc`` is one of ``'left'``,``'right'``, or ``'nonassoc'`` and ``level`` is + an integer. This value is determined by the precedence of the right-most terminal symbol in the production + or by use of the ``%prec`` specifier when adding the production. + +``p.usyms`` + A list of all unique symbols found in the production. + +``p.lr_items`` + A list of all LR items for this production. This attribute only has a meaningful value if the + ``Grammar.build_lritems()`` method has been called. The items in this list are + instances of ``LRItem`` described below. + +``p.lr_next`` + The head of a linked-list representation of the LR items in ``p.lr_items``. + This attribute only has a meaningful value if the ``Grammar.build_lritems()`` + method has been called. Each ``LRItem`` instance has a ``lr_next`` attribute + to move to the next item. The list is terminated by ``None``. + +``p.bind(dict)`` + Binds the production function name in ``p.func`` to a callable object in + ``dict``. This operation is typically carried out in the last step + prior to running the parsing engine and is needed since parsing tables are typically + read from files which only include the function names, not the functions themselves. + +``Production`` objects support +the ``__len__()``, ``__getitem__()``, and ``__str__()`` +special methods. +``len(p)`` returns the number of symbols in ``p.prod`` +and ``p[n]`` is the same as ``p.prod[n]``. + +4. LRItems +---------- + +The construction of parsing tables in an LR-based parser generator is primarily +done over a set of "LR Items". An LR item represents a stage of parsing one +of the grammar rules. To compute the LR items, it is first necessary to +call ``Grammar.build_lritems()``. Once this step, all of the productions +in the grammar will have their LR items attached to them. + +Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, ``g`` is a ``Grammar`` +object:: + + >>> g.build_lritems() + >>> p = g[1] + >>> p + Production(statement -> ID = expr) + >>> + +In the above code, ``p`` represents the first grammar rule. In +this case, a rule ``'statement -> ID = expr'``. + +Now, let's look at the LR items for ``p``:: + + >>> p.lr_items + [LRItem(statement -> . ID = expr), + LRItem(statement -> ID . = expr), + LRItem(statement -> ID = . expr), + LRItem(statement -> ID = expr .)] + >>> + +In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot +is advanced by one symbol. It is only when the dot reaches the very end that a production +is successfully parsed. + +An instance ``lr`` of ``LRItem`` has the following +attributes that hold information related to that specific stage of +parsing. + +``lr.name`` + The name of the grammar rule. For example, ``'statement'`` in the above example. + +``lr.prod`` + A tuple of symbols representing the right-hand side of the production, including the + special ``'.'`` character. For example, ``('ID','.','=','expr')``. + +``lr.number`` + An integer representing the production number in the grammar. + +``lr.usyms`` + A set of unique symbols in the production. Inherited from the original ``Production`` instance. + +``lr.lr_index`` + An integer representing the position of the dot (.). You should never use ``lr.prod.index()`` + to search for it--the result will be wrong if the grammar happens to also use (.) as a character + literal. + +``lr.lr_after`` + A list of all productions that can legally appear immediately to the right of the + dot (.). This list contains ``Production`` instances. This attribute + represents all of the possible branches a parse can take from the current position. + For example, suppose that ``lr`` represents a stage immediately before + an expression like this:: + + >>> lr + LRItem(statement -> ID = . expr) + >>> + + Then, the value of ``lr.lr_after`` might look like this, showing all productions that + can legally appear next:: + + >>> lr.lr_after + [Production(expr -> expr PLUS expr), + Production(expr -> expr MINUS expr), + Production(expr -> expr TIMES expr), + Production(expr -> expr DIVIDE expr), + Production(expr -> MINUS expr), + Production(expr -> LPAREN expr RPAREN), + Production(expr -> NUMBER), + Production(expr -> ID)] + >>> + +``lr.lr_before`` + The grammar symbol that appears immediately before the dot (.) or ``None`` if + at the beginning of the parse. + +``lr.lr_next`` + A link to the next LR item, representing the next stage of the parse. ``None`` if ``lr`` + is the last LR item. + +``LRItem`` instances also support the ``__len__()`` and ``__getitem__()`` special methods. +``len(lr)`` returns the number of items in ``lr.prod`` including the dot (.). ``lr[n]`` +returns ``lr.prod[n]``. + +It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very +likely create a small black-hole that will consume you and your code. + +5. LRTable +---------- + +The ``LRTable`` class represents constructed LR parsing tables on a +grammar. + +``LRTable(grammar, log=None)`` + Create the LR parsing tables on a grammar. ``grammar`` is an instance of ``Grammar`` and + ``log`` is a logger object used to write debugging information. The debugging information + written to ``log`` is the same as what appears in the ``parser.out`` file created + by yacc. By supplying a custom logger with a different message format, it is possible to get + more information (e.g., the line number in ``yacc.py`` used for issuing each line of + output in the log). + +An instance ``lr`` of ``LRTable`` has the following attributes. + +``lr.grammar`` + A link to the Grammar object used to construct the parsing tables. + +``lr.lr_method`` + The LR parsing method used (e.g., ``'LALR'``) + +``lr.lr_productions`` + A reference to ``grammar.Productions``. This, together with ``lr_action`` and ``lr_goto`` + contain all of the information needed by the LR parsing engine. + +``lr.lr_action`` + The LR action dictionary that implements the underlying state machine. The keys of this dictionary are + the LR states. + +``lr.lr_goto`` + The LR goto table that contains information about grammar rule reductions. + +``lr.sr_conflicts`` + A list of tuples ``(state,token,resolution)`` identifying all shift/reduce conflicts. ``state`` is the LR state + number where the conflict occurred, ``token`` is the token causing the conflict, and ``resolution`` is + a string describing the resolution taken. ``resolution`` is either ``'shift'`` or ``'reduce'``. + +``lr.rr_conflicts`` + A list of tuples ``(state,rule,rejected)`` identifying all reduce/reduce conflicts. ``state`` is the + LR state number where the conflict occurred, ``rule`` is the production rule that was selected + and ``rejected`` is the production rule that was rejected. Both ``rule`` and ``rejected`` are + instances of ``Production``. They can be inspected to provide the user with more information. + +``lrtab.bind_callables(dict)`` + This binds all of the function names used in productions to callable objects + found in the dictionary ``dict``. During table generation and when reading + LR tables from files, PLY only uses the names of action functions such as ``'p_expr'``, + ``'p_statement'``, etc. In order to actually run the parser, these names + have to be bound to callable objects. This method is always called prior to + running a parser. + +6. LRParser +----------- + +The ``LRParser`` class implements the low-level LR parsing engine. + +``LRParser(lrtab, error_func)`` + Create an LRParser. ``lrtab`` is an instance of ``LRTable`` + containing the LR production and state tables. ``error_func`` is the + error function to invoke in the event of a parsing error. + +An instance ``p`` of ``LRParser`` has the following methods: + +``p.parse(input=None,lexer=None,debug=0,tracking=0)`` + Run the parser. ``input`` is a string, which if supplied is fed into the + lexer using its ``input()`` method. ``lexer`` is an instance of the + ``Lexer`` class to use for tokenizing. If not supplied, the last lexer + created with the ``lex`` module is used. ``debug`` is a boolean flag + that enables debugging. ``tracking`` is a boolean flag that tells the + parser to perform additional line number tracking. + +``p.restart()`` + Resets the parser state for a parse already in progress. + +7. ParserReflect +---------------- + +The ``ParserReflect`` class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +``p_rule()`` functions in a PLY file, performs basic error checking, +and collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the ``yacc()`` function is actually +implemented by this class. + +``ParserReflect(pdict, log=None)`` + Creates a ``ParserReflect`` instance. ``pdict`` is a dictionary + containing parser specification data. This dictionary typically corresponds + to the module or class dictionary of code that implements a PLY parser. + ``log`` is a logger instance that will be used to report error + messages. + +An instance ``p`` of ``ParserReflect`` has the following methods: + +``p.get_all()`` + Collect and store all required parsing information. + +``p.validate_all()`` + Validate all of the collected parsing information. This is a seprate step + from ``p.get_all()`` as a performance optimization. In order to + increase parser start-up time, a parser can elect to only validate the + parsing data when regenerating the parsing tables. The validation + step tries to collect as much information as possible rather than + raising an exception at the first sign of trouble. The attribute + ``p.error`` is set if there are any validation errors. The + value of this attribute is also returned. + +``p.signature()`` + Compute a signature representing the contents of the collected parsing + data. The signature value should change if anything in the parser + specification has changed in a way that would justify parser table + regeneration. This method can be called after ``p.get_all()``, + but before ``p.validate_all()``. + +The following attributes are set in the process of collecting data: + +``p.start`` + The grammar start symbol, if any. Taken from ``pdict['start']``. + +``p.error_func`` + The error handling function or ``None``. Taken from ``pdict['p_error']``. + +``p.tokens`` + The token list. Taken from ``pdict['tokens']``. + +``p.prec`` + The precedence specifier. Taken from ``pdict['precedence']``. + +``p.preclist`` + A parsed version of the precedence specified. A list of tuples of the form + ``(token,assoc,level)`` where ``token`` is the terminal symbol, + ``assoc`` is the associativity (e.g., ``'left'``) and ``level`` + is a numeric precedence level. + +``p.grammar`` + A list of tuples ``(name, rules)`` representing the grammar rules. ``name`` is the + name of a Python function or method in ``pdict`` that starts with ``"p_"``. + ``rules`` is a list of tuples ``(filename,line,prodname,syms)`` representing + the grammar rules found in the documentation string of that function. ``filename`` and ``line`` contain location + information that can be used for debugging. ``prodname`` is the name of the + production. ``syms`` is the right-hand side of the production. If you have a + function like this:: + + def p_expr(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr''' + + then the corresponding entry in ``p.grammar`` might look like this:: + + ('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), + ('calc.py',11,'expr', ['expr','MINUS','expr']), + ('calc.py',12,'expr', ['expr','TIMES','expr']), + ('calc.py',13,'expr', ['expr','DIVIDE','expr']) + ]) + +``p.pfuncs`` + A sorted list of tuples ``(line, file, name, doc)`` representing all of + the ``p_`` functions found. ``line`` and ``file`` give location + information. ``name`` is the name of the function. ``doc`` is the + documentation string. This list is sorted in ascending order by line number. + +``p.files`` + A dictionary holding all of the source filenames that were encountered + while collecting parser information. Only the keys of this dictionary have + any meaning. + +``p.error`` + An attribute that indicates whether or not any critical errors + occurred in validation. If this is set, it means that that some kind + of problem was detected and that no further processing should be + performed. + +8. High-level operation +----------------------- + +Using all of the above classes requires some attention to detail. The ``yacc()`` +function carries out a very specific sequence of operations to create a grammar. +This same sequence should be emulated if you build an alternative PLY interface. + + +1. A ``ParserReflect`` object is created and raw grammar specification data is +collected. + +2. A ``Grammar`` object is created and populated with information +from the specification data. + +3. A ``LRTable`` object is created to run the LALR algorithm over +the ``Grammar`` object. + +4. Productions in the LRTable and bound to callables using the ``bind_callables()`` +method. + +5. A ``LRParser`` object is created from from the information in the +``LRTable`` object. + + + diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..474c9bd --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. xml to make Docutils-native XML files
+ echo. pseudoxml to make pseudoxml-XML files for display purposes
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ echo. coverage to run coverage check of the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sly.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sly.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdf" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf
+ cd %~dp0
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdfja" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf-ja
+ cd %~dp0
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+if "%1" == "coverage" (
+ %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+ goto end
+)
+
+if "%1" == "xml" (
+ %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The XML files are in %BUILDDIR%/xml.
+ goto end
+)
+
+if "%1" == "pseudoxml" (
+ %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+ goto end
+)
+
+:end
diff --git a/docs/ply.rst b/docs/ply.rst new file mode 100644 index 0000000..2f6a89a --- /dev/null +++ b/docs/ply.rst @@ -0,0 +1,2656 @@ +PLY (Python Lex-Yacc) +===================== + +This document provides an overview of lexing and parsing with PLY. +Given the intrinsic complexity of parsing, I strongly advise +that you read (or at least skim) this entire document before jumping +into a big development project with PLY. + +PLY-4.0 requires Python 3.6 or newer. If you're using an older version +of Python, you're out of luck. Sorry. + +Introduction +------------ + +PLY is a pure-Python implementation of the compiler +construction tools lex and yacc. The main goal of PLY is to stay +fairly faithful to the way in which traditional lex/yacc tools work. +This includes supporting LALR(1) parsing as well as providing +extensive input validation, error reporting, and diagnostics. Thus, +if you've used yacc in another programming language, it should be +relatively straightforward to use PLY. + +Early versions of PLY were developed to support an Introduction to +Compilers Course I taught in 2001 at the University of Chicago. Since +PLY was primarily developed as an instructional tool, you will find it +to be fairly picky about token and grammar rule specification. In +part, this added formality is meant to catch common programming +mistakes made by novice users. However, advanced users will also find +such features to be useful when building complicated grammars for real +programming languages. It should also be noted that PLY does not +provide much in the way of bells and whistles (e.g., automatic +construction of abstract syntax trees, tree traversal, etc.). Nor +would I consider it to be a parsing framework. Instead, you will find +a bare-bones, yet fully capable lex/yacc implementation written +entirely in Python. + +The rest of this document assumes that you are somewhat familiar with +parsing theory, syntax directed translation, and the use of compiler +construction tools such as lex and yacc in other programming +languages. If you are unfamiliar with these topics, you will probably +want to consult an introductory text such as "Compilers: Principles, +Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex +and Yacc" by John Levine may also be handy. In fact, the O'Reilly +book can be used as a reference for PLY as the concepts are virtually +identical. + +PLY Overview +------------ + +PLY consists of two separate modules; ``lex.py`` and ``yacc.py``, both +of which are found in a Python package called ``ply``. The ``lex.py`` +module is used to break input text into a collection of tokens +specified by a collection of regular expression rules. ``yacc.py`` is +used to recognize language syntax that has been specified in the form +of a context free grammar. + +The two tools are meant to work together. Specifically, ``lex.py`` +provides an interface to produce tokens. ``yacc.py`` uses this +retrieve tokens and invoke grammar rules. The output of ``yacc.py`` +is often an Abstract Syntax Tree (AST). However, this is entirely up +to the user. If desired, ``yacc.py`` can also be used to implement +simple one-pass compilers. + +Like its Unix counterpart, ``yacc.py`` provides most of the features +you expect including extensive error checking, grammar validation, +support for empty productions, error tokens, and ambiguity resolution +via precedence rules. In fact, almost everything that is possible in +traditional yacc should be supported in PLY. + +The primary difference between ``yacc.py`` and Unix ``yacc`` is that +``yacc.py`` doesn't involve a separate code-generation process. +Instead, PLY relies on reflection (introspection) to build its lexers +and parsers. Unlike traditional lex/yacc which require a special +input file that is converted into a separate source file, the +specifications given to PLY *are* valid Python programs. This +means that there are no extra source files nor is there a special +compiler construction step (e.g., running yacc to generate Python code +for the compiler). + +Lex +--- + +``lex.py`` is used to tokenize an input string. For example, suppose +you're writing a programming language and a user supplied the +following input string:: + + x = 3 + 42 * (s - t) + +A tokenizer splits the string into individual tokens:: + + 'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')' + +Tokens are usually given names to indicate what they are. For example:: + + 'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES', + 'LPAREN','ID','MINUS','ID','RPAREN' + +More specifically, the input is broken into pairs of token types and +values. For example:: + + ('ID','x'), ('EQUALS','='), ('NUMBER','3'), + ('PLUS','+'), ('NUMBER','42), ('TIMES','*'), + ('LPAREN','('), ('ID','s'), ('MINUS','-'), + ('ID','t'), ('RPAREN',')' + +The specification of tokens is done by writing a series of +regular expression rules. The next section shows how this is done +using ``lex.py``. + +Lex Example +^^^^^^^^^^^ + +The following example shows how ``lex.py`` is used to write a simple tokenizer:: + + # ------------------------------------------------------------ + # calclex.py + # + # tokenizer for a simple expression evaluator for + # numbers and +,-,*,/ + # ------------------------------------------------------------ + import ply.lex as lex + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + lexer = lex.lex() + +To use the lexer, you first need to feed it some input text using +its ``input()`` method. After that, repeated calls +to ``token()`` produce tokens. The following code shows how this +works:: + + # Test it out + data = ''' + 3 + 4 * 10 + + -20 *2 + ''' + + # Give the lexer some input + lexer.input(data) + + # Tokenize + while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok) + +When executed, the example will produce the following output:: + + $ python example.py + LexToken(NUMBER,3,2,1) + LexToken(PLUS,'+',2,3) + LexToken(NUMBER,4,2,5) + LexToken(TIMES,'*',2,7) + LexToken(NUMBER,10,2,10) + LexToken(PLUS,'+',3,14) + LexToken(MINUS,'-',3,16) + LexToken(NUMBER,20,3,18) + LexToken(TIMES,'*',3,20) + LexToken(NUMBER,2,3,21) + +Lexers also support the iteration protocol. So, you can write the +above loop as follows:: + + for tok in lexer: + print(tok) + +The tokens returned by ``lexer.token()`` are instances of +``LexToken``. This object has attributes ``type``, ``value``, +``lineno``, and ``lexpos``. The following code shows an +example of accessing these attributes:: + + # Tokenize + while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok.type, tok.value, tok.lineno, tok.lexpos) + +The ``type`` and ``value`` attributes contain the type and +value of the token itself. ``lineno`` and ``lexpos`` contain +information about the location of the token. ``lexpos`` is the +index of the token relative to the start of the input text. + +The tokens list +^^^^^^^^^^^^^^^ + +All lexers must provide a list ``tokens`` that defines all of the +possible token names that can be produced by the lexer. This list is +always required and is used to perform a variety of validation checks. +The tokens list is also used by the ``yacc.py`` module to identify +terminals. + +In the example, the following code specified the token names:: + + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + +Specification of tokens +^^^^^^^^^^^^^^^^^^^^^^^ + +Each token is specified by writing a regular expression rule +compatible with Python's ``re`` module. Each of these rules are +defined by making declarations with a special prefix ``t_`` to +indicate that it defines a token. For simple tokens, the regular +expression can be specified as strings such as this (note: Python raw +strings are used since they are the most convenient way to write +regular expression strings):: + + t_PLUS = r'\+' + +In this case, the name following the ``t_`` must exactly match one of +the names supplied in ``tokens``. If some kind of action needs to be +performed, a token rule can be specified as a function. For example, +this rule matches numbers and converts the string into a Python +integer:: + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +When a function is used, the regular expression rule is specified in +the function documentation string. The function always takes a single +argument which is an instance of ``LexToken``. This object has +attributes of ``type`` which is the token type (as a string), +``value`` which is the lexeme (the actual text matched), +``lineno`` which is the current line number, and ``lexpos`` which +is the position of the token relative to the beginning of the input +text. By default, ``type`` is set to the name following the ``t_`` +prefix. The action function can modify the contents of the +``LexToken`` object as appropriate. However, when it is done, the +resulting token should be returned. If no value is returned by the +action function, the token is discarded and the next token +read. + +Internally, ``lex.py`` uses the ``re`` module to do its pattern +matching. Patterns are compiled using the ``re.VERBOSE`` flag which +can be used to help readability. However, be aware that unescaped +whitespace is ignored and comments are allowed in this mode. If your +pattern involves whitespace, make sure you use ``\s``. If you need to +match the ``#`` character, use ``[#]``. + +When building the master regular expression, rules are added in the +following order: + +1. All tokens defined by functions are added in the same order as they + appear in the lexer file. + +2. Tokens defined by strings are added next by sorting them in order + of decreasing regular expression length (longer expressions are added + first). + +Without this ordering, it can be difficult to correctly match certain +types of tokens. For example, if you wanted to have separate tokens +for "=" and "==", you need to make sure that "==" is checked first. +By sorting regular expressions in order of decreasing length, this +problem is solved for rules defined as strings. For functions, the +order can be explicitly controlled since rules appearing first are +checked first. + +To handle reserved words, you should write a single rule to match an +identifier and do a special name lookup in a function like this:: + + reserved = { + 'if' : 'IF', + 'then' : 'THEN', + 'else' : 'ELSE', + 'while' : 'WHILE', + ... + } + + tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) + + def t_ID(t): + r'[a-zA-Z_][a-zA-Z_0-9]*' + t.type = reserved.get(t.value,'ID') # Check for reserved words + return t + +This approach greatly reduces the number of regular expression rules +and is likely to make things a little faster. + +Note: You should avoid writing individual rules for reserved words. +For example, if you write rules like this:: + + t_FOR = r'for' + t_PRINT = r'print' + +those rules will be triggered for identifiers that include those words +as a prefix such as "forget" or "printed". This is probably not what +you want. + +Token values +^^^^^^^^^^^^ + +When tokens are returned by lex, they have a value that is stored in +the ``value`` attribute. Normally, the value is the text that was +matched. However, the value can be assigned to any Python object. +For instance, when lexing identifiers, you may want to return both the +identifier name and information from some sort of symbol table. To do +this, you might write a rule like this:: + + def t_ID(t): + ... + # Look up symbol table information and return a tuple + t.value = (t.value, symbol_lookup(t.value)) + ... + return t + +It is important to note that storing data in other attribute names is +*not* recommended. The ``yacc.py`` module only exposes the +contents of the ``value`` attribute. Thus, accessing other attributes +may be unnecessarily awkward. If you need to store multiple values on +a token, assign a tuple, dictionary, or instance to ``value``. + +Discarded tokens +^^^^^^^^^^^^^^^^ + +To discard a token, such as a comment, define a token rule that +returns no value. For example:: + + def t_COMMENT(t): + r'\#.*' + pass + # No return value. Token discarded + +Alternatively, you can include the prefix ``ignore_`` in the token +declaration to force a token to be ignored. For example:: + + t_ignore_COMMENT = r'\#.*' + +Be advised that if you are ignoring many different kinds of text, you +may still want to use functions since these provide more precise +control over the order in which regular expressions are matched (i.e., +functions are matched in order of specification whereas strings are +sorted by regular expression length). + +Line numbers and positional information +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, ``lex.py`` knows nothing about line numbers. This is +because ``lex.py`` doesn't know anything about what constitutes a +"line" of input (e.g., the newline character or even if the input is +textual data). To update this information, you need to write a +special rule. In the example, the ``t_newline()`` rule shows how to +do this:: + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +Within the rule, the ``lineno`` attribute of the underlying lexer +``t.lexer`` is updated. After the line number is updated, the token +is discarded since nothing is returned. + +``lex.py`` does not perform any kind of automatic column tracking. +However, it does record positional information related to each token +in the ``lexpos`` attribute. Using this, it is usually possible to +compute column information as a separate step. For instance, just +count backwards until you reach a newline:: + + # Compute column. + # input is the input text string + # token is a token instance + def find_column(input, token): + line_start = input.rfind('\n', 0, token.lexpos) + 1 + return (token.lexpos - line_start) + 1 + +Since column information is often only useful in the context of error +handling, calculating the column position can be performed when needed +as opposed to doing it for each token. Note: If you're parsing a language +where whitespace matters (i.e., Python), it's probably better match +whitespace as a token instead of ignoring it. + +Ignored characters +^^^^^^^^^^^^^^^^^^ + +The special ``t_ignore`` rule is reserved by ``lex.py`` for characters +that should be completely ignored in the input stream. Usually this +is used to skip over whitespace and other non-essential characters. +Although it is possible to define a regular expression rule for +whitespace in a manner similar to ``t_newline()``, the use of +``t_ignore`` provides substantially better lexing performance because +it is handled as a special case and is checked in a much more +efficient manner than the normal regular expression rules. + +The characters given in ``t_ignore`` are not ignored when such +characters are part of other regular expression patterns. For +example, if you had a rule to capture quoted text, that pattern can +include the ignored characters (which will be captured in the normal +way). The main purpose of ``t_ignore`` is to ignore whitespace and +other padding between the tokens that you actually want to parse. + +Literal characters +^^^^^^^^^^^^^^^^^^ + +Literal characters can be specified by defining a variable +``literals`` in your lexing module. For example:: + + literals = [ '+','-','*','/' ] + +or alternatively:: + + literals = "+-*/" + +A literal character is a single character that is returned "as +is" when encountered by the lexer. Literals are checked after all of +the defined regular expression rules. Thus, if a rule starts with one +of the literal characters, it will always take precedence. + +When a literal token is returned, both its ``type`` and ``value`` +attributes are set to the character itself. For example, ``'+'``. + +It's possible to write token functions that perform additional actions +when literals are matched. However, you'll need to set the token type +appropriately. For example:: + + literals = [ '{', '}' ] + + def t_lbrace(t): + r'\{' + t.type = '{' # Set token type to the expected literal + return t + + def t_rbrace(t): + r'\}' + t.type = '}' # Set token type to the expected literal + return t + +Error handling +^^^^^^^^^^^^^^ + +The ``t_error()`` function is used to handle lexing errors that occur +when illegal characters are detected. In this case, the ``t.value`` +attribute contains the rest of the input string that has not been +tokenized. In the example, the error function was defined as +follows:: + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +In this case, we print the offending character and skip ahead +one character by calling ``t.lexer.skip(1)``. + +EOF Handling +^^^^^^^^^^^^ + +The ``t_eof()`` function is used to handle an end-of-file (EOF) +condition in the input. As input, it receives a token type ``'eof'`` +with the ``lineno`` and ``lexpos`` attributes set appropriately. The +main use of this function is provide more input to the lexer so that +it can continue to parse. Here is an example of how this works:: + + # EOF handling rule + def t_eof(t): + # Get more input (Example) + more = raw_input('... ') + if more: + self.lexer.input(more) + return self.lexer.token() + return None + +The EOF function should return the next available token (by calling +``self.lexer.token())`` or ``None`` to indicate no more data. Be +aware that setting more input with the ``self.lexer.input()`` method +does NOT reset the lexer state or the ``lineno`` attribute used for +position tracking. The ``lexpos`` attribute is reset so be aware of +that if you're using it in error reporting. + +Building and using the lexer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To build the lexer, the function ``lex.lex()`` is used. For example:: + + lexer = lex.lex() + +This function uses Python reflection (or introspection) to read the +regular expression rules out of the calling context and build the +lexer. Once the lexer has been built, two methods can be used to +control the lexer. + +``lexer.input(data)``. Reset the lexer and store a new input string. + +``lexer.token()``. Return the next token. Returns a special +``LexToken`` instance on success or None if the end of the input text +has been reached. + +The @TOKEN decorator +^^^^^^^^^^^^^^^^^^^^ + +In some applications, you may want to define tokens as a series of +more complex regular expression rules. For example:: + + digit = r'([0-9])' + nondigit = r'([_A-Za-z])' + identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' + + def t_ID(t): + # want docstring to be identifier above. ????? + ... + +In this case, we want the regular expression rule for ``ID`` to be one +of the variables above. However, there is no way to directly specify +this using a normal documentation string. To solve this problem, you +can use the ``@TOKEN`` decorator. For example:: + + from ply.lex import TOKEN + + @TOKEN(identifier) + def t_ID(t): + ... + +This will attach ``identifier`` to the docstring for ``t_ID()`` +allowing ``lex.py`` to work normally. Naturally, you could use ``@TOKEN`` +on all functions as an alternative to using doc-strings. + +Debugging +^^^^^^^^^ + +For the purpose of debugging, you can run ``lex()`` in a debugging +mode as follows:: + + lexer = lex.lex(debug=True) + +This will produce various sorts of debugging information including all +of the added rules, the master regular expressions used by the lexer, +and tokens generating during lexing. + +In addition, ``lex.py`` comes with a simple main function which will +either tokenize input read from standard input or from a file +specified on the command line. To use it, put this in your +lexer:: + + if __name__ == '__main__': + lex.runmain() + +Please refer to the "Debugging" section near the end for some more +advanced details of debugging. + +Alternative specification of lexers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As shown in the example, lexers are specified all within one Python +module. If you want to put token rules in a different module from the +one in which you invoke ``lex()``, use the ``module`` keyword +argument. + +For example, you might have a dedicated module that just contains the +token rules:: + + # module: tokrules.py + # This module just contains the lexing rules + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +Now, if you wanted to build a tokenizer from these rules from within a +different module, you would do the following (shown for Python +interactive mode):: + + >>> import tokrules + >>> lexer = lex.lex(module=tokrules) + >>> lexer.input("3 + 4") + >>> lexer.token() + LexToken(NUMBER,3,1,1,0) + >>> lexer.token() + LexToken(PLUS,'+',1,2) + >>> lexer.token() + LexToken(NUMBER,4,1,4) + >>> lexer.token() + None + >>> + +The ``module`` option can also be used to define lexers from instances +of a class. For example:: + + import ply.lex as lex + + class MyLexer(object): + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + # Note addition of self parameter since we're in a class + def t_NUMBER(self,t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(self,t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(self,t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + def build(self,**kwargs): + self.lexer = lex.lex(module=self, **kwargs) + + # Test it output + def test(self,data): + self.lexer.input(data) + while True: + tok = self.lexer.token() + if not tok: + break + print(tok) + + # Build the lexer and try it out + m = MyLexer() + m.build() # Build the lexer + m.test("3 + 4") # Test it + + +When building a lexer from class, *you should construct the lexer +from an instance of the class*, not the class object itself. This +is because PLY only works properly if the lexer actions are defined by +bound-methods. + +When using the ``module`` option to ``lex()``, PLY collects symbols +from the underlying object using the ``dir()`` function. There is no +direct access to the ``__dict__`` attribute of the object supplied as +a module value. + +Finally, if you want to keep things nicely encapsulated, but don't +want to use a full-fledged class definition, lexers can be defined +using closures. For example:: + + import ply.lex as lex + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + def MyLexer(): + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer from my environment and return it + return lex.lex() + +Important note: If you are defining a lexer using a class or closure, +be aware that PLY still requires you to only define a single lexer per +module (source file). There are extensive validation/error checking +parts of the PLY that may falsely report error messages if you don't +follow this rule. + +Maintaining state +^^^^^^^^^^^^^^^^^ + +In your lexer, you may want to maintain a variety of state +information. This might include mode settings, symbol tables, and +other details. As an example, suppose that you wanted to keep track +of how many NUMBER tokens had been encountered. + +One way to do this is to keep a set of global variables in the module +where you created the lexer. For example:: + + num_count = 0 + def t_NUMBER(t): + r'\d+' + global num_count + num_count += 1 + t.value = int(t.value) + return t + +If you don't like the use of a global variable, another place to store +information is inside the Lexer object created by ``lex()``. To this, +you can use the ``lexer`` attribute of tokens passed to the various +rules. For example:: + + def t_NUMBER(t): + r'\d+' + t.lexer.num_count += 1 # Note use of lexer attribute + t.value = int(t.value) + return t + + lexer = lex.lex() + lexer.num_count = 0 # Set the initial count + +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given +lexer exist in the same application. However, this might also feel +like a gross violation of encapsulation to OO purists. Just to put +your mind at some ease, all internal attributes of the lexer (with the +exception of ``lineno``) have names that are prefixed by ``lex`` +(e.g., ``lexdata``,``lexpos``, etc.). Thus, it is perfectly safe to +store attributes in the lexer that don't have names starting with that +prefix or a name that conflicts with one of the predefined methods +(e.g., ``input()``, ``token()``, etc.). + +If you don't like assigning values on the lexer object, you can define +your lexer as a class as shown in the previous section:: + + class MyLexer: + ... + def t_NUMBER(self,t): + r'\d+' + self.num_count += 1 + t.value = int(t.value) + return t + + def build(self, **kwargs): + self.lexer = lex.lex(object=self,**kwargs) + + def __init__(self): + self.num_count = 0 + +The class approach may be the easiest to manage if your application is +going to be creating multiple instances of the same lexer and you need +to manage a lot of state. + +State can also be managed through closures. For example:: + + def MyLexer(): + num_count = 0 + ... + def t_NUMBER(t): + r'\d+' + nonlocal num_count + num_count += 1 + t.value = int(t.value) + return t + ... + +Lexer cloning +^^^^^^^^^^^^^ + +If necessary, a lexer object can be duplicated by invoking its +``clone()`` method. For example:: + + lexer = lex.lex() + ... + newlexer = lexer.clone() + +When a lexer is cloned, the copy is exactly identical to the original +lexer including any input text and internal state. However, the clone +allows a different set of input text to be supplied which may be +processed separately. This may be useful in situations when you are +writing a parser/compiler that involves recursive or reentrant +processing. For instance, if you needed to scan ahead in the input +for some reason, you could create a clone and use it to look ahead. +Or, if you were implementing some kind of preprocessor, cloned lexers +could be used to handle different input files. + +Creating a clone is different than calling ``lex.lex()`` in that +PLY doesn't regenerate any of the internal tables or regular expressions. + +Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, +you need to be aware that the newly created lexers will share all of +this state with the original lexer. For example, if you defined a +lexer as a class and did this:: + + m = MyLexer() + a = lex.lex(object=m) # Create a lexer + + b = a.clone() # Clone the lexer + +Then both ``a`` and ``b`` are going to be bound to the same object +``m`` and any changes to ``m`` will be reflected in both lexers. It's +important to emphasize that ``clone()`` is only meant to create a new +lexer that reuses the regular expressions and environment of another +lexer. If you need to make a totally new copy of a lexer, then call +``lex()`` again. + +Internal lexer state +^^^^^^^^^^^^^^^^^^^^ + +A Lexer object ``lexer`` has a number of internal attributes that may be useful in certain +situations. + +``lexer.lexpos`` + This attribute is an integer that contains the current position + within the input text. If you modify the value, it will change + the result of the next call to ``token()``. Within token rule + functions, this points to the first character *after* the + matched text. If the value is modified within a rule, the next + returned token will be matched at the new position. + +``lexer.lineno`` + The current value of the line number attribute stored in the + lexer. PLY only specifies that the attribute exists---it never + sets, updates, or performs any processing with it. If you want to + track line numbers, you will need to add code yourself (see the + section on line numbers and positional information). + +``lexer.lexdata`` + The current input text stored in the lexer. This is the string + passed with the ``input()`` method. It would probably be a bad + idea to modify this unless you really know what you're doing. + +``lexer.lexmatch`` + This is the raw ``Match`` object returned by the Python + ``re.match()`` function (used internally by PLY) for the current + token. If you have written a regular expression that contains + named groups, you can use this to retrieve those values. Note: + This attribute is only updated when tokens are defined and + processed by functions. + +Conditional lexing and start conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In advanced parsing applications, it may be useful to have different +lexing states. For instance, you may want the occurrence of a certain +token or syntactic construct to trigger a different kind of lexing. +PLY supports a feature that allows the underlying lexer to be put into +a series of different states. Each state can have its own tokens, +lexing rules, and so forth. The implementation is based largely on +the "start condition" feature of GNU flex. Details of this can be +found at http://flex.sourceforge.net/manual/Start-Conditions.html + +To define a new lexing state, it must first be declared. This is done +by including a "states" declaration in your lex file. For example:: + + states = ( + ('foo','exclusive'), + ('bar','inclusive'), + ) + +This declaration declares two states, ``'foo'`` and ``'bar'``. States +may be of two types; ``'exclusive'`` and ``'inclusive'``. An +exclusive state completely overrides the default behavior of the +lexer. That is, lex will only return tokens and apply rules defined +specifically for that state. An inclusive state adds additional +tokens and rules to the default set of rules. Thus, lex will return +both the tokens defined by default in addition to those defined for +the inclusive state. + +Once a state has been declared, tokens and rules are declared by +including the state name in token/rule declaration. For example:: + + t_foo_NUMBER = r'\d+' # Token 'NUMBER' in state 'foo' + t_bar_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' # Token 'ID' in state 'bar' + + def t_foo_newline(t): + r'\n' + t.lexer.lineno += 1 + +A token can be declared in multiple states by including multiple state +names in the declaration. For example:: + + t_foo_bar_NUMBER = r'\d+' # Defines token 'NUMBER' in both state 'foo' and 'bar' + +Alternative, a token can be declared in all states using the 'ANY' in +the name:: + + t_ANY_NUMBER = r'\d+' # Defines a token 'NUMBER' in all states + +If no state name is supplied, as is normally the case, the token is +associated with a special state ``'INITIAL'``. For example, these two +declarations are identical:: + + t_NUMBER = r'\d+' + t_INITIAL_NUMBER = r'\d+' + + +States are also associated with the special ``t_ignore``, +``t_error()``, and ``t_eof()`` declarations. For example, if a state +treats these differently, you can declare:: + + t_foo_ignore = " \t\n" # Ignored characters for state 'foo' + + def t_bar_error(t): # Special error handler for state 'bar' + pass + +By default, lexing operates in the ``'INITIAL'`` state. This state +includes all of the normally defined tokens. For users who aren't +using different states, this fact is completely transparent. If, +during lexing or parsing, you want to change the lexing state, use the +``begin()`` method. For example:: + + def t_begin_foo(t): + r'start_foo' + t.lexer.begin('foo') # Starts 'foo' state + +To get out of a state, you use ``begin()`` to switch back to the +initial state. For example:: + + def t_foo_end(t): + r'end_foo' + t.lexer.begin('INITIAL') # Back to the initial state + +The management of states can also be done with a stack. For example:: + + def t_begin_foo(t): + r'start_foo' + t.lexer.push_state('foo') # Starts 'foo' state + + def t_foo_end(t): + r'end_foo' + t.lexer.pop_state() # Back to the previous state + + +The use of a stack would be useful in situations where there are many +ways of entering a new lexing state and you merely want to go back to +the previous state afterwards. + +An example might help clarify. Suppose you were writing a parser and +you wanted to grab sections of arbitrary C code enclosed by curly +braces. That is, whenever you encounter a starting brace ``'{'``, you +want to read all of the enclosed code up to the ending brace ``'}'`` and +return it as a string. Doing this with a normal regular expression +rule is nearly (if not actually) impossible. This is because braces +can be nested and can be included in comments and strings. Thus, +matching up to the first matching ``'}'`` character isn't good +enough. Here is how you might use lexer states to do this:: + + # Declare the state + states = ( + ('ccode','exclusive'), + ) + + # Match the first {. Enter ccode state. + def t_ccode(t): + r'\{' + t.lexer.code_start = t.lexer.lexpos # Record the starting position + t.lexer.level = 1 # Initial brace level + t.lexer.begin('ccode') # Enter 'ccode' state + + # Rules for the ccode state + def t_ccode_lbrace(t): + r'\{' + t.lexer.level +=1 + + def t_ccode_rbrace(t): + r'\}' + t.lexer.level -=1 + + # If closing brace, return the code fragment + if t.lexer.level == 0: + t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1] + t.type = "CCODE" + t.lexer.lineno += t.value.count('\n') + t.lexer.begin('INITIAL') + return t + + # C or C++ comment (ignore) + def t_ccode_comment(t): + r'(/\*(.|\n)*?\*/)|(//.*)' + pass + + # C string + def t_ccode_string(t): + r'\"([^\\\n]|(\\.))*?\"' + + # C character literal + def t_ccode_char(t): + r'\'([^\\\n]|(\\.))*?\'' + + # Any sequence of non-whitespace characters (not braces, strings) + def t_ccode_nonspace(t): + r'[^\s\{\}\'\"]+' + + # Ignored characters (whitespace) + t_ccode_ignore = " \t\n" + + # For bad characters, we just skip over it + def t_ccode_error(t): + t.lexer.skip(1) + +In this example, the occurrence of the first '{' causes the lexer to +record the starting position and enter a new state ``'ccode'``. A +collection of rules then match various parts of the input that follow +(comments, strings, etc.). All of these rules merely discard the +token (by not returning a value). However, if the closing right brace +is encountered, the rule ``t_ccode_rbrace`` collects all of the code +(using the earlier recorded starting position), stores it, and returns +a token 'CCODE' containing all of that text. When returning the +token, the lexing state is restored back to its initial state. + +Miscellaneous Issues +^^^^^^^^^^^^^^^^^^^^ + +- The lexer requires input to be supplied as a single input string. + Since most machines have more than enough memory, this rarely presents + a performance concern. However, it means that the lexer currently + can't be used with streaming data such as open files or sockets. This + limitation is primarily a side-effect of using the ``re`` module. You + might be able to work around this by implementing an appropriate ``def + t_eof()`` end-of-file handling rule. The main complication here is + that you'll probably need to ensure that data is fed to the lexer in a + way so that it doesn't split in in the middle of a token. + +- If you need to supply optional flags to the re.compile() function, + use the reflags option to lex. For example:: + + lex.lex(reflags=re.UNICODE | re.VERBOSE) + + Note: by default, ``reflags`` is set to ``re.VERBOSE``. If you provide + your own flags, you may need to include this for PLY to preserve its normal behavior. + +- If you are going to create a hand-written lexer and you plan to use it with ``yacc.py``, + it only needs to conform to the following requirements: + + 1. It must provide a ``token()`` method that returns the next token or + ``None`` if no more tokens are available. + + 2. The ``token()`` method must return an object ``tok`` that has + ``type`` and ``value`` attributes. If line number tracking is + being used, then the token should also define a ``lineno`` + attribute. + +Parsing basics +-------------- + +``yacc.py`` is used to parse language syntax. Before showing an +example, there are a few important bits of background that must be +mentioned. First, *syntax* is usually specified in terms of a +BNF grammar. For example, if you wanted to parse simple arithmetic +expressions, you might first write an unambiguous grammar +specification like this:: + + expression : expression + term + | expression - term + | term + + term : term * factor + | term / factor + | factor + + factor : NUMBER + | ( expression ) + +In the grammar, symbols such as ``NUMBER``, ``+``, ``-``, ``*``, and +``/`` are known as *terminals* and correspond to input +tokens. Identifiers such as ``term`` and ``factor`` refer to grammar +rules comprised of a collection of terminals and other rules. These +identifiers are known as *non-terminals*. + +The semantic behavior of a language is often specified using a +technique known as syntax directed translation. In syntax directed +translation, attributes are attached to each symbol in a given grammar +rule along with an action. Whenever a particular grammar rule is +recognized, the action describes what to do. For example, given the +expression grammar above, you might write the specification for a +simple calculator like this:: + + Grammar Action + -------------------------------- -------------------------------------------- + expression0 : expression1 + term expression0.val = expression1.val + term.val + | expression1 - term expression0.val = expression1.val - term.val + | term expression0.val = term.val + + term0 : term1 * factor term0.val = term1.val * factor.val + | term1 / factor term0.val = term1.val / factor.val + | factor term0.val = factor.val + + factor : NUMBER factor.val = int(NUMBER.lexval) + | ( expression ) factor.val = expression.val + +A good way to think about syntax directed translation is to view each +symbol in the grammar as a kind of object. Associated with each symbol +is a value representing its "state" (for example, the ``val`` +attribute above). Semantic actions are then expressed as a collection +of functions or methods that operate on the symbols and associated +values. + +Yacc uses a parsing technique known as LR-parsing or shift-reduce +parsing. LR parsing is a bottom up technique that tries to recognize +the right-hand-side of various grammar rules. Whenever a valid +right-hand-side is found in the input, the appropriate action code is +triggered and the grammar symbols are replaced by the grammar symbol +on the left-hand-side. + +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns +that match one of the grammar rules. The details of the algorithm can +be found in a compiler textbook, but the following example illustrates +the steps that are performed if you wanted to parse the expression ``3 ++ 5 * (10 - 20)`` using the grammar defined above. In the example, +the special symbol ``$`` represents the end of input:: + + Step Symbol Stack Input Tokens Action + ---- --------------------- --------------------- ------------------------------- + 1 3 + 5 * ( 10 - 20 )$ Shift 3 + 2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER + 3 factor + 5 * ( 10 - 20 )$ Reduce term : factor + 4 term + 5 * ( 10 - 20 )$ Reduce expr : term + 5 expr + 5 * ( 10 - 20 )$ Shift + + 6 expr + 5 * ( 10 - 20 )$ Shift 5 + 7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER + 8 expr + factor * ( 10 - 20 )$ Reduce term : factor + 9 expr + term * ( 10 - 20 )$ Shift * + 10 expr + term * ( 10 - 20 )$ Shift ( + 11 expr + term * ( 10 - 20 )$ Shift 10 + 12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER + 13 expr + term * ( factor - 20 )$ Reduce term : factor + 14 expr + term * ( term - 20 )$ Reduce expr : term + 15 expr + term * ( expr - 20 )$ Shift - + 16 expr + term * ( expr - 20 )$ Shift 20 + 17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER + 18 expr + term * ( expr - factor )$ Reduce term : factor + 19 expr + term * ( expr - term )$ Reduce expr : expr - term + 20 expr + term * ( expr )$ Shift ) + 21 expr + term * ( expr ) $ Reduce factor : (expr) + 22 expr + term * factor $ Reduce term : term * factor + 23 expr + term $ Reduce expr : expr + term + 24 expr $ Reduce expr + 25 $ Success! + +When parsing the expression, an underlying state machine and the +current input token determine what happens next. If the next token +looks like part of a valid grammar rule (based on other items on the +stack), it is generally shifted onto the stack. If the top of the +stack contains a valid right-hand-side of a grammar rule, it is +usually "reduced" and the symbols replaced with the symbol on the +left-hand-side. When this reduction occurs, the appropriate action is +triggered (if defined). If the input token can't be shifted and the +top of stack doesn't match any grammar rules, a syntax error has +occurred and the parser must take some kind of recovery step (or bail +out). A parse is only successful if the parser reaches a state where +the symbol stack is empty and there are no more input tokens. + +It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and +beyond the scope of this discussion. However, subtle details of this +process explain why, in the example above, the parser chooses to shift +a token onto the stack in step 9 rather than reducing the +rule ``expr : expr + term``. + +Yacc +---- + +The ``ply.yacc`` module implements the parsing component of PLY. +The name "yacc" stands for "Yet Another Compiler Compiler" and is +borrowed from the Unix tool of the same name. + +An example +^^^^^^^^^^ + +Suppose you wanted to make a grammar for simple arithmetic expressions +as previously described. Here is how you would do it with +``yacc.py``:: + + # Yacc example + + import ply.yacc as yacc + + # Get the token map from the lexer. This is required. + from calclex import tokens + + def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + + def p_expression_minus(p): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + + def p_expression_term(p): + 'expression : term' + p[0] = p[1] + + def p_term_times(p): + 'term : term TIMES factor' + p[0] = p[1] * p[3] + + def p_term_div(p): + 'term : term DIVIDE factor' + p[0] = p[1] / p[3] + + def p_term_factor(p): + 'term : factor' + p[0] = p[1] + + def p_factor_num(p): + 'factor : NUMBER' + p[0] = p[1] + + def p_factor_expr(p): + 'factor : LPAREN expression RPAREN' + p[0] = p[2] + + # Error rule for syntax errors + def p_error(p): + print("Syntax error in input!") + + # Build the parser + parser = yacc.yacc() + + while True: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: continue + result = parser.parse(s) + print(result) + +In this example, each grammar rule is defined by a Python function +where the docstring to that function contains the appropriate +context-free grammar specification. The statements that make up the +function body implement the semantic actions of the rule. Each +function accepts a single argument ``p`` that is a sequence containing +the values of each grammar symbol in the corresponding rule. The +values of ``p[i]`` are mapped to grammar symbols as shown here:: + + def p_expression_plus(p): + 'expression : expression PLUS term' + # ^ ^ ^ ^ + # p[0] p[1] p[2] p[3] + + p[0] = p[1] + p[3] + + +For tokens, the "value" of the corresponding ``p[i]`` is the +*same* as the ``p.value`` attribute assigned in the lexer +module. For non-terminals, the value is determined by whatever is +placed in ``p[0]`` when rules are reduced. This value can be anything +at all. However, it probably most common for the value to be a simple +Python type, a tuple, or an instance. In this example, we are relying +on the fact that the ``NUMBER`` token stores an integer value in its +value field. All of the other rules perform various types of +integer operations and propagate the result. + +Note: The use of negative indices have a special meaning in +yacc---specially ``p[-1]`` does not have the same value as ``p[3]`` in +this example. Please see the section on "Embedded Actions" for +further details. + +The first rule defined in the yacc specification determines the +starting grammar symbol (in this case, a rule for ``expression`` +appears first). Whenever the starting rule is reduced by the parser +and no more input is available, parsing stops and the final value is +returned (this value will be whatever the top-most rule placed in +``p[0]``). Note: an alternative starting symbol can be specified using +the ``start`` keyword argument to ``yacc()``. + +The ``p_error(p)`` rule is defined to catch syntax errors. See the +error handling section below for more detail. + +To build the parser, call the ``yacc.yacc()`` function. This function +looks at the module and attempts to construct all of the LR parsing +tables for the grammar you have specified. + +If any errors are detected in your grammar specification, ``yacc.py`` +will produce diagnostic messages and possibly raise an exception. +Some of the errors that can be detected include: + +- Duplicated function names (if more than one rule function have the same name in the grammar file). +- Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. +- Badly specified grammar rules. +- Infinite recursion (rules that can never terminate). +- Unused rules and tokens +- Undefined rules and tokens + +The next few sections discuss grammar specification in more detail. + +The final part of the example shows how to actually run the parser +created by ``yacc()``. To run the parser, you have to call the +``parse()`` with a string of input text. This will run all of the +grammar rules and return the result of the entire parse. This result +return is the value assigned to ``p[0]`` in the starting grammar rule. + +Combining Grammar Rule Functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When grammar rules are similar, they can be combined into a single +function. For example, consider the two rules in our earlier +example:: + + def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + + def p_expression_minus(t): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + +Instead of writing two functions, you might write a single function +like this:: + + def p_expression(p): + '''expression : expression PLUS term + | expression MINUS term''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + +In general, the doc string for any given function can contain multiple +grammar rules. So, it would have also been legal (although possibly +confusing) to write this:: + + def p_binary_operators(p): + '''expression : expression PLUS term + | expression MINUS term + term : term TIMES factor + | term DIVIDE factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + +When combining grammar rules into a single function, it is usually a +good idea for all of the rules to have a similar structure (e.g., the +same number of terms). Otherwise, the corresponding action code may +be more complicated than necessary. However, it is possible to handle +simple cases using len(). For example:: + + def p_expressions(p): + '''expression : expression MINUS expression + | MINUS expression''' + if (len(p) == 4): + p[0] = p[1] - p[3] + elif (len(p) == 3): + p[0] = -p[2] + +If parsing performance is a concern, you should resist the urge to put +too much conditional processing into a single grammar rule as shown in +these examples. When you add checks to see which grammar rule is +being handled, you are actually duplicating the work that the parser +has already performed (i.e., the parser already knows exactly what +rule it matched). You can eliminate this overhead by using a separate +``p_rule()`` function for each grammar rule. + +Character Literals +^^^^^^^^^^^^^^^^^^ + +If desired, a grammar may contain tokens defined as single character +literals. For example:: + + def p_binary_operators(p): + '''expression : expression '+' term + | expression '-' term + term : term '*' factor + | term '/' factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + +A character literal must be enclosed in quotes such as ``'+'``. In +addition, if literals are used, they must be declared in the +corresponding ``lex`` file through the use of a special ``literals`` +declaration:: + + # Literals. Should be placed in module given to lex() + literals = ['+','-','*','/' ] + +Character literals are limited to a single character. Thus, it is not +legal to specify literals such as ``'<='`` or ``'=='``. For this, +use the normal lexing rules (e.g., define a rule such as ``t_EQ = +r'=='``). + +Empty Productions +^^^^^^^^^^^^^^^^^ + +``yacc.py`` can handle empty productions by defining a rule like this:: + + def p_empty(p): + 'empty :' + pass + +Now to use the empty production, use 'empty' as a symbol. For +example:: + + def p_optitem(p): + 'optitem : item' + ' | empty' + ... + +Note: You can write empty rules anywhere by specifying an empty +right hand side. However, I personally find that writing an "empty" +rule and using "empty" to denote an empty production is easier to read +and more clearly states your intentions. + +Changing the starting symbol +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Normally, the first rule found in a yacc specification defines the +starting grammar rule (top level rule). To change this, supply +a ``start`` specifier in your file. For example:: + + start = 'foo' + + def p_bar(p): + 'bar : A B' + + # This is the starting rule due to the start specifier above + def p_foo(p): + 'foo : bar X' + ... + +The use of a ``start`` specifier may be useful during debugging +since you can use it to have yacc build a subset of a larger grammar. +For this purpose, it is also possible to specify a starting symbol as +an argument to ``yacc()``. For example:: + + parser = yacc.yacc(start='foo') + +Dealing With Ambiguous Grammars +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The expression grammar given in the earlier example has been written +in a special format to eliminate ambiguity. However, in many +situations, it is extremely difficult or awkward to write grammars in +this format. A much more natural way to express the grammar is in a +more compact form like this:: + + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER + +Unfortunately, this grammar specification is ambiguous. For example, +if you are parsing the string "3 * 4 + 5", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? + +When an ambiguous grammar is given to ``yacc.py`` it will print +messages about "shift/reduce conflicts" or "reduce/reduce conflicts". +A shift/reduce conflict is caused when the parser generator can't +decide whether or not to reduce a rule or shift a symbol on the +parsing stack. For example, consider the string "3 * 4 + 5" and the +internal parsing stack:: + + Step Symbol Stack Input Tokens Action + ---- --------------------- --------------------- ------------------------------- + 1 $ 3 * 4 + 5$ Shift 3 + 2 $ 3 * 4 + 5$ Reduce : expression : NUMBER + 3 $ expr * 4 + 5$ Shift * + 4 $ expr * 4 + 5$ Shift 4 + 5 $ expr * 4 + 5$ Reduce: expression : NUMBER + 6 $ expr * expr + 5$ SHIFT/REDUCE CONFLICT ???? + +In this case, when the parser reaches step 6, it has two options. One +is to reduce the rule ``expr : expr * expr`` on the stack. The other +option is to shift the token ``+`` on the stack. Both options are +perfectly legal from the rules of the context-free-grammar. + +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always +shift the ``+`` instead of reducing. Although this strategy works in +many cases (for example, the case of "if-then" versus "if-then-else"), +it is not enough for arithmetic expressions. In fact, in the above +example, the decision to shift ``+`` is completely wrong---we should +have reduced ``expr * expr`` since multiplication has higher +mathematical precedence than addition. + +To resolve ambiguity, especially in expression grammars, ``yacc.py`` +allows individual tokens to be assigned a precedence level and +associativity. This is done by adding a variable ``precedence`` to +the grammar file like this:: + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ) + +This declaration specifies that ``PLUS``/``MINUS`` have the same +precedence level and are left-associative and that +``TIMES``/``DIVIDE`` have the same precedence and are +left-associative. Within the ``precedence`` declaration, tokens are +ordered from lowest to highest precedence. Thus, this declaration +specifies that ``TIMES``/``DIVIDE`` have higher precedence than +``PLUS``/``MINUS`` (since they appear later in the precedence +specification). + +The precedence specification works by associating a numerical +precedence level value and associativity direction to the listed +tokens. For example, in the above example you get:: + + PLUS : level = 1, assoc = 'left' + MINUS : level = 1, assoc = 'left' + TIMES : level = 2, assoc = 'left' + DIVIDE : level = 2, assoc = 'left' + +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. *This is always +determined by looking at the precedence of the right-most terminal +symbol.* For example:: + + expression : expression PLUS expression # level = 1, left + | expression MINUS expression # level = 1, left + | expression TIMES expression # level = 2, left + | expression DIVIDE expression # level = 2, left + | LPAREN expression RPAREN # level = None (not specified) + | NUMBER # level = None (not specified) + +When shift/reduce conflicts are encountered, the parser generator +resolves the conflict by looking at the precedence rules and +associativity specifiers. + +1. If the current token has higher precedence than the rule on the stack, it is shifted. + +2. If the grammar rule on the stack has higher precedence, the rule is reduced. + +3. If the current token and the grammar rule have the same precedence, the + rule is reduced for left associativity, whereas the token is shifted for right associativity. + +4. If nothing is known about the precedence, shift/reduce conflicts are resolved in + favor of shifting (the default). + +For example, if "expression PLUS expression" has been parsed and the +next token is "TIMES", the action is going to be a shift because +"TIMES" has a higher precedence level than "PLUS". On the other hand, +if "expression TIMES expression" has been parsed and the next token is +"PLUS", the action is going to be reduce because "PLUS" has a lower +precedence than "TIMES." + +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), ``yacc.py`` will +report no errors or conflicts in the grammar (although it will print +some information in the ``parser.out`` debugging file). + +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in "3 + 4 * +-5". Mathematically, the unary minus is normally given a very high +precedence--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To +deal with this, precedence rules can be given for so-called +"fictitious tokens" like this:: + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + +Now, in the grammar file, we can write our unary minus rule like +this:: + + def p_expr_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + +In this case, ``%prec UMINUS`` overrides the default rule +precedence--setting it to that of UMINUS in the precedence specifier. + +At first, the use of UMINUS in this example may appear very confusing. +UMINUS is not an input token or a grammar rule. Instead, you should +think of it as the name of a special marker in the precedence table. +When you use the ``%prec`` qualifier, you're telling yacc that +you want the precedence of the expression to be the same as for this +special marker instead of the usual precedence. + +It is also possible to specify non-associativity in the ``precedence`` +table. This would be used when you *don't* want operations to +chain together. For example, suppose you wanted to support comparison +operators like ``<`` and ``>`` but you didn't want to allow +combinations like ``a < b < c``. To do this, specify a +rule like this:: + + precedence = ( + ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + +If you do this, the occurrence of input text such as ``a < b < c`` +will result in a syntax error. However, simple expressions such +as ``a < b`` will still be fine. + +Reduce/reduce conflicts are caused when there are multiple grammar +rules that can be applied to a given set of symbols. This kind of +conflict is almost always bad and is always resolved by picking the +rule that appears first in the grammar file. Reduce/reduce conflicts +are almost always caused when different sets of grammar rules somehow +generate the same set of symbols. For example:: + + assignment : ID EQUALS NUMBER + | ID EQUALS expression + + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER + +In this case, a reduce/reduce conflict exists between these two rules:: + + assignment : ID EQUALS NUMBER + expression : NUMBER + +For example, if you wrote "a = 5", the parser can't figure out if this +is supposed to be reduced as ``assignment : ID EQUALS NUMBER`` or +whether it's supposed to reduce the 5 as an expression and then reduce +the rule ``assignment : ID EQUALS expression``. + +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot looking at the input grammar. When a +reduce/reduce conflict occurs, ``yacc()`` will try to help by printing +a warning message such as this:: + + WARNING: 1 reduce/reduce conflict + WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) + WARNING: rejected rule (expression -> NUMBER) + +This message identifies the two rules that are in conflict. However, +it may not tell you how the parser arrived at such a state. To try +and figure it out, you'll probably have to look at your grammar and +the contents of the ``parser.out`` debugging file with an +appropriately high level of caffeination. + +The parser.out file +^^^^^^^^^^^^^^^^^^^ + +Tracking down shift/reduce and reduce/reduce conflicts is one of the +finer pleasures of using an LR parsing algorithm. To assist in +debugging, ``yacc.py`` can create a debugging file called 'parser.out'. +To create this file, use ``yacc.yacc(debug=True)``. +The contents of this file look like the following:: + + Unused terminals: + + + Grammar + + Rule 1 expression -> expression PLUS expression + Rule 2 expression -> expression MINUS expression + Rule 3 expression -> expression TIMES expression + Rule 4 expression -> expression DIVIDE expression + Rule 5 expression -> NUMBER + Rule 6 expression -> LPAREN expression RPAREN + + Terminals, with rules where they appear + + TIMES : 3 + error : + MINUS : 2 + RPAREN : 6 + LPAREN : 6 + DIVIDE : 4 + PLUS : 1 + NUMBER : 5 + + Nonterminals, with rules where they appear + + expression : 1 1 2 2 3 3 4 4 6 0 + + + Parsing method: LALR + + + state 0 + + S' -> . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 1 + + S' -> expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + + state 2 + + expression -> LPAREN . expression RPAREN + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 3 + + expression -> NUMBER . + + $ reduce using rule 5 + PLUS reduce using rule 5 + MINUS reduce using rule 5 + TIMES reduce using rule 5 + DIVIDE reduce using rule 5 + RPAREN reduce using rule 5 + + + state 4 + + expression -> expression TIMES . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 5 + + expression -> expression MINUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 6 + + expression -> expression PLUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 7 + + expression -> expression DIVIDE . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 8 + + expression -> LPAREN expression . RPAREN + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + RPAREN shift and go to state 13 + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + + state 9 + + expression -> expression TIMES expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 3 + PLUS reduce using rule 3 + MINUS reduce using rule 3 + TIMES reduce using rule 3 + DIVIDE reduce using rule 3 + RPAREN reduce using rule 3 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + + state 10 + + expression -> expression MINUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 2 + PLUS reduce using rule 2 + MINUS reduce using rule 2 + RPAREN reduce using rule 2 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + + state 11 + + expression -> expression PLUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 1 + PLUS reduce using rule 1 + MINUS reduce using rule 1 + RPAREN reduce using rule 1 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 1 ] + ! DIVIDE [ reduce using rule 1 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + + state 12 + + expression -> expression DIVIDE expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 4 + PLUS reduce using rule 4 + MINUS reduce using rule 4 + TIMES reduce using rule 4 + DIVIDE reduce using rule 4 + RPAREN reduce using rule 4 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + + state 13 + + expression -> LPAREN expression RPAREN . + + $ reduce using rule 6 + PLUS reduce using rule 6 + MINUS reduce using rule 6 + TIMES reduce using rule 6 + DIVIDE reduce using rule 6 + RPAREN reduce using rule 6 + +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar +rules that might be in the process of being matched at that point. +Within each rule, the "." character indicates the current location of +the parse within that rule. In addition, the actions for each valid +input token are listed. When a shift/reduce or reduce/reduce conflict +arises, rules *not* selected are prefixed with an !. For +example:: + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + +By looking at these rules (and with a little practice), you can +usually track down the source of most parsing conflicts. It should +also be stressed that not all shift-reduce conflicts are bad. +However, the only way to be sure that they are resolved correctly is +to look at ``parser.out``. + +Syntax Error Handling +^^^^^^^^^^^^^^^^^^^^^ + +If you are creating a parser for production use, the handling of +syntax errors is important. As a general rule, you don't want a +parser to throw up its hands and stop at the first sign of +trouble. Instead, you want it to report the error, recover if +possible, and continue parsing so that all of the errors in the input +get reported to the user at once. This is the standard behavior found +in compilers for languages such as C, C++, and Java. + +In PLY, when a syntax error occurs during parsing, the error is +immediately detected (i.e., the parser does not read any more tokens +beyond the source of the error). However, at this point, the parser +enters a recovery mode that can be used to try and continue further +parsing. As a general rule, error recovery in LR parsers is a +delicate topic that involves ancient rituals and black-magic. The +recovery mechanism provided by ``yacc.py`` is comparable to Unix yacc +so you may want consult a book like O'Reilly's "Lex and Yacc" for some +of the finer details. + +When a syntax error occurs, ``yacc.py`` performs the following steps: + +1. On the first occurrence of an error, the user-defined ``p_error()`` + function is called with the offending token as an + argument. However, if the syntax error is due to reaching the + end-of-file, ``p_error()`` is called with an argument of ``None``. + Afterwards, the parser enters an "error-recovery" mode in which it + will not make future calls to ``p_error()`` until it has + successfully shifted at least 3 tokens onto the parsing stack. + + +2. If no recovery action is taken in ``p_error()``, the offending + lookahead token is replaced with a special ``error`` token. + +3. If the offending lookahead token is already set to ``error``, the + top item of the parsing stack is deleted. + +4. If the entire parsing stack is unwound, the parser enters a restart + state and attempts to start parsing from its initial state. + +5. If a grammar rule accepts ``error`` as a token, it will be + shifted onto the parsing stack. + +6. If the top item of the parsing stack is ``error``, lookahead tokens + will be discarded until the parser can successfully shift a new + symbol or reduce a rule involving ``error``. + +Recovery and resynchronization with error rules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The most well-behaved approach for handling syntax errors is to write +grammar rules that include the ``error`` token. For example, suppose +your language had a grammar rule for a print statement like this:: + + def p_statement_print(p): + 'statement : PRINT expr SEMI' + ... + +To account for the possibility of a bad expression, you might write an +additional grammar rule like this:: + + def p_statement_print_error(p): + 'statement : PRINT error SEMI' + print("Syntax error in print statement. Bad expression") + +In this case, the ``error`` token will match any sequence of +tokens that might appear up to the first semicolon that is +encountered. Once the semicolon is reached, the rule will be +invoked and the ``error`` token will go away. + +This type of recovery is sometimes known as parser resynchronization. +The ``error`` token acts as a wildcard for any bad input text and +the token immediately following ``error`` acts as a +synchronization token. + +It is important to note that the ``error`` token usually does not +appear as the last token on the right in an error rule. For example:: + + def p_statement_print_error(p): + 'statement : PRINT error' + print("Syntax error in print statement. Bad expression") + +This is because the first bad token encountered will cause the rule to +be reduced--which may make it difficult to recover if more bad tokens +immediately follow. + +Panic mode recovery +~~~~~~~~~~~~~~~~~~~ + +An alternative error recovery scheme is to enter a panic mode recovery +in which tokens are discarded to a point where the parser might be +able to recover in some sensible manner. + +Panic mode recovery is implemented entirely in the ``p_error()`` +function. For example, this function starts discarding tokens until +it reaches a closing '}'. Then, it restarts the parser in its initial +state:: + + def p_error(p): + print("Whoa. You are seriously hosed.") + if not p: + print("End of File!") + return + + # Read ahead looking for a closing '}' + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'RBRACE': + break + parser.restart() + +This function discards the bad token and tells the parser that +the error was ok:: + + def p_error(p): + if p: + print("Syntax error at token", p.type) + # Just discard the token and tell the parser it's okay. + parser.errok() + else: + print("Syntax error at EOF") + +More information on these methods is as follows: + +``parser.errok()`` + This resets the parser state so it doesn't think it's in error-recovery + mode. This will prevent an ``error`` token from being generated and will reset the internal + error counters so that the next syntax error will call ``p_error()`` again. + +``parser.token()`` + This returns the next token on the input stream. + +``parser.restart()``. + This discards the entire parsing stack and resets the parser + to its initial state. + +To supply the next lookahead token to the parser, ``p_error()`` can +return a token. This might be useful if trying to synchronize on +special characters. For example:: + + def p_error(p): + # Read ahead looking for a terminating ";" + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'SEMI': break + parser.errok() + + # Return SEMI to the parser as the next lookahead token + return tok + +Keep in mind in that the above error handling functions, ``parser`` is +an instance of the parser created by ``yacc()``. You'll need to save +this instance someplace in your code so that you can refer to it +during error handling. + +Signalling an error from a production +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If necessary, a production rule can manually force the parser to enter +error recovery. This is done by raising the ``SyntaxError`` exception +like this:: + + def p_production(p): + 'production : some production ...' + raise SyntaxError + +The effect of raising ``SyntaxError`` is the same as if the last +symbol shifted onto the parsing stack was actually a syntax error. +Thus, when you do this, the last symbol shifted is popped off of the +parsing stack and the current lookahead token is set to an ``error`` +token. The parser then enters error-recovery mode where it tries to +reduce rules that can accept ``error`` tokens. The steps that follow +from this point are exactly the same as if a syntax error were +detected and ``p_error()`` were called. + +One important aspect of manually setting an error is that the +``p_error()`` function will NOT be called in this case. If you need +to issue an error message, make sure you do it in the production that +raises ``SyntaxError``. + +Note: This feature of PLY is meant to mimic the behavior of the +YYERROR macro in yacc. + +When Do Syntax Errors Get Reported? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In most cases, yacc will handle errors as soon as a bad input token is +detected on the input. However, be aware that yacc may choose to +delay error handling until after it has reduced one or more grammar +rules first. This behavior might be unexpected, but it's related to +special states in the underlying parsing table known as "defaulted +states." A defaulted state is parsing condition where the same +grammar rule will be reduced regardless of what *valid* token +comes next on the input. For such states, yacc chooses to go ahead +and reduce the grammar rule *without reading the next input +token*. If the next token is bad, yacc will eventually get around +to reading it and report a syntax error. It's just a little unusual +in that you might see some of your grammar rules firing immediately +prior to the syntax error. + +Usually, the delayed error reporting with defaulted states is harmless +(and there are other reasons for wanting PLY to behave in this way). +However, if you need to turn this behavior off for some reason. You +can clear the defaulted states table like this:: + + parser = yacc.yacc() + parser.defaulted_states = {} + +Disabling defaulted states is not recommended if your grammar makes +use of embedded actions as described in Section 6.11. + +General comments on error handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For normal types of languages, error recovery with error rules and +resynchronization characters is probably the most reliable +technique. This is because you can instrument the grammar to catch +errors at selected places where it is relatively easy to recover and +continue parsing. Panic mode recovery is really only useful in +certain specialized applications where you might want to discard huge +portions of the input text to find a valid restart point. + +Line Number and Position Tracking +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Position tracking is often a tricky problem when writing compilers. +By default, PLY tracks the line number and position of all tokens. +This information is available using the following functions: + +``p.lineno(num)``. Return the line number for symbol *num* + +``p.lexpos(num)``. Return the lexing position for symbol *num* + +For example:: + + def p_expression(p): + 'expression : expression PLUS expression' + line = p.lineno(2) # line number of the PLUS token + index = p.lexpos(2) # Position of the PLUS token + +As an optional feature, ``yacc.py`` can automatically track line +numbers and positions for all of the grammar symbols as well. +However, this extra tracking requires extra processing and can +significantly slow down parsing. Therefore, it must be enabled by +passing the ``tracking=True`` option to ``yacc.parse()``. For +example:: + + yacc.parse(data,tracking=True) + +Once enabled, the ``lineno()`` and ``lexpos()`` methods work for all +grammar symbols. In addition, two additional methods can be used: + +``p.linespan(num)``. Return a tuple (startline,endline) with the starting and ending line number for symbol *num*. + +``p.lexspan(num)``. Return a tuple (start,end) with the starting and ending positions for symbol *num*. + +For example:: + + def p_expression(p): + 'expression : expression PLUS expression' + p.lineno(1) # Line number of the left expression + p.lineno(2) # line number of the PLUS operator + p.lineno(3) # line number of the right expression + ... + start,end = p.linespan(3) # Start,end lines of the right expression + starti,endi = p.lexspan(3) # Start,end positions of right expression + + +Note: The ``lexspan()`` function only returns the range of values up +to the start of the last grammar symbol. + +Although it may be convenient for PLY to track position information on +all grammar symbols, this is often unnecessary. For example, if you +are merely using line number information in an error message, you can +often just key off of a specific token in the grammar rule. For +example:: + + def p_bad_func(p): + 'funccall : fname LPAREN error RPAREN' + # Line number reported from LPAREN token + print("Bad function call at line", p.lineno(2)) + +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it's needed using +the ``p.set_lineno()`` method. For example:: + + def p_fname(p): + 'fname : ID' + p[0] = p[1] + p.set_lineno(0,p.lineno(1)) + +PLY doesn't retain line number information from rules that have +already been parsed. If you are building an abstract syntax tree and +need to have line numbers, you should make sure that the line numbers +appear in the tree itself. + +AST Construction +^^^^^^^^^^^^^^^^ + +``yacc.py`` provides no special functions for constructing an abstract +syntax tree. However, such construction is easy enough to do on your +own. + +A minimal way to construct a tree is to create and propagate a +tuple or list in each grammar rule function. There are many possible +ways to do this, but one example would be something like this:: + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = ('binary-expression',p[2],p[1],p[3]) + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = ('group-expression',p[2]) + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = ('number-expression',p[1]) + +Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to ``p[0]`` in +each rule. For example:: + + class Expr: pass + + class BinOp(Expr): + def __init__(self,left,op,right): + self.left = left + self.right = right + self.op = op + + class Number(Expr): + def __init__(self,value): + self.value = value + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = BinOp(p[1],p[2],p[3]) + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = Number(p[1]) + +The advantage to this approach is that it may make it easier to attach +more complicated semantics, type checking, code generation, and other +features to the node classes. + +To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example:: + + class Node: + def __init__(self,type,children=None,leaf=None): + self.type = type + if children: + self.children = children + else: + self.children = [ ] + self.leaf = leaf + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = Node("binop", [p[1],p[3]], p[2]) + +Embedded Actions +^^^^^^^^^^^^^^^^ + +The parsing technique used by yacc only allows actions to be executed +at the end of a rule. For example, suppose you have a rule like +this:: + + def p_foo(p): + "foo : A B C D" + print("Parsed a foo", p[1],p[2],p[3],p[4]) + +In this case, the supplied action code only executes after all of the +symbols ``A``, ``B``, ``C``, and ``D`` have been parsed. Sometimes, +however, it is useful to execute small code fragments during +intermediate stages of parsing. For example, suppose you wanted to +perform some action immediately after ``A`` has been parsed. To do +this, write an empty rule like this:: + + def p_foo(p): + "foo : A seen_A B C D" + print("Parsed a foo", p[1],p[3],p[4],p[5]) + print("seen_A returned", p[2]) + + def p_seen_A(p): + "seen_A :" + print("Saw an A = ", p[-1]) # Access grammar symbol to left + p[0] = some_value # Assign value to seen_A + +In this example, the empty ``seen_A`` rule executes immediately after +``A`` is shifted onto the parsing stack. Within this rule, ``p[-1]`` +refers to the symbol on the stack that appears immediately to the left +of the ``seen_A`` symbol. In this case, it would be the value of +``A`` in the ``foo`` rule immediately above. Like other rules, a +value can be returned from an embedded action by assigning it +to ``p[0]`` + +The use of embedded actions can sometimes introduce extra shift/reduce +conflicts. For example, this grammar has no conflicts:: + + def p_foo(p): + """foo : abcd + | abcx""" + + def p_abcd(p): + "abcd : A B C D" + + def p_abcx(p): + "abcx : A B C X" + +However, if you insert an embedded action into one of the rules like +this:: + + def p_foo(p): + """foo : abcd + | abcx""" + + def p_abcd(p): + "abcd : A B C D" + + def p_abcx(p): + "abcx : A B seen_AB C X" + + def p_seen_AB(p): + "seen_AB :" + +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol ``C`` appears next in both the +``abcd`` and ``abcx`` rules. The parser can either shift the symbol +(``abcd`` rule) or reduce the empty rule ``seen_AB`` (``abcx`` rule). + +A common use of embedded rules is to control other aspects of parsing +such as scoping of local variables. For example, if you were parsing +C code, you might write code like this:: + + def p_statements_block(p): + "statements: LBRACE new_scope statements RBRACE""" + # Action code + ... + pop_scope() # Return to previous scope + + def p_new_scope(p): + "new_scope :" + # Create a new scope for local variables + s = new_scope() + push_scope(s) + ... + +In this case, the embedded action ``new_scope`` executes +immediately after a ``LBRACE`` (``{``) symbol is parsed. +This might adjust internal symbol tables and other aspects of the +parser. Upon completion of the rule ``statements_block``, code +might undo the operations performed in the embedded action +(e.g., ``pop_scope()``). + +Miscellaneous Yacc Notes +^^^^^^^^^^^^^^^^^^^^^^^^ + + +1. By default, ``yacc.py`` relies on ``lex.py`` for tokenizing. However, an alternative tokenizer + can be supplied as follows:: + + parser = yacc.parse(lexer=x) + + in this case, ``x`` must be a Lexer object that minimally has a ``x.token()`` method for retrieving the next + token. If an input string is given to ``yacc.parse()``, the lexer must also have an ``x.input()`` method. + +2. To print copious amounts of debugging during parsing, use:: + + parser.parse(input_text, debug=True) + +3. Since LR parsing is driven by tables, the performance of the parser is largely independent of the + size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. + +4. ``yacc()`` also allows parsers to be defined as classes and as closures (see the section on alternative specification of + lexers). However, be aware that only one parser may be defined in a single module (source file). There are various + error checks and validation steps that may issue confusing error messages if you try to define multiple parsers + in the same source file. + +Multiple Parsers and Lexers +--------------------------- + +In advanced parsing applications, you may want to have multiple +parsers and lexers. + +As a general rules this isn't a problem. However, to make it work, +you need to carefully make sure everything gets hooked up correctly. +First, make sure you save the objects returned by ``lex()`` and +``yacc()``. For example:: + + lexer = lex.lex() # Return lexer object + parser = yacc.yacc() # Return parser object + +Next, when parsing, make sure you give the ``parse()`` function a +reference to the lexer it should be using. For example:: + + parser.parse(text,lexer=lexer) + +If you forget to do this, the parser will use the last lexer +created--which is not always what you want. + +Within lexer and parser rule functions, these objects are also +available. In the lexer, the "lexer" attribute of a token refers to +the lexer object that triggered the rule. For example:: + + def t_NUMBER(t): + r'\d+' + ... + print(t.lexer) # Show lexer object + +In the parser, the "lexer" and "parser" attributes refer to the lexer +and parser objects respectively:: + + def p_expr_plus(p): + 'expr : expr PLUS expr' + ... + print(p.parser) # Show parser object + print(p.lexer) # Show lexer object + +If necessary, arbitrary attributes can be attached to the lexer or +parser object. For example, if you wanted to have different parsing +modes, you could attach a mode attribute to the parser object and look +at it later. + +Advanced Debugging +------------------ + +Debugging a compiler is typically not an easy task. PLY provides some +diagostic capabilities through the use of Python's +``logging`` module. The next two sections describe this: + +Debugging the lex() and yacc() commands +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Both the ``lex()`` and ``yacc()`` commands have a debugging mode that +can be enabled using the ``debug`` flag. For example:: + + lex.lex(debug=True) + yacc.yacc(debug=True) + +Normally, the output produced by debugging is routed to either +standard error or, in the case of ``yacc()``, to a file +``parser.out``. This output can be more carefully controlled by +supplying a logging object. Here is an example that adds information +about where different debugging messages are coming from:: + + # Set up a logging object + import logging + logging.basicConfig( + level = logging.DEBUG, + filename = "parselog.txt", + filemode = "w", + format = "%(filename)10s:%(lineno)4d:%(message)s" + ) + log = logging.getLogger() + + lex.lex(debug=True,debuglog=log) + yacc.yacc(debug=True,debuglog=log) + +If you supply a custom logger, the amount of debugging information +produced can be controlled by setting the logging level. Typically, +debugging messages are either issued at the ``DEBUG``, ``INFO``, or +``WARNING`` levels. + +PLY's error messages and warnings are also produced using the logging +interface. This can be controlled by passing a logging object using +the ``errorlog`` parameter:: + + lex.lex(errorlog=log) + yacc.yacc(errorlog=log) + +If you want to completely silence warnings, you can either pass in a +logging object with an appropriate filter level or use the +``NullLogger`` object defined in either ``lex`` or ``yacc``. For +example:: + + yacc.yacc(errorlog=yacc.NullLogger()) + +Run-time Debugging +^^^^^^^^^^^^^^^^^^ + +To enable run-time debugging of a parser, use the ``debug`` option to +parse. This option can either be an integer (which turns +debugging on or off) or an instance of a logger object. For example:: + + log = logging.getLogger() + parser.parse(input,debug=log) + +If a logging object is passed, you can use its filtering level to +control how much output gets generated. The ``INFO`` level is used to +produce information about rule reductions. The ``DEBUG`` level will +show information about the parsing stack, token shifts, and other +details. The ``ERROR`` level shows information related to parsing +errors. + +For very complicated problems, you should pass in a logging object that +redirects to a file where you can more easily inspect the output after +execution. + +Where to go from here? +---------------------- + +The ``examples`` directory of the PLY distribution contains several +simple examples. Please consult a compilers textbook for the theory +and underlying implementation details or LR parsing. + + + + + + + + diff --git a/example/BASIC/basic.py b/example/BASIC/basic.py index 17687b1..8a8a500 100644 --- a/example/BASIC/basic.py +++ b/example/BASIC/basic.py @@ -4,9 +4,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - import basiclex import basparse import basinterp @@ -36,9 +33,9 @@ else: # Specifying a line number with no code deletes that line from # the program. -while 1: +while True: try: - line = raw_input("[BASIC] ") + line = input("[BASIC] ") except EOFError: raise SystemExit if not line: diff --git a/example/GardenSnake/GardenSnake.py b/example/GardenSnake/GardenSnake.py deleted file mode 100644 index 8b493b4..0000000 --- a/example/GardenSnake/GardenSnake.py +++ /dev/null @@ -1,777 +0,0 @@ -# GardenSnake - a parser generator demonstration program -# -# This implements a modified version of a subset of Python: -# - only 'def', 'return' and 'if' statements -# - 'if' only has 'then' clause (no elif nor else) -# - single-quoted strings only, content in raw format -# - numbers are decimal.Decimal instances (not integers or floats) -# - no print statment; use the built-in 'print' function -# - only < > == + - / * implemented (and unary + -) -# - assignment and tuple assignment work -# - no generators of any sort -# - no ... well, no quite a lot - -# Why? I'm thinking about a new indentation-based configuration -# language for a project and wanted to figure out how to do it. Once -# I got that working I needed a way to test it out. My original AST -# was dumb so I decided to target Python's AST and compile it into -# Python code. Plus, it's pretty cool that it only took a day or so -# from sitting down with Ply to having working code. - -# This uses David Beazley's Ply from http://www.dabeaz.com/ply/ - -# This work is hereby released into the Public Domain. To view a copy of -# the public domain dedication, visit -# http://creativecommons.org/licenses/publicdomain/ or send a letter to -# Creative Commons, 543 Howard Street, 5th Floor, San Francisco, -# California, 94105, USA. -# -# Portions of this work are derived from Python's Grammar definition -# and may be covered under the Python copyright and license -# -# Andrew Dalke / Dalke Scientific Software, LLC -# 30 August 2006 / Cape Town, South Africa - -# Changelog: -# 30 August - added link to CC license; removed the "swapcase" encoding - -# Modifications for inclusion in PLY distribution -import sys -sys.path.insert(0, "../..") -from ply import * - -##### Lexer ###### -#import lex -import decimal - -tokens = ( - 'DEF', - 'IF', - 'NAME', - 'NUMBER', # Python decimals - 'STRING', # single quoted strings only; syntax of raw strings - 'LPAR', - 'RPAR', - 'COLON', - 'EQ', - 'ASSIGN', - 'LT', - 'GT', - 'PLUS', - 'MINUS', - 'MULT', - 'DIV', - 'RETURN', - 'WS', - 'NEWLINE', - 'COMMA', - 'SEMICOLON', - 'INDENT', - 'DEDENT', - 'ENDMARKER', -) - -#t_NUMBER = r'\d+' -# taken from decmial.py but without the leading sign - - -def t_NUMBER(t): - r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?""" - t.value = decimal.Decimal(t.value) - return t - - -def t_STRING(t): - r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... - t.value = t.value[1:-1].decode("string-escape") # .swapcase() # for fun - return t - -t_COLON = r':' -t_EQ = r'==' -t_ASSIGN = r'=' -t_LT = r'<' -t_GT = r'>' -t_PLUS = r'\+' -t_MINUS = r'-' -t_MULT = r'\*' -t_DIV = r'/' -t_COMMA = r',' -t_SEMICOLON = r';' - -# Ply nicely documented how to do this. - -RESERVED = { - "def": "DEF", - "if": "IF", - "return": "RETURN", -} - - -def t_NAME(t): - r'[a-zA-Z_][a-zA-Z0-9_]*' - t.type = RESERVED.get(t.value, "NAME") - return t - -# Putting this before t_WS let it consume lines with only comments in -# them so the latter code never sees the WS part. Not consuming the -# newline. Needed for "if 1: #comment" - - -def t_comment(t): - r"[ ]*\043[^\n]*" # \043 is '#' - pass - - -# Whitespace -def t_WS(t): - r' [ ]+ ' - if t.lexer.at_line_start and t.lexer.paren_count == 0: - return t - -# Don't generate newline tokens when inside of parenthesis, eg -# a = (1, -# 2, 3) - - -def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - t.type = "NEWLINE" - if t.lexer.paren_count == 0: - return t - - -def t_LPAR(t): - r'\(' - t.lexer.paren_count += 1 - return t - - -def t_RPAR(t): - r'\)' - # check for underflow? should be the job of the parser - t.lexer.paren_count -= 1 - return t - - -def t_error(t): - raise SyntaxError("Unknown symbol %r" % (t.value[0],)) - print "Skipping", repr(t.value[0]) - t.lexer.skip(1) - -# I implemented INDENT / DEDENT generation as a post-processing filter - -# The original lex token stream contains WS and NEWLINE characters. -# WS will only occur before any other tokens on a line. - -# I have three filters. One tags tokens by adding two attributes. -# "must_indent" is True if the token must be indented from the -# previous code. The other is "at_line_start" which is True for WS -# and the first non-WS/non-NEWLINE on a line. It flags the check so -# see if the new line has changed indication level. - -# Python's syntax has three INDENT states -# 0) no colon hence no need to indent -# 1) "if 1: go()" - simple statements have a COLON but no need for an indent -# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent -NO_INDENT = 0 -MAY_INDENT = 1 -MUST_INDENT = 2 - -# only care about whitespace at the start of a line - - -def track_tokens_filter(lexer, tokens): - lexer.at_line_start = at_line_start = True - indent = NO_INDENT - saw_colon = False - for token in tokens: - token.at_line_start = at_line_start - - if token.type == "COLON": - at_line_start = False - indent = MAY_INDENT - token.must_indent = False - - elif token.type == "NEWLINE": - at_line_start = True - if indent == MAY_INDENT: - indent = MUST_INDENT - token.must_indent = False - - elif token.type == "WS": - assert token.at_line_start == True - at_line_start = True - token.must_indent = False - - else: - # A real token; only indent after COLON NEWLINE - if indent == MUST_INDENT: - token.must_indent = True - else: - token.must_indent = False - at_line_start = False - indent = NO_INDENT - - yield token - lexer.at_line_start = at_line_start - - -def _new_token(type, lineno): - tok = lex.LexToken() - tok.type = type - tok.value = None - tok.lineno = lineno - return tok - -# Synthesize a DEDENT tag - - -def DEDENT(lineno): - return _new_token("DEDENT", lineno) - -# Synthesize an INDENT tag - - -def INDENT(lineno): - return _new_token("INDENT", lineno) - - -# Track the indentation level and emit the right INDENT / DEDENT events. -def indentation_filter(tokens): - # A stack of indentation levels; will never pop item 0 - levels = [0] - token = None - depth = 0 - prev_was_ws = False - for token in tokens: - # if 1: - # print "Process", token, - # if token.at_line_start: - # print "at_line_start", - # if token.must_indent: - # print "must_indent", - # print - - # WS only occurs at the start of the line - # There may be WS followed by NEWLINE so - # only track the depth here. Don't indent/dedent - # until there's something real. - if token.type == "WS": - assert depth == 0 - depth = len(token.value) - prev_was_ws = True - # WS tokens are never passed to the parser - continue - - if token.type == "NEWLINE": - depth = 0 - if prev_was_ws or token.at_line_start: - # ignore blank lines - continue - # pass the other cases on through - yield token - continue - - # then it must be a real token (not WS, not NEWLINE) - # which can affect the indentation level - - prev_was_ws = False - if token.must_indent: - # The current depth must be larger than the previous level - if not (depth > levels[-1]): - raise IndentationError("expected an indented block") - - levels.append(depth) - yield INDENT(token.lineno) - - elif token.at_line_start: - # Must be on the same level or one of the previous levels - if depth == levels[-1]: - # At the same level - pass - elif depth > levels[-1]: - raise IndentationError( - "indentation increase but not in new block") - else: - # Back up; but only if it matches a previous level - try: - i = levels.index(depth) - except ValueError: - raise IndentationError("inconsistent indentation") - for _ in range(i + 1, len(levels)): - yield DEDENT(token.lineno) - levels.pop() - - yield token - - ### Finished processing ### - - # Must dedent any remaining levels - if len(levels) > 1: - assert token is not None - for _ in range(1, len(levels)): - yield DEDENT(token.lineno) - - -# The top-level filter adds an ENDMARKER, if requested. -# Python's grammar uses it. -def filter(lexer, add_endmarker=True): - token = None - tokens = iter(lexer.token, None) - tokens = track_tokens_filter(lexer, tokens) - for token in indentation_filter(tokens): - yield token - - if add_endmarker: - lineno = 1 - if token is not None: - lineno = token.lineno - yield _new_token("ENDMARKER", lineno) - -# Combine Ply and my filters into a new lexer - - -class IndentLexer(object): - - def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): - self.lexer = lex.lex(debug=debug, optimize=optimize, - lextab=lextab, reflags=reflags) - self.token_stream = None - - def input(self, s, add_endmarker=True): - self.lexer.paren_count = 0 - self.lexer.input(s) - self.token_stream = filter(self.lexer, add_endmarker) - - def token(self): - try: - return self.token_stream.next() - except StopIteration: - return None - -########## Parser (tokens -> AST) ###### - -# also part of Ply -#import yacc - -# I use the Python AST -from compiler import ast - -# Helper function - - -def Assign(left, right): - names = [] - if isinstance(left, ast.Name): - # Single assignment on left - return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right) - elif isinstance(left, ast.Tuple): - # List of things - make sure they are Name nodes - names = [] - for child in left.getChildren(): - if not isinstance(child, ast.Name): - raise SyntaxError("that assignment not supported") - names.append(child.name) - ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names] - return ast.Assign([ast.AssTuple(ass_list)], right) - else: - raise SyntaxError("Can't do that yet") - - -# The grammar comments come from Python's Grammar/Grammar file - -# NB: compound_stmt in single_input is followed by extra NEWLINE! -# file_input: (NEWLINE | stmt)* ENDMARKER -def p_file_input_end(p): - """file_input_end : file_input ENDMARKER""" - p[0] = ast.Stmt(p[1]) - - -def p_file_input(p): - """file_input : file_input NEWLINE - | file_input stmt - | NEWLINE - | stmt""" - if isinstance(p[len(p) - 1], basestring): - if len(p) == 3: - p[0] = p[1] - else: - p[0] = [] # p == 2 --> only a blank line - else: - if len(p) == 3: - p[0] = p[1] + p[2] - else: - p[0] = p[1] - - -# funcdef: [decorators] 'def' NAME parameters ':' suite -# ignoring decorators -def p_funcdef(p): - "funcdef : DEF NAME parameters COLON suite" - p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) - -# parameters: '(' [varargslist] ')' - - -def p_parameters(p): - """parameters : LPAR RPAR - | LPAR varargslist RPAR""" - if len(p) == 3: - p[0] = [] - else: - p[0] = p[2] - - -# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | -# highly simplified -def p_varargslist(p): - """varargslist : varargslist COMMA NAME - | NAME""" - if len(p) == 4: - p[0] = p[1] + p[3] - else: - p[0] = [p[1]] - -# stmt: simple_stmt | compound_stmt - - -def p_stmt_simple(p): - """stmt : simple_stmt""" - # simple_stmt is a list - p[0] = p[1] - - -def p_stmt_compound(p): - """stmt : compound_stmt""" - p[0] = [p[1]] - -# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE - - -def p_simple_stmt(p): - """simple_stmt : small_stmts NEWLINE - | small_stmts SEMICOLON NEWLINE""" - p[0] = p[1] - - -def p_small_stmts(p): - """small_stmts : small_stmts SEMICOLON small_stmt - | small_stmt""" - if len(p) == 4: - p[0] = p[1] + [p[3]] - else: - p[0] = [p[1]] - -# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | -# import_stmt | global_stmt | exec_stmt | assert_stmt - - -def p_small_stmt(p): - """small_stmt : flow_stmt - | expr_stmt""" - p[0] = p[1] - -# expr_stmt: testlist (augassign (yield_expr|testlist) | -# ('=' (yield_expr|testlist))*) -# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | -# '<<=' | '>>=' | '**=' | '//=') - - -def p_expr_stmt(p): - """expr_stmt : testlist ASSIGN testlist - | testlist """ - if len(p) == 2: - # a list of expressions - p[0] = ast.Discard(p[1]) - else: - p[0] = Assign(p[1], p[3]) - - -def p_flow_stmt(p): - "flow_stmt : return_stmt" - p[0] = p[1] - -# return_stmt: 'return' [testlist] - - -def p_return_stmt(p): - "return_stmt : RETURN testlist" - p[0] = ast.Return(p[2]) - - -def p_compound_stmt(p): - """compound_stmt : if_stmt - | funcdef""" - p[0] = p[1] - - -def p_if_stmt(p): - 'if_stmt : IF test COLON suite' - p[0] = ast.If([(p[2], p[4])], None) - - -def p_suite(p): - """suite : simple_stmt - | NEWLINE INDENT stmts DEDENT""" - if len(p) == 2: - p[0] = ast.Stmt(p[1]) - else: - p[0] = ast.Stmt(p[3]) - - -def p_stmts(p): - """stmts : stmts stmt - | stmt""" - if len(p) == 3: - p[0] = p[1] + p[2] - else: - p[0] = p[1] - -# No using Python's approach because Ply supports precedence - -# comparison: expr (comp_op expr)* -# arith_expr: term (('+'|'-') term)* -# term: factor (('*'|'/'|'%'|'//') factor)* -# factor: ('+'|'-'|'~') factor | power -# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' - - -def make_lt_compare((left, right)): - return ast.Compare(left, [('<', right), ]) - - -def make_gt_compare((left, right)): - return ast.Compare(left, [('>', right), ]) - - -def make_eq_compare((left, right)): - return ast.Compare(left, [('==', right), ]) - - -binary_ops = { - "+": ast.Add, - "-": ast.Sub, - "*": ast.Mul, - "/": ast.Div, - "<": make_lt_compare, - ">": make_gt_compare, - "==": make_eq_compare, -} -unary_ops = { - "+": ast.UnaryAdd, - "-": ast.UnarySub, -} -precedence = ( - ("left", "EQ", "GT", "LT"), - ("left", "PLUS", "MINUS"), - ("left", "MULT", "DIV"), -) - - -def p_comparison(p): - """comparison : comparison PLUS comparison - | comparison MINUS comparison - | comparison MULT comparison - | comparison DIV comparison - | comparison LT comparison - | comparison EQ comparison - | comparison GT comparison - | PLUS comparison - | MINUS comparison - | power""" - if len(p) == 4: - p[0] = binary_ops[p[2]]((p[1], p[3])) - elif len(p) == 3: - p[0] = unary_ops[p[1]](p[2]) - else: - p[0] = p[1] - -# power: atom trailer* ['**' factor] -# trailers enables function calls. I only allow one level of calls -# so this is 'trailer' - - -def p_power(p): - """power : atom - | atom trailer""" - if len(p) == 2: - p[0] = p[1] - else: - if p[2][0] == "CALL": - p[0] = ast.CallFunc(p[1], p[2][1], None, None) - else: - raise AssertionError("not implemented") - - -def p_atom_name(p): - """atom : NAME""" - p[0] = ast.Name(p[1]) - - -def p_atom_number(p): - """atom : NUMBER - | STRING""" - p[0] = ast.Const(p[1]) - - -def p_atom_tuple(p): - """atom : LPAR testlist RPAR""" - p[0] = p[2] - -# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME - - -def p_trailer(p): - "trailer : LPAR arglist RPAR" - p[0] = ("CALL", p[2]) - -# testlist: test (',' test)* [','] -# Contains shift/reduce error - - -def p_testlist(p): - """testlist : testlist_multi COMMA - | testlist_multi """ - if len(p) == 2: - p[0] = p[1] - else: - # May need to promote singleton to tuple - if isinstance(p[1], list): - p[0] = p[1] - else: - p[0] = [p[1]] - # Convert into a tuple? - if isinstance(p[0], list): - p[0] = ast.Tuple(p[0]) - - -def p_testlist_multi(p): - """testlist_multi : testlist_multi COMMA test - | test""" - if len(p) == 2: - # singleton - p[0] = p[1] - else: - if isinstance(p[1], list): - p[0] = p[1] + [p[3]] - else: - # singleton -> tuple - p[0] = [p[1], p[3]] - - -# test: or_test ['if' or_test 'else' test] | lambdef -# as I don't support 'and', 'or', and 'not' this works down to 'comparison' -def p_test(p): - "test : comparison" - p[0] = p[1] - - -# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) -# XXX INCOMPLETE: this doesn't allow the trailing comma -def p_arglist(p): - """arglist : arglist COMMA argument - | argument""" - if len(p) == 4: - p[0] = p[1] + [p[3]] - else: - p[0] = [p[1]] - -# argument: test [gen_for] | test '=' test # Really [keyword '='] test - - -def p_argument(p): - "argument : test" - p[0] = p[1] - - -def p_error(p): - # print "Error!", repr(p) - raise SyntaxError(p) - - -class GardenSnakeParser(object): - - def __init__(self, lexer=None): - if lexer is None: - lexer = IndentLexer() - self.lexer = lexer - self.parser = yacc.yacc(start="file_input_end") - - def parse(self, code): - self.lexer.input(code) - result = self.parser.parse(lexer=self.lexer) - return ast.Module(None, result) - - -###### Code generation ###### - -from compiler import misc, syntax, pycodegen - - -class GardenSnakeCompiler(object): - - def __init__(self): - self.parser = GardenSnakeParser() - - def compile(self, code, filename="<string>"): - tree = self.parser.parse(code) - # print tree - misc.set_filename(filename, tree) - syntax.check(tree) - gen = pycodegen.ModuleCodeGenerator(tree) - code = gen.getCode() - return code - -####### Test code ####### - -compile = GardenSnakeCompiler().compile - -code = r""" - -print('LET\'S TRY THIS \\OUT') - -#Comment here -def x(a): - print('called with',a) - if a == 1: - return 2 - if a*2 > 10: return 999 / 4 - # Another comment here - - return a+2*3 - -ints = (1, 2, - 3, 4, -5) -print('mutiline-expression', ints) - -t = 4+1/3*2+6*(9-5+1) -print('predence test; should be 34+2/3:', t, t==(34+2/3)) - -print('numbers', 1,2,3,4,5) -if 1: - 8 - a=9 - print(x(a)) - -print(x(1)) -print(x(2)) -print(x(8),'3') -print('this is decimal', 1/5) -print('BIG DECIMAL', 1.234567891234567e12345) - -""" - -# Set up the GardenSnake run-time environment - - -def print_(*args): - print "-->", " ".join(map(str, args)) - -globals()["print"] = print_ - -compiled_code = compile(code) - -exec compiled_code in globals() -print "Done" diff --git a/example/GardenSnake/README b/example/GardenSnake/README deleted file mode 100644 index 4d8be2d..0000000 --- a/example/GardenSnake/README +++ /dev/null @@ -1,5 +0,0 @@ -This example is Andrew Dalke's GardenSnake language. It shows how to process an -indentation-like language like Python. Further details can be found here: - -http://dalkescientific.com/writings/diary/archive/2006/08/30/gardensnake_language.html - diff --git a/example/README b/example/README index 63519b5..a7ec6e8 100644 --- a/example/README +++ b/example/README @@ -5,6 +5,5 @@ Simple examples: Complex examples ansic - ANSI C grammar from K&R BASIC - A small BASIC interpreter - GardenSnake - A simple python-like language yply - Converts Unix yacc files to PLY programs. diff --git a/example/calc/calc.py b/example/calc/calc.py index 824c3d7..406d83c 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -8,9 +8,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - tokens = ( 'NAME', 'NUMBER', ) @@ -29,19 +26,17 @@ def t_NUMBER(t): t_ignore = " \t" - def t_newline(t): r'\n+' t.lexer.lineno += t.value.count("\n") - def t_error(t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) # Build the lexer import ply.lex as lex -lex.lex() +lexer = lex.lex() # Parsing rules @@ -54,7 +49,6 @@ precedence = ( # dictionary of names names = {} - def p_statement_assign(p): 'statement : NAME "=" expression' names[p[1]] = p[3] @@ -111,11 +105,11 @@ def p_error(p): print("Syntax error at EOF") import ply.yacc as yacc -yacc.yacc() +parser = yacc.yacc() -while 1: +while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/calcdebug/calc.py b/example/calcdebug/calc.py index 06831e2..386000e 100644 --- a/example/calcdebug/calc.py +++ b/example/calcdebug/calc.py @@ -8,9 +8,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - tokens = ( 'NAME', 'NUMBER', ) @@ -119,9 +116,9 @@ logging.basicConfig( filename="parselog.txt" ) -while 1: +while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/calceof/calc.py b/example/calceof/calc.py index 22b39a4..7bb7e0f 100644 --- a/example/calceof/calc.py +++ b/example/calceof/calc.py @@ -8,9 +8,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - tokens = ( 'NAME', 'NUMBER', ) @@ -36,7 +33,7 @@ def t_newline(t): def t_eof(t): - more = raw_input('... ') + more = input('... ') if more: t.lexer.input(more + '\n') return t.lexer.token() @@ -122,9 +119,9 @@ def p_error(p): import ply.yacc as yacc yacc.yacc() -while 1: +while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/classcalc/calc.py b/example/classcalc/calc.py index ada4afd..6f35195 100755 --- a/example/classcalc/calc.py +++ b/example/classcalc/calc.py @@ -12,9 +12,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - import ply.lex as lex import ply.yacc as yacc import os @@ -36,20 +33,18 @@ class Parser: except: modname = "parser" + "_" + self.__class__.__name__ self.debugfile = modname + ".dbg" - self.tabmodule = modname + "_" + "parsetab" - # print self.debugfile, self.tabmodule + # print self.debugfile # Build the lexer and parser lex.lex(module=self, debug=self.debug) yacc.yacc(module=self, debug=self.debug, - debugfile=self.debugfile, - tabmodule=self.tabmodule) + debugfile=self.debugfile) def run(self): - while 1: + while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/closurecalc/calc.py b/example/closurecalc/calc.py index 6031b05..59c9d6f 100644 --- a/example/closurecalc/calc.py +++ b/example/closurecalc/calc.py @@ -9,9 +9,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - # Make a calculator function @@ -124,7 +121,7 @@ calc = make_calculator() while True: try: - s = raw_input("calc > ") + s = input("calc > ") except EOFError: break r = calc(s) diff --git a/example/hedit/hedit.py b/example/hedit/hedit.py deleted file mode 100644 index 32da745..0000000 --- a/example/hedit/hedit.py +++ /dev/null @@ -1,48 +0,0 @@ -# ----------------------------------------------------------------------------- -# hedit.py -# -# Paring of Fortran H Edit descriptions (Contributed by Pearu Peterson) -# -# These tokens can't be easily tokenized because they are of the following -# form: -# -# nHc1...cn -# -# where n is a positive integer and c1 ... cn are characters. -# -# This example shows how to modify the state of the lexer to parse -# such tokens -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - - -tokens = ( - 'H_EDIT_DESCRIPTOR', -) - -# Tokens -t_ignore = " \t\n" - - -def t_H_EDIT_DESCRIPTOR(t): - r"\d+H.*" # This grabs all of the remaining text - i = t.value.index('H') - n = eval(t.value[:i]) - - # Adjust the tokenizing position - t.lexer.lexpos -= len(t.value) - (i + 1 + n) - - t.value = t.value[i + 1:i + 1 + n] - return t - - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import ply.lex as lex -lex.lex() -lex.runmain() diff --git a/example/newclasscalc/calc.py b/example/newclasscalc/calc.py deleted file mode 100755 index 43c9506..0000000 --- a/example/newclasscalc/calc.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python - -# ----------------------------------------------------------------------------- -# calc.py -# -# A simple calculator with variables. This is from O'Reilly's -# "Lex and Yacc", p. 63. -# -# Class-based example contributed to PLY by David McNab. -# -# Modified to use new-style classes. Test case. -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - -if sys.version_info[0] >= 3: - raw_input = input - -import ply.lex as lex -import ply.yacc as yacc -import os - - -class Parser(object): - """ - Base class for a lexer/parser that has the rules defined as methods - """ - tokens = () - precedence = () - - def __init__(self, **kw): - self.debug = kw.get('debug', 0) - self.names = {} - try: - modname = os.path.split(os.path.splitext(__file__)[0])[ - 1] + "_" + self.__class__.__name__ - except: - modname = "parser" + "_" + self.__class__.__name__ - self.debugfile = modname + ".dbg" - self.tabmodule = modname + "_" + "parsetab" - # print self.debugfile, self.tabmodule - - # Build the lexer and parser - lex.lex(module=self, debug=self.debug) - yacc.yacc(module=self, - debug=self.debug, - debugfile=self.debugfile, - tabmodule=self.tabmodule) - - def run(self): - while 1: - try: - s = raw_input('calc > ') - except EOFError: - break - if not s: - continue - yacc.parse(s) - - -class Calc(Parser): - - tokens = ( - 'NAME', 'NUMBER', - 'PLUS', 'MINUS', 'EXP', 'TIMES', 'DIVIDE', 'EQUALS', - 'LPAREN', 'RPAREN', - ) - - # Tokens - - t_PLUS = r'\+' - t_MINUS = r'-' - t_EXP = r'\*\*' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_EQUALS = r'=' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - - def t_NUMBER(self, t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - # print "parsed number %s" % repr(t.value) - return t - - t_ignore = " \t" - - def t_newline(self, t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - - def t_error(self, t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - # Parsing rules - - precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('left', 'EXP'), - ('right', 'UMINUS'), - ) - - def p_statement_assign(self, p): - 'statement : NAME EQUALS expression' - self.names[p[1]] = p[3] - - def p_statement_expr(self, p): - 'statement : expression' - print(p[1]) - - def p_expression_binop(self, p): - """ - expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression - | expression EXP expression - """ - # print [repr(p[i]) for i in range(0,4)] - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - elif p[2] == '*': - p[0] = p[1] * p[3] - elif p[2] == '/': - p[0] = p[1] / p[3] - elif p[2] == '**': - p[0] = p[1] ** p[3] - - def p_expression_uminus(self, p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] - - def p_expression_group(self, p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - - def p_expression_number(self, p): - 'expression : NUMBER' - p[0] = p[1] - - def p_expression_name(self, p): - 'expression : NAME' - try: - p[0] = self.names[p[1]] - except LookupError: - print("Undefined name '%s'" % p[1]) - p[0] = 0 - - def p_error(self, p): - if p: - print("Syntax error at '%s'" % p.value) - else: - print("Syntax error at EOF") - -if __name__ == '__main__': - calc = Calc() - calc.run() diff --git a/example/optcalc/README b/example/optcalc/README deleted file mode 100644 index 53dd5fc..0000000 --- a/example/optcalc/README +++ /dev/null @@ -1,9 +0,0 @@ -An example showing how to use Python optimized mode. -To run: - - - First run 'python calc.py' - - - Then run 'python -OO calc.py' - -If working correctly, the second version should run the -same way. diff --git a/example/optcalc/calc.py b/example/optcalc/calc.py deleted file mode 100644 index 0c223e5..0000000 --- a/example/optcalc/calc.py +++ /dev/null @@ -1,134 +0,0 @@ -# ----------------------------------------------------------------------------- -# calc.py -# -# A simple calculator with variables. This is from O'Reilly's -# "Lex and Yacc", p. 63. -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - -if sys.version_info[0] >= 3: - raw_input = input - -tokens = ( - 'NAME', 'NUMBER', - 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'EQUALS', - 'LPAREN', 'RPAREN', -) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import ply.lex as lex -lex.lex(optimize=1) - -# Parsing rules - -precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), -) - -# dictionary of names -names = {} - - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - - -def p_statement_expr(t): - 'statement : expression' - print(t[1]) - - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+': - t[0] = t[1] + t[3] - elif t[2] == '-': - t[0] = t[1] - t[3] - elif t[2] == '*': - t[0] = t[1] * t[3] - elif t[2] == '/': - t[0] = t[1] / t[3] - elif t[2] == '<': - t[0] = t[1] < t[3] - - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - - -def p_error(t): - if t: - print("Syntax error at '%s'" % t.value) - else: - print("Syntax error at EOF") - -import ply.yacc as yacc -yacc.yacc(optimize=1) - -while 1: - try: - s = raw_input('calc > ') - except EOFError: - break - yacc.parse(s) diff --git a/example/unicalc/calc.py b/example/unicalc/calc.py deleted file mode 100644 index 901c4b9..0000000 --- a/example/unicalc/calc.py +++ /dev/null @@ -1,133 +0,0 @@ -# ----------------------------------------------------------------------------- -# calc.py -# -# A simple calculator with variables. This is from O'Reilly's -# "Lex and Yacc", p. 63. -# -# This example uses unicode strings for tokens, docstrings, and input. -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - -tokens = ( - 'NAME', 'NUMBER', - 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'EQUALS', - 'LPAREN', 'RPAREN', -) - -# Tokens - -t_PLUS = ur'\+' -t_MINUS = ur'-' -t_TIMES = ur'\*' -t_DIVIDE = ur'/' -t_EQUALS = ur'=' -t_LPAREN = ur'\(' -t_RPAREN = ur'\)' -t_NAME = ur'[a-zA-Z_][a-zA-Z0-9_]*' - - -def t_NUMBER(t): - ur'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Integer value too large", t.value - t.value = 0 - return t - -t_ignore = u" \t" - - -def t_newline(t): - ur'\n+' - t.lexer.lineno += t.value.count("\n") - - -def t_error(t): - print "Illegal character '%s'" % t.value[0] - t.lexer.skip(1) - -# Build the lexer -import ply.lex as lex -lex.lex() - -# Parsing rules - -precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), -) - -# dictionary of names -names = {} - - -def p_statement_assign(p): - 'statement : NAME EQUALS expression' - names[p[1]] = p[3] - - -def p_statement_expr(p): - 'statement : expression' - print p[1] - - -def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if p[2] == u'+': - p[0] = p[1] + p[3] - elif p[2] == u'-': - p[0] = p[1] - p[3] - elif p[2] == u'*': - p[0] = p[1] * p[3] - elif p[2] == u'/': - p[0] = p[1] / p[3] - - -def p_expression_uminus(p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] - - -def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - - -def p_expression_number(p): - 'expression : NUMBER' - p[0] = p[1] - - -def p_expression_name(p): - 'expression : NAME' - try: - p[0] = names[p[1]] - except LookupError: - print "Undefined name '%s'" % p[1] - p[0] = 0 - - -def p_error(p): - if p: - print "Syntax error at '%s'" % p.value - else: - print "Syntax error at EOF" - -import ply.yacc as yacc -yacc.yacc() - -while 1: - try: - s = raw_input('calc > ') - except EOFError: - break - if not s: - continue - yacc.parse(unicode(s)) diff --git a/example/yply/yparse.py b/example/yply/yparse.py index 1f2e8d0..b2c8863 100644 --- a/example/yply/yparse.py +++ b/example/yply/yparse.py @@ -233,7 +233,7 @@ def p_empty(p): def p_error(p): pass -yacc.yacc(debug=0) +yacc.yacc(debug=False) def print_code(code, indent): diff --git a/ply/__init__.py b/ply/__init__.py index 6f768b7..8783862 100644 --- a/ply/__init__.py +++ b/ply/__init__.py @@ -1,5 +1,6 @@ # PLY package # Author: David Beazley (dave@dabeaz.com) +# https://dabeaz.com/ply/index.html __version__ = '4.0' __all__ = ['lex','yacc'] @@ -33,9 +33,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- -__version__ = '4.0' -__tabversion__ = '3.10' - import re import sys import types @@ -56,15 +53,10 @@ class LexError(Exception): self.args = (message,) self.text = s - # Token class. This class is used to represent the tokens produced. class LexToken(object): - def __str__(self): - return 'LexToken(%s,%r,%d,%d)' % (self.type, self.value, self.lineno, self.lexpos) - def __repr__(self): - return str(self) - + return f'LexToken({self.type},{self.value!r},{self.lineno},{self.lexpos})' # This object is a stand-in for a logging object created by the # logging module. @@ -85,16 +77,6 @@ class PlyLogger(object): info = critical debug = critical - -# Null logger is used when no output is generated. Does nothing. -class NullLogger(object): - def __getattribute__(self, name): - return self - - def __call__(self, *args, **kwargs): - return self - - # ----------------------------------------------------------------------------- # === Lexing Engine === # @@ -136,7 +118,6 @@ class Lexer: self.lexliterals = '' # Literal characters that can be passed through self.lexmodule = None # Module self.lineno = 1 # Current line number - self.lexoptimize = False # Optimized mode def clone(self, object=None): c = copy.copy(self) @@ -166,90 +147,9 @@ class Lexer: return c # ------------------------------------------------------------ - # writetab() - Write lexer information to a table file - # ------------------------------------------------------------ - def writetab(self, lextab, outputdir=''): - if isinstance(lextab, types.ModuleType): - raise IOError("Won't overwrite existing lextab module") - basetabmodule = lextab.split('.')[-1] - filename = os.path.join(outputdir, basetabmodule) + '.py' - with open(filename, 'w') as tf: - tf.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule, __version__)) - tf.write('_tabversion = %s\n' % repr(__tabversion__)) - tf.write('_lextokens = set(%s)\n' % repr(tuple(sorted(self.lextokens)))) - tf.write('_lexreflags = %s\n' % repr(int(self.lexreflags))) - tf.write('_lexliterals = %s\n' % repr(self.lexliterals)) - tf.write('_lexstateinfo = %s\n' % repr(self.lexstateinfo)) - - # Rewrite the lexstatere table, replacing function objects with function names - tabre = {} - for statename, lre in self.lexstatere.items(): - titem = [] - for (pat, func), retext, renames in zip(lre, self.lexstateretext[statename], self.lexstaterenames[statename]): - titem.append((retext, _funcs_to_names(func, renames))) - tabre[statename] = titem - - tf.write('_lexstatere = %s\n' % repr(tabre)) - tf.write('_lexstateignore = %s\n' % repr(self.lexstateignore)) - - taberr = {} - for statename, ef in self.lexstateerrorf.items(): - taberr[statename] = ef.__name__ if ef else None - tf.write('_lexstateerrorf = %s\n' % repr(taberr)) - - tabeof = {} - for statename, ef in self.lexstateeoff.items(): - tabeof[statename] = ef.__name__ if ef else None - tf.write('_lexstateeoff = %s\n' % repr(tabeof)) - - # ------------------------------------------------------------ - # readtab() - Read lexer information from a tab file - # ------------------------------------------------------------ - def readtab(self, tabfile, fdict): - if isinstance(tabfile, types.ModuleType): - lextab = tabfile - else: - exec('import %s' % tabfile) - lextab = sys.modules[tabfile] - - if getattr(lextab, '_tabversion', '0.0') != __tabversion__: - raise ImportError('Inconsistent PLY version') - - self.lextokens = lextab._lextokens - self.lexreflags = lextab._lexreflags - self.lexliterals = lextab._lexliterals - self.lextokens_all = self.lextokens | set(self.lexliterals) - self.lexstateinfo = lextab._lexstateinfo - self.lexstateignore = lextab._lexstateignore - self.lexstatere = {} - self.lexstateretext = {} - for statename, lre in lextab._lexstatere.items(): - titem = [] - txtitem = [] - for pat, func_name in lre: - titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict))) - - self.lexstatere[statename] = titem - self.lexstateretext[statename] = txtitem - - self.lexstateerrorf = {} - for statename, ef in lextab._lexstateerrorf.items(): - self.lexstateerrorf[statename] = fdict[ef] - - self.lexstateeoff = {} - for statename, ef in lextab._lexstateeoff.items(): - self.lexstateeoff[statename] = fdict[ef] - - self.begin('INITIAL') - - # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self, s): - # Pull off the first character to see if s looks like a string - c = s[:1] - if not isinstance(c, StringTypes): - raise ValueError('Expected a string') self.lexdata = s self.lexpos = 0 self.lexlen = len(s) @@ -259,7 +159,7 @@ class Lexer: # ------------------------------------------------------------ def begin(self, state): if state not in self.lexstatere: - raise ValueError('Undefined state') + raise ValueError(f'Undefined state {state!r}') self.lexre = self.lexstatere[state] self.lexretext = self.lexstateretext[state] self.lexignore = self.lexstateignore.get(state, '') @@ -293,7 +193,7 @@ class Lexer: self.lexpos += n # ------------------------------------------------------------ - # opttoken() - Return the next token from the Lexer + # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what @@ -343,22 +243,15 @@ class Lexer: tok.lexer = self # Set additional attributes useful in token rules self.lexmatch = m self.lexpos = lexpos - newtok = func(tok) + del tok.lexer + del self.lexmatch # Every function must return a token, if nothing, we just move to next token if not newtok: lexpos = self.lexpos # This is here in case user has updated lexpos. lexignore = self.lexignore # This is here in case there was a state change break - - # Verify type of the token. If not in the token map, raise an error - if not self.lexoptimize: - if newtok.type not in self.lextokens_all: - raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( - func.__code__.co_filename, func.__code__.co_firstlineno, - func.__name__, newtok.type), lexdata[lexpos:]) - return newtok else: # No match, see if in literals @@ -383,14 +276,16 @@ class Lexer: newtok = self.lexerrorf(tok) if lexpos == self.lexpos: # Error method didn't change text position at all. This is an error. - raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + raise LexError(f"Scanning error. Illegal character {lexdata[lexpos]!r}", + lexdata[lexpos:]) lexpos = self.lexpos if not newtok: continue return newtok self.lexpos = lexpos - raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:]) + raise LexError(f"Illegal character {lexdata[lexpos]!r} at index {lexpos}", + lexdata[lexpos:]) if self.lexeoff: tok = LexToken() @@ -412,14 +307,12 @@ class Lexer: def __iter__(self): return self - def next(self): + def __next__(self): t = self.token() if t is None: raise StopIteration return t - __next__ = next - # ----------------------------------------------------------------------------- # ==== Lex Builder === # @@ -445,40 +338,7 @@ def _get_regex(func): # ----------------------------------------------------------------------------- def get_caller_module_dict(levels): f = sys._getframe(levels) - ldict = f.f_globals.copy() - if f.f_globals != f.f_locals: - ldict.update(f.f_locals) - return ldict - -# ----------------------------------------------------------------------------- -# _funcs_to_names() -# -# Given a list of regular expression functions, this converts it to a list -# suitable for output to a table file -# ----------------------------------------------------------------------------- -def _funcs_to_names(funclist, namelist): - result = [] - for f, name in zip(funclist, namelist): - if f and f[0]: - result.append((name, f[1])) - else: - result.append(f) - return result - -# ----------------------------------------------------------------------------- -# _names_to_funcs() -# -# Given a list of regular expression function names, this converts it back to -# functions. -# ----------------------------------------------------------------------------- -def _names_to_funcs(namelist, fdict): - result = [] - for n in namelist: - if n and n[0]: - result.append((fdict[n[0]], n[1])) - else: - result.append(n) - return result + return { **f.f_globals, **f.f_locals } # ----------------------------------------------------------------------------- # _form_master_re() @@ -489,7 +349,7 @@ def _names_to_funcs(namelist, fdict): # ----------------------------------------------------------------------------- def _form_master_re(relist, reflags, ldict, toknames): if not relist: - return [] + return [], [], [] regex = '|'.join(relist) try: lexre = re.compile(regex, reflags) @@ -512,9 +372,7 @@ def _form_master_re(relist, reflags, ldict, toknames): return [(lexre, lexindexfunc)], [regex], [lexindexnames] except Exception: - m = int(len(relist)/2) - if m == 0: - m = 1 + m = (len(relist) // 2) + 1 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) return (llist+rlist), (lre+rre), (lnames+rnames) @@ -601,10 +459,10 @@ class LexerReflect(object): terminals = {} for n in self.tokens: if not _is_identifier.match(n): - self.log.error("Bad token name '%s'", n) + self.log.error(f"Bad token name {n!r}") self.error = True if n in terminals: - self.log.warning("Token '%s' multiply defined", n) + self.log.warning(f"Token {n!r} multiply defined") terminals[n] = 1 # Get the literals specifier @@ -618,7 +476,7 @@ class LexerReflect(object): try: for c in self.literals: if not isinstance(c, StringTypes) or len(c) > 1: - self.log.error('Invalid literal %s. Must be a single character', repr(c)) + self.log.error(f'Invalid literal {c!r}. Must be a single character') self.error = True except TypeError: @@ -635,20 +493,20 @@ class LexerReflect(object): else: for s in self.states: if not isinstance(s, tuple) or len(s) != 2: - self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s)) + self.log.error("Invalid state specifier %r. Must be a tuple (statename,'exclusive|inclusive')", s) self.error = True continue name, statetype = s if not isinstance(name, StringTypes): - self.log.error('State name %s must be a string', repr(name)) + self.log.error('State name %r must be a string', name) self.error = True continue if not (statetype == 'inclusive' or statetype == 'exclusive'): - self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name) + self.log.error("State type for state %r must be 'inclusive' or 'exclusive'", name) self.error = True continue if name in self.stateinfo: - self.log.error("State '%s' already defined", name) + self.log.error("State %r already defined", name) self.error = True continue self.stateinfo[name] = statetype @@ -691,7 +549,7 @@ class LexerReflect(object): elif tokname == 'ignore': line = t.__code__.co_firstlineno file = t.__code__.co_filename - self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__) + self.log.error("%s:%d: Rule %r must be defined as a string", file, line, t.__name__) self.error = True else: for s in states: @@ -704,7 +562,7 @@ class LexerReflect(object): self.log.warning("%s contains a literal backslash '\\'", f) elif tokname == 'error': - self.log.error("Rule '%s' must be defined as a function", f) + self.log.error("Rule %r must be defined as a function", f) self.error = True else: for s in states: @@ -739,57 +597,57 @@ class LexerReflect(object): reqargs = 1 nargs = f.__code__.co_argcount if nargs > reqargs: - self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) + self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__) self.error = True continue if nargs < reqargs: - self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) + self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__) self.error = True continue if not _get_regex(f): - self.log.error("%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__) + self.log.error("%s:%d: No regular expression defined for rule %r", file, line, f.__name__) self.error = True continue try: c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), self.reflags) if c.match(''): - self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__) + self.log.error("%s:%d: Regular expression for rule %r matches empty string", file, line, f.__name__) self.error = True except re.error as e: self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e) if '#' in _get_regex(f): - self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__) + self.log.error("%s:%d. Make sure '#' in rule %r is escaped with '\\#'", file, line, f.__name__) self.error = True # Validate all rules defined by strings for name, r in self.strsym[state]: tokname = self.toknames[name] if tokname == 'error': - self.log.error("Rule '%s' must be defined as a function", name) + self.log.error("Rule %r must be defined as a function", name) self.error = True continue if tokname not in self.tokens and tokname.find('ignore_') < 0: - self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname) + self.log.error("Rule %r defined for an unspecified token %s", name, tokname) self.error = True continue try: c = re.compile('(?P<%s>%s)' % (name, r), self.reflags) if (c.match('')): - self.log.error("Regular expression for rule '%s' matches empty string", name) + self.log.error("Regular expression for rule %r matches empty string", name) self.error = True except re.error as e: - self.log.error("Invalid regular expression for rule '%s'. %s", name, e) + self.log.error("Invalid regular expression for rule %r. %s", name, e) if '#' in r: - self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name) + self.log.error("Make sure '#' in rule %r is escaped with '\\#'", name) self.error = True if not self.funcsym[state] and not self.strsym[state]: - self.log.error("No rules defined for state '%s'", state) + self.log.error("No rules defined for state %r", state) self.error = True # Validate the error function @@ -807,11 +665,11 @@ class LexerReflect(object): reqargs = 1 nargs = f.__code__.co_argcount if nargs > reqargs: - self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) + self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__) self.error = True if nargs < reqargs: - self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) + self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__) self.error = True for module in self.modules: @@ -856,18 +714,14 @@ class LexerReflect(object): # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- -def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', - reflags=int(re.VERBOSE), nowarn=False, outputdir=None, debuglog=None, errorlog=None): - - if lextab is None: - lextab = 'lextab' +def lex(*, module=None, object=None, debug=False, + reflags=int(re.VERBOSE), debuglog=None, errorlog=None): global lexer ldict = None stateinfo = {'INITIAL': 'inclusive'} lexobj = Lexer() - lexobj.lexoptimize = optimize global token, input if errorlog is None: @@ -891,30 +745,11 @@ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', else: ldict = get_caller_module_dict(2) - # Determine if the module is package of a package or not. - # If so, fix the tabmodule setting so that tables load correctly - pkg = ldict.get('__package__') - if pkg and isinstance(lextab, str): - if '.' not in lextab: - lextab = pkg + '.' + lextab - # Collect parser information from the dictionary linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) linfo.get_all() - if not optimize: - if linfo.validate_all(): - raise SyntaxError("Can't build lexer") - - if optimize and lextab: - try: - lexobj.readtab(lextab, ldict) - token = lexobj.token - input = lexobj.input - lexer = lexobj - return lexobj - - except ImportError: - pass + if linfo.validate_all(): + raise SyntaxError("Can't build lexer") # Dump some basic debugging information if debug: @@ -1001,9 +836,9 @@ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', for s, stype in stateinfo.items(): if stype == 'exclusive': if s not in linfo.errorf: - errorlog.warning("No error rule is defined for exclusive state '%s'", s) + errorlog.warning("No error rule is defined for exclusive state %r", s) if s not in linfo.ignore and lexobj.lexignore: - errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) + errorlog.warning("No ignore rule is defined for exclusive state %r", s) elif stype == 'inclusive': if s not in linfo.errorf: linfo.errorf[s] = linfo.errorf.get('INITIAL', None) @@ -1015,31 +850,6 @@ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', input = lexobj.input lexer = lexobj - # If in optimize mode, we write the lextab - if lextab and optimize: - if outputdir is None: - # If no output directory is set, the location of the output files - # is determined according to the following rules: - # - If lextab specifies a package, files go into that package directory - # - Otherwise, files go in the same directory as the specifying module - if isinstance(lextab, types.ModuleType): - srcfile = lextab.__file__ - else: - if '.' not in lextab: - srcfile = ldict['__file__'] - else: - parts = lextab.split('.') - pkgname = '.'.join(parts[:-1]) - exec('import %s' % pkgname) - srcfile = getattr(sys.modules[pkgname], '__file__', '') - outputdir = os.path.dirname(srcfile) - try: - lexobj.writetab(lextab, outputdir) - if lextab in sys.modules: - del sys.modules[lextab] - except IOError as e: - errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e)) - return lexobj # ----------------------------------------------------------------------------- @@ -1072,7 +882,7 @@ def runmain(lexer=None, data=None): tok = _token() if not tok: break - sys.stdout.write('(%s,%r,%d,%d)\n' % (tok.type, tok.value, tok.lineno, tok.lexpos)) + sys.stdout.write(f'({tok.type},{tok.value!r},{tok.lineno},{tok.lexpos})\n') # ----------------------------------------------------------------------------- # @TOKEN(regex) @@ -1089,6 +899,3 @@ def TOKEN(r): f.regex = r return f return set_regex - -# Alternative spelling of the TOKEN decorator -Token = TOKEN diff --git a/ply/yacc.py b/ply/yacc.py index a5024eb..5a750d7 100644 --- a/ply/yacc.py +++ b/ply/yacc.py @@ -64,12 +64,7 @@ import re import types import sys -import os.path import inspect -import warnings - -__version__ = '4.0' -__tabversion__ = '3.10' #----------------------------------------------------------------------------- # === User configurable parameters === @@ -77,22 +72,13 @@ __tabversion__ = '3.10' # Change these to modify the default behavior of yacc (if you wish) #----------------------------------------------------------------------------- -yaccdebug = True # Debugging mode. If set, yacc generates a +yaccdebug = False # Debugging mode. If set, yacc generates a # a 'parser.out' file in the current directory debug_file = 'parser.out' # Default name of the debugging file -tab_module = 'parsetab' # Default name of the table module -default_lr = 'LALR' # Default LR table generation method - error_count = 3 # Number of symbols that must be shifted to leave recovery mode - -yaccdevel = False # Set to True if developing yacc. This turns off optimized - # implementations of certain functions. - resultlimit = 40 # Size limit of results when running in debug mode. -pickle_protocol = 0 # Protocol to use when writing pickle files - MAXINT = sys.maxsize # This object is a stand-in for a logging object created by the @@ -150,48 +136,6 @@ def format_stack_entry(r): else: return '<%s @ 0x%x>' % (type(r).__name__, id(r)) -# Panic mode error recovery support. This feature is being reworked--much of the -# code here is to offer a deprecation/backwards compatible transition - -_errok = None -_token = None -_restart = None -_warnmsg = '''PLY: Don't use global functions errok(), token(), and restart() in p_error(). -Instead, invoke the methods on the associated parser instance: - - def p_error(p): - ... - # Use parser.errok(), parser.token(), parser.restart() - ... - - parser = yacc.yacc() -''' - -def errok(): - warnings.warn(_warnmsg) - return _errok() - -def restart(): - warnings.warn(_warnmsg) - return _restart() - -def token(): - warnings.warn(_warnmsg) - return _token() - -# Utility function to call the p_error() function with some deprecation hacks -def call_errorfunc(errorfunc, token, parser): - global _errok, _token, _restart - _errok = parser.errok - _token = parser.token - _restart = parser.restart - r = errorfunc(token) - try: - del _errok, _token, _restart - except NameError: - pass - return r - #----------------------------------------------------------------------------- # === LR Parsing Engine === # @@ -318,33 +262,19 @@ class LRParser: def disable_defaulted_states(self): self.defaulted_states = {} - def parse(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - if debug or yaccdevel: - if isinstance(debug, int): - debug = PlyLogger(sys.stderr) - return self.parsedebug(input, lexer, debug, tracking, tokenfunc) - elif tracking: - return self.parseopt(input, lexer, debug, tracking, tokenfunc) - else: - return self.parseopt_notrack(input, lexer, debug, tracking, tokenfunc) - - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # parsedebug(). - # - # This is the debugging enabled version of parse(). All changes made to the - # parsing engine should be made here. Optimized versions of this function - # are automatically created by the ply/ygen.py script. This script cuts out - # sections enclosed in markers such as this: + # parse(). # - # #--! DEBUG - # statements - # #--! DEBUG - # - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # This is the core parsing engine. To operate, it requires a lexer object. + # Two options are provided. The debug flag turns on debugging so that you can + # see the various rule reductions and parsing steps. tracking turns on position + # tracking. In this mode, symbols will record the starting/ending line number and + # character index. + + def parse(self, input=None, lexer=None, debug=False, tracking=False): + # If debugging has been specified as a flag, turn it into a logging object + if isinstance(debug, int) and debug: + debug = PlyLogger(sys.stderr) - def parsedebug(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - #--! parsedebug-start lookahead = None # Current lookahead symbol lookaheadstack = [] # Stack of lookahead symbols actions = self.action # Local reference to action table (to avoid lookup on self.) @@ -354,9 +284,8 @@ class LRParser: pslice = YaccProduction(None) # Production object passed to grammar rules errorcount = 0 # Used during error recovery - #--! DEBUG - debug.info('PLY: PARSE DEBUG START') - #--! DEBUG + if debug: + debug.info('PLY: PARSE DEBUG START') # If no lexer was given, we will try to use the lex module if not lexer: @@ -371,24 +300,14 @@ class LRParser: if input is not None: lexer.input(input) - if tokenfunc is None: - # Tokenize function - get_token = lexer.token - else: - get_token = tokenfunc - - # Set the parser() token method (sometimes used in error recovery) - self.token = get_token + # Set the token function + get_token = self.token = lexer.token # Set up the state and symbol stacks - - statestack = [] # Stack of parsing states - self.statestack = statestack - symstack = [] # Stack of grammar symbols - self.symstack = symstack - - pslice.stack = symstack # Put in the production - errtoken = None # Err token + statestack = self.statestack = [] # Stack of parsing states + symstack = self.symstack = [] # Stack of grammar symbols + pslice.stack = symstack # Put in the production + errtoken = None # Err token # The start state is assumed to be (0,$end) @@ -402,10 +321,8 @@ class LRParser: # is already set, we just use that. Otherwise, we'll pull # the next token off of the lookaheadstack or from the lexer - #--! DEBUG - debug.debug('') - debug.debug('State : %s', state) - #--! DEBUG + if debug: + debug.debug('State : %s', state) if state not in defaulted_states: if not lookahead: @@ -422,14 +339,12 @@ class LRParser: t = actions[state].get(ltype) else: t = defaulted_states[state] - #--! DEBUG - debug.debug('Defaulted state %s: Reduce using %d', state, -t) - #--! DEBUG + if debug: + debug.debug('Defaulted state %s: Reduce using %d', state, -t) - #--! DEBUG - debug.debug('Stack : %s', - ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) - #--! DEBUG + if debug: + debug.debug('Stack : %s', + ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) if t is not None: if t > 0: @@ -437,9 +352,8 @@ class LRParser: statestack.append(t) state = t - #--! DEBUG - debug.debug('Action : Shift and goto state %s', t) - #--! DEBUG + if debug: + debug.debug('Action : Shift and goto state %s', t) symstack.append(lookahead) lookahead = None @@ -460,22 +374,19 @@ class LRParser: sym.type = pname # Production name sym.value = None - #--! DEBUG - if plen: - debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, - '['+','.join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+']', - goto[statestack[-1-plen]][pname]) - else: - debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, [], - goto[statestack[-1]][pname]) - - #--! DEBUG + if debug: + if plen: + debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, + '['+','.join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+']', + goto[statestack[-1-plen]][pname]) + else: + debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, [], + goto[statestack[-1]][pname]) if plen: targ = symstack[-plen-1:] targ[0] = sym - #--! TRACKING if tracking: t1 = targ[1] sym.lineno = t1.lineno @@ -483,7 +394,6 @@ class LRParser: t1 = targ[-1] sym.endlineno = getattr(t1, 'endlineno', t1.lineno) sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) - #--! TRACKING # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # The code enclosed in this section is duplicated @@ -498,9 +408,8 @@ class LRParser: self.state = state p.callable(pslice) del statestack[-plen:] - #--! DEBUG - debug.info('Result : %s', format_result(pslice[0])) - #--! DEBUG + if debug: + debug.info('Result : %s', format_result(pslice[0])) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -517,15 +426,12 @@ class LRParser: self.errorok = False continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! else: - #--! TRACKING if tracking: sym.lineno = lexer.lineno sym.lexpos = lexer.lexpos - #--! TRACKING targ = [sym] @@ -540,9 +446,8 @@ class LRParser: # Call the grammar rule with our special slice object self.state = state p.callable(pslice) - #--! DEBUG - debug.info('Result : %s', format_result(pslice[0])) - #--! DEBUG + if debug: + debug.info('Result : %s', format_result(pslice[0])) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -558,329 +463,22 @@ class LRParser: self.errorok = False continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if t == 0: n = symstack[-1] result = getattr(n, 'value', None) - #--! DEBUG - debug.info('Done : Returning %s', format_result(result)) - debug.info('PLY: PARSE DEBUG END') - #--! DEBUG - return result - - if t is None: - - #--! DEBUG - debug.error('Error : %s', - ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) - #--! DEBUG - - # We have some kind of parsing error here. To handle - # this, we are going to push the current token onto - # the tokenstack and replace it with an 'error' token. - # If there are any synchronization rules, they may - # catch it. - # - # In addition to pushing the error token, we call call - # the user defined p_error() function if this is the - # first syntax error. This function is only called if - # errorcount == 0. - if errorcount == 0 or self.errorok: - errorcount = error_count - self.errorok = False - errtoken = lookahead - if errtoken.type == '$end': - errtoken = None # End of file! - if self.errorfunc: - if errtoken and not hasattr(errtoken, 'lexer'): - errtoken.lexer = lexer - self.state = state - tok = call_errorfunc(self.errorfunc, errtoken, self) - if self.errorok: - # User must have done some kind of panic - # mode recovery on their own. The - # returned token is the next lookahead - lookahead = tok - errtoken = None - continue - else: - if errtoken: - if hasattr(errtoken, 'lineno'): - lineno = lookahead.lineno - else: - lineno = 0 - if lineno: - sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) - else: - sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) - else: - sys.stderr.write('yacc: Parse error in input. EOF\n') - return - - else: - errorcount = error_count - - # case 1: the statestack only has 1 entry on it. If we're in this state, the - # entire parse has been rolled back and we're completely hosed. The token is - # discarded and we just keep going. - - if len(statestack) <= 1 and lookahead.type != '$end': - lookahead = None - errtoken = None - state = 0 - # Nuke the pushback stack - del lookaheadstack[:] - continue - - # case 2: the statestack has a couple of entries on it, but we're - # at the end of the file. nuke the top entry and generate an error token - - # Start nuking entries on the stack - if lookahead.type == '$end': - # Whoa. We're really hosed here. Bail out - return - - if lookahead.type != 'error': - sym = symstack[-1] - if sym.type == 'error': - # Hmmm. Error is on top of stack, we'll just nuke input - # symbol and continue - #--! TRACKING - if tracking: - sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) - sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) - #--! TRACKING - lookahead = None - continue - - # Create the error symbol for the first time and make it the new lookahead symbol - t = YaccSymbol() - t.type = 'error' - - if hasattr(lookahead, 'lineno'): - t.lineno = t.endlineno = lookahead.lineno - if hasattr(lookahead, 'lexpos'): - t.lexpos = t.endlexpos = lookahead.lexpos - t.value = lookahead - lookaheadstack.append(lookahead) - lookahead = t - else: - sym = symstack.pop() - #--! TRACKING - if tracking: - lookahead.lineno = sym.lineno - lookahead.lexpos = sym.lexpos - #--! TRACKING - statestack.pop() - state = statestack[-1] - - continue - - # Call an error function here - raise RuntimeError('yacc: internal parser error!!!\n') - - #--! parsedebug-end - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # parseopt(). - # - # Optimized version of parse() method. DO NOT EDIT THIS CODE DIRECTLY! - # This code is automatically generated by the ply/ygen.py script. Make - # changes to the parsedebug() method instead. - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - def parseopt(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - #--! parseopt-start - lookahead = None # Current lookahead symbol - lookaheadstack = [] # Stack of lookahead symbols - actions = self.action # Local reference to action table (to avoid lookup on self.) - goto = self.goto # Local reference to goto table (to avoid lookup on self.) - prod = self.productions # Local reference to production list (to avoid lookup on self.) - defaulted_states = self.defaulted_states # Local reference to defaulted states - pslice = YaccProduction(None) # Production object passed to grammar rules - errorcount = 0 # Used during error recovery - - - # If no lexer was given, we will try to use the lex module - if not lexer: - from . import lex - lexer = lex.lexer - - # Set up the lexer and parser objects on pslice - pslice.lexer = lexer - pslice.parser = self - - # If input was supplied, pass to lexer - if input is not None: - lexer.input(input) - - if tokenfunc is None: - # Tokenize function - get_token = lexer.token - else: - get_token = tokenfunc - - # Set the parser() token method (sometimes used in error recovery) - self.token = get_token - - # Set up the state and symbol stacks - - statestack = [] # Stack of parsing states - self.statestack = statestack - symstack = [] # Stack of grammar symbols - self.symstack = symstack - - pslice.stack = symstack # Put in the production - errtoken = None # Err token - - # The start state is assumed to be (0,$end) - statestack.append(0) - sym = YaccSymbol() - sym.type = '$end' - symstack.append(sym) - state = 0 - while True: - # Get the next symbol on the input. If a lookahead symbol - # is already set, we just use that. Otherwise, we'll pull - # the next token off of the lookaheadstack or from the lexer - - - if state not in defaulted_states: - if not lookahead: - if not lookaheadstack: - lookahead = get_token() # Get the next token - else: - lookahead = lookaheadstack.pop() - if not lookahead: - lookahead = YaccSymbol() - lookahead.type = '$end' - - # Check the action table - ltype = lookahead.type - t = actions[state].get(ltype) - else: - t = defaulted_states[state] - - - if t is not None: - if t > 0: - # shift a symbol on the stack - statestack.append(t) - state = t - - - symstack.append(lookahead) - lookahead = None - - # Decrease error count on successful shift - if errorcount: - errorcount -= 1 - continue + if debug: + debug.info('Done : Returning %s', format_result(result)) + debug.info('PLY: PARSE DEBUG END') - if t < 0: - # reduce a symbol on the stack, emit a production - p = prod[-t] - pname = p.name - plen = p.len - - # Get production function - sym = YaccSymbol() - sym.type = pname # Production name - sym.value = None - - - if plen: - targ = symstack[-plen-1:] - targ[0] = sym - - #--! TRACKING - if tracking: - t1 = targ[1] - sym.lineno = t1.lineno - sym.lexpos = t1.lexpos - t1 = targ[-1] - sym.endlineno = getattr(t1, 'endlineno', t1.lineno) - sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) - #--! TRACKING - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # below as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - del symstack[-plen:] - self.state = state - p.callable(pslice) - del statestack[-plen:] - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - symstack.extend(targ[1:-1]) # Put the production slice back on the stack - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - else: - - #--! TRACKING - if tracking: - sym.lineno = lexer.lineno - sym.lexpos = lexer.lexpos - #--! TRACKING - - targ = [sym] - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # above as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - self.state = state - p.callable(pslice) - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - if t == 0: - n = symstack[-1] - result = getattr(n, 'value', None) return result if t is None: + if debug: + debug.error('Error : %s', + ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) # We have some kind of parsing error here. To handle # this, we are going to push the current token onto @@ -902,7 +500,7 @@ class LRParser: if errtoken and not hasattr(errtoken, 'lexer'): errtoken.lexer = lexer self.state = state - tok = call_errorfunc(self.errorfunc, errtoken, self) + tok = self.errorfunc(errtoken) if self.errorok: # User must have done some kind of panic # mode recovery on their own. The @@ -952,11 +550,9 @@ class LRParser: if sym.type == 'error': # Hmmm. Error is on top of stack, we'll just nuke input # symbol and continue - #--! TRACKING if tracking: sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) - #--! TRACKING lookahead = None continue @@ -973,303 +569,17 @@ class LRParser: lookahead = t else: sym = symstack.pop() - #--! TRACKING if tracking: lookahead.lineno = sym.lineno lookahead.lexpos = sym.lexpos - #--! TRACKING statestack.pop() state = statestack[-1] continue - # Call an error function here + # If we'r here, something really bad happened raise RuntimeError('yacc: internal parser error!!!\n') - #--! parseopt-end - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # parseopt_notrack(). - # - # Optimized version of parseopt() with line number tracking removed. - # DO NOT EDIT THIS CODE DIRECTLY. This code is automatically generated - # by the ply/ygen.py script. Make changes to the parsedebug() method instead. - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - #--! parseopt-notrack-start - lookahead = None # Current lookahead symbol - lookaheadstack = [] # Stack of lookahead symbols - actions = self.action # Local reference to action table (to avoid lookup on self.) - goto = self.goto # Local reference to goto table (to avoid lookup on self.) - prod = self.productions # Local reference to production list (to avoid lookup on self.) - defaulted_states = self.defaulted_states # Local reference to defaulted states - pslice = YaccProduction(None) # Production object passed to grammar rules - errorcount = 0 # Used during error recovery - - - # If no lexer was given, we will try to use the lex module - if not lexer: - from . import lex - lexer = lex.lexer - - # Set up the lexer and parser objects on pslice - pslice.lexer = lexer - pslice.parser = self - - # If input was supplied, pass to lexer - if input is not None: - lexer.input(input) - - if tokenfunc is None: - # Tokenize function - get_token = lexer.token - else: - get_token = tokenfunc - - # Set the parser() token method (sometimes used in error recovery) - self.token = get_token - - # Set up the state and symbol stacks - - statestack = [] # Stack of parsing states - self.statestack = statestack - symstack = [] # Stack of grammar symbols - self.symstack = symstack - - pslice.stack = symstack # Put in the production - errtoken = None # Err token - - # The start state is assumed to be (0,$end) - - statestack.append(0) - sym = YaccSymbol() - sym.type = '$end' - symstack.append(sym) - state = 0 - while True: - # Get the next symbol on the input. If a lookahead symbol - # is already set, we just use that. Otherwise, we'll pull - # the next token off of the lookaheadstack or from the lexer - - - if state not in defaulted_states: - if not lookahead: - if not lookaheadstack: - lookahead = get_token() # Get the next token - else: - lookahead = lookaheadstack.pop() - if not lookahead: - lookahead = YaccSymbol() - lookahead.type = '$end' - - # Check the action table - ltype = lookahead.type - t = actions[state].get(ltype) - else: - t = defaulted_states[state] - - - if t is not None: - if t > 0: - # shift a symbol on the stack - statestack.append(t) - state = t - - - symstack.append(lookahead) - lookahead = None - - # Decrease error count on successful shift - if errorcount: - errorcount -= 1 - continue - - if t < 0: - # reduce a symbol on the stack, emit a production - p = prod[-t] - pname = p.name - plen = p.len - - # Get production function - sym = YaccSymbol() - sym.type = pname # Production name - sym.value = None - - - if plen: - targ = symstack[-plen-1:] - targ[0] = sym - - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # below as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - del symstack[-plen:] - self.state = state - p.callable(pslice) - del statestack[-plen:] - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - symstack.extend(targ[1:-1]) # Put the production slice back on the stack - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - else: - - - targ = [sym] - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # above as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - self.state = state - p.callable(pslice) - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - if t == 0: - n = symstack[-1] - result = getattr(n, 'value', None) - return result - - if t is None: - - - # We have some kind of parsing error here. To handle - # this, we are going to push the current token onto - # the tokenstack and replace it with an 'error' token. - # If there are any synchronization rules, they may - # catch it. - # - # In addition to pushing the error token, we call call - # the user defined p_error() function if this is the - # first syntax error. This function is only called if - # errorcount == 0. - if errorcount == 0 or self.errorok: - errorcount = error_count - self.errorok = False - errtoken = lookahead - if errtoken.type == '$end': - errtoken = None # End of file! - if self.errorfunc: - if errtoken and not hasattr(errtoken, 'lexer'): - errtoken.lexer = lexer - self.state = state - tok = call_errorfunc(self.errorfunc, errtoken, self) - if self.errorok: - # User must have done some kind of panic - # mode recovery on their own. The - # returned token is the next lookahead - lookahead = tok - errtoken = None - continue - else: - if errtoken: - if hasattr(errtoken, 'lineno'): - lineno = lookahead.lineno - else: - lineno = 0 - if lineno: - sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) - else: - sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) - else: - sys.stderr.write('yacc: Parse error in input. EOF\n') - return - - else: - errorcount = error_count - - # case 1: the statestack only has 1 entry on it. If we're in this state, the - # entire parse has been rolled back and we're completely hosed. The token is - # discarded and we just keep going. - - if len(statestack) <= 1 and lookahead.type != '$end': - lookahead = None - errtoken = None - state = 0 - # Nuke the pushback stack - del lookaheadstack[:] - continue - - # case 2: the statestack has a couple of entries on it, but we're - # at the end of the file. nuke the top entry and generate an error token - - # Start nuking entries on the stack - if lookahead.type == '$end': - # Whoa. We're really hosed here. Bail out - return - - if lookahead.type != 'error': - sym = symstack[-1] - if sym.type == 'error': - # Hmmm. Error is on top of stack, we'll just nuke input - # symbol and continue - lookahead = None - continue - - # Create the error symbol for the first time and make it the new lookahead symbol - t = YaccSymbol() - t.type = 'error' - - if hasattr(lookahead, 'lineno'): - t.lineno = t.endlineno = lookahead.lineno - if hasattr(lookahead, 'lexpos'): - t.lexpos = t.endlexpos = lookahead.lexpos - t.value = lookahead - lookaheadstack.append(lookahead) - lookahead = t - else: - sym = symstack.pop() - statestack.pop() - state = statestack[-1] - - continue - - # Call an error function here - raise RuntimeError('yacc: internal parser error!!!\n') - - #--! parseopt-notrack-end - # ----------------------------------------------------------------------------- # === Grammar Representation === # @@ -1372,32 +682,6 @@ class Production(object): if self.func: self.callable = pdict[self.func] -# This class serves as a minimal standin for Production objects when -# reading table data from files. It only contains information -# actually used by the LR parsing engine, plus some additional -# debugging information. -class MiniProduction(object): - def __init__(self, str, name, len, func, file, line): - self.name = name - self.len = len - self.func = func - self.callable = None - self.file = file - self.line = line - self.str = str - - def __str__(self): - return self.str - - def __repr__(self): - return 'MiniProduction(%s)' % self.str - - # Bind the production function name to a callable - def bind(self, pdict): - if self.func: - self.callable = pdict[self.func] - - # ----------------------------------------------------------------------------- # class LRItem # @@ -1956,77 +1240,6 @@ class Grammar(object): p.lr_items = lr_items # ----------------------------------------------------------------------------- -# == Class LRTable == -# -# This basic class represents a basic table of LR parsing information. -# Methods for generating the tables are not defined here. They are defined -# in the derived class LRGeneratedTable. -# ----------------------------------------------------------------------------- - -class VersionError(YaccError): - pass - -class LRTable(object): - def __init__(self): - self.lr_action = None - self.lr_goto = None - self.lr_productions = None - self.lr_method = None - - def read_table(self, module): - if isinstance(module, types.ModuleType): - parsetab = module - else: - exec('import %s' % module) - parsetab = sys.modules[module] - - if parsetab._tabversion != __tabversion__: - raise VersionError('yacc table file version is out of date') - - self.lr_action = parsetab._lr_action - self.lr_goto = parsetab._lr_goto - - self.lr_productions = [] - for p in parsetab._lr_productions: - self.lr_productions.append(MiniProduction(*p)) - - self.lr_method = parsetab._lr_method - return parsetab._lr_signature - - def read_pickle(self, filename): - try: - import cPickle as pickle - except ImportError: - import pickle - - if not os.path.exists(filename): - raise ImportError - - in_f = open(filename, 'rb') - - tabversion = pickle.load(in_f) - if tabversion != __tabversion__: - raise VersionError('yacc table file version is out of date') - self.lr_method = pickle.load(in_f) - signature = pickle.load(in_f) - self.lr_action = pickle.load(in_f) - self.lr_goto = pickle.load(in_f) - productions = pickle.load(in_f) - - self.lr_productions = [] - for p in productions: - self.lr_productions.append(MiniProduction(*p)) - - in_f.close() - return signature - - # Bind all production function names to callable objects in pdict - def bind_callables(self, pdict): - for p in self.lr_productions: - p.bind(pdict) - - -# ----------------------------------------------------------------------------- # === LR Generator === # # The following classes and functions are used to generate LR parsing tables on @@ -2087,20 +1300,17 @@ def traverse(x, N, stack, F, X, R, FP): class LALRError(YaccError): pass + # ----------------------------------------------------------------------------- -# == LRGeneratedTable == +# == LRTable == # # This class implements the LR table generation algorithm. There are no -# public methods except for write() +# public methods. # ----------------------------------------------------------------------------- -class LRGeneratedTable(LRTable): - def __init__(self, grammar, method='LALR', log=None): - if method not in ['SLR', 'LALR']: - raise LALRError('Unsupported method %s' % method) - +class LRTable: + def __init__(self, grammar, log=None): self.grammar = grammar - self.lr_method = method # Set up the logger if not log: @@ -2130,6 +1340,11 @@ class LRGeneratedTable(LRTable): self.grammar.compute_follow() self.lr_parse_table() + # Bind all production function names to callable objects in pdict + def bind_callables(self, pdict): + for p in self.lr_productions: + p.bind(pdict) + # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. def lr0_closure(self, I): @@ -2536,15 +1751,11 @@ class LRGeneratedTable(LRTable): actionp = {} # Action production array (temporary) - log.info('Parsing method: %s', self.lr_method) - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states C = self.lr0_items() - - if self.lr_method == 'LALR': - self.add_lalr_lookaheads(C) + self.add_lalr_lookaheads(C) # Build the parser table, state by state st = 0 @@ -2569,10 +1780,7 @@ class LRGeneratedTable(LRTable): st_actionp['$end'] = p else: # We are at the end of a production. Reduce! - if self.lr_method == 'LALR': - laheads = p.lookaheads[st] - else: - laheads = self.grammar.Follow[p.name] + laheads = p.lookaheads[st] for a in laheads: actlist.append((a, p, 'reduce using rule %d (%s)' % (p.number, p))) r = st_action.get(a) @@ -2714,155 +1922,6 @@ class LRGeneratedTable(LRTable): goto[st] = st_goto st += 1 - # ----------------------------------------------------------------------------- - # write() - # - # This function writes the LR parsing tables to a file - # ----------------------------------------------------------------------------- - - def write_table(self, tabmodule, outputdir='', signature=''): - if isinstance(tabmodule, types.ModuleType): - raise IOError("Won't overwrite existing tabmodule") - - basemodulename = tabmodule.split('.')[-1] - filename = os.path.join(outputdir, basemodulename) + '.py' - try: - f = open(filename, 'w') - - f.write(''' -# %s -# This file is automatically generated. Do not edit. -# pylint: disable=W,C,R -_tabversion = %r - -_lr_method = %r - -_lr_signature = %r - ''' % (os.path.basename(filename), __tabversion__, self.lr_method, signature)) - - # Change smaller to 0 to go back to original tables - smaller = 1 - - # Factor out names to try and make smaller - if smaller: - items = {} - - for s, nd in self.lr_action.items(): - for name, v in nd.items(): - i = items.get(name) - if not i: - i = ([], []) - items[name] = i - i[0].append(s) - i[1].append(v) - - f.write('\n_lr_action_items = {') - for k, v in items.items(): - f.write('%r:([' % k) - for i in v[0]: - f.write('%r,' % i) - f.write('],[') - for i in v[1]: - f.write('%r,' % i) - - f.write(']),') - f.write('}\n') - - f.write(''' -_lr_action = {} -for _k, _v in _lr_action_items.items(): - for _x,_y in zip(_v[0],_v[1]): - if not _x in _lr_action: _lr_action[_x] = {} - _lr_action[_x][_k] = _y -del _lr_action_items -''') - - else: - f.write('\n_lr_action = { ') - for k, v in self.lr_action.items(): - f.write('(%r,%r):%r,' % (k[0], k[1], v)) - f.write('}\n') - - if smaller: - # Factor out names to try and make smaller - items = {} - - for s, nd in self.lr_goto.items(): - for name, v in nd.items(): - i = items.get(name) - if not i: - i = ([], []) - items[name] = i - i[0].append(s) - i[1].append(v) - - f.write('\n_lr_goto_items = {') - for k, v in items.items(): - f.write('%r:([' % k) - for i in v[0]: - f.write('%r,' % i) - f.write('],[') - for i in v[1]: - f.write('%r,' % i) - - f.write(']),') - f.write('}\n') - - f.write(''' -_lr_goto = {} -for _k, _v in _lr_goto_items.items(): - for _x, _y in zip(_v[0], _v[1]): - if not _x in _lr_goto: _lr_goto[_x] = {} - _lr_goto[_x][_k] = _y -del _lr_goto_items -''') - else: - f.write('\n_lr_goto = { ') - for k, v in self.lr_goto.items(): - f.write('(%r,%r):%r,' % (k[0], k[1], v)) - f.write('}\n') - - # Write production table - f.write('_lr_productions = [\n') - for p in self.lr_productions: - if p.func: - f.write(' (%r,%r,%d,%r,%r,%d),\n' % (p.str, p.name, p.len, - p.func, os.path.basename(p.file), p.line)) - else: - f.write(' (%r,%r,%d,None,None,None),\n' % (str(p), p.name, p.len)) - f.write(']\n') - f.close() - - except IOError as e: - raise - - - # ----------------------------------------------------------------------------- - # pickle_table() - # - # This function pickles the LR parsing tables to a supplied file object - # ----------------------------------------------------------------------------- - - def pickle_table(self, filename, signature=''): - try: - import cPickle as pickle - except ImportError: - import pickle - with open(filename, 'wb') as outf: - pickle.dump(__tabversion__, outf, pickle_protocol) - pickle.dump(self.lr_method, outf, pickle_protocol) - pickle.dump(signature, outf, pickle_protocol) - pickle.dump(self.lr_action, outf, pickle_protocol) - pickle.dump(self.lr_goto, outf, pickle_protocol) - - outp = [] - for p in self.lr_productions: - if p.func: - outp.append((p.str, p.name, p.len, p.func, os.path.basename(p.file), p.line)) - else: - outp.append((str(p), p.name, p.len, None, None, None)) - pickle.dump(outp, outf, pickle_protocol) - # ----------------------------------------------------------------------------- # === INTROSPECTION === # @@ -3209,20 +2268,13 @@ class ParserReflect(object): # Build a parser # ----------------------------------------------------------------------------- -def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, start=None, - check_recursion=True, optimize=False, write_tables=True, debugfile=debug_file, - outputdir=None, debuglog=None, errorlog=None, picklefile=None): - - if tabmodule is None: - tabmodule = tab_module +def yacc(*, debug=yaccdebug, module=None, start=None, + check_recursion=True, optimize=False, debugfile=debug_file, + debuglog=None, errorlog=None): # Reference to the parsing method of the last built parser global parse - # If pickling is enabled, table files are not created - if picklefile: - write_tables = 0 - if errorlog is None: errorlog = PlyLogger(sys.stderr) @@ -3240,32 +2292,6 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star else: pdict = get_caller_module_dict(2) - if outputdir is None: - # If no output directory is set, the location of the output files - # is determined according to the following rules: - # - If tabmodule specifies a package, files go into that package directory - # - Otherwise, files go in the same directory as the specifying module - if isinstance(tabmodule, types.ModuleType): - srcfile = tabmodule.__file__ - else: - if '.' not in tabmodule: - srcfile = pdict['__file__'] - else: - parts = tabmodule.split('.') - pkgname = '.'.join(parts[:-1]) - exec('import %s' % pkgname) - srcfile = getattr(sys.modules[pkgname], '__file__', '') - outputdir = os.path.dirname(srcfile) - - # Determine if the module is package of a package or not. - # If so, fix the tabmodule setting so that tables load correctly - pkg = pdict.get('__package__') - if pkg and isinstance(tabmodule, str): - if '.' not in tabmodule: - tabmodule = pkg + '.' + tabmodule - - - # Set start symbol if it's specified directly using an argument if start is not None: pdict['start'] = start @@ -3277,40 +2303,17 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star if pinfo.error: raise YaccError('Unable to build parser') - # Check signature against table files (if any) - signature = pinfo.signature() - - # Read the tables - try: - lr = LRTable() - if picklefile: - read_signature = lr.read_pickle(picklefile) - else: - read_signature = lr.read_table(tabmodule) - if optimize or (read_signature == signature): - try: - lr.bind_callables(pinfo.pdict) - parser = LRParser(lr, pinfo.error_func) - parse = parser.parse - return parser - except Exception as e: - errorlog.warning('There was a problem loading the table file: %r', e) - except VersionError as e: - errorlog.warning(str(e)) - except ImportError: - pass - if debuglog is None: if debug: try: - debuglog = PlyLogger(open(os.path.join(outputdir, debugfile), 'w')) + debuglog = PlyLogger(open(debugfile, 'w')) except IOError as e: errorlog.warning("Couldn't open %r. %s" % (debugfile, e)) debuglog = NullLogger() else: debuglog = NullLogger() - debuglog.info('Created by PLY version %s (http://www.dabeaz.com/ply)', __version__) + debuglog.info('Created by PLY (http://www.dabeaz.com/ply)') errors = False @@ -3427,11 +2430,8 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star if errors: raise YaccError('Unable to build parser') - # Run the LRGeneratedTable on the grammar - if debug: - errorlog.debug('Generating %s tables', method) - - lr = LRGeneratedTable(grammar, method, debuglog) + # Run the LRTable on the grammar + lr = LRTable(grammar, debuglog) if debug: num_sr = len(lr.sr_conflicts) @@ -3474,22 +2474,6 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star errorlog.warning('Rule (%s) is never reduced', rejected) warned_never.append(rejected) - # Write the table file if requested - if write_tables: - try: - lr.write_table(tabmodule, outputdir, signature) - if tabmodule in sys.modules: - del sys.modules[tabmodule] - except IOError as e: - errorlog.warning("Couldn't create %r. %s" % (tabmodule, e)) - - # Write a pickled version of the tables - if picklefile: - try: - lr.pickle_table(picklefile, signature) - except IOError as e: - errorlog.warning("Couldn't create %r. %s" % (picklefile, e)) - # Build the parser lr.bind_callables(pinfo.pdict) parser = LRParser(lr, pinfo.error_func) diff --git a/ply/ygen.py b/ply/ygen.py deleted file mode 100644 index 03b9318..0000000 --- a/ply/ygen.py +++ /dev/null @@ -1,69 +0,0 @@ -# ply: ygen.py -# -# This is a support program that auto-generates different versions of the YACC parsing -# function with different features removed for the purposes of performance. -# -# Users should edit the method LRParser.parsedebug() in yacc.py. The source code -# for that method is then used to create the other methods. See the comments in -# yacc.py for further details. - -import os.path -import shutil - -def get_source_range(lines, tag): - srclines = enumerate(lines) - start_tag = '#--! %s-start' % tag - end_tag = '#--! %s-end' % tag - - for start_index, line in srclines: - if line.strip().startswith(start_tag): - break - - for end_index, line in srclines: - if line.strip().endswith(end_tag): - break - - return (start_index + 1, end_index) - -def filter_section(lines, tag): - filtered_lines = [] - include = True - tag_text = '#--! %s' % tag - for line in lines: - if line.strip().startswith(tag_text): - include = not include - elif include: - filtered_lines.append(line) - return filtered_lines - -def main(): - dirname = os.path.dirname(__file__) - shutil.copy2(os.path.join(dirname, 'yacc.py'), os.path.join(dirname, 'yacc.py.bak')) - with open(os.path.join(dirname, 'yacc.py'), 'r') as f: - lines = f.readlines() - - parse_start, parse_end = get_source_range(lines, 'parsedebug') - parseopt_start, parseopt_end = get_source_range(lines, 'parseopt') - parseopt_notrack_start, parseopt_notrack_end = get_source_range(lines, 'parseopt-notrack') - - # Get the original source - orig_lines = lines[parse_start:parse_end] - - # Filter the DEBUG sections out - parseopt_lines = filter_section(orig_lines, 'DEBUG') - - # Filter the TRACKING sections out - parseopt_notrack_lines = filter_section(parseopt_lines, 'TRACKING') - - # Replace the parser source sections with updated versions - lines[parseopt_notrack_start:parseopt_notrack_end] = parseopt_notrack_lines - lines[parseopt_start:parseopt_end] = parseopt_lines - - lines = [line.rstrip()+'\n' for line in lines] - with open(os.path.join(dirname, 'yacc.py'), 'w') as f: - f.writelines(lines) - - print('Updated yacc.py') - -if __name__ == '__main__': - main() @@ -34,17 +34,3 @@ literally no reason to ever upgrade it. Keep using the version of code that you copied. If you think you've found a bug, check back with the repository to see if it's been fixed. Or submit it as an issue so that it can be looked at. - - - - - - - - - - - - - - diff --git a/test/lex_many_tokens.py b/test/lex_many_tokens.py index 77ae12b..81ae57a 100644 --- a/test/lex_many_tokens.py +++ b/test/lex_many_tokens.py @@ -21,7 +21,7 @@ t_ignore = " \t" def t_error(t): pass -lex.lex(optimize=1,lextab="manytab") +lex.lex() lex.runmain(data="TOK34: TOK143: TOK269: TOK372: TOK452: TOK561: TOK999:") diff --git a/test/lex_opt_alias.py b/test/lex_opt_alias.py deleted file mode 100644 index 5d5ed4c..0000000 --- a/test/lex_opt_alias.py +++ /dev/null @@ -1,54 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_opt_alias.py -# -# Tests ability to match up functions with states, aliases, and -# lexing tables. -# ----------------------------------------------------------------------------- - -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - -tokens = ( - 'NAME','NUMBER', - ) - -states = (('instdef','inclusive'),('spam','exclusive')) - -literals = ['=','+','-','*','/', '(',')'] - -# Tokens - -def t_instdef_spam_BITS(t): - r'[01-]+' - return t - -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ANY_NUMBER = NUMBER - -t_ignore = " \t" -t_spam_ignore = t_ignore - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -t_spam_error = t_error - -# Build the lexer -import ply.lex as lex -lex.lex(optimize=1,lextab="aliastab") -lex.runmain(data="3+4") diff --git a/test/lex_optimize.py b/test/lex_optimize.py deleted file mode 100644 index 0e447e6..0000000 --- a/test/lex_optimize.py +++ /dev/null @@ -1,50 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize.py -# ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lex.lex(optimize=1) -lex.runmain(data="3+4") - - - diff --git a/test/lex_optimize2.py b/test/lex_optimize2.py deleted file mode 100644 index 64555f6..0000000 --- a/test/lex_optimize2.py +++ /dev/null @@ -1,50 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize2.py -# ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lex.lex(optimize=1,lextab="opt2tab") -lex.runmain(data="3+4") - - - diff --git a/test/lex_optimize3.py b/test/lex_optimize3.py deleted file mode 100644 index b8df5aa..0000000 --- a/test/lex_optimize3.py +++ /dev/null @@ -1,52 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize3.py -# -# Writes table in a subdirectory structure. -# ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lex.lex(optimize=1,lextab="lexdir.sub.calctab" ,outputdir="lexdir/sub") -lex.runmain(data="3+4") - - - diff --git a/test/lex_optimize4.py b/test/lex_optimize4.py deleted file mode 100644 index cc6e2a9..0000000 --- a/test/lex_optimize4.py +++ /dev/null @@ -1,26 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize4.py -# ----------------------------------------------------------------------------- -import re -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = [ - "PLUS", - "MINUS", - "NUMBER", - ] - -t_PLUS = r'\+?' -t_MINUS = r'-' -t_NUMBER = r'(\d+)' - -def t_error(t): - pass - - -# Build the lexer -lex.lex(optimize=True, lextab="opt4tab", reflags=re.UNICODE) -lex.runmain(data="3+4") diff --git a/test/lex_token5.py b/test/lex_token5.py deleted file mode 100644 index ef7a3c5..0000000 --- a/test/lex_token5.py +++ /dev/null @@ -1,31 +0,0 @@ -# lex_token5.py -# -# Return a bad token name - -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - -import ply.lex as lex - -tokens = [ - "PLUS", - "MINUS", - "NUMBER", - ] - -t_PLUS = r'\+' -t_MINUS = r'-' - -def t_NUMBER(t): - r'\d+' - t.type = "NUM" - return t - -def t_error(t): - pass - -lex.lex() -lex.input("1234") -t = lex.token() - - diff --git a/test/pkg_test1/__init__.py b/test/pkg_test1/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test1/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test1/parsing/__init__.py b/test/pkg_test1/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test1/parsing/__init__.py +++ /dev/null diff --git a/test/pkg_test1/parsing/calclex.py b/test/pkg_test1/parsing/calclex.py deleted file mode 100644 index b3c1a4d..0000000 --- a/test/pkg_test1/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True) - - - diff --git a/test/pkg_test1/parsing/calcparse.py b/test/pkg_test1/parsing/calcparse.py deleted file mode 100644 index c058e9f..0000000 --- a/test/pkg_test1/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc() - - - - - diff --git a/test/pkg_test2/__init__.py b/test/pkg_test2/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test2/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test2/parsing/__init__.py b/test/pkg_test2/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test2/parsing/__init__.py +++ /dev/null diff --git a/test/pkg_test2/parsing/calclex.py b/test/pkg_test2/parsing/calclex.py deleted file mode 100644 index 789e13f..0000000 --- a/test/pkg_test2/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True, lextab='calclextab') - - - diff --git a/test/pkg_test2/parsing/calcparse.py b/test/pkg_test2/parsing/calcparse.py deleted file mode 100644 index f519338..0000000 --- a/test/pkg_test2/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc(tabmodule='calcparsetab') - - - - - diff --git a/test/pkg_test3/__init__.py b/test/pkg_test3/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test3/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test3/generated/__init__.py b/test/pkg_test3/generated/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test3/generated/__init__.py +++ /dev/null diff --git a/test/pkg_test3/parsing/__init__.py b/test/pkg_test3/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test3/parsing/__init__.py +++ /dev/null diff --git a/test/pkg_test3/parsing/calclex.py b/test/pkg_test3/parsing/calclex.py deleted file mode 100644 index 6ca2c4f..0000000 --- a/test/pkg_test3/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True, lextab='pkg_test3.generated.lextab') - - - diff --git a/test/pkg_test3/parsing/calcparse.py b/test/pkg_test3/parsing/calcparse.py deleted file mode 100644 index 2dcb52b..0000000 --- a/test/pkg_test3/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc(tabmodule='pkg_test3.generated.parsetab') - - - - - diff --git a/test/pkg_test4/__init__.py b/test/pkg_test4/__init__.py deleted file mode 100644 index ba9ddac..0000000 --- a/test/pkg_test4/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures -# Check of warning messages when files aren't writable - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -import ply.lex -import ply.yacc - -def patched_open(filename, mode): - if 'w' in mode: - raise IOError("Permission denied %r" % filename) - return open(filename, mode) - -ply.lex.open = patched_open -ply.yacc.open = patched_open -try: - from .parsing.calcparse import parser -finally: - del ply.lex.open - del ply.yacc.open - - diff --git a/test/pkg_test4/parsing/__init__.py b/test/pkg_test4/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test4/parsing/__init__.py +++ /dev/null diff --git a/test/pkg_test4/parsing/calclex.py b/test/pkg_test4/parsing/calclex.py deleted file mode 100644 index b3c1a4d..0000000 --- a/test/pkg_test4/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True) - - - diff --git a/test/pkg_test4/parsing/calcparse.py b/test/pkg_test4/parsing/calcparse.py deleted file mode 100644 index c058e9f..0000000 --- a/test/pkg_test4/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc() - - - - - diff --git a/test/pkg_test5/__init__.py b/test/pkg_test5/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test5/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test5/parsing/__init__.py b/test/pkg_test5/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test5/parsing/__init__.py +++ /dev/null diff --git a/test/pkg_test5/parsing/calclex.py b/test/pkg_test5/parsing/calclex.py deleted file mode 100644 index e8759b6..0000000 --- a/test/pkg_test5/parsing/calclex.py +++ /dev/null @@ -1,48 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import os.path -lexer = lex.lex(optimize=True, outputdir=os.path.dirname(__file__)) - - - diff --git a/test/pkg_test5/parsing/calcparse.py b/test/pkg_test5/parsing/calcparse.py deleted file mode 100644 index 2a1ddfe..0000000 --- a/test/pkg_test5/parsing/calcparse.py +++ /dev/null @@ -1,67 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -import os.path -parser = yacc.yacc(outputdir=os.path.dirname(__file__)) - - - - - diff --git a/test/pkg_test6/__init__.py b/test/pkg_test6/__init__.py deleted file mode 100644 index 5dbe0cb..0000000 --- a/test/pkg_test6/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper sorting of modules in yacc.ParserReflect.get_pfunctions - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test6/parsing/__init__.py b/test/pkg_test6/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/test/pkg_test6/parsing/__init__.py +++ /dev/null diff --git a/test/pkg_test6/parsing/calclex.py b/test/pkg_test6/parsing/calclex.py deleted file mode 100644 index e8759b6..0000000 --- a/test/pkg_test6/parsing/calclex.py +++ /dev/null @@ -1,48 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import os.path -lexer = lex.lex(optimize=True, outputdir=os.path.dirname(__file__)) - - - diff --git a/test/pkg_test6/parsing/calcparse.py b/test/pkg_test6/parsing/calcparse.py deleted file mode 100644 index 6defaf9..0000000 --- a/test/pkg_test6/parsing/calcparse.py +++ /dev/null @@ -1,33 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -from .statement import * - -from .expression import * - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -import os.path -parser = yacc.yacc(outputdir=os.path.dirname(__file__)) - - - - - diff --git a/test/pkg_test6/parsing/expression.py b/test/pkg_test6/parsing/expression.py deleted file mode 100644 index 028f662..0000000 --- a/test/pkg_test6/parsing/expression.py +++ /dev/null @@ -1,31 +0,0 @@ -# This file contains definitions of expression grammar - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 diff --git a/test/pkg_test6/parsing/statement.py b/test/pkg_test6/parsing/statement.py deleted file mode 100644 index ef7dc55..0000000 --- a/test/pkg_test6/parsing/statement.py +++ /dev/null @@ -1,9 +0,0 @@ -# This file contains definitions of statement grammar - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] diff --git a/test/testlex.py b/test/testlex.py index a94ed64..318b47a 100755 --- a/test/testlex.py +++ b/test/testlex.py @@ -239,7 +239,7 @@ class LexErrorWarningTests(unittest.TestCase): self.assertRaises(SyntaxError,run_import,"lex_state4") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "State type for state comment must be 'inclusive' or 'exclusive'\n")) + "State type for state 'comment' must be 'inclusive' or 'exclusive'\n")) def test_lex_state5(self): @@ -294,13 +294,6 @@ class LexErrorWarningTests(unittest.TestCase): "Bad token name '-'\n")) - def test_lex_token5(self): - try: - run_import("lex_token5") - except ply.lex.LexError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e),"lex_token5.py:19: Rule 't_NUMBER' returned an unknown token type 'NUM'")) - def test_lex_token_dup(self): run_import("lex_token_dup") result = sys.stderr.getvalue() @@ -361,249 +354,7 @@ class LexBuildOptionTests(unittest.TestCase): "(PLUS,'+',1,1)\n" "(NUMBER,4,1,2)\n")) - def test_lex_optimize(self): - try: - os.remove("lextab.py") - except OSError: - pass - try: - os.remove("lextab.pyc") - except OSError: - pass - try: - os.remove("lextab.pyo") - except OSError: - pass - run_import("lex_optimize") - - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("lextab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_optimize.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("lextab.pyo", 1)) - pymodule_out_remove("lextab.pyo", 1) - - p = subprocess.Popen([sys.executable,'-OO','lex_optimize.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - - if test_pyo: - self.assert_(pymodule_out_exists("lextab.pyo", 2)) - try: - os.remove("lextab.py") - except OSError: - pass - try: - pymodule_out_remove("lextab.pyc") - except OSError: - pass - try: - pymodule_out_remove("lextab.pyo", 2) - except OSError: - pass - - def test_lex_optimize2(self): - try: - os.remove("opt2tab.py") - except OSError: - pass - try: - os.remove("opt2tab.pyc") - except OSError: - pass - try: - os.remove("opt2tab.pyo") - except OSError: - pass - run_import("lex_optimize2") - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("opt2tab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_optimize2.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("opt2tab.pyo", 1)) - pymodule_out_remove("opt2tab.pyo", 1) - p = subprocess.Popen([sys.executable,'-OO','lex_optimize2.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("opt2tab.pyo", 2)) - try: - os.remove("opt2tab.py") - except OSError: - pass - try: - pymodule_out_remove("opt2tab.pyc") - except OSError: - pass - try: - pymodule_out_remove("opt2tab.pyo", 2) - except OSError: - pass - - def test_lex_optimize3(self): - try: - shutil.rmtree("lexdir") - except OSError: - pass - - os.mkdir("lexdir") - os.mkdir("lexdir/sub") - with open("lexdir/__init__.py","w") as f: - f.write("") - with open("lexdir/sub/__init__.py","w") as f: - f.write("") - run_import("lex_optimize3") - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("lexdir/sub/calctab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_optimize3.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("lexdir/sub/calctab.pyo", 1)) - pymodule_out_remove("lexdir/sub/calctab.pyo", 1) - - p = subprocess.Popen([sys.executable,'-OO','lex_optimize3.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("lexdir/sub/calctab.pyo", 2)) - try: - shutil.rmtree("lexdir") - except OSError: - pass - - def test_lex_optimize4(self): - - # Regression test to make sure that reflags works correctly - # on Python 3. - - for extension in ['py', 'pyc']: - try: - os.remove("opt4tab.{0}".format(extension)) - except OSError: - pass - - run_import("lex_optimize4") - run_import("lex_optimize4") - - for extension in ['py', 'pyc']: - try: - os.remove("opt4tab.{0}".format(extension)) - except OSError: - pass - - def test_lex_opt_alias(self): - try: - os.remove("aliastab.py") - except OSError: - pass - try: - os.remove("aliastab.pyc") - except OSError: - pass - try: - os.remove("aliastab.pyo") - except OSError: - pass - run_import("lex_opt_alias") - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(+,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("aliastab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_opt_alias.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(+,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("aliastab.pyo", 1)) - pymodule_out_remove("aliastab.pyo", 1) - - p = subprocess.Popen([sys.executable,'-OO','lex_opt_alias.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(+,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - - if test_pyo: - self.assert_(pymodule_out_exists("aliastab.pyo", 2)) - try: - os.remove("aliastab.py") - except OSError: - pass - try: - pymodule_out_remove("aliastab.pyc") - except OSError: - pass - try: - pymodule_out_remove("aliastab.pyo", 2) - except OSError: - pass - def test_lex_many_tokens(self): - try: - os.remove("manytab.py") - except OSError: - pass - try: - os.remove("manytab.pyc") - except OSError: - pass - try: - os.remove("manytab.pyo") - except OSError: - pass run_import("lex_many_tokens") result = sys.stdout.getvalue() self.assert_(check_expected(result, @@ -615,37 +366,6 @@ class LexBuildOptionTests(unittest.TestCase): "(TOK561,'TOK561:',1,39)\n" "(TOK999,'TOK999:',1,47)\n" )) - - self.assert_(os.path.exists("manytab.py")) - - if implementation() == 'CPython': - p = subprocess.Popen([sys.executable,'-O','lex_many_tokens.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(TOK34,'TOK34:',1,0)\n" - "(TOK143,'TOK143:',1,7)\n" - "(TOK269,'TOK269:',1,15)\n" - "(TOK372,'TOK372:',1,23)\n" - "(TOK452,'TOK452:',1,31)\n" - "(TOK561,'TOK561:',1,39)\n" - "(TOK999,'TOK999:',1,47)\n" - )) - - self.assert_(pymodule_out_exists("manytab.pyo", 1)) - pymodule_out_remove("manytab.pyo", 1) - try: - os.remove("manytab.py") - except OSError: - pass - try: - os.remove("manytab.pyc") - except OSError: - pass - try: - os.remove("manytab.pyo") - except OSError: - pass # Tests related to run-time behavior of lexers class LexRunTests(unittest.TestCase): diff --git a/test/testyacc.py b/test/testyacc.py index 7e69f09..96d4b0d 100644 --- a/test/testyacc.py +++ b/test/testyacc.py @@ -401,52 +401,4 @@ class YaccErrorWarningTests(unittest.TestCase): "Precedence rule 'left' defined for unknown symbol '/'\n" )) - def test_pkg_test1(self): - from pkg_test1 import parser - self.assertTrue(os.path.exists('pkg_test1/parsing/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test1/parsing/lextab.py')) - self.assertTrue(os.path.exists('pkg_test1/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test2(self): - from pkg_test2 import parser - self.assertTrue(os.path.exists('pkg_test2/parsing/calcparsetab.py')) - self.assertTrue(os.path.exists('pkg_test2/parsing/calclextab.py')) - self.assertTrue(os.path.exists('pkg_test2/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test3(self): - from pkg_test3 import parser - self.assertTrue(os.path.exists('pkg_test3/generated/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test3/generated/lextab.py')) - self.assertTrue(os.path.exists('pkg_test3/generated/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test4(self): - from pkg_test4 import parser - self.assertFalse(os.path.exists('pkg_test4/parsing/parsetab.py')) - self.assertFalse(os.path.exists('pkg_test4/parsing/lextab.py')) - self.assertFalse(os.path.exists('pkg_test4/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test5(self): - from pkg_test5 import parser - self.assertTrue(os.path.exists('pkg_test5/parsing/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test5/parsing/lextab.py')) - self.assertTrue(os.path.exists('pkg_test5/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test6(self): - from pkg_test6 import parser - self.assertTrue(os.path.exists('pkg_test6/parsing/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test6/parsing/lextab.py')) - self.assertTrue(os.path.exists('pkg_test6/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - unittest.main() diff --git a/test/yacc_error7.py b/test/yacc_error7.py index fb131be..abdc834 100644 --- a/test/yacc_error7.py +++ b/test/yacc_error7.py @@ -56,11 +56,11 @@ def p_error(p): print("Line %d: Syntax error at '%s'" % (p.lineno, p.value)) # Scan ahead looking for a name token while True: - tok = yacc.token() + tok = parser.token() if not tok or tok.type == 'RPAREN': break if tok: - yacc.restart() + parser.restart() return None parser = yacc.yacc() diff --git a/test/yacc_nested.py b/test/yacc_nested.py index a1b061e..a3543a9 100644 --- a/test/yacc_nested.py +++ b/test/yacc_nested.py @@ -26,7 +26,7 @@ def p_nest(t): '''nest : B''' print(t[-1]) -the_parser = yacc.yacc(debug = False, write_tables = False) +the_parser = yacc.yacc(debug = False) the_parser.parse('ABC', the_lexer) the_parser.parse('ABC', the_lexer, tracking=True) |