diff options
-rw-r--r-- | CHANGES | 38 | ||||
-rw-r--r-- | CONTRIBUTING.md | 7 | ||||
-rw-r--r-- | MANIFEST.in | 4 | ||||
-rw-r--r-- | Makefile | 30 | ||||
-rw-r--r-- | README.md | 450 | ||||
-rw-r--r-- | doc/internals.md | 598 | ||||
-rw-r--r-- | doc/ply.md | 2587 | ||||
-rw-r--r-- | docs/Makefile | 192 | ||||
-rw-r--r-- | docs/conf.py | 284 | ||||
-rw-r--r-- | docs/index.rst | 58 | ||||
-rw-r--r-- | docs/internals.rst | 530 | ||||
-rw-r--r-- | docs/make.bat | 263 | ||||
-rw-r--r-- | docs/ply.rst | 2675 | ||||
-rw-r--r-- | example/BASIC/basic.py | 3 | ||||
-rw-r--r-- | example/BASIC/basiclog.py | 2 | ||||
-rw-r--r-- | example/ansic/clex.py | 3 | ||||
-rw-r--r-- | example/ansic/cparse.py | 1 | ||||
-rw-r--r-- | example/calc/calc.py | 3 | ||||
-rw-r--r-- | example/calcdebug/calc.py | 3 | ||||
-rw-r--r-- | example/calceof/calc.py | 3 | ||||
-rwxr-xr-x | example/classcalc/calc.py | 3 | ||||
-rw-r--r-- | example/closurecalc/calc.py | 3 | ||||
-rw-r--r-- | example/yply/ylex.py | 3 | ||||
-rwxr-xr-x | example/yply/yply.py | 1 | ||||
-rw-r--r-- | install.py | 28 | ||||
-rw-r--r-- | ply/__init__.py | 6 | ||||
-rw-r--r-- | pyproject.toml | 3 | ||||
-rw-r--r-- | setup.cfg | 18 | ||||
-rw-r--r-- | setup.md | 36 | ||||
-rw-r--r-- | src/ply/__init__.py | 5 | ||||
-rw-r--r-- | src/ply/lex.py (renamed from ply/lex.py) | 2 | ||||
-rw-r--r-- | src/ply/yacc.py (renamed from ply/yacc.py) | 2 | ||||
-rw-r--r-- | test/testcpp.py | 153 | ||||
-rw-r--r-- | tests/README (renamed from test/README) | 0 | ||||
-rw-r--r-- | tests/calclex.py (renamed from test/calclex.py) | 3 | ||||
-rwxr-xr-x | tests/cleanup.sh (renamed from test/cleanup.sh) | 0 | ||||
-rw-r--r-- | tests/lex_closure.py (renamed from test/lex_closure.py) | 3 | ||||
-rw-r--r-- | tests/lex_doc1.py (renamed from test/lex_doc1.py) | 3 | ||||
-rw-r--r-- | tests/lex_dup1.py (renamed from test/lex_dup1.py) | 3 | ||||
-rw-r--r-- | tests/lex_dup2.py (renamed from test/lex_dup2.py) | 3 | ||||
-rw-r--r-- | tests/lex_dup3.py (renamed from test/lex_dup3.py) | 3 | ||||
-rw-r--r-- | tests/lex_empty.py (renamed from test/lex_empty.py) | 3 | ||||
-rw-r--r-- | tests/lex_error1.py (renamed from test/lex_error1.py) | 3 | ||||
-rw-r--r-- | tests/lex_error2.py (renamed from test/lex_error2.py) | 3 | ||||
-rw-r--r-- | tests/lex_error3.py (renamed from test/lex_error3.py) | 3 | ||||
-rw-r--r-- | tests/lex_error4.py (renamed from test/lex_error4.py) | 3 | ||||
-rw-r--r-- | tests/lex_hedit.py (renamed from test/lex_hedit.py) | 2 | ||||
-rw-r--r-- | tests/lex_ignore.py (renamed from test/lex_ignore.py) | 3 | ||||
-rw-r--r-- | tests/lex_ignore2.py (renamed from test/lex_ignore2.py) | 3 | ||||
-rw-r--r-- | tests/lex_literal1.py (renamed from test/lex_literal1.py) | 3 | ||||
-rw-r--r-- | tests/lex_literal2.py (renamed from test/lex_literal2.py) | 3 | ||||
-rw-r--r-- | tests/lex_literal3.py (renamed from test/lex_literal3.py) | 3 | ||||
-rw-r--r-- | tests/lex_many_tokens.py (renamed from test/lex_many_tokens.py) | 2 | ||||
-rw-r--r-- | tests/lex_module.py (renamed from test/lex_module.py) | 3 | ||||
-rw-r--r-- | tests/lex_module_import.py (renamed from test/lex_module_import.py) | 0 | ||||
-rw-r--r-- | tests/lex_object.py (renamed from test/lex_object.py) | 2 | ||||
-rw-r--r-- | tests/lex_re1.py (renamed from test/lex_re1.py) | 3 | ||||
-rw-r--r-- | tests/lex_re2.py (renamed from test/lex_re2.py) | 3 | ||||
-rw-r--r-- | tests/lex_re3.py (renamed from test/lex_re3.py) | 3 | ||||
-rw-r--r-- | tests/lex_rule1.py (renamed from test/lex_rule1.py) | 3 | ||||
-rw-r--r-- | tests/lex_rule2.py (renamed from test/lex_rule2.py) | 3 | ||||
-rw-r--r-- | tests/lex_rule3.py (renamed from test/lex_rule3.py) | 3 | ||||
-rw-r--r-- | tests/lex_state1.py (renamed from test/lex_state1.py) | 3 | ||||
-rw-r--r-- | tests/lex_state2.py (renamed from test/lex_state2.py) | 3 | ||||
-rw-r--r-- | tests/lex_state3.py (renamed from test/lex_state3.py) | 3 | ||||
-rw-r--r-- | tests/lex_state4.py (renamed from test/lex_state4.py) | 3 | ||||
-rw-r--r-- | tests/lex_state5.py (renamed from test/lex_state5.py) | 3 | ||||
-rw-r--r-- | tests/lex_state_noerror.py (renamed from test/lex_state_noerror.py) | 3 | ||||
-rw-r--r-- | tests/lex_state_norule.py (renamed from test/lex_state_norule.py) | 3 | ||||
-rw-r--r-- | tests/lex_state_try.py (renamed from test/lex_state_try.py) | 3 | ||||
-rw-r--r-- | tests/lex_token1.py (renamed from test/lex_token1.py) | 3 | ||||
-rw-r--r-- | tests/lex_token2.py (renamed from test/lex_token2.py) | 3 | ||||
-rw-r--r-- | tests/lex_token3.py (renamed from test/lex_token3.py) | 3 | ||||
-rw-r--r-- | tests/lex_token4.py (renamed from test/lex_token4.py) | 3 | ||||
-rw-r--r-- | tests/lex_token_dup.py (renamed from test/lex_token_dup.py) | 3 | ||||
-rw-r--r-- | tests/test_cpp_nonascii.c (renamed from test/test_cpp_nonascii.c) | 0 | ||||
-rwxr-xr-x | tests/testlex.py (renamed from test/testlex.py) | 95 | ||||
-rw-r--r-- | tests/testyacc.py (renamed from test/testyacc.py) | 113 | ||||
-rw-r--r-- | tests/yacc_badargs.py (renamed from test/yacc_badargs.py) | 2 | ||||
-rw-r--r-- | tests/yacc_badid.py (renamed from test/yacc_badid.py) | 3 | ||||
-rw-r--r-- | tests/yacc_badprec.py (renamed from test/yacc_badprec.py) | 3 | ||||
-rw-r--r-- | tests/yacc_badprec2.py (renamed from test/yacc_badprec2.py) | 3 | ||||
-rw-r--r-- | tests/yacc_badprec3.py (renamed from test/yacc_badprec3.py) | 3 | ||||
-rw-r--r-- | tests/yacc_badrule.py (renamed from test/yacc_badrule.py) | 3 | ||||
-rw-r--r-- | tests/yacc_badtok.py (renamed from test/yacc_badtok.py) | 3 | ||||
-rw-r--r-- | tests/yacc_dup.py (renamed from test/yacc_dup.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error1.py (renamed from test/yacc_error1.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error2.py (renamed from test/yacc_error2.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error3.py (renamed from test/yacc_error3.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error4.py (renamed from test/yacc_error4.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error5.py (renamed from test/yacc_error5.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error6.py (renamed from test/yacc_error6.py) | 3 | ||||
-rw-r--r-- | tests/yacc_error7.py (renamed from test/yacc_error7.py) | 3 | ||||
-rw-r--r-- | tests/yacc_inf.py (renamed from test/yacc_inf.py) | 3 | ||||
-rw-r--r-- | tests/yacc_literal.py (renamed from test/yacc_literal.py) | 3 | ||||
-rw-r--r-- | tests/yacc_misplaced.py (renamed from test/yacc_misplaced.py) | 3 | ||||
-rw-r--r-- | tests/yacc_missing1.py (renamed from test/yacc_missing1.py) | 3 | ||||
-rw-r--r-- | tests/yacc_nested.py (renamed from test/yacc_nested.py) | 3 | ||||
-rw-r--r-- | tests/yacc_nodoc.py (renamed from test/yacc_nodoc.py) | 3 | ||||
-rw-r--r-- | tests/yacc_noerror.py (renamed from test/yacc_noerror.py) | 3 | ||||
-rw-r--r-- | tests/yacc_nop.py (renamed from test/yacc_nop.py) | 3 | ||||
-rw-r--r-- | tests/yacc_notfunc.py (renamed from test/yacc_notfunc.py) | 3 | ||||
-rw-r--r-- | tests/yacc_notok.py (renamed from test/yacc_notok.py) | 4 | ||||
-rw-r--r-- | tests/yacc_prec1.py (renamed from test/yacc_prec1.py) | 3 | ||||
-rw-r--r-- | tests/yacc_rr.py (renamed from test/yacc_rr.py) | 3 | ||||
-rw-r--r-- | tests/yacc_rr_unused.py (renamed from test/yacc_rr_unused.py) | 3 | ||||
-rw-r--r-- | tests/yacc_simple.py (renamed from test/yacc_simple.py) | 3 | ||||
-rw-r--r-- | tests/yacc_sr.py (renamed from test/yacc_sr.py) | 3 | ||||
-rw-r--r-- | tests/yacc_term1.py (renamed from test/yacc_term1.py) | 3 | ||||
-rw-r--r-- | tests/yacc_unicode_literals.py (renamed from test/yacc_unicode_literals.py) | 3 | ||||
-rw-r--r-- | tests/yacc_unused.py (renamed from test/yacc_unused.py) | 3 | ||||
-rw-r--r-- | tests/yacc_unused_rule.py (renamed from test/yacc_unused_rule.py) | 3 | ||||
-rw-r--r-- | tests/yacc_uprec.py (renamed from test/yacc_uprec.py) | 3 | ||||
-rw-r--r-- | tests/yacc_uprec2.py (renamed from test/yacc_uprec2.py) | 3 |
114 files changed, 3584 insertions, 4843 deletions
@@ -1,5 +1,3 @@ -Current Version ---------------- IMPORTANT NOTE (2018-12-22): PLY is no longer be released in any package-installable format. If you want to use the latest version, you need to COPY the contents of the ply/ directory into your own project @@ -8,26 +6,32 @@ maintained as a mature library. No new major features are planned, but issues reported for bugs are still welcome. Any changes to the software will be noted here. -Version 4.0 (In progress) -------------------------- -Note: The 4.0 series of PLY represents a massive cleanup and modernization -effort. At a fundamental level, no new "features" are being added. -Instead, a lot of outdated, inconsistent, and problematic features are -being eliminated. Here is a short summary: +Version 2022.10.27 +------------------ +10/27/22 Reoganization/modernization of the build process. PLY continues + to make no package-installable releases. However, you can now + use the Makefile to build artifacts installable via pip. + Use `make test` to test and `make build` to build. + +Version 2022_01_02 +------------------ +12/12/21 PLY is no longer being developed in public. Instead + periodic versions will be posted. In general, there has + been an effort at cleanup and modernization. + Here is a short summary of major changes: - - PLY no longer writes table files or cached data. If you want this, - it's your responsibility to serialize the parser tables. Use pickle. + - PLY no longer writes table files or cached data. If you want this, + it's your responsibility to serialize the parser tables. Use pickle. - - Elimination of side-effects and global variables (generally). + - Elimination of side-effects and global variables (generally). - - Elimination of numerous optional features in an effort to - simplify the API. + - Elimination of numerous optional features in an effort to + simplify the API. - - More use of modern Python features including iterators/generators, - keyword-only arguments, f-strings, etc. + - More use of modern Python features including iterators/generators, + keyword-only arguments, f-strings, etc. - - Dropped support for Python 2.x ------------------------- + - Dropped support for Python 2.x 01/26/20 PLY no longer writes cached table files. Honestly, the use of the cached files made more sense when I was developing PLY on diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 3e1cad6..0000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,7 +0,0 @@ -Contributing to PLY -=================== - -PLY is a mature project. However, if you feel that you have found a -bug in PLY or its documentation, please submit an issue in the form -of a bug report at https://github.com/dabeaz/ply. Pull requests -to the project are not accepted. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..cfcf771 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include Makefile CONTRIBUTING.md +recursive-include example * +recursive-include tests * +recursive-include docs * @@ -1,10 +1,26 @@ -PYTHON ?= python +PYTHON=python3 +VENV=.venv -install: - python install.py +# Setup and install all of the required tools for building, testing, +# and deploying +setup:: + rm -rf $(VENV) + $(PYTHON) -m venv $(VENV) + ./$(VENV)/bin/python -m pip install pytest + ./$(VENV)/bin/python -m pip install pytest-cov + ./$(VENV)/bin/python -m pip install build + ./$(VENV)/bin/python -m pip install twine -test: - cd test && $(PYTHON) testlex.py - cd test && $(PYTHON) testyacc.py +# Run unit tests +test:: + ./$(VENV)/bin/python -m pip install . + ./$(VENV)/bin/python tests/testlex.py + ./$(VENV)/bin/python tests/testyacc.py -.PHONY: install test +# Build an artifact suitable for installing with pip +build:: + ./$(VENV)/bin/python -m build + +# Install into the default Python +install:: + $(PYTHON) -m pip install . @@ -1,282 +1,270 @@ -# PLY (Python Lex-Yacc) - -Copyright (C) 2001-2020 -David M. Beazley (Dabeaz LLC) -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. -* Neither the name of the David Beazley or Dabeaz LLC may be used to - endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Introduction -============ - -PLY is a 100% Python implementation of the common parsing tools lex -and yacc. Here are a few highlights: - - - PLY is very closely modeled after traditional lex/yacc. - If you know how to use these tools in C, you will find PLY - to be similar. - - - PLY provides *very* extensive error reporting and diagnostic - information to assist in parser construction. The original - implementation was developed for instructional purposes. As - a result, the system tries to identify the most common types - of errors made by novice users. - - - PLY provides full support for empty productions, error recovery, - precedence specifiers, and moderately ambiguous grammars. - - - Parsing is based on LR-parsing which is fast, memory efficient, - better suited to large grammars, and which has a number of nice - properties when dealing with syntax errors and other parsing problems. - Currently, PLY builds its parsing tables using the LALR(1) - algorithm used in yacc. - - - PLY uses Python introspection features to build lexers and parsers. - This greatly simplifies the task of parser construction since it reduces - the number of files and eliminates the need to run a separate lex/yacc - tool before running your program. - - - PLY can be used to build parsers for "real" programming languages. - Although it is not ultra-fast due to its Python implementation, - PLY can be used to parse grammars consisting of several hundred - rules (as might be found for a language like C). The lexer and LR - parser are also reasonably efficient when parsing typically - sized programs. People have used PLY to build parsers for - C, C++, ADA, and other real programming languages. - -How to Use -========== - -PLY consists of two files : lex.py and yacc.py. These are contained -within the `ply` directory which may also be used as a Python package. -To use PLY, simply copy the `ply` directory to your project and import -lex and yacc from the associated `ply` package. For example: +# PLY - Python Lex-Yacc -```python -from .ply import lex -from .ply import yacc -``` - -Alternatively, you can copy just the files lex.py and yacc.py -individually and use them as modules however you see fit. For example: - -```python -import lex -import yacc -``` - -If you wish, you can use the install.py script to install PLY into -virtual environment. - -PLY has no third-party dependencies. - -The docs/ directory contains complete documentation on how to use -the system. Documentation available at https://ply.readthedocs.io - -The example directory contains several different examples including a -PLY specification for ANSI C as given in K&R 2nd Ed. - -A simple example is found at the end of this document - -Requirements -============ -PLY requires the use of Python 3.6 or greater. However, you should -use the latest Python release if possible. It should work on just -about any platform. +Author: David Beazley (https://www.dabeaz.com) -Note: PLY does not support execution under `python -OO`. It can be -made to work in that mode, but you'll need to change the programming -interface with a decorator. See the documentation for details. +## Introduction -Resources -========= +PLY is a zero-dependency Python implementation of the traditional +parsing tools lex and yacc. It uses the same LALR(1) parsing algorithm +as yacc and has most of its core features. It is compatible with all +modern versions of Python. -Official Documentation is available at: +PLY was originally created in 2001 to support an Introduction to +Compilers course at the University of Chicago. As such, it has almost +no features other than the core LALR(1) parsing algorithm. This is by +design--students should be made to suffer. Well, at least a little +bit. However, from a more practical point of view, there is a lot +flexibility in terms of how you decide to use it. You can use PLY to +build Abstract Syntax Trees (ASTs), simple one-pass compilers, +protocol decoders, or even a more advanced parsing framework. -* https://ply.readthedocs.io +## Important Notice - October 27, 2022 -More information about PLY can be obtained on the PLY webpage at: +The PLY project will make no further package-installable releases. +If you want the latest version, you'll need to download it here +or clone the repo. -* http://www.dabeaz.com/ply +## Requirements -For a detailed overview of parsing theory, consult the excellent -book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and -Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown -may also be useful. +The current release of PLY requires the use of Python 3.6 or +greater. If you need to support an older version, download one of the +historical releases at https://github.com/dabeaz/archive/tree/main/ply. -The GitHub page for PLY can be found at: +## How to Install and Use -* https://github.com/dabeaz/ply +Although PLY is open-source, it is not distributed or installed by +package manager. There are only two files: `lex.py` and `yacc.py`, +both of which are contained in a `ply` package directory. To use PLY, +copy the `ply` directory into your project and import `lex` and `yacc` +from the associated `ply` subpackage. Alternatively, you can install +these files into your working python using `make install`. -Acknowledgments -=============== -A special thanks is in order for all of the students in CS326 who -suffered through about 25 different versions of these tools :-). +```python +from .ply import lex +from .ply import yacc +``` -The CHANGES file acknowledges those who have contributed patches. +PLY has no third-party dependencies and can be freely renamed or moved +around within your project as you see fit. It rarely changes. -Elias Ioup did the first implementation of LALR(1) parsing in PLY-1.x. -Andrew Waters and Markus Schoepflin were instrumental in reporting bugs -and testing a revised LALR(1) implementation for PLY-2.0. - -Example -======= +## Example -Here is a simple example showing a PLY implementation of a calculator -with variables. +The primary use for PLY is writing parsers for programming languages. Here an +example that parses simple expressions into an abstract syntax tree (AST): ```python # ----------------------------------------------------------------------------- -# calc.py +# example.py +# +# Example of using PLY To parse the following simple grammar. +# +# expression : term PLUS term +# | term MINUS term +# | term +# +# term : factor TIMES factor +# | factor DIVIDE factor +# | factor +# +# factor : NUMBER +# | NAME +# | PLUS factor +# | MINUS factor +# | LPAREN expression RPAREN # -# A simple calculator with variables. # ----------------------------------------------------------------------------- -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) +from ply.lex import lex +from ply.yacc import yacc -# Tokens +# --- Tokenizer -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' +# All tokens must be named in advance. +tokens = ( 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'LPAREN', 'RPAREN', + 'NAME', 'NUMBER' ) +# Ignored characters +t_ignore = ' \t' + +# Token matching rules are written as regexs +t_PLUS = r'\+' +t_MINUS = r'-' +t_TIMES = r'\*' +t_DIVIDE = r'/' +t_LPAREN = r'\(' +t_RPAREN = r'\)' +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +# A function can be used if there is an associated action. +# Write the matching regex in the docstring. def t_NUMBER(t): r'\d+' t.value = int(t.value) return t -# Ignored characters -t_ignore = " \t" - -def t_newline(t): +# Ignored token with an action associated with it +def t_ignore_newline(t): r'\n+' - t.lexer.lineno += t.value.count("\n") + t.lexer.lineno += t.value.count('\n') +# Error handler for illegal characters def t_error(t): - print(f"Illegal character {t.value[0]!r}") + print(f'Illegal character {t.value[0]!r}') t.lexer.skip(1) -# Build the lexer -import ply.lex as lex -lex.lex() - -# Precedence rules for the arithmetic operators -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names (for storing variables) -names = { } - -def p_statement_assign(p): - 'statement : NAME EQUALS expression' - names[p[1]] = p[3] - -def p_statement_expr(p): - 'statement : expression' - print(p[1]) - -def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if p[2] == '+' : p[0] = p[1] + p[3] - elif p[2] == '-': p[0] = p[1] - p[3] - elif p[2] == '*': p[0] = p[1] * p[3] - elif p[2] == '/': p[0] = p[1] / p[3] - -def p_expression_uminus(p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] - -def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - -def p_expression_number(p): - 'expression : NUMBER' +# Build the lexer object +lexer = lex() + +# --- Parser + +# Write functions for each grammar rule which is +# specified in the docstring. +def p_expression(p): + ''' + expression : term PLUS term + | term MINUS term + ''' + # p is a sequence that represents rule contents. + # + # expression : term PLUS term + # p[0] : p[1] p[2] p[3] + # + p[0] = ('binop', p[2], p[1], p[3]) + +def p_expression_term(p): + ''' + expression : term + ''' p[0] = p[1] -def p_expression_name(p): - 'expression : NAME' - try: - p[0] = names[p[1]] - except LookupError: - print(f"Undefined name {p[1]!r}") - p[0] = 0 +def p_term(p): + ''' + term : factor TIMES factor + | factor DIVIDE factor + ''' + p[0] = ('binop', p[2], p[1], p[3]) + +def p_term_factor(p): + ''' + term : factor + ''' + p[0] = p[1] + +def p_factor_number(p): + ''' + factor : NUMBER + ''' + p[0] = ('number', p[1]) + +def p_factor_name(p): + ''' + factor : NAME + ''' + p[0] = ('name', p[1]) + +def p_factor_unary(p): + ''' + factor : PLUS factor + | MINUS factor + ''' + p[0] = ('unary', p[1], p[2]) + +def p_factor_grouped(p): + ''' + factor : LPAREN expression RPAREN + ''' + p[0] = ('grouped', p[2]) def p_error(p): - print(f"Syntax error at {p.value!r}") + print(f'Syntax error at {p.value!r}') -import ply.yacc as yacc -yacc.yacc() +# Build the parser +parser = yacc() -while True: - try: - s = input('calc > ') - except EOFError: - break - yacc.parse(s) +# Parse an expression +ast = parser.parse('2 * 3 + 4 * (5 - x)') +print(ast) ``` -Bug Reports and Patches -======================= -My goal with PLY is to simply have a decent lex/yacc implementation -for Python. As a general rule, I don't spend huge amounts of time -working on it unless I receive very specific bug reports and/or -patches to fix problems. At this time, PLY is mature software and new -features are no longer being added. If you think you have found a -bug, please visit the PLY Github page at https://github.com/dabeaz/ply -to report an issue. +## Documentation + +The [doc/](doc/) directory has more extensive documentation on PLY. + +## Examples + +The [example/](example/) directory has various examples of using PLY. + +## Test Suite + +PLY is not shipped with a test-suite. However, it is extensively +tested before releases are made. The file +[ply-tests.tar.gz](https://github.com/dabeaz/ply/raw/master/ply-tests.tar.gz) +contains a current version of the tests should you want to run them on +your own. + +## Bug Reports + +PLY is mature software and new features are rarely added. If you think you have +found a bug, please report an issue. + +## Questions and Answers -Take a Class! -============= +*Q: Why does PLY have such a weird API?* -If you'd like to learn more about compiler principles and have a go at -implementing a compiler, come take a course. -https://www.dabeaz.com/compiler.html. - --- Dave +Aside from simply being over 20 years old and predating many advanced +Python features, PLY takes inspiration from two primary sources: the +Unix `yacc` utility and John Aycock's Spark toolkit. `yacc` uses a +convention of referencing grammar symbols by `$1`, `$2`, and so +forth. PLY mirrors this using `p[1]`, `p[2]`, etc. Spark was a +parsing toolkit that utilized introspection and Python docstrings for +specifying a grammar. PLY borrowed that because it was clever. +A modern API can be found in the related [SLY](https://github.com/dabeaz/sly) +project. +*Q: Why isn't PLY distributed as a package?* +PLY is highly specialized software that is really only of interest to +people creating parsers for programming languages. In such a project, +I think you should take responsibility for library dependencies. PLY +is very small and rarely changes--if it works for your project, there +is no reason to ever update it. At the same time, as it's author, I +reserve the creative right to make occasional updates to the project, +including those that might introduce breaking changes. The bottom +line is that I'm not a link in your software supply chain. If you +use PLY, you should copy it into your project. +*Q: Do you accept pull requests and user contributions?* + +No. New features are not being added to PLY at this time. However, I +am interested in bug reports should you find a problem with PLY. Feel +free to use the issue-tracker for feature ideas--just because PLY +is not in active development doesn't mean that good ideas will be +ignored. + +*Q: Can I use PLY to make my own parsing tool?* + +Absolutely! It's free software that you are free to copy and modify in +any way that makes sense. This includes remixing it into something +that's more powerful. I'd just ask that you keep the original +copyright notice in files based on the original PLY source code so +that others know where it came from. + + +## Acknowledgements + +A special thanks is in order for all of the students in CS326 who +suffered through about 25 different versions of this. + +The CHANGES file acknowledges those who have contributed patches. + +Elias Ioup did the first implementation of LALR(1) parsing in PLY-1.x. +Andrew Waters and Markus Schoepflin were instrumental in reporting bugs +and testing a revised LALR(1) implementation for PLY-2.0. +## Take a Class! +If you'd like to learn more about compiler principles or have a go at implementing +a compiler from scratch, come take a course. https://www.dabeaz.com/compiler.html. diff --git a/doc/internals.md b/doc/internals.md new file mode 100644 index 0000000..76864a1 --- /dev/null +++ b/doc/internals.md @@ -0,0 +1,598 @@ +# PLY Internals + +## 1. Introduction + +This document describes classes and functions that make up the internal +operation of PLY. Using this programming interface, it is possible to +manually build an parser using a different interface specification than +what PLY normally uses. For example, you could build a gramar from +information parsed in a completely different input format. Some of these +objects may be useful for building more advanced parsing engines such as +GLR. + +It should be stressed that using PLY at this level is not for the faint +of heart. Generally, it\'s assumed that you know a bit of the underlying +compiler theory and how an LR parser is put together. + +## 2. Grammar Class + +The file `ply.yacc` defines a class `Grammar` that is used to hold and +manipulate information about a grammar specification. It encapsulates +the same basic information about a grammar that is put into a YACC file +including the list of tokens, precedence rules, and grammar rules. +Various operations are provided to perform different validations on the +grammar. In addition, there are operations to compute the first and +follow sets that are needed by the various table generation algorithms. + +`Grammar(terminals)` + +: Creates a new grammar object. `terminals` is a list of strings + specifying the terminals for the grammar. An instance `g` of + `Grammar` has the following methods: + +`g.set_precedence(term,assoc,level)` + +: Sets the precedence level and associativity for a given terminal + `term`. `assoc` is one of `'right'`, `'left'`, or `'nonassoc'` and + `level` is a positive integer. The higher the value of `level`, the + higher the precedence. Here is an example of typical precedence + settings: + + g.set_precedence('PLUS', 'left',1) + g.set_precedence('MINUS', 'left',1) + g.set_precedence('TIMES', 'left',2) + g.set_precedence('DIVIDE','left',2) + g.set_precedence('UMINUS','left',3) + + This method must be called prior to adding any productions to the + grammar with `g.add_production()`. The precedence of individual + grammar rules is determined by the precedence of the right-most + terminal. + +`g.add_production(name,syms,func=None,file='',line=0)` + +: Adds a new grammar rule. `name` is the name of the rule, `syms` is a + list of symbols making up the right hand side of the rule, `func` is + the function to call when reducing the rule. `file` and `line` + specify the filename and line number of the rule and are used for + generating error messages. + + The list of symbols in `syms` may include character literals and + `%prec` specifiers. Here are some examples: + + g.add_production('expr',['expr','PLUS','term'],func,file,line) + g.add_production('expr',['expr','"+"','term'],func,file,line) + g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) + + If any kind of error is detected, a `GrammarError` exception is + raised with a message indicating the reason for the failure. + +`g.set_start(start=None)` + +: Sets the starting rule for the grammar. `start` is a string + specifying the name of the start rule. If `start` is omitted, the + first grammar rule added with `add_production()` is taken to be the + starting rule. This method must always be called after all + productions have been added. + +`g.find_unreachable()` + +: Diagnostic function. Returns a list of all unreachable non-terminals + defined in the grammar. This is used to identify inactive parts of + the grammar specification. + +`g.infinite_cycle()` + +: Diagnostic function. Returns a list of all non-terminals in the + grammar that result in an infinite cycle. This condition occurs if + there is no way for a grammar rule to expand to a string containing + only terminal symbols. + +`g.undefined_symbols()` + +: Diagnostic function. Returns a list of tuples `(name, prod)` + corresponding to undefined symbols in the grammar. `name` is the + name of the undefined symbol and `prod` is an instance of + `Production` which has information about the production rule where + the undefined symbol was used. + +`g.unused_terminals()` + +: Diagnostic function. Returns a list of terminals that were defined, + but never used in the grammar. + +`g.unused_rules()` + +: Diagnostic function. Returns a list of `Production` instances + corresponding to production rules that were defined in the grammar, + but never used anywhere. This is slightly different than + `find_unreachable()`. + +`g.unused_precedence()` + +: Diagnostic function. Returns a list of tuples `(term, assoc)` + corresponding to precedence rules that were set, but never used the + grammar. `term` is the terminal name and `assoc` is the precedence + associativity (e.g., `'left'`, `'right'`, or `'nonassoc'`. + +`g.compute_first()` + +: Compute all of the first sets for all symbols in the grammar. + Returns a dictionary mapping symbol names to a list of all first + symbols. + +`g.compute_follow()` + +: Compute all of the follow sets for all non-terminals in the grammar. + The follow set is the set of all possible symbols that might follow + a given non-terminal. Returns a dictionary mapping non-terminal + names to a list of symbols. + +`g.build_lritems()` + +: Calculates all of the LR items for all productions in the grammar. + This step is required before using the grammar for any kind of table + generation. See the section on LR items below. + +The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +`g.Productions` + +: A list of all productions added. The first entry is reserved for a + production representing the starting rule. The objects in this list + are instances of the `Production` class, described shortly. + +`g.Prodnames` + +: A dictionary mapping the names of nonterminals to a list of all + productions of that nonterminal. + +`g.Terminals` + +: A dictionary mapping the names of terminals to a list of the + production numbers where they are used. + +`g.Nonterminals` + +: A dictionary mapping the names of nonterminals to a list of the + production numbers where they are used. + +`g.First` + +: A dictionary representing the first sets for all grammar symbols. + This is computed and returned by the `compute_first()` method. + +`g.Follow` + +: A dictionary representing the follow sets for all grammar rules. + This is computed and returned by the `compute_follow()` method. + +`g.Start` + +: Starting symbol for the grammar. Set by the `set_start()` method. + +For the purposes of debugging, a `Grammar` object supports the +`__len__()` and `__getitem__()` special methods. Accessing `g[n]` +returns the nth production from the grammar. + +## 3. Productions + +`Grammar` objects store grammar rules as instances of a `Production` +class. This class has no public constructor\--you should only create +productions by calling `Grammar.add_production()`. The following +attributes are available on a `Production` instance `p`. + +`p.name` + +: The name of the production. For a grammar rule such as `A : B C D`, + this is `'A'`. + +`p.prod` + +: A tuple of symbols making up the right-hand side of the production. + For a grammar rule such as `A : B C D`, this is `('B','C','D')`. + +`p.number` + +: Production number. An integer containing the index of the production + in the grammar\'s `Productions` list. + +`p.func` + +: The name of the reduction function associated with the production. + This is the function that will execute when reducing the entire + grammar rule during parsing. + +`p.callable` + +: The callable object associated with the name in `p.func`. This is + `None` unless the production has been bound using `bind()`. + +`p.file` + +: Filename associated with the production. Typically this is the file + where the production was defined. Used for error messages. + +`p.lineno` + +: Line number associated with the production. Typically this is the + line number in `p.file` where the production was defined. Used for + error messages. + +`p.prec` + +: Precedence and associativity associated with the production. This is + a tuple `(assoc,level)` where `assoc` is one of `'left'`,`'right'`, + or `'nonassoc'` and `level` is an integer. This value is determined + by the precedence of the right-most terminal symbol in the + production or by use of the `%prec` specifier when adding the + production. + +`p.usyms` + +: A list of all unique symbols found in the production. + +`p.lr_items` + +: A list of all LR items for this production. This attribute only has + a meaningful value if the `Grammar.build_lritems()` method has been + called. The items in this list are instances of `LRItem` described + below. + +`p.lr_next` + +: The head of a linked-list representation of the LR items in + `p.lr_items`. This attribute only has a meaningful value if the + `Grammar.build_lritems()` method has been called. Each `LRItem` + instance has a `lr_next` attribute to move to the next item. The + list is terminated by `None`. + +`p.bind(dict)` + +: Binds the production function name in `p.func` to a callable object + in `dict`. This operation is typically carried out in the last step + prior to running the parsing engine and is needed since parsing + tables are typically read from files which only include the function + names, not the functions themselves. + +`Production` objects support the `__len__()`, `__getitem__()`, and +`__str__()` special methods. `len(p)` returns the number of symbols in +`p.prod` and `p[n]` is the same as `p.prod[n]`. + +## 4. LRItems + +The construction of parsing tables in an LR-based parser generator is +primarily done over a set of \"LR Items\". An LR item represents a stage +of parsing one of the grammar rules. To compute the LR items, it is +first necessary to call `Grammar.build_lritems()`. Once this step, all +of the productions in the grammar will have their LR items attached to +them. + +Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, `g` is a `Grammar` object: + + >>> g.build_lritems() + >>> p = g[1] + >>> p + Production(statement -> ID = expr) + >>> + +In the above code, `p` represents the first grammar rule. In this case, +a rule `'statement -> ID = expr'`. + +Now, let\'s look at the LR items for `p`: + + >>> p.lr_items + [LRItem(statement -> . ID = expr), + LRItem(statement -> ID . = expr), + LRItem(statement -> ID = . expr), + LRItem(statement -> ID = expr .)] + >>> + +In each LR item, the dot (.) represents a specific stage of parsing. In +each LR item, the dot is advanced by one symbol. It is only when the dot +reaches the very end that a production is successfully parsed. + +An instance `lr` of `LRItem` has the following attributes that hold +information related to that specific stage of parsing. + +`lr.name` + +: The name of the grammar rule. For example, `'statement'` in the + above example. + +`lr.prod` + +: A tuple of symbols representing the right-hand side of the + production, including the special `'.'` character. For example, + `('ID','.','=','expr')`. + +`lr.number` + +: An integer representing the production number in the grammar. + +`lr.usyms` + +: A set of unique symbols in the production. Inherited from the + original `Production` instance. + +`lr.lr_index` + +: An integer representing the position of the dot (.). You should + never use `lr.prod.index()` to search for it\--the result will be + wrong if the grammar happens to also use (.) as a character literal. + +`lr.lr_after` + +: A list of all productions that can legally appear immediately to the + right of the dot (.). This list contains `Production` instances. + This attribute represents all of the possible branches a parse can + take from the current position. For example, suppose that `lr` + represents a stage immediately before an expression like this: + + >>> lr + LRItem(statement -> ID = . expr) + >>> + + Then, the value of `lr.lr_after` might look like this, showing all + productions that can legally appear next: + + >>> lr.lr_after + [Production(expr -> expr PLUS expr), + Production(expr -> expr MINUS expr), + Production(expr -> expr TIMES expr), + Production(expr -> expr DIVIDE expr), + Production(expr -> MINUS expr), + Production(expr -> LPAREN expr RPAREN), + Production(expr -> NUMBER), + Production(expr -> ID)] + >>> + +`lr.lr_before` + +: The grammar symbol that appears immediately before the dot (.) or + `None` if at the beginning of the parse. + +`lr.lr_next` + +: A link to the next LR item, representing the next stage of the + parse. `None` if `lr` is the last LR item. + +`LRItem` instances also support the `__len__()` and `__getitem__()` +special methods. `len(lr)` returns the number of items in `lr.prod` +including the dot (.). `lr[n]` returns `lr.prod[n]`. + +It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very likely +create a small black-hole that will consume you and your code. + +## 5. LRTable + +The `LRTable` class represents constructed LR parsing tables on a +grammar. + +`LRTable(grammar, log=None)` + +: Create the LR parsing tables on a grammar. `grammar` is an instance + of `Grammar` and `log` is a logger object used to write debugging + information. The debugging information written to `log` is the same + as what appears in the `parser.out` file created by yacc. By + supplying a custom logger with a different message format, it is + possible to get more information (e.g., the line number in `yacc.py` + used for issuing each line of output in the log). + +An instance `lr` of `LRTable` has the following attributes. + +`lr.grammar` + +: A link to the Grammar object used to construct the parsing tables. + +`lr.lr_method` + +: The LR parsing method used (e.g., `'LALR'`) + +`lr.lr_productions` + +: A reference to `grammar.Productions`. This, together with + `lr_action` and `lr_goto` contain all of the information needed by + the LR parsing engine. + +`lr.lr_action` + +: The LR action dictionary that implements the underlying state + machine. The keys of this dictionary are the LR states. + +`lr.lr_goto` + +: The LR goto table that contains information about grammar rule + reductions. + +`lr.sr_conflicts` + +: A list of tuples `(state,token,resolution)` identifying all + shift/reduce conflicts. `state` is the LR state number where the + conflict occurred, `token` is the token causing the conflict, and + `resolution` is a string describing the resolution taken. + `resolution` is either `'shift'` or `'reduce'`. + +`lr.rr_conflicts` + +: A list of tuples `(state,rule,rejected)` identifying all + reduce/reduce conflicts. `state` is the LR state number where the + conflict occurred, `rule` is the production rule that was selected + and `rejected` is the production rule that was rejected. Both `rule` + and `rejected` are instances of `Production`. They can be inspected + to provide the user with more information. + +`lrtab.bind_callables(dict)` + +: This binds all of the function names used in productions to callable + objects found in the dictionary `dict`. During table generation and + when reading LR tables from files, PLY only uses the names of action + functions such as `'p_expr'`, `'p_statement'`, etc. In order to + actually run the parser, these names have to be bound to callable + objects. This method is always called prior to running a parser. + +## 6. LRParser + +The `LRParser` class implements the low-level LR parsing engine. + +`LRParser(lrtab, error_func)` + +: Create an LRParser. `lrtab` is an instance of `LRTable` containing + the LR production and state tables. `error_func` is the error + function to invoke in the event of a parsing error. + +An instance `p` of `LRParser` has the following methods: + +`p.parse(input=None,lexer=None,debug=0,tracking=0)` + +: Run the parser. `input` is a string, which if supplied is fed into + the lexer using its `input()` method. `lexer` is an instance of the + `Lexer` class to use for tokenizing. If not supplied, the last lexer + created with the `lex` module is used. `debug` is a boolean flag + that enables debugging. `tracking` is a boolean flag that tells the + parser to perform additional line number tracking. + +`p.restart()` + +: Resets the parser state for a parse already in progress. + +## 7. ParserReflect + +The `ParserReflect` class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +`p_rule()` functions in a PLY file, performs basic error checking, and +collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the `yacc()` function is actually +implemented by this class. + +`ParserReflect(pdict, log=None)` + +: Creates a `ParserReflect` instance. `pdict` is a dictionary + containing parser specification data. This dictionary typically + corresponds to the module or class dictionary of code that + implements a PLY parser. `log` is a logger instance that will be + used to report error messages. + +An instance `p` of `ParserReflect` has the following methods: + +`p.get_all()` + +: Collect and store all required parsing information. + +`p.validate_all()` + +: Validate all of the collected parsing information. This is a seprate + step from `p.get_all()` as a performance optimization. In order to + increase parser start-up time, a parser can elect to only validate + the parsing data when regenerating the parsing tables. The + validation step tries to collect as much information as possible + rather than raising an exception at the first sign of trouble. The + attribute `p.error` is set if there are any validation errors. The + value of this attribute is also returned. + +`p.signature()` + +: Compute a signature representing the contents of the collected + parsing data. The signature value should change if anything in the + parser specification has changed in a way that would justify parser + table regeneration. This method can be called after `p.get_all()`, + but before `p.validate_all()`. + +The following attributes are set in the process of collecting data: + +`p.start` + +: The grammar start symbol, if any. Taken from `pdict['start']`. + +`p.error_func` + +: The error handling function or `None`. Taken from + `pdict['p_error']`. + +`p.tokens` + +: The token list. Taken from `pdict['tokens']`. + +`p.prec` + +: The precedence specifier. Taken from `pdict['precedence']`. + +`p.preclist` + +: A parsed version of the precedence specified. A list of tuples of + the form `(token,assoc,level)` where `token` is the terminal symbol, + `assoc` is the associativity (e.g., `'left'`) and `level` is a + numeric precedence level. + +`p.grammar` + +: A list of tuples `(name, rules)` representing the grammar rules. + `name` is the name of a Python function or method in `pdict` that + starts with `"p_"`. `rules` is a list of tuples + `(filename,line,prodname,syms)` representing the grammar rules found + in the documentation string of that function. `filename` and `line` + contain location information that can be used for debugging. + `prodname` is the name of the production. `syms` is the right-hand + side of the production. If you have a function like this: + + def p_expr(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr''' + + then the corresponding entry in `p.grammar` might look like this: + + ('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), + ('calc.py',11,'expr', ['expr','MINUS','expr']), + ('calc.py',12,'expr', ['expr','TIMES','expr']), + ('calc.py',13,'expr', ['expr','DIVIDE','expr']) + ]) + +`p.pfuncs` + +: A sorted list of tuples `(line, file, name, doc)` representing all + of the `p_` functions found. `line` and `file` give location + information. `name` is the name of the function. `doc` is the + documentation string. This list is sorted in ascending order by line + number. + +`p.files` + +: A dictionary holding all of the source filenames that were + encountered while collecting parser information. Only the keys of + this dictionary have any meaning. + +`p.error` + +: An attribute that indicates whether or not any critical errors + occurred in validation. If this is set, it means that that some kind + of problem was detected and that no further processing should be + performed. + +## 8. High-level operation + +Using all of the above classes requires some attention to detail. The +`yacc()` function carries out a very specific sequence of operations to +create a grammar. This same sequence should be emulated if you build an +alternative PLY interface. + +1\. A `ParserReflect` object is created and raw grammar specification +data is collected. + +2\. A `Grammar` object is created and populated with information from +the specification data. + +3\. A `LRTable` object is created to run the LALR algorithm over the +`Grammar` object. + +4\. Productions in the LRTable and bound to callables using the +`bind_callables()` method. + +5\. A `LRParser` object is created from from the information in the +`LRTable` object. diff --git a/doc/ply.md b/doc/ply.md new file mode 100644 index 0000000..29a16de --- /dev/null +++ b/doc/ply.md @@ -0,0 +1,2587 @@ +# PLY (Python Lex-Yacc) + +This document provides an overview of lexing and parsing with PLY. Given +the intrinsic complexity of parsing, I strongly advise that you read (or +at least skim) this entire document before jumping into a big +development project with PLY. + +The current version requires Python 3.6 or newer. If you\'re using an +older version of Python, use one of the historical releases. + +## Introduction + +PLY is a pure-Python implementation of the compiler construction tools +lex and yacc. The main goal of PLY is to stay fairly faithful to the way +in which traditional lex/yacc tools work. This includes supporting +LALR(1) parsing as well as providing extensive input validation, error +reporting, and diagnostics. Thus, if you\'ve used yacc in another +programming language, it should be relatively straightforward to use +PLY. + +Early versions of PLY were developed to support an Introduction to +Compilers Course I taught in 2001 at the University of Chicago. Since +PLY was primarily developed as an instructional tool, you will find it +to be fairly picky about token and grammar rule specification. In part, +this added formality is meant to catch common programming mistakes made +by novice users. However, advanced users will also find such features to +be useful when building complicated grammars for real programming +languages. It should also be noted that PLY does not provide much in the +way of bells and whistles (e.g., automatic construction of abstract +syntax trees, tree traversal, etc.). Nor would I consider it to be a +parsing framework. Instead, you will find a bare-bones, yet fully +capable lex/yacc implementation written entirely in Python. + +The rest of this document assumes that you are somewhat familiar with +parsing theory, syntax directed translation, and the use of compiler +construction tools such as lex and yacc in other programming languages. +If you are unfamiliar with these topics, you will probably want to +consult an introductory text such as \"Compilers: Principles, +Techniques, and Tools\", by Aho, Sethi, and Ullman. O\'Reilly\'s \"Lex +and Yacc\" by John Levine may also be handy. In fact, the O\'Reilly book +can be used as a reference for PLY as the concepts are virtually +identical. + +## PLY Overview + +PLY consists of two separate modules; `lex.py` and `yacc.py`, both of +which are found in a Python package called `ply`. The `lex.py` module is +used to break input text into a collection of tokens specified by a +collection of regular expression rules. `yacc.py` is used to recognize +language syntax that has been specified in the form of a context free +grammar. + +The two tools are meant to work together. Specifically, `lex.py` +provides an interface to produce tokens. `yacc.py` uses this retrieve +tokens and invoke grammar rules. The output of `yacc.py` is often an +Abstract Syntax Tree (AST). However, this is entirely up to the user. If +desired, `yacc.py` can also be used to implement simple one-pass +compilers. + +Like its Unix counterpart, `yacc.py` provides most of the features you +expect including extensive error checking, grammar validation, support +for empty productions, error tokens, and ambiguity resolution via +precedence rules. In fact, almost everything that is possible in +traditional yacc should be supported in PLY. + +The primary difference between `yacc.py` and Unix `yacc` is that +`yacc.py` doesn\'t involve a separate code-generation process. Instead, +PLY relies on reflection (introspection) to build its lexers and +parsers. Unlike traditional lex/yacc which require a special input file +that is converted into a separate source file, the specifications given +to PLY *are* valid Python programs. This means that there are no extra +source files nor is there a special compiler construction step (e.g., +running yacc to generate Python code for the compiler). + +## Lex + +`lex.py` is used to tokenize an input string. For example, suppose +you\'re writing a programming language and a user supplied the following +input string: + + x = 3 + 42 * (s - t) + +A tokenizer splits the string into individual tokens: + + 'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')' + +Tokens are usually given names to indicate what they are. For example: + + 'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES', + 'LPAREN','ID','MINUS','ID','RPAREN' + +More specifically, the input is broken into pairs of token types and +values. For example: + + ('ID','x'), ('EQUALS','='), ('NUMBER','3'), + ('PLUS','+'), ('NUMBER','42), ('TIMES','*'), + ('LPAREN','('), ('ID','s'), ('MINUS','-'), + ('ID','t'), ('RPAREN',')' + +The specification of tokens is done by writing a series of regular +expression rules. The next section shows how this is done using +`lex.py`. + +### Lex Example + +The following example shows how `lex.py` is used to write a simple +tokenizer: + + # ------------------------------------------------------------ + # calclex.py + # + # tokenizer for a simple expression evaluator for + # numbers and +,-,*,/ + # ------------------------------------------------------------ + import ply.lex as lex + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + lexer = lex.lex() + +To use the lexer, you first need to feed it some input text using its +`input()` method. After that, repeated calls to `token()` produce +tokens. The following code shows how this works: + + # Test it out + data = ''' + 3 + 4 * 10 + + -20 *2 + ''' + + # Give the lexer some input + lexer.input(data) + + # Tokenize + while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok) + +When executed, the example will produce the following output: + + $ python example.py + LexToken(NUMBER,3,2,1) + LexToken(PLUS,'+',2,3) + LexToken(NUMBER,4,2,5) + LexToken(TIMES,'*',2,7) + LexToken(NUMBER,10,2,10) + LexToken(PLUS,'+',3,14) + LexToken(MINUS,'-',3,16) + LexToken(NUMBER,20,3,18) + LexToken(TIMES,'*',3,20) + LexToken(NUMBER,2,3,21) + +Lexers also support the iteration protocol. So, you can write the above +loop as follows: + + for tok in lexer: + print(tok) + +The tokens returned by `lexer.token()` are instances of `LexToken`. This +object has attributes `type`, `value`, `lineno`, and `lexpos`. The +following code shows an example of accessing these attributes: + + # Tokenize + while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok.type, tok.value, tok.lineno, tok.lexpos) + +The `type` and `value` attributes contain the type and value of the +token itself. `lineno` and `lexpos` contain information about the +location of the token. `lexpos` is the index of the token relative to +the start of the input text. + +### The tokens list + +All lexers must provide a list `tokens` that defines all of the possible +token names that can be produced by the lexer. This list is always +required and is used to perform a variety of validation checks. The +tokens list is also used by the `yacc.py` module to identify terminals. + +In the example, the following code specified the token names: + + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + +### Specification of tokens + +Each token is specified by writing a regular expression rule compatible +with Python\'s `re` module. Each of these rules are defined by making +declarations with a special prefix `t_` to indicate that it defines a +token. For simple tokens, the regular expression can be specified as +strings such as this (note: Python raw strings are used since they are +the most convenient way to write regular expression strings): + + t_PLUS = r'\+' + +In this case, the name following the `t_` must exactly match one of the +names supplied in `tokens`. If some kind of action needs to be +performed, a token rule can be specified as a function. For example, +this rule matches numbers and converts the string into a Python integer: + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +When a function is used, the regular expression rule is specified in the +function documentation string. The function always takes a single +argument which is an instance of `LexToken`. This object has attributes +of `type` which is the token type (as a string), `value` which is the +lexeme (the actual text matched), `lineno` which is the current line +number, and `lexpos` which is the position of the token relative to the +beginning of the input text. By default, `type` is set to the name +following the `t_` prefix. The action function can modify the contents +of the `LexToken` object as appropriate. However, when it is done, the +resulting token should be returned. If no value is returned by the +action function, the token is discarded and the next token read. + +Internally, `lex.py` uses the `re` module to do its pattern matching. +Patterns are compiled using the `re.VERBOSE` flag which can be used to +help readability. However, be aware that unescaped whitespace is ignored +and comments are allowed in this mode. If your pattern involves +whitespace, make sure you use `\s`. If you need to match the `#` +character, use `[#]`. + +When building the master regular expression, rules are added in the +following order: + +1. All tokens defined by functions are added in the same order as they + appear in the lexer file. +2. Tokens defined by strings are added next by sorting them in order of + decreasing regular expression length (longer expressions are added + first). + +Without this ordering, it can be difficult to correctly match certain +types of tokens. For example, if you wanted to have separate tokens for +\"=\" and \"==\", you need to make sure that \"==\" is checked first. By +sorting regular expressions in order of decreasing length, this problem +is solved for rules defined as strings. For functions, the order can be +explicitly controlled since rules appearing first are checked first. + +To handle reserved words, you should write a single rule to match an +identifier and do a special name lookup in a function like this: + + reserved = { + 'if' : 'IF', + 'then' : 'THEN', + 'else' : 'ELSE', + 'while' : 'WHILE', + ... + } + + tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) + + def t_ID(t): + r'[a-zA-Z_][a-zA-Z_0-9]*' + t.type = reserved.get(t.value,'ID') # Check for reserved words + return t + +This approach greatly reduces the number of regular expression rules and +is likely to make things a little faster. + +Note: You should avoid writing individual rules for reserved words. For +example, if you write rules like this: + + t_FOR = r'for' + t_PRINT = r'print' + +those rules will be triggered for identifiers that include those words +as a prefix such as \"forget\" or \"printed\". This is probably not what +you want. + +### Token values + +When tokens are returned by lex, they have a value that is stored in the +`value` attribute. Normally, the value is the text that was matched. +However, the value can be assigned to any Python object. For instance, +when lexing identifiers, you may want to return both the identifier name +and information from some sort of symbol table. To do this, you might +write a rule like this: + + def t_ID(t): + ... + # Look up symbol table information and return a tuple + t.value = (t.value, symbol_lookup(t.value)) + ... + return t + +It is important to note that storing data in other attribute names is +*not* recommended. The `yacc.py` module only exposes the contents of the +`value` attribute. Thus, accessing other attributes may be unnecessarily +awkward. If you need to store multiple values on a token, assign a +tuple, dictionary, or instance to `value`. + +### Discarded tokens + +To discard a token, such as a comment, define a token rule that returns +no value. For example: + + def t_COMMENT(t): + r'\#.*' + pass + # No return value. Token discarded + +Alternatively, you can include the prefix `ignore_` in the token +declaration to force a token to be ignored. For example: + + t_ignore_COMMENT = r'\#.*' + +Be advised that if you are ignoring many different kinds of text, you +may still want to use functions since these provide more precise control +over the order in which regular expressions are matched (i.e., functions +are matched in order of specification whereas strings are sorted by +regular expression length). + +### Line numbers and positional information + +By default, `lex.py` knows nothing about line numbers. This is because +`lex.py` doesn\'t know anything about what constitutes a \"line\" of +input (e.g., the newline character or even if the input is textual +data). To update this information, you need to write a special rule. In +the example, the `t_newline()` rule shows how to do this: + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +Within the rule, the `lineno` attribute of the underlying lexer +`t.lexer` is updated. After the line number is updated, the token is +discarded since nothing is returned. + +`lex.py` does not perform any kind of automatic column tracking. +However, it does record positional information related to each token in +the `lexpos` attribute. Using this, it is usually possible to compute +column information as a separate step. For instance, just count +backwards until you reach a newline: + + # Compute column. + # input is the input text string + # token is a token instance + def find_column(input, token): + line_start = input.rfind('\n', 0, token.lexpos) + 1 + return (token.lexpos - line_start) + 1 + +Since column information is often only useful in the context of error +handling, calculating the column position can be performed when needed +as opposed to doing it for each token. Note: If you\'re parsing a +language where whitespace matters (i.e., Python), it\'s probably better +match whitespace as a token instead of ignoring it. + +### Ignored characters + +The special `t_ignore` rule is reserved by `lex.py` for characters that +should be completely ignored in the input stream. Usually this is used +to skip over whitespace and other non-essential characters. Although it +is possible to define a regular expression rule for whitespace in a +manner similar to `t_newline()`, the use of `t_ignore` provides +substantially better lexing performance because it is handled as a +special case and is checked in a much more efficient manner than the +normal regular expression rules. + +The characters given in `t_ignore` are not ignored when such characters +are part of other regular expression patterns. For example, if you had a +rule to capture quoted text, that pattern can include the ignored +characters (which will be captured in the normal way). The main purpose +of `t_ignore` is to ignore whitespace and other padding between the +tokens that you actually want to parse. + +### Literal characters + +Literal characters can be specified by defining a variable `literals` in +your lexing module. For example: + + literals = [ '+','-','*','/' ] + +or alternatively: + + literals = "+-*/" + +A literal character is a single character that is returned \"as is\" +when encountered by the lexer. Literals are checked after all of the +defined regular expression rules. Thus, if a rule starts with one of the +literal characters, it will always take precedence. + +When a literal token is returned, both its `type` and `value` attributes +are set to the character itself. For example, `'+'`. + +It\'s possible to write token functions that perform additional actions +when literals are matched. However, you\'ll need to set the token type +appropriately. For example: + + literals = [ '{', '}' ] + + def t_lbrace(t): + r'\{' + t.type = '{' # Set token type to the expected literal + return t + + def t_rbrace(t): + r'\}' + t.type = '}' # Set token type to the expected literal + return t + +### Error handling + +The `t_error()` function is used to handle lexing errors that occur when +illegal characters are detected. In this case, the `t.value` attribute +contains the rest of the input string that has not been tokenized. In +the example, the error function was defined as follows: + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +In this case, we print the offending character and skip ahead one +character by calling `t.lexer.skip(1)`. + +### EOF Handling + +The `t_eof()` function is used to handle an end-of-file (EOF) condition +in the input. As input, it receives a token type `'eof'` with the +`lineno` and `lexpos` attributes set appropriately. The main use of this +function is provide more input to the lexer so that it can continue to +parse. Here is an example of how this works: + + # EOF handling rule + def t_eof(t): + # Get more input (Example) + more = input('... ') + if more: + t.lexer.input(more) + return t.lexer.token() + return None + +The EOF function should return the next available token (by calling +`t.lexer.token())` or `None` to indicate no more data. Be aware that +setting more input with the `t.lexer.input()` method does NOT reset +the lexer state or the `lineno` attribute used for position tracking. +The `lexpos` attribute is reset so be aware of that if you\'re using it +in error reporting. + +### Building and using the lexer + +To build the lexer, the function `lex.lex()` is used. For example: + + lexer = lex.lex() + +This function uses Python reflection (or introspection) to read the +regular expression rules out of the calling context and build the lexer. +Once the lexer has been built, two methods can be used to control the +lexer. + +`lexer.input(data)`. Reset the lexer and store a new input string. + +`lexer.token()`. Return the next token. Returns a special `LexToken` +instance on success or None if the end of the input text has been +reached. + +### The \@TOKEN decorator + +In some applications, you may want to define tokens as a series of more +complex regular expression rules. For example: + + digit = r'([0-9])' + nondigit = r'([_A-Za-z])' + identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' + + def t_ID(t): + # want docstring to be identifier above. ????? + ... + +In this case, we want the regular expression rule for `ID` to be one of +the variables above. However, there is no way to directly specify this +using a normal documentation string. To solve this problem, you can use +the `@TOKEN` decorator. For example: + + from ply.lex import TOKEN + + @TOKEN(identifier) + def t_ID(t): + ... + +This will attach `identifier` to the docstring for `t_ID()` allowing +`lex.py` to work normally. Naturally, you could use `@TOKEN` on all +functions as an alternative to using docstrings. + +### Debugging + +For the purpose of debugging, you can run `lex()` in a debugging mode as +follows: + + lexer = lex.lex(debug=True) + +This will produce various sorts of debugging information including all +of the added rules, the master regular expressions used by the lexer, +and tokens generating during lexing. + +In addition, `lex.py` comes with a simple main function which will +either tokenize input read from standard input or from a file specified +on the command line. To use it, put this in your lexer: + + if __name__ == '__main__': + lex.runmain() + +Please refer to the \"Debugging\" section near the end for some more +advanced details of debugging. + +### Alternative specification of lexers + +As shown in the example, lexers are specified all within one Python +module. If you want to put token rules in a different module from the +one in which you invoke `lex()`, use the `module` keyword argument. + +For example, you might have a dedicated module that just contains the +token rules: + + # module: tokrules.py + # This module just contains the lexing rules + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +Now, if you wanted to build a tokenizer from these rules from within a +different module, you would do the following (shown for Python +interactive mode): + + >>> import tokrules + >>> lexer = lex.lex(module=tokrules) + >>> lexer.input("3 + 4") + >>> lexer.token() + LexToken(NUMBER,3,1,1,0) + >>> lexer.token() + LexToken(PLUS,'+',1,2) + >>> lexer.token() + LexToken(NUMBER,4,1,4) + >>> lexer.token() + None + >>> + +The `module` option can also be used to define lexers from instances of +a class. For example: + + import ply.lex as lex + + class MyLexer(object): + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + # Note addition of self parameter since we're in a class + def t_NUMBER(self,t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(self,t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(self,t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + def build(self,**kwargs): + self.lexer = lex.lex(module=self, **kwargs) + + # Test it output + def test(self,data): + self.lexer.input(data) + while True: + tok = self.lexer.token() + if not tok: + break + print(tok) + + # Build the lexer and try it out + m = MyLexer() + m.build() # Build the lexer + m.test("3 + 4") # Test it + +When building a lexer from class, *you should construct the lexer from +an instance of the class*, not the class object itself. This is because +PLY only works properly if the lexer actions are defined by +bound-methods. + +When using the `module` option to `lex()`, PLY collects symbols from the +underlying object using the `dir()` function. There is no direct access +to the `__dict__` attribute of the object supplied as a module value. + +Finally, if you want to keep things nicely encapsulated, but don\'t want +to use a full-fledged class definition, lexers can be defined using +closures. For example: + + import ply.lex as lex + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + def MyLexer(): + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer from my environment and return it + return lex.lex() + +Important note: If you are defining a lexer using a class or closure, be +aware that PLY still requires you to only define a single lexer per +module (source file). There are extensive validation/error checking +parts of the PLY that may falsely report error messages if you don\'t +follow this rule. + +### Maintaining state + +In your lexer, you may want to maintain a variety of state information. +This might include mode settings, symbol tables, and other details. As +an example, suppose that you wanted to keep track of how many NUMBER +tokens had been encountered. + +One way to do this is to keep a set of global variables in the module +where you created the lexer. For example: + + num_count = 0 + def t_NUMBER(t): + r'\d+' + global num_count + num_count += 1 + t.value = int(t.value) + return t + +If you don\'t like the use of a global variable, another place to store +information is inside the Lexer object created by `lex()`. To this, you +can use the `lexer` attribute of tokens passed to the various rules. For +example: + + def t_NUMBER(t): + r'\d+' + t.lexer.num_count += 1 # Note use of lexer attribute + t.value = int(t.value) + return t + + lexer = lex.lex() + lexer.num_count = 0 # Set the initial count + +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given lexer +exist in the same application. However, this might also feel like a +gross violation of encapsulation to OO purists. Just to put your mind at +some ease, all internal attributes of the lexer (with the exception of +`lineno`) have names that are prefixed by `lex` (e.g., +`lexdata`,`lexpos`, etc.). Thus, it is perfectly safe to store +attributes in the lexer that don\'t have names starting with that prefix +or a name that conflicts with one of the predefined methods (e.g., +`input()`, `token()`, etc.). + +If you don\'t like assigning values on the lexer object, you can define +your lexer as a class as shown in the previous section: + + class MyLexer: + ... + def t_NUMBER(self,t): + r'\d+' + self.num_count += 1 + t.value = int(t.value) + return t + + def build(self, **kwargs): + self.lexer = lex.lex(object=self,**kwargs) + + def __init__(self): + self.num_count = 0 + +The class approach may be the easiest to manage if your application is +going to be creating multiple instances of the same lexer and you need +to manage a lot of state. + +State can also be managed through closures. For example: + + def MyLexer(): + num_count = 0 + ... + def t_NUMBER(t): + r'\d+' + nonlocal num_count + num_count += 1 + t.value = int(t.value) + return t + ... + +### Lexer cloning + +If necessary, a lexer object can be duplicated by invoking its `clone()` +method. For example: + + lexer = lex.lex() + ... + newlexer = lexer.clone() + +When a lexer is cloned, the copy is exactly identical to the original +lexer including any input text and internal state. However, the clone +allows a different set of input text to be supplied which may be +processed separately. This may be useful in situations when you are +writing a parser/compiler that involves recursive or reentrant +processing. For instance, if you needed to scan ahead in the input for +some reason, you could create a clone and use it to look ahead. Or, if +you were implementing some kind of preprocessor, cloned lexers could be +used to handle different input files. + +Creating a clone is different than calling `lex.lex()` in that PLY +doesn\'t regenerate any of the internal tables or regular expressions. + +Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, you +need to be aware that the newly created lexers will share all of this +state with the original lexer. For example, if you defined a lexer as a +class and did this: + + m = MyLexer() + a = lex.lex(object=m) # Create a lexer + + b = a.clone() # Clone the lexer + +Then both `a` and `b` are going to be bound to the same object `m` and +any changes to `m` will be reflected in both lexers. It\'s important to +emphasize that `clone()` is only meant to create a new lexer that reuses +the regular expressions and environment of another lexer. If you need to +make a totally new copy of a lexer, then call `lex()` again. + +### Internal lexer state + +A Lexer object `lexer` has a number of internal attributes that may be +useful in certain situations. + +`lexer.lexpos` + +: This attribute is an integer that contains the current position + within the input text. If you modify the value, it will change the + result of the next call to `token()`. Within token rule functions, + this points to the first character *after* the matched text. If the + value is modified within a rule, the next returned token will be + matched at the new position. + +`lexer.lineno` + +: The current value of the line number attribute stored in the lexer. + PLY only specifies that the attribute exists\-\--it never sets, + updates, or performs any processing with it. If you want to track + line numbers, you will need to add code yourself (see the section on + line numbers and positional information). + +`lexer.lexdata` + +: The current input text stored in the lexer. This is the string + passed with the `input()` method. It would probably be a bad idea to + modify this unless you really know what you\'re doing. + +`lexer.lexmatch` + +: This is the raw `Match` object returned by the Python `re.match()` + function (used internally by PLY) for the current token. If you have + written a regular expression that contains named groups, you can use + this to retrieve those values. Note: This attribute is only updated + when tokens are defined and processed by functions. + +### Conditional lexing and start conditions + +In advanced parsing applications, it may be useful to have different +lexing states. For instance, you may want the occurrence of a certain +token or syntactic construct to trigger a different kind of lexing. PLY +supports a feature that allows the underlying lexer to be put into a +series of different states. Each state can have its own tokens, lexing +rules, and so forth. The implementation is based largely on the \"start +condition\" feature of GNU flex. Details of this can be found at +<http://flex.sourceforge.net/manual/Start-Conditions.html> + +To define a new lexing state, it must first be declared. This is done by +including a \"states\" declaration in your lex file. For example: + + states = ( + ('foo','exclusive'), + ('bar','inclusive'), + ) + +This declaration declares two states, `'foo'` and `'bar'`. States may be +of two types; `'exclusive'` and `'inclusive'`. An exclusive state +completely overrides the default behavior of the lexer. That is, lex +will only return tokens and apply rules defined specifically for that +state. An inclusive state adds additional tokens and rules to the +default set of rules. Thus, lex will return both the tokens defined by +default in addition to those defined for the inclusive state. + +Once a state has been declared, tokens and rules are declared by +including the state name in token/rule declaration. For example: + + t_foo_NUMBER = r'\d+' # Token 'NUMBER' in state 'foo' + t_bar_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' # Token 'ID' in state 'bar' + + def t_foo_newline(t): + r'\n' + t.lexer.lineno += 1 + +A token can be declared in multiple states by including multiple state +names in the declaration. For example: + + t_foo_bar_NUMBER = r'\d+' # Defines token 'NUMBER' in both state 'foo' and 'bar' + +Alternative, a token can be declared in all states using the \'ANY\' in +the name: + + t_ANY_NUMBER = r'\d+' # Defines a token 'NUMBER' in all states + +If no state name is supplied, as is normally the case, the token is +associated with a special state `'INITIAL'`. For example, these two +declarations are identical: + + t_NUMBER = r'\d+' + t_INITIAL_NUMBER = r'\d+' + +States are also associated with the special `t_ignore`, `t_error()`, and +`t_eof()` declarations. For example, if a state treats these +differently, you can declare: + + t_foo_ignore = " \t\n" # Ignored characters for state 'foo' + + def t_bar_error(t): # Special error handler for state 'bar' + pass + +By default, lexing operates in the `'INITIAL'` state. This state +includes all of the normally defined tokens. For users who aren\'t using +different states, this fact is completely transparent. If, during lexing +or parsing, you want to change the lexing state, use the `begin()` +method. For example: + + def t_begin_foo(t): + r'start_foo' + t.lexer.begin('foo') # Starts 'foo' state + +To get out of a state, you use `begin()` to switch back to the initial +state. For example: + + def t_foo_end(t): + r'end_foo' + t.lexer.begin('INITIAL') # Back to the initial state + +The management of states can also be done with a stack. For example: + + def t_begin_foo(t): + r'start_foo' + t.lexer.push_state('foo') # Starts 'foo' state + + def t_foo_end(t): + r'end_foo' + t.lexer.pop_state() # Back to the previous state + +The use of a stack would be useful in situations where there are many +ways of entering a new lexing state and you merely want to go back to +the previous state afterwards. + +An example might help clarify. Suppose you were writing a parser and you +wanted to grab sections of arbitrary C code enclosed by curly braces. +That is, whenever you encounter a starting brace `'{'`, you want to read +all of the enclosed code up to the ending brace `'}'` and return it as a +string. Doing this with a normal regular expression rule is nearly (if +not actually) impossible. This is because braces can be nested and can +be included in comments and strings. Thus, matching up to the first +matching `'}'` character isn\'t good enough. Here is how you might use +lexer states to do this: + + # Declare the state + states = ( + ('ccode','exclusive'), + ) + + # Match the first {. Enter ccode state. + def t_ccode(t): + r'\{' + t.lexer.code_start = t.lexer.lexpos # Record the starting position + t.lexer.level = 1 # Initial brace level + t.lexer.begin('ccode') # Enter 'ccode' state + + # Rules for the ccode state + def t_ccode_lbrace(t): + r'\{' + t.lexer.level +=1 + + def t_ccode_rbrace(t): + r'\}' + t.lexer.level -=1 + + # If closing brace, return the code fragment + if t.lexer.level == 0: + t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1] + t.type = "CCODE" + t.lexer.lineno += t.value.count('\n') + t.lexer.begin('INITIAL') + return t + + # C or C++ comment (ignore) + def t_ccode_comment(t): + r'(/\*(.|\n)*?\*/)|(//.*)' + pass + + # C string + def t_ccode_string(t): + r'\"([^\\\n]|(\\.))*?\"' + + # C character literal + def t_ccode_char(t): + r'\'([^\\\n]|(\\.))*?\'' + + # Any sequence of non-whitespace characters (not braces, strings) + def t_ccode_nonspace(t): + r'[^\s\{\}\'\"]+' + + # Ignored characters (whitespace) + t_ccode_ignore = " \t\n" + + # For bad characters, we just skip over it + def t_ccode_error(t): + t.lexer.skip(1) + +In this example, the occurrence of the first \'{\' causes the lexer to +record the starting position and enter a new state `'ccode'`. A +collection of rules then match various parts of the input that follow +(comments, strings, etc.). All of these rules merely discard the token +(by not returning a value). However, if the closing right brace is +encountered, the rule `t_ccode_rbrace` collects all of the code (using +the earlier recorded starting position), stores it, and returns a token +\'CCODE\' containing all of that text. When returning the token, the +lexing state is restored back to its initial state. + +### Miscellaneous Issues + +- The lexer requires input to be supplied as a single input string. + Since most machines have more than enough memory, this rarely + presents a performance concern. However, it means that the lexer + currently can\'t be used with streaming data such as open files or + sockets. This limitation is primarily a side-effect of using the + `re` module. You might be able to work around this by implementing + an appropriate `def t_eof()` end-of-file handling rule. The main + complication here is that you\'ll probably need to ensure that data + is fed to the lexer in a way so that it doesn\'t split in in the + middle of a token. + +- If you need to supply optional flags to the re.compile() function, + use the reflags option to lex. For example: + + lex.lex(reflags=re.UNICODE | re.VERBOSE) + + Note: by default, `reflags` is set to `re.VERBOSE`. If you provide + your own flags, you may need to include this for PLY to preserve its + normal behavior. + +- If you are going to create a hand-written lexer and you plan to use + it with `yacc.py`, it only needs to conform to the following + requirements: + + 1. It must provide a `token()` method that returns the next token + or `None` if no more tokens are available. + 2. The `token()` method must return an object `tok` that has `type` + and `value` attributes. If line number tracking is being used, + then the token should also define a `lineno` attribute. + +## Parsing basics + +`yacc.py` is used to parse language syntax. Before showing an example, +there are a few important bits of background that must be mentioned. +First, *syntax* is usually specified in terms of a BNF grammar. For +example, if you wanted to parse simple arithmetic expressions, you might +first write an unambiguous grammar specification like this: + + expression : expression + term + | expression - term + | term + + term : term * factor + | term / factor + | factor + + factor : NUMBER + | ( expression ) + +In the grammar, symbols such as `NUMBER`, `+`, `-`, `*`, and `/` are +known as *terminals* and correspond to input tokens. Identifiers such as +`term` and `factor` refer to grammar rules comprised of a collection of +terminals and other rules. These identifiers are known as +*non-terminals*. + +The semantic behavior of a language is often specified using a technique +known as syntax directed translation. In syntax directed translation, +attributes are attached to each symbol in a given grammar rule along +with an action. Whenever a particular grammar rule is recognized, the +action describes what to do. For example, given the expression grammar +above, you might write the specification for a simple calculator like +this: + + Grammar Action + -------------------------------- -------------------------------------------- + expression0 : expression1 + term expression0.val = expression1.val + term.val + | expression1 - term expression0.val = expression1.val - term.val + | term expression0.val = term.val + + term0 : term1 * factor term0.val = term1.val * factor.val + | term1 / factor term0.val = term1.val / factor.val + | factor term0.val = factor.val + + factor : NUMBER factor.val = int(NUMBER.lexval) + | ( expression ) factor.val = expression.val + +A good way to think about syntax directed translation is to view each +symbol in the grammar as a kind of object. Associated with each symbol +is a value representing its \"state\" (for example, the `val` attribute +above). Semantic actions are then expressed as a collection of functions +or methods that operate on the symbols and associated values. + +Yacc uses a parsing technique known as LR-parsing or shift-reduce +parsing. LR parsing is a bottom up technique that tries to recognize the +right-hand-side of various grammar rules. Whenever a valid +right-hand-side is found in the input, the appropriate action code is +triggered and the grammar symbols are replaced by the grammar symbol on +the left-hand-side. + +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns +that match one of the grammar rules. The details of the algorithm can be +found in a compiler textbook, but the following example illustrates the +steps that are performed if you wanted to parse the expression +`3 + 5 * (10 - 20)` using the grammar defined above. In the example, the +special symbol `$` represents the end of input: + + Step Symbol Stack Input Tokens Action + ---- --------------------- --------------------- ------------------------------- + 1 3 + 5 * ( 10 - 20 )$ Shift 3 + 2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER + 3 factor + 5 * ( 10 - 20 )$ Reduce term : factor + 4 term + 5 * ( 10 - 20 )$ Reduce expr : term + 5 expr + 5 * ( 10 - 20 )$ Shift + + 6 expr + 5 * ( 10 - 20 )$ Shift 5 + 7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER + 8 expr + factor * ( 10 - 20 )$ Reduce term : factor + 9 expr + term * ( 10 - 20 )$ Shift * + 10 expr + term * ( 10 - 20 )$ Shift ( + 11 expr + term * ( 10 - 20 )$ Shift 10 + 12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER + 13 expr + term * ( factor - 20 )$ Reduce term : factor + 14 expr + term * ( term - 20 )$ Reduce expr : term + 15 expr + term * ( expr - 20 )$ Shift - + 16 expr + term * ( expr - 20 )$ Shift 20 + 17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER + 18 expr + term * ( expr - factor )$ Reduce term : factor + 19 expr + term * ( expr - term )$ Reduce expr : expr - term + 20 expr + term * ( expr )$ Shift ) + 21 expr + term * ( expr ) $ Reduce factor : (expr) + 22 expr + term * factor $ Reduce term : term * factor + 23 expr + term $ Reduce expr : expr + term + 24 expr $ Reduce expr + 25 $ Success! + +When parsing the expression, an underlying state machine and the current +input token determine what happens next. If the next token looks like +part of a valid grammar rule (based on other items on the stack), it is +generally shifted onto the stack. If the top of the stack contains a +valid right-hand-side of a grammar rule, it is usually \"reduced\" and +the symbols replaced with the symbol on the left-hand-side. When this +reduction occurs, the appropriate action is triggered (if defined). If +the input token can\'t be shifted and the top of stack doesn\'t match +any grammar rules, a syntax error has occurred and the parser must take +some kind of recovery step (or bail out). A parse is only successful if +the parser reaches a state where the symbol stack is empty and there are +no more input tokens. + +It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and beyond the +scope of this discussion. However, subtle details of this process +explain why, in the example above, the parser chooses to shift a token +onto the stack in step 9 rather than reducing the rule +`expr : expr + term`. + +## Yacc + +The `ply.yacc` module implements the parsing component of PLY. The name +\"yacc\" stands for \"Yet Another Compiler Compiler\" and is borrowed +from the Unix tool of the same name. + +### An example + +Suppose you wanted to make a grammar for simple arithmetic expressions +as previously described. Here is how you would do it with `yacc.py`: + + # Yacc example + + import ply.yacc as yacc + + # Get the token map from the lexer. This is required. + from calclex import tokens + + def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + + def p_expression_minus(p): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + + def p_expression_term(p): + 'expression : term' + p[0] = p[1] + + def p_term_times(p): + 'term : term TIMES factor' + p[0] = p[1] * p[3] + + def p_term_div(p): + 'term : term DIVIDE factor' + p[0] = p[1] / p[3] + + def p_term_factor(p): + 'term : factor' + p[0] = p[1] + + def p_factor_num(p): + 'factor : NUMBER' + p[0] = p[1] + + def p_factor_expr(p): + 'factor : LPAREN expression RPAREN' + p[0] = p[2] + + # Error rule for syntax errors + def p_error(p): + print("Syntax error in input!") + + # Build the parser + parser = yacc.yacc() + + while True: + try: + s = input('calc > ') + except EOFError: + break + if not s: continue + result = parser.parse(s) + print(result) + +In this example, each grammar rule is defined by a Python function where +the docstring to that function contains the appropriate context-free +grammar specification. The statements that make up the function body +implement the semantic actions of the rule. Each function accepts a +single argument `p` that is a sequence containing the values of each +grammar symbol in the corresponding rule. The values of `p[i]` are +mapped to grammar symbols as shown here: + + def p_expression_plus(p): + 'expression : expression PLUS term' + # ^ ^ ^ ^ + # p[0] p[1] p[2] p[3] + + p[0] = p[1] + p[3] + +For tokens, the \"value\" of the corresponding `p[i]` is the *same* as +the `p.value` attribute assigned in the lexer module. For non-terminals, +the value is determined by whatever is placed in `p[0]` when rules are +reduced. This value can be anything at all. However, it probably most +common for the value to be a simple Python type, a tuple, or an +instance. In this example, we are relying on the fact that the `NUMBER` +token stores an integer value in its value field. All of the other rules +perform various types of integer operations and propagate the result. + +Note: The use of negative indices have a special meaning in +yacc\-\--specially `p[-1]` does not have the same value as `p[3]` in +this example. Please see the section on \"Embedded Actions\" for further +details. + +The first rule defined in the yacc specification determines the starting +grammar symbol (in this case, a rule for `expression` appears first). +Whenever the starting rule is reduced by the parser and no more input is +available, parsing stops and the final value is returned (this value +will be whatever the top-most rule placed in `p[0]`). Note: an +alternative starting symbol can be specified using the `start` keyword +argument to `yacc()`. + +The `p_error(p)` rule is defined to catch syntax errors. See the error +handling section below for more detail. + +To build the parser, call the `yacc.yacc()` function. This function +looks at the module and attempts to construct all of the LR parsing +tables for the grammar you have specified. + +If any errors are detected in your grammar specification, `yacc.py` will +produce diagnostic messages and possibly raise an exception. Some of the +errors that can be detected include: + +- Duplicated function names (if more than one rule function have the + same name in the grammar file). +- Shift/reduce and reduce/reduce conflicts generated by ambiguous + grammars. +- Badly specified grammar rules. +- Infinite recursion (rules that can never terminate). +- Unused rules and tokens +- Undefined rules and tokens + +The next few sections discuss grammar specification in more detail. + +The final part of the example shows how to actually run the parser +created by `yacc()`. To run the parser, you have to call the `parse()` +with a string of input text. This will run all of the grammar rules and +return the result of the entire parse. This result return is the value +assigned to `p[0]` in the starting grammar rule. + +### Combining Grammar Rule Functions + +When grammar rules are similar, they can be combined into a single +function. For example, consider the two rules in our earlier example: + + def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + + def p_expression_minus(t): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + +Instead of writing two functions, you might write a single function like +this: + + def p_expression(p): + '''expression : expression PLUS term + | expression MINUS term''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + +In general, the docstring for any given function can contain multiple +grammar rules. So, it would have also been legal (although possibly +confusing) to write this: + + def p_binary_operators(p): + '''expression : expression PLUS term + | expression MINUS term + term : term TIMES factor + | term DIVIDE factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + +When combining grammar rules into a single function, it is usually a +good idea for all of the rules to have a similar structure (e.g., the +same number of terms). Otherwise, the corresponding action code may be +more complicated than necessary. However, it is possible to handle +simple cases using len(). For example: + + def p_expressions(p): + '''expression : expression MINUS expression + | MINUS expression''' + if (len(p) == 4): + p[0] = p[1] - p[3] + elif (len(p) == 3): + p[0] = -p[2] + +If parsing performance is a concern, you should resist the urge to put +too much conditional processing into a single grammar rule as shown in +these examples. When you add checks to see which grammar rule is being +handled, you are actually duplicating the work that the parser has +already performed (i.e., the parser already knows exactly what rule it +matched). You can eliminate this overhead by using a separate `p_rule()` +function for each grammar rule. + +### Character Literals + +If desired, a grammar may contain tokens defined as single character +literals. For example: + + def p_binary_operators(p): + '''expression : expression '+' term + | expression '-' term + term : term '*' factor + | term '/' factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + +A character literal must be enclosed in quotes such as `'+'`. In +addition, if literals are used, they must be declared in the +corresponding `lex` file through the use of a special `literals` +declaration: + + # Literals. Should be placed in module given to lex() + literals = ['+','-','*','/' ] + +Character literals are limited to a single character. Thus, it is not +legal to specify literals such as `'<='` or `'=='`. For this, use the +normal lexing rules (e.g., define a rule such as `t_EQ = r'=='`). + +### Empty Productions + +`yacc.py` can handle empty productions by defining a rule like this: + + def p_empty(p): + 'empty :' + pass + +Now to use the empty production, use \'empty\' as a symbol. For example: + + def p_optitem(p): + 'optitem : item' + ' | empty' + ... + +Note: You can write empty rules anywhere by specifying an empty right +hand side. However, I personally find that writing an \"empty\" rule and +using \"empty\" to denote an empty production is easier to read and more +clearly states your intentions. + +### Changing the starting symbol + +Normally, the first rule found in a yacc specification defines the +starting grammar rule (top level rule). To change this, supply a `start` +specifier in your file. For example: + + start = 'foo' + + def p_bar(p): + 'bar : A B' + + # This is the starting rule due to the start specifier above + def p_foo(p): + 'foo : bar X' + ... + +The use of a `start` specifier may be useful during debugging since you +can use it to have yacc build a subset of a larger grammar. For this +purpose, it is also possible to specify a starting symbol as an argument +to `yacc()`. For example: + + parser = yacc.yacc(start='foo') + +### Dealing With Ambiguous Grammars + +The expression grammar given in the earlier example has been written in +a special format to eliminate ambiguity. However, in many situations, it +is extremely difficult or awkward to write grammars in this format. A +much more natural way to express the grammar is in a more compact form +like this: + + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER + +Unfortunately, this grammar specification is ambiguous. For example, if +you are parsing the string \"3 \* 4 + 5\", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean \"(3 \* 4) + 5\" or is it \"3 \* (4+5)\"? + +When an ambiguous grammar is given to `yacc.py` it will print messages +about \"shift/reduce conflicts\" or \"reduce/reduce conflicts\". A +shift/reduce conflict is caused when the parser generator can\'t decide +whether or not to reduce a rule or shift a symbol on the parsing stack. +For example, consider the string \"3 \* 4 + 5\" and the internal parsing +stack: + + Step Symbol Stack Input Tokens Action + ---- --------------------- --------------------- ------------------------------- + 1 $ 3 * 4 + 5$ Shift 3 + 2 $ 3 * 4 + 5$ Reduce : expression : NUMBER + 3 $ expr * 4 + 5$ Shift * + 4 $ expr * 4 + 5$ Shift 4 + 5 $ expr * 4 + 5$ Reduce: expression : NUMBER + 6 $ expr * expr + 5$ SHIFT/REDUCE CONFLICT ???? + +In this case, when the parser reaches step 6, it has two options. One is +to reduce the rule `expr : expr * expr` on the stack. The other option +is to shift the token `+` on the stack. Both options are perfectly legal +from the rules of the context-free-grammar. + +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always shift +the `+` instead of reducing. Although this strategy works in many cases +(for example, the case of \"if-then\" versus \"if-then-else\"), it is +not enough for arithmetic expressions. In fact, in the above example, +the decision to shift `+` is completely wrong\-\--we should have reduced +`expr * expr` since multiplication has higher mathematical precedence +than addition. + +To resolve ambiguity, especially in expression grammars, `yacc.py` +allows individual tokens to be assigned a precedence level and +associativity. This is done by adding a variable `precedence` to the +grammar file like this: + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ) + +This declaration specifies that `PLUS`/`MINUS` have the same precedence +level and are left-associative and that `TIMES`/`DIVIDE` have the same +precedence and are left-associative. Within the `precedence` +declaration, tokens are ordered from lowest to highest precedence. Thus, +this declaration specifies that `TIMES`/`DIVIDE` have higher precedence +than `PLUS`/`MINUS` (since they appear later in the precedence +specification). + +The precedence specification works by associating a numerical precedence +level value and associativity direction to the listed tokens. For +example, in the above example you get: + + PLUS : level = 1, assoc = 'left' + MINUS : level = 1, assoc = 'left' + TIMES : level = 2, assoc = 'left' + DIVIDE : level = 2, assoc = 'left' + +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. *This is always determined +by looking at the precedence of the right-most terminal symbol.* For +example: + + expression : expression PLUS expression # level = 1, left + | expression MINUS expression # level = 1, left + | expression TIMES expression # level = 2, left + | expression DIVIDE expression # level = 2, left + | LPAREN expression RPAREN # level = None (not specified) + | NUMBER # level = None (not specified) + +When shift/reduce conflicts are encountered, the parser generator +resolves the conflict by looking at the precedence rules and +associativity specifiers. + +1. If the current token has higher precedence than the rule on the + stack, it is shifted. +2. If the grammar rule on the stack has higher precedence, the rule is + reduced. +3. If the current token and the grammar rule have the same precedence, + the rule is reduced for left associativity, whereas the token is + shifted for right associativity. +4. If nothing is known about the precedence, shift/reduce conflicts are + resolved in favor of shifting (the default). + +For example, if \"expression PLUS expression\" has been parsed and the +next token is \"TIMES\", the action is going to be a shift because +\"TIMES\" has a higher precedence level than \"PLUS\". On the other +hand, if \"expression TIMES expression\" has been parsed and the next +token is \"PLUS\", the action is going to be reduce because \"PLUS\" has +a lower precedence than \"TIMES.\" + +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), `yacc.py` will report no +errors or conflicts in the grammar (although it will print some +information in the `parser.out` debugging file). + +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in \"3 + 4 \* +-5\". Mathematically, the unary minus is normally given a very high +precedence\--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To deal +with this, precedence rules can be given for so-called \"fictitious +tokens\" like this: + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + +Now, in the grammar file, we can write our unary minus rule like this: + + def p_expr_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + +In this case, `%prec UMINUS` overrides the default rule +precedence\--setting it to that of UMINUS in the precedence specifier. + +At first, the use of UMINUS in this example may appear very confusing. +UMINUS is not an input token or a grammar rule. Instead, you should +think of it as the name of a special marker in the precedence table. +When you use the `%prec` qualifier, you\'re telling yacc that you want +the precedence of the expression to be the same as for this special +marker instead of the usual precedence. + +It is also possible to specify non-associativity in the `precedence` +table. This would be used when you *don\'t* want operations to chain +together. For example, suppose you wanted to support comparison +operators like `<` and `>` but you didn\'t want to allow combinations +like `a < b < c`. To do this, specify a rule like this: + + precedence = ( + ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + +If you do this, the occurrence of input text such as `a < b < c` will +result in a syntax error. However, simple expressions such as `a < b` +will still be fine. + +Reduce/reduce conflicts are caused when there are multiple grammar rules +that can be applied to a given set of symbols. This kind of conflict is +almost always bad and is always resolved by picking the rule that +appears first in the grammar file. Reduce/reduce conflicts are almost +always caused when different sets of grammar rules somehow generate the +same set of symbols. For example: + + assignment : ID EQUALS NUMBER + | ID EQUALS expression + + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER + +In this case, a reduce/reduce conflict exists between these two rules: + + assignment : ID EQUALS NUMBER + expression : NUMBER + +For example, if you wrote \"a = 5\", the parser can\'t figure out if +this is supposed to be reduced as `assignment : ID EQUALS NUMBER` or +whether it\'s supposed to reduce the 5 as an expression and then reduce +the rule `assignment : ID EQUALS expression`. + +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot looking at the input grammar. When a reduce/reduce +conflict occurs, `yacc()` will try to help by printing a warning message +such as this: + + WARNING: 1 reduce/reduce conflict + WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) + WARNING: rejected rule (expression -> NUMBER) + +This message identifies the two rules that are in conflict. However, it +may not tell you how the parser arrived at such a state. To try and +figure it out, you\'ll probably have to look at your grammar and the +contents of the `parser.out` debugging file with an appropriately high +level of caffeination. + +### The parser.out file + +Tracking down shift/reduce and reduce/reduce conflicts is one of the +finer pleasures of using an LR parsing algorithm. To assist in +debugging, `yacc.py` can create a debugging file called \'parser.out\'. +To create this file, use `yacc.yacc(debug=True)`. The contents of this +file look like the following: + + Unused terminals: + + + Grammar + + Rule 1 expression -> expression PLUS expression + Rule 2 expression -> expression MINUS expression + Rule 3 expression -> expression TIMES expression + Rule 4 expression -> expression DIVIDE expression + Rule 5 expression -> NUMBER + Rule 6 expression -> LPAREN expression RPAREN + + Terminals, with rules where they appear + + TIMES : 3 + error : + MINUS : 2 + RPAREN : 6 + LPAREN : 6 + DIVIDE : 4 + PLUS : 1 + NUMBER : 5 + + Nonterminals, with rules where they appear + + expression : 1 1 2 2 3 3 4 4 6 0 + + + Parsing method: LALR + + + state 0 + + S' -> . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 1 + + S' -> expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + + state 2 + + expression -> LPAREN . expression RPAREN + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 3 + + expression -> NUMBER . + + $ reduce using rule 5 + PLUS reduce using rule 5 + MINUS reduce using rule 5 + TIMES reduce using rule 5 + DIVIDE reduce using rule 5 + RPAREN reduce using rule 5 + + + state 4 + + expression -> expression TIMES . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 5 + + expression -> expression MINUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 6 + + expression -> expression PLUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 7 + + expression -> expression DIVIDE . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 8 + + expression -> LPAREN expression . RPAREN + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + RPAREN shift and go to state 13 + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + + state 9 + + expression -> expression TIMES expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 3 + PLUS reduce using rule 3 + MINUS reduce using rule 3 + TIMES reduce using rule 3 + DIVIDE reduce using rule 3 + RPAREN reduce using rule 3 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + + state 10 + + expression -> expression MINUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 2 + PLUS reduce using rule 2 + MINUS reduce using rule 2 + RPAREN reduce using rule 2 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + + state 11 + + expression -> expression PLUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 1 + PLUS reduce using rule 1 + MINUS reduce using rule 1 + RPAREN reduce using rule 1 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 1 ] + ! DIVIDE [ reduce using rule 1 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + + state 12 + + expression -> expression DIVIDE expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 4 + PLUS reduce using rule 4 + MINUS reduce using rule 4 + TIMES reduce using rule 4 + DIVIDE reduce using rule 4 + RPAREN reduce using rule 4 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + + state 13 + + expression -> LPAREN expression RPAREN . + + $ reduce using rule 6 + PLUS reduce using rule 6 + MINUS reduce using rule 6 + TIMES reduce using rule 6 + DIVIDE reduce using rule 6 + RPAREN reduce using rule 6 + +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar rules +that might be in the process of being matched at that point. Within each +rule, the \".\" character indicates the current location of the parse +within that rule. In addition, the actions for each valid input token +are listed. When a shift/reduce or reduce/reduce conflict arises, rules +*not* selected are prefixed with an !. For example: + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + +By looking at these rules (and with a little practice), you can usually +track down the source of most parsing conflicts. It should also be +stressed that not all shift-reduce conflicts are bad. However, the only +way to be sure that they are resolved correctly is to look at +`parser.out`. + +### Syntax Error Handling + +If you are creating a parser for production use, the handling of syntax +errors is important. As a general rule, you don\'t want a parser to +throw up its hands and stop at the first sign of trouble. Instead, you +want it to report the error, recover if possible, and continue parsing +so that all of the errors in the input get reported to the user at once. +This is the standard behavior found in compilers for languages such as +C, C++, and Java. + +In PLY, when a syntax error occurs during parsing, the error is +immediately detected (i.e., the parser does not read any more tokens +beyond the source of the error). However, at this point, the parser +enters a recovery mode that can be used to try and continue further +parsing. As a general rule, error recovery in LR parsers is a delicate +topic that involves ancient rituals and black-magic. The recovery +mechanism provided by `yacc.py` is comparable to Unix yacc so you may +want consult a book like O\'Reilly\'s \"Lex and Yacc\" for some of the +finer details. + +When a syntax error occurs, `yacc.py` performs the following steps: + +1. On the first occurrence of an error, the user-defined `p_error()` + function is called with the offending token as an argument. However, + if the syntax error is due to reaching the end-of-file, `p_error()` + is called with an argument of `None`. Afterwards, the parser enters + an \"error-recovery\" mode in which it will not make future calls to + `p_error()` until it has successfully shifted at least 3 tokens onto + the parsing stack. +2. If no recovery action is taken in `p_error()`, the offending + lookahead token is replaced with a special `error` token. +3. If the offending lookahead token is already set to `error`, the top + item of the parsing stack is deleted. +4. If the entire parsing stack is unwound, the parser enters a restart + state and attempts to start parsing from its initial state. +5. If a grammar rule accepts `error` as a token, it will be shifted + onto the parsing stack. +6. If the top item of the parsing stack is `error`, lookahead tokens + will be discarded until the parser can successfully shift a new + symbol or reduce a rule involving `error`. + +#### Recovery and resynchronization with error rules + +The most well-behaved approach for handling syntax errors is to write +grammar rules that include the `error` token. For example, suppose your +language had a grammar rule for a print statement like this: + + def p_statement_print(p): + 'statement : PRINT expr SEMI' + ... + +To account for the possibility of a bad expression, you might write an +additional grammar rule like this: + + def p_statement_print_error(p): + 'statement : PRINT error SEMI' + print("Syntax error in print statement. Bad expression") + +In this case, the `error` token will match any sequence of tokens that +might appear up to the first semicolon that is encountered. Once the +semicolon is reached, the rule will be invoked and the `error` token +will go away. + +This type of recovery is sometimes known as parser resynchronization. +The `error` token acts as a wildcard for any bad input text and the +token immediately following `error` acts as a synchronization token. + +It is important to note that the `error` token usually does not appear +as the last token on the right in an error rule. For example: + + def p_statement_print_error(p): + 'statement : PRINT error' + print("Syntax error in print statement. Bad expression") + +This is because the first bad token encountered will cause the rule to +be reduced\--which may make it difficult to recover if more bad tokens +immediately follow. + +#### Panic mode recovery + +An alternative error recovery scheme is to enter a panic mode recovery +in which tokens are discarded to a point where the parser might be able +to recover in some sensible manner. + +Panic mode recovery is implemented entirely in the `p_error()` function. +For example, this function starts discarding tokens until it reaches a +closing \'}\'. Then, it restarts the parser in its initial state: + + def p_error(p): + print("Whoa. You are seriously hosed.") + if not p: + print("End of File!") + return + + # Read ahead looking for a closing '}' + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'RBRACE': + break + parser.restart() + +This function discards the bad token and tells the parser that the error +was ok: + + def p_error(p): + if p: + print("Syntax error at token", p.type) + # Just discard the token and tell the parser it's okay. + parser.errok() + else: + print("Syntax error at EOF") + +More information on these methods is as follows: + +`parser.errok()` + +: This resets the parser state so it doesn\'t think it\'s in + error-recovery mode. This will prevent an `error` token from being + generated and will reset the internal error counters so that the + next syntax error will call `p_error()` again. + +`parser.token()` + +: This returns the next token on the input stream. + +`parser.restart()`. + +: This discards the entire parsing stack and resets the parser to its + initial state. + +To supply the next lookahead token to the parser, `p_error()` can return +a token. This might be useful if trying to synchronize on special +characters. For example: + + def p_error(p): + # Read ahead looking for a terminating ";" + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'SEMI': break + parser.errok() + + # Return SEMI to the parser as the next lookahead token + return tok + +Keep in mind in that the above error handling functions, `parser` is an +instance of the parser created by `yacc()`. You\'ll need to save this +instance someplace in your code so that you can refer to it during error +handling. + +#### Signalling an error from a production + +If necessary, a production rule can manually force the parser to enter +error recovery. This is done by raising the `SyntaxError` exception like +this: + + def p_production(p): + 'production : some production ...' + raise SyntaxError + +The effect of raising `SyntaxError` is the same as if the last symbol +shifted onto the parsing stack was actually a syntax error. Thus, when +you do this, the last symbol shifted is popped off of the parsing stack +and the current lookahead token is set to an `error` token. The parser +then enters error-recovery mode where it tries to reduce rules that can +accept `error` tokens. The steps that follow from this point are exactly +the same as if a syntax error were detected and `p_error()` were called. + +One important aspect of manually setting an error is that the +`p_error()` function will NOT be called in this case. If you need to +issue an error message, make sure you do it in the production that +raises `SyntaxError`. + +Note: This feature of PLY is meant to mimic the behavior of the YYERROR +macro in yacc. + +#### When Do Syntax Errors Get Reported? + +In most cases, yacc will handle errors as soon as a bad input token is +detected on the input. However, be aware that yacc may choose to delay +error handling until after it has reduced one or more grammar rules +first. This behavior might be unexpected, but it\'s related to special +states in the underlying parsing table known as \"defaulted states.\" A +defaulted state is parsing condition where the same grammar rule will be +reduced regardless of what *valid* token comes next on the input. For +such states, yacc chooses to go ahead and reduce the grammar rule +*without reading the next input token*. If the next token is bad, yacc +will eventually get around to reading it and report a syntax error. +It\'s just a little unusual in that you might see some of your grammar +rules firing immediately prior to the syntax error. + +Usually, the delayed error reporting with defaulted states is harmless +(and there are other reasons for wanting PLY to behave in this way). +However, if you need to turn this behavior off for some reason. You can +clear the defaulted states table like this: + + parser = yacc.yacc() + parser.defaulted_states = {} + +Disabling defaulted states is not recommended if your grammar makes use +of embedded actions as described in Section 6.11. + +#### General comments on error handling + +For normal types of languages, error recovery with error rules and +resynchronization characters is probably the most reliable technique. +This is because you can instrument the grammar to catch errors at +selected places where it is relatively easy to recover and continue +parsing. Panic mode recovery is really only useful in certain +specialized applications where you might want to discard huge portions +of the input text to find a valid restart point. + +### Line Number and Position Tracking + +Position tracking is often a tricky problem when writing compilers. By +default, PLY tracks the line number and position of all tokens. This +information is available using the following functions: + +`p.lineno(num)`. Return the line number for symbol *num* + +`p.lexpos(num)`. Return the lexing position for symbol *num* + +For example: + + def p_expression(p): + 'expression : expression PLUS expression' + line = p.lineno(2) # line number of the PLUS token + index = p.lexpos(2) # Position of the PLUS token + +As an optional feature, `yacc.py` can automatically track line numbers +and positions for all of the grammar symbols as well. However, this +extra tracking requires extra processing and can significantly slow down +parsing. Therefore, it must be enabled by passing the `tracking=True` +option to `yacc.parse()`. For example: + + yacc.parse(data,tracking=True) + +Once enabled, the `lineno()` and `lexpos()` methods work for all grammar +symbols. In addition, two additional methods can be used: + +`p.linespan(num)`. Return a tuple (startline,endline) with the starting +and ending line number for symbol *num*. + +`p.lexspan(num)`. Return a tuple (start,end) with the starting and +ending positions for symbol *num*. + +For example: + + def p_expression(p): + 'expression : expression PLUS expression' + p.lineno(1) # Line number of the left expression + p.lineno(2) # line number of the PLUS operator + p.lineno(3) # line number of the right expression + ... + start,end = p.linespan(3) # Start,end lines of the right expression + starti,endi = p.lexspan(3) # Start,end positions of right expression + +Note: The `lexspan()` function only returns the range of values up to +the start of the last grammar symbol. + +Although it may be convenient for PLY to track position information on +all grammar symbols, this is often unnecessary. For example, if you are +merely using line number information in an error message, you can often +just key off of a specific token in the grammar rule. For example: + + def p_bad_func(p): + 'funccall : fname LPAREN error RPAREN' + # Line number reported from LPAREN token + print("Bad function call at line", p.lineno(2)) + +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it\'s needed using +the `p.set_lineno()` method. For example: + + def p_fname(p): + 'fname : ID' + p[0] = p[1] + p.set_lineno(0,p.lineno(1)) + +PLY doesn\'t retain line number information from rules that have already +been parsed. If you are building an abstract syntax tree and need to +have line numbers, you should make sure that the line numbers appear in +the tree itself. + +### AST Construction + +`yacc.py` provides no special functions for constructing an abstract +syntax tree. However, such construction is easy enough to do on your +own. + +A minimal way to construct a tree is to create and propagate a tuple or +list in each grammar rule function. There are many possible ways to do +this, but one example would be something like this: + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = ('binary-expression',p[2],p[1],p[3]) + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = ('group-expression',p[2]) + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = ('number-expression',p[1]) + +Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to `p[0]` in each +rule. For example: + + class Expr: pass + + class BinOp(Expr): + def __init__(self,left,op,right): + self.left = left + self.right = right + self.op = op + + class Number(Expr): + def __init__(self,value): + self.value = value + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = BinOp(p[1],p[2],p[3]) + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = Number(p[1]) + +The advantage to this approach is that it may make it easier to attach +more complicated semantics, type checking, code generation, and other +features to the node classes. + +To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example: + + class Node: + def __init__(self,type,children=None,leaf=None): + self.type = type + if children: + self.children = children + else: + self.children = [ ] + self.leaf = leaf + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = Node("binop", [p[1],p[3]], p[2]) + +### Embedded Actions + +The parsing technique used by yacc only allows actions to be executed at +the end of a rule. For example, suppose you have a rule like this: + + def p_foo(p): + "foo : A B C D" + print("Parsed a foo", p[1],p[2],p[3],p[4]) + +In this case, the supplied action code only executes after all of the +symbols `A`, `B`, `C`, and `D` have been parsed. Sometimes, however, it +is useful to execute small code fragments during intermediate stages of +parsing. For example, suppose you wanted to perform some action +immediately after `A` has been parsed. To do this, write an empty rule +like this: + + def p_foo(p): + "foo : A seen_A B C D" + print("Parsed a foo", p[1],p[3],p[4],p[5]) + print("seen_A returned", p[2]) + + def p_seen_A(p): + "seen_A :" + print("Saw an A = ", p[-1]) # Access grammar symbol to left + p[0] = some_value # Assign value to seen_A + +In this example, the empty `seen_A` rule executes immediately after `A` +is shifted onto the parsing stack. Within this rule, `p[-1]` refers to +the symbol on the stack that appears immediately to the left of the +`seen_A` symbol. In this case, it would be the value of `A` in the `foo` +rule immediately above. Like other rules, a value can be returned from +an embedded action by assigning it to `p[0]` + +The use of embedded actions can sometimes introduce extra shift/reduce +conflicts. For example, this grammar has no conflicts: + + def p_foo(p): + """foo : abcd + | abcx""" + + def p_abcd(p): + "abcd : A B C D" + + def p_abcx(p): + "abcx : A B C X" + +However, if you insert an embedded action into one of the rules like +this: + + def p_foo(p): + """foo : abcd + | abcx""" + + def p_abcd(p): + "abcd : A B C D" + + def p_abcx(p): + "abcx : A B seen_AB C X" + + def p_seen_AB(p): + "seen_AB :" + +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol `C` appears next in both the +`abcd` and `abcx` rules. The parser can either shift the symbol (`abcd` +rule) or reduce the empty rule `seen_AB` (`abcx` rule). + +A common use of embedded rules is to control other aspects of parsing +such as scoping of local variables. For example, if you were parsing C +code, you might write code like this: + + def p_statements_block(p): + "statements: LBRACE new_scope statements RBRACE""" + # Action code + ... + pop_scope() # Return to previous scope + + def p_new_scope(p): + "new_scope :" + # Create a new scope for local variables + s = new_scope() + push_scope(s) + ... + +In this case, the embedded action `new_scope` executes immediately after +a `LBRACE` (`{`) symbol is parsed. This might adjust internal symbol +tables and other aspects of the parser. Upon completion of the rule +`statements_block`, code might undo the operations performed in the +embedded action (e.g., `pop_scope()`). + +### Miscellaneous Yacc Notes + +1. By default, `yacc.py` relies on `lex.py` for tokenizing. However, an + alternative tokenizer can be supplied as follows: + + parser = yacc.parse(lexer=x) + + in this case, `x` must be a Lexer object that minimally has a + `x.token()` method for retrieving the next token. If an input string + is given to `yacc.parse()`, the lexer must also have an `x.input()` + method. + +2. To print copious amounts of debugging during parsing, use: + + parser.parse(input_text, debug=True) + +3. Since LR parsing is driven by tables, the performance of the parser + is largely independent of the size of the grammar. The biggest + bottlenecks will be the lexer and the complexity of the code in your + grammar rules. + +4. `yacc()` also allows parsers to be defined as classes and as + closures (see the section on alternative specification of lexers). + However, be aware that only one parser may be defined in a single + module (source file). There are various error checks and validation + steps that may issue confusing error messages if you try to define + multiple parsers in the same source file. + +## Multiple Parsers and Lexers + +In advanced parsing applications, you may want to have multiple parsers +and lexers. + +As a general rules this isn\'t a problem. However, to make it work, you +need to carefully make sure everything gets hooked up correctly. First, +make sure you save the objects returned by `lex()` and `yacc()`. For +example: + + lexer = lex.lex() # Return lexer object + parser = yacc.yacc() # Return parser object + +Next, when parsing, make sure you give the `parse()` function a +reference to the lexer it should be using. For example: + + parser.parse(text,lexer=lexer) + +If you forget to do this, the parser will use the last lexer +created\--which is not always what you want. + +Within lexer and parser rule functions, these objects are also +available. In the lexer, the \"lexer\" attribute of a token refers to +the lexer object that triggered the rule. For example: + + def t_NUMBER(t): + r'\d+' + ... + print(t.lexer) ## Show lexer object + +In the parser, the \"lexer\" and \"parser\" attributes refer to the +lexer and parser objects respectively: + + def p_expr_plus(p): + 'expr : expr PLUS expr' + ... + print(p.parser) # Show parser object + print(p.lexer) # Show lexer object + +If necessary, arbitrary attributes can be attached to the lexer or +parser object. For example, if you wanted to have different parsing +modes, you could attach a mode attribute to the parser object and look +at it later. + +## Advanced Debugging + +Debugging a compiler is typically not an easy task. PLY provides some +diagostic capabilities through the use of Python\'s `logging` module. +The next two sections describe this: + +### Debugging the lex() and yacc() commands + +Both the `lex()` and `yacc()` commands have a debugging mode that can be +enabled using the `debug` flag. For example: + + lex.lex(debug=True) + yacc.yacc(debug=True) + +Normally, the output produced by debugging is routed to either standard +error or, in the case of `yacc()`, to a file `parser.out`. This output +can be more carefully controlled by supplying a logging object. Here is +an example that adds information about where different debugging +messages are coming from: + + # Set up a logging object + import logging + logging.basicConfig( + level = logging.DEBUG, + filename = "parselog.txt", + filemode = "w", + format = "%(filename)10s:%(lineno)4d:%(message)s" + ) + log = logging.getLogger() + + lex.lex(debug=True,debuglog=log) + yacc.yacc(debug=True,debuglog=log) + +If you supply a custom logger, the amount of debugging information +produced can be controlled by setting the logging level. Typically, +debugging messages are either issued at the `DEBUG`, `INFO`, or +`WARNING` levels. + +PLY\'s error messages and warnings are also produced using the logging +interface. This can be controlled by passing a logging object using the +`errorlog` parameter: + + lex.lex(errorlog=log) + yacc.yacc(errorlog=log) + +If you want to completely silence warnings, you can either pass in a +logging object with an appropriate filter level or use the `NullLogger` +object defined in either `lex` or `yacc`. For example: + + yacc.yacc(errorlog=yacc.NullLogger()) + +### Run-time Debugging + +To enable run-time debugging of a parser, use the `debug` option to +parse. This option can either be an integer (which turns debugging on or +off) or an instance of a logger object. For example: + + log = logging.getLogger() + parser.parse(input,debug=log) + +If a logging object is passed, you can use its filtering level to +control how much output gets generated. The `INFO` level is used to +produce information about rule reductions. The `DEBUG` level will show +information about the parsing stack, token shifts, and other details. +The `ERROR` level shows information related to parsing errors. + +For very complicated problems, you should pass in a logging object that +redirects to a file where you can more easily inspect the output after +execution. + +## Using Python -OO Mode + +Because of PLY\'s reliance on docstrings, it is not compatible with +[-OO]{.title-ref} mode of the interpreter (which strips docstrings). If +you want to support this, you\'ll need to write a decorator or some +other tool to attach docstrings to functions. For example: + + def _(doc): + def decorate(func): + func.__doc__ = doc + return func + return decorate + + @_("assignment : expr PLUS expr") + def p_assignment(p): + ... + +PLY does not provide such a decorator by default. + +## Where to go from here? + +The `examples` directory of the PLY distribution contains several simple +examples. Please consult a compilers textbook for the theory and +underlying implementation details or LR parsing. diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 91883c3..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,192 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext - -help: - @echo "Please use \`make <target>' where <target> is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sly.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sly.qhc" - -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/sly" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sly" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index abfb05c..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,284 +0,0 @@ -# -*- coding: utf-8 -*- -# -# ply documentation build configuration file, created by -# sphinx-quickstart on Wed Sep 7 13:23:26 2016. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os -import shlex - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'ply' -copyright = u'2001-2020, David Beazley' -author = u'David Beazley' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '4.0' -# The full version, including alpha/beta/rc tags. -release = '4.0' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'alabaster' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# "<project> v<release> documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a <link> tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Language to be used for generating the HTML full-text search index. -# Sphinx supports the following languages: -# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' -# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' -#html_search_language = 'en' - -# A dictionary with options for the search language support, empty by default. -# Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} - -# The name of a javascript file (relative to the configuration directory) that -# implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'plydoc' - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'ply.tex', u'Ply Documentation', - u'David Beazley', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'ply', u'Ply Documentation', - [author], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'ply', u'Ply Documentation', - author, 'ply', 'Python Lex-Yacc.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index da22efd..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,58 +0,0 @@ -PLY (Python Lex-Yacc) -===================== - -Requirements ------------- - -PLY requires the use of Python 3.6 or greater. Older versions -of Python are not supported. - -Overview --------- - -PLY is a 100% Python implementation of the lex and yacc tools -commonly used to write parsers and compilers. Parsing is -based on the same LALR(1) algorithm used by many yacc tools. -Here are a few notable features: - - - PLY provides *very* extensive error reporting and diagnostic - information to assist in parser construction. The original - implementation was developed for instructional purposes. As - a result, the system tries to identify the most common types - of errors made by novice users. - - - PLY provides full support for empty productions, error recovery, - precedence specifiers, and moderately ambiguous grammars. - - - PLY can be used to build parsers for "real" programming languages. - Although it is not ultra-fast due to its Python implementation, - PLY can be used to parse grammars consisting of several hundred - rules (as might be found for a language like C). - -More Documentation -================== - -Contents: - -.. toctree:: - :maxdepth: 3 - - ply - internals - -Resources -========= - -For a detailed overview of parsing theory, consult the excellent -book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and -Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown -may also be useful. - -The GitHub page for PLY can be found at: - - https://github.com/dabeaz/ply - -Please direct bug reports and pull requests to the GitHub page. -To contact me directly, send email to dave@dabeaz.com or contact -me on Twitter (@dabeaz). - diff --git a/docs/internals.rst b/docs/internals.rst deleted file mode 100644 index 2e0de17..0000000 --- a/docs/internals.rst +++ /dev/null @@ -1,530 +0,0 @@ -PLY Internals -============= - -1. Introduction ---------------- - -This document describes classes and functions that make up the internal -operation of PLY. Using this programming interface, it is possible to -manually build an parser using a different interface specification -than what PLY normally uses. For example, you could build a gramar -from information parsed in a completely different input format. Some of -these objects may be useful for building more advanced parsing engines -such as GLR. - -It should be stressed that using PLY at this level is not for the -faint of heart. Generally, it's assumed that you know a bit of -the underlying compiler theory and how an LR parser is put together. - -2. Grammar Class ----------------- - -The file ``ply.yacc`` defines a class ``Grammar`` that -is used to hold and manipulate information about a grammar -specification. It encapsulates the same basic information -about a grammar that is put into a YACC file including -the list of tokens, precedence rules, and grammar rules. -Various operations are provided to perform different validations -on the grammar. In addition, there are operations to compute -the first and follow sets that are needed by the various table -generation algorithms. - - -``Grammar(terminals)`` - Creates a new grammar object. ``terminals`` is a list of strings - specifying the terminals for the grammar. An instance ``g`` of - ``Grammar`` has the following methods: - - -``g.set_precedence(term,assoc,level)`` - Sets the precedence level and associativity for a given terminal ``term``. - ``assoc`` is one of ``'right'``, - ``'left'``, or ``'nonassoc'`` and ``level`` is a positive integer. The higher - the value of ``level``, the higher the precedence. Here is an example of typical - precedence settings:: - - g.set_precedence('PLUS', 'left',1) - g.set_precedence('MINUS', 'left',1) - g.set_precedence('TIMES', 'left',2) - g.set_precedence('DIVIDE','left',2) - g.set_precedence('UMINUS','left',3) - - This method must be called prior to adding any productions to the - grammar with ``g.add_production()``. The precedence of individual grammar - rules is determined by the precedence of the right-most terminal. - - -``g.add_production(name,syms,func=None,file='',line=0)`` - Adds a new grammar rule. ``name`` is the name of the rule, - ``syms`` is a list of symbols making up the right hand - side of the rule, ``func`` is the function to call when - reducing the rule. ``file`` and ``line`` specify - the filename and line number of the rule and are used for - generating error messages. - - The list of symbols in ``syms`` may include character - literals and ``%prec`` specifiers. Here are some - examples:: - - g.add_production('expr',['expr','PLUS','term'],func,file,line) - g.add_production('expr',['expr','"+"','term'],func,file,line) - g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) - - If any kind of error is detected, a ``GrammarError`` exception - is raised with a message indicating the reason for the failure. - - -``g.set_start(start=None)`` - Sets the starting rule for the grammar. ``start`` is a string - specifying the name of the start rule. If ``start`` is omitted, - the first grammar rule added with ``add_production()`` is taken to be - the starting rule. This method must always be called after all - productions have been added. - -``g.find_unreachable()`` - Diagnostic function. Returns a list of all unreachable non-terminals - defined in the grammar. This is used to identify inactive parts of - the grammar specification. - -``g.infinite_cycle()`` - Diagnostic function. Returns a list of all non-terminals in the - grammar that result in an infinite cycle. This condition occurs if - there is no way for a grammar rule to expand to a string containing - only terminal symbols. - -``g.undefined_symbols()`` - Diagnostic function. Returns a list of tuples ``(name, prod)`` - corresponding to undefined symbols in the grammar. ``name`` is the - name of the undefined symbol and ``prod`` is an instance of - ``Production`` which has information about the production rule - where the undefined symbol was used. - -``g.unused_terminals()`` - Diagnostic function. Returns a list of terminals that were defined, - but never used in the grammar. - -``g.unused_rules()`` - Diagnostic function. Returns a list of ``Production`` instances - corresponding to production rules that were defined in the grammar, - but never used anywhere. This is slightly different - than ``find_unreachable()``. - -``g.unused_precedence()`` - Diagnostic function. Returns a list of tuples ``(term, assoc)`` - corresponding to precedence rules that were set, but never used the - grammar. ``term`` is the terminal name and ``assoc`` is the - precedence associativity (e.g., ``'left'``, ``'right'``, - or ``'nonassoc'``. - -``g.compute_first()`` - Compute all of the first sets for all symbols in the grammar. Returns a dictionary - mapping symbol names to a list of all first symbols. - -``g.compute_follow()`` - Compute all of the follow sets for all non-terminals in the grammar. - The follow set is the set of all possible symbols that might follow a - given non-terminal. Returns a dictionary mapping non-terminal names - to a list of symbols. - -``g.build_lritems()`` - Calculates all of the LR items for all productions in the grammar. This - step is required before using the grammar for any kind of table generation. - See the section on LR items below. - -The following attributes are set by the above methods and may be useful -in code that works with the grammar. All of these attributes should be -assumed to be read-only. Changing their values directly will likely -break the grammar. - -``g.Productions`` - A list of all productions added. The first entry is reserved for - a production representing the starting rule. The objects in this list - are instances of the ``Production`` class, described shortly. - -``g.Prodnames`` - A dictionary mapping the names of nonterminals to a list of all - productions of that nonterminal. - -``g.Terminals`` - A dictionary mapping the names of terminals to a list of the - production numbers where they are used. - -``g.Nonterminals`` - A dictionary mapping the names of nonterminals to a list of the - production numbers where they are used. - -``g.First`` - A dictionary representing the first sets for all grammar symbols. This is - computed and returned by the ``compute_first()`` method. - -``g.Follow`` - A dictionary representing the follow sets for all grammar rules. This is - computed and returned by the ``compute_follow()`` method. - -``g.Start`` - Starting symbol for the grammar. Set by the ``set_start()`` method. - -For the purposes of debugging, a ``Grammar`` object supports the ``__len__()`` and -``__getitem__()`` special methods. Accessing ``g[n]`` returns the nth production -from the grammar. - -3. Productions --------------- - -``Grammar`` objects store grammar rules as instances of a ``Production`` class. This -class has no public constructor--you should only create productions by calling ``Grammar.add_production()``. -The following attributes are available on a ``Production`` instance ``p``. - -``p.name`` - The name of the production. For a grammar rule such as ``A : B C D``, this is ``'A'``. - -``p.prod`` - A tuple of symbols making up the right-hand side of the production. For a grammar rule such as ``A : B C D``, this is ``('B','C','D')``. - -``p.number`` - Production number. An integer containing the index of the production in the grammar's ``Productions`` list. - -``p.func`` - The name of the reduction function associated with the production. - This is the function that will execute when reducing the entire - grammar rule during parsing. - -``p.callable`` - The callable object associated with the name in ``p.func``. This is ``None`` - unless the production has been bound using ``bind()``. - -``p.file`` - Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. - -``p.lineno`` - Line number associated with the production. Typically this is the line number in ``p.file`` where the production was defined. Used for error messages. - -``p.prec`` - Precedence and associativity associated with the production. This is a tuple ``(assoc,level)`` where - ``assoc`` is one of ``'left'``,``'right'``, or ``'nonassoc'`` and ``level`` is - an integer. This value is determined by the precedence of the right-most terminal symbol in the production - or by use of the ``%prec`` specifier when adding the production. - -``p.usyms`` - A list of all unique symbols found in the production. - -``p.lr_items`` - A list of all LR items for this production. This attribute only has a meaningful value if the - ``Grammar.build_lritems()`` method has been called. The items in this list are - instances of ``LRItem`` described below. - -``p.lr_next`` - The head of a linked-list representation of the LR items in ``p.lr_items``. - This attribute only has a meaningful value if the ``Grammar.build_lritems()`` - method has been called. Each ``LRItem`` instance has a ``lr_next`` attribute - to move to the next item. The list is terminated by ``None``. - -``p.bind(dict)`` - Binds the production function name in ``p.func`` to a callable object in - ``dict``. This operation is typically carried out in the last step - prior to running the parsing engine and is needed since parsing tables are typically - read from files which only include the function names, not the functions themselves. - -``Production`` objects support -the ``__len__()``, ``__getitem__()``, and ``__str__()`` -special methods. -``len(p)`` returns the number of symbols in ``p.prod`` -and ``p[n]`` is the same as ``p.prod[n]``. - -4. LRItems ----------- - -The construction of parsing tables in an LR-based parser generator is primarily -done over a set of "LR Items". An LR item represents a stage of parsing one -of the grammar rules. To compute the LR items, it is first necessary to -call ``Grammar.build_lritems()``. Once this step, all of the productions -in the grammar will have their LR items attached to them. - -Here is an interactive example that shows what LR items look like if you -interactively experiment. In this example, ``g`` is a ``Grammar`` -object:: - - >>> g.build_lritems() - >>> p = g[1] - >>> p - Production(statement -> ID = expr) - >>> - -In the above code, ``p`` represents the first grammar rule. In -this case, a rule ``'statement -> ID = expr'``. - -Now, let's look at the LR items for ``p``:: - - >>> p.lr_items - [LRItem(statement -> . ID = expr), - LRItem(statement -> ID . = expr), - LRItem(statement -> ID = . expr), - LRItem(statement -> ID = expr .)] - >>> - -In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot -is advanced by one symbol. It is only when the dot reaches the very end that a production -is successfully parsed. - -An instance ``lr`` of ``LRItem`` has the following -attributes that hold information related to that specific stage of -parsing. - -``lr.name`` - The name of the grammar rule. For example, ``'statement'`` in the above example. - -``lr.prod`` - A tuple of symbols representing the right-hand side of the production, including the - special ``'.'`` character. For example, ``('ID','.','=','expr')``. - -``lr.number`` - An integer representing the production number in the grammar. - -``lr.usyms`` - A set of unique symbols in the production. Inherited from the original ``Production`` instance. - -``lr.lr_index`` - An integer representing the position of the dot (.). You should never use ``lr.prod.index()`` - to search for it--the result will be wrong if the grammar happens to also use (.) as a character - literal. - -``lr.lr_after`` - A list of all productions that can legally appear immediately to the right of the - dot (.). This list contains ``Production`` instances. This attribute - represents all of the possible branches a parse can take from the current position. - For example, suppose that ``lr`` represents a stage immediately before - an expression like this:: - - >>> lr - LRItem(statement -> ID = . expr) - >>> - - Then, the value of ``lr.lr_after`` might look like this, showing all productions that - can legally appear next:: - - >>> lr.lr_after - [Production(expr -> expr PLUS expr), - Production(expr -> expr MINUS expr), - Production(expr -> expr TIMES expr), - Production(expr -> expr DIVIDE expr), - Production(expr -> MINUS expr), - Production(expr -> LPAREN expr RPAREN), - Production(expr -> NUMBER), - Production(expr -> ID)] - >>> - -``lr.lr_before`` - The grammar symbol that appears immediately before the dot (.) or ``None`` if - at the beginning of the parse. - -``lr.lr_next`` - A link to the next LR item, representing the next stage of the parse. ``None`` if ``lr`` - is the last LR item. - -``LRItem`` instances also support the ``__len__()`` and ``__getitem__()`` special methods. -``len(lr)`` returns the number of items in ``lr.prod`` including the dot (.). ``lr[n]`` -returns ``lr.prod[n]``. - -It goes without saying that all of the attributes associated with LR -items should be assumed to be read-only. Modifications will very -likely create a small black-hole that will consume you and your code. - -5. LRTable ----------- - -The ``LRTable`` class represents constructed LR parsing tables on a -grammar. - -``LRTable(grammar, log=None)`` - Create the LR parsing tables on a grammar. ``grammar`` is an instance of ``Grammar`` and - ``log`` is a logger object used to write debugging information. The debugging information - written to ``log`` is the same as what appears in the ``parser.out`` file created - by yacc. By supplying a custom logger with a different message format, it is possible to get - more information (e.g., the line number in ``yacc.py`` used for issuing each line of - output in the log). - -An instance ``lr`` of ``LRTable`` has the following attributes. - -``lr.grammar`` - A link to the Grammar object used to construct the parsing tables. - -``lr.lr_method`` - The LR parsing method used (e.g., ``'LALR'``) - -``lr.lr_productions`` - A reference to ``grammar.Productions``. This, together with ``lr_action`` and ``lr_goto`` - contain all of the information needed by the LR parsing engine. - -``lr.lr_action`` - The LR action dictionary that implements the underlying state machine. The keys of this dictionary are - the LR states. - -``lr.lr_goto`` - The LR goto table that contains information about grammar rule reductions. - -``lr.sr_conflicts`` - A list of tuples ``(state,token,resolution)`` identifying all shift/reduce conflicts. ``state`` is the LR state - number where the conflict occurred, ``token`` is the token causing the conflict, and ``resolution`` is - a string describing the resolution taken. ``resolution`` is either ``'shift'`` or ``'reduce'``. - -``lr.rr_conflicts`` - A list of tuples ``(state,rule,rejected)`` identifying all reduce/reduce conflicts. ``state`` is the - LR state number where the conflict occurred, ``rule`` is the production rule that was selected - and ``rejected`` is the production rule that was rejected. Both ``rule`` and ``rejected`` are - instances of ``Production``. They can be inspected to provide the user with more information. - -``lrtab.bind_callables(dict)`` - This binds all of the function names used in productions to callable objects - found in the dictionary ``dict``. During table generation and when reading - LR tables from files, PLY only uses the names of action functions such as ``'p_expr'``, - ``'p_statement'``, etc. In order to actually run the parser, these names - have to be bound to callable objects. This method is always called prior to - running a parser. - -6. LRParser ------------ - -The ``LRParser`` class implements the low-level LR parsing engine. - -``LRParser(lrtab, error_func)`` - Create an LRParser. ``lrtab`` is an instance of ``LRTable`` - containing the LR production and state tables. ``error_func`` is the - error function to invoke in the event of a parsing error. - -An instance ``p`` of ``LRParser`` has the following methods: - -``p.parse(input=None,lexer=None,debug=0,tracking=0)`` - Run the parser. ``input`` is a string, which if supplied is fed into the - lexer using its ``input()`` method. ``lexer`` is an instance of the - ``Lexer`` class to use for tokenizing. If not supplied, the last lexer - created with the ``lex`` module is used. ``debug`` is a boolean flag - that enables debugging. ``tracking`` is a boolean flag that tells the - parser to perform additional line number tracking. - -``p.restart()`` - Resets the parser state for a parse already in progress. - -7. ParserReflect ----------------- - -The ``ParserReflect`` class is used to collect parser specification data -from a Python module or object. This class is what collects all of the -``p_rule()`` functions in a PLY file, performs basic error checking, -and collects all of the needed information to build a grammar. Most of the -high-level PLY interface as used by the ``yacc()`` function is actually -implemented by this class. - -``ParserReflect(pdict, log=None)`` - Creates a ``ParserReflect`` instance. ``pdict`` is a dictionary - containing parser specification data. This dictionary typically corresponds - to the module or class dictionary of code that implements a PLY parser. - ``log`` is a logger instance that will be used to report error - messages. - -An instance ``p`` of ``ParserReflect`` has the following methods: - -``p.get_all()`` - Collect and store all required parsing information. - -``p.validate_all()`` - Validate all of the collected parsing information. This is a seprate step - from ``p.get_all()`` as a performance optimization. In order to - increase parser start-up time, a parser can elect to only validate the - parsing data when regenerating the parsing tables. The validation - step tries to collect as much information as possible rather than - raising an exception at the first sign of trouble. The attribute - ``p.error`` is set if there are any validation errors. The - value of this attribute is also returned. - -``p.signature()`` - Compute a signature representing the contents of the collected parsing - data. The signature value should change if anything in the parser - specification has changed in a way that would justify parser table - regeneration. This method can be called after ``p.get_all()``, - but before ``p.validate_all()``. - -The following attributes are set in the process of collecting data: - -``p.start`` - The grammar start symbol, if any. Taken from ``pdict['start']``. - -``p.error_func`` - The error handling function or ``None``. Taken from ``pdict['p_error']``. - -``p.tokens`` - The token list. Taken from ``pdict['tokens']``. - -``p.prec`` - The precedence specifier. Taken from ``pdict['precedence']``. - -``p.preclist`` - A parsed version of the precedence specified. A list of tuples of the form - ``(token,assoc,level)`` where ``token`` is the terminal symbol, - ``assoc`` is the associativity (e.g., ``'left'``) and ``level`` - is a numeric precedence level. - -``p.grammar`` - A list of tuples ``(name, rules)`` representing the grammar rules. ``name`` is the - name of a Python function or method in ``pdict`` that starts with ``"p_"``. - ``rules`` is a list of tuples ``(filename,line,prodname,syms)`` representing - the grammar rules found in the documentation string of that function. ``filename`` and ``line`` contain location - information that can be used for debugging. ``prodname`` is the name of the - production. ``syms`` is the right-hand side of the production. If you have a - function like this:: - - def p_expr(p): - '''expr : expr PLUS expr - | expr MINUS expr - | expr TIMES expr - | expr DIVIDE expr''' - - then the corresponding entry in ``p.grammar`` might look like this:: - - ('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), - ('calc.py',11,'expr', ['expr','MINUS','expr']), - ('calc.py',12,'expr', ['expr','TIMES','expr']), - ('calc.py',13,'expr', ['expr','DIVIDE','expr']) - ]) - -``p.pfuncs`` - A sorted list of tuples ``(line, file, name, doc)`` representing all of - the ``p_`` functions found. ``line`` and ``file`` give location - information. ``name`` is the name of the function. ``doc`` is the - documentation string. This list is sorted in ascending order by line number. - -``p.files`` - A dictionary holding all of the source filenames that were encountered - while collecting parser information. Only the keys of this dictionary have - any meaning. - -``p.error`` - An attribute that indicates whether or not any critical errors - occurred in validation. If this is set, it means that that some kind - of problem was detected and that no further processing should be - performed. - -8. High-level operation ------------------------ - -Using all of the above classes requires some attention to detail. The ``yacc()`` -function carries out a very specific sequence of operations to create a grammar. -This same sequence should be emulated if you build an alternative PLY interface. - - -1. A ``ParserReflect`` object is created and raw grammar specification data is -collected. - -2. A ``Grammar`` object is created and populated with information -from the specification data. - -3. A ``LRTable`` object is created to run the LALR algorithm over -the ``Grammar`` object. - -4. Productions in the LRTable and bound to callables using the ``bind_callables()`` -method. - -5. A ``LRParser`` object is created from from the information in the -``LRTable`` object. - - - diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 474c9bd..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,263 +0,0 @@ -@ECHO OFF
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set BUILDDIR=_build
-set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
-set I18NSPHINXOPTS=%SPHINXOPTS% .
-if NOT "%PAPER%" == "" (
- set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
- set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
-)
-
-if "%1" == "" goto help
-
-if "%1" == "help" (
- :help
- echo.Please use `make ^<target^>` where ^<target^> is one of
- echo. html to make standalone HTML files
- echo. dirhtml to make HTML files named index.html in directories
- echo. singlehtml to make a single large HTML file
- echo. pickle to make pickle files
- echo. json to make JSON files
- echo. htmlhelp to make HTML files and a HTML help project
- echo. qthelp to make HTML files and a qthelp project
- echo. devhelp to make HTML files and a Devhelp project
- echo. epub to make an epub
- echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
- echo. text to make text files
- echo. man to make manual pages
- echo. texinfo to make Texinfo files
- echo. gettext to make PO message catalogs
- echo. changes to make an overview over all changed/added/deprecated items
- echo. xml to make Docutils-native XML files
- echo. pseudoxml to make pseudoxml-XML files for display purposes
- echo. linkcheck to check all external links for integrity
- echo. doctest to run all doctests embedded in the documentation if enabled
- echo. coverage to run coverage check of the documentation if enabled
- goto end
-)
-
-if "%1" == "clean" (
- for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
- del /q /s %BUILDDIR%\*
- goto end
-)
-
-
-REM Check if sphinx-build is available and fallback to Python version if any
-%SPHINXBUILD% 2> nul
-if errorlevel 9009 goto sphinx_python
-goto sphinx_ok
-
-:sphinx_python
-
-set SPHINXBUILD=python -m sphinx.__init__
-%SPHINXBUILD% 2> nul
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.http://sphinx-doc.org/
- exit /b 1
-)
-
-:sphinx_ok
-
-
-if "%1" == "html" (
- %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The HTML pages are in %BUILDDIR%/html.
- goto end
-)
-
-if "%1" == "dirhtml" (
- %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
- goto end
-)
-
-if "%1" == "singlehtml" (
- %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
- goto end
-)
-
-if "%1" == "pickle" (
- %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished; now you can process the pickle files.
- goto end
-)
-
-if "%1" == "json" (
- %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished; now you can process the JSON files.
- goto end
-)
-
-if "%1" == "htmlhelp" (
- %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished; now you can run HTML Help Workshop with the ^
-.hhp project file in %BUILDDIR%/htmlhelp.
- goto end
-)
-
-if "%1" == "qthelp" (
- %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished; now you can run "qcollectiongenerator" with the ^
-.qhcp project file in %BUILDDIR%/qthelp, like this:
- echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sly.qhcp
- echo.To view the help file:
- echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sly.ghc
- goto end
-)
-
-if "%1" == "devhelp" (
- %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished.
- goto end
-)
-
-if "%1" == "epub" (
- %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The epub file is in %BUILDDIR%/epub.
- goto end
-)
-
-if "%1" == "latex" (
- %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
- goto end
-)
-
-if "%1" == "latexpdf" (
- %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
- cd %BUILDDIR%/latex
- make all-pdf
- cd %~dp0
- echo.
- echo.Build finished; the PDF files are in %BUILDDIR%/latex.
- goto end
-)
-
-if "%1" == "latexpdfja" (
- %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
- cd %BUILDDIR%/latex
- make all-pdf-ja
- cd %~dp0
- echo.
- echo.Build finished; the PDF files are in %BUILDDIR%/latex.
- goto end
-)
-
-if "%1" == "text" (
- %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The text files are in %BUILDDIR%/text.
- goto end
-)
-
-if "%1" == "man" (
- %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The manual pages are in %BUILDDIR%/man.
- goto end
-)
-
-if "%1" == "texinfo" (
- %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
- goto end
-)
-
-if "%1" == "gettext" (
- %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
- goto end
-)
-
-if "%1" == "changes" (
- %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
- if errorlevel 1 exit /b 1
- echo.
- echo.The overview file is in %BUILDDIR%/changes.
- goto end
-)
-
-if "%1" == "linkcheck" (
- %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
- if errorlevel 1 exit /b 1
- echo.
- echo.Link check complete; look for any errors in the above output ^
-or in %BUILDDIR%/linkcheck/output.txt.
- goto end
-)
-
-if "%1" == "doctest" (
- %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
- if errorlevel 1 exit /b 1
- echo.
- echo.Testing of doctests in the sources finished, look at the ^
-results in %BUILDDIR%/doctest/output.txt.
- goto end
-)
-
-if "%1" == "coverage" (
- %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
- if errorlevel 1 exit /b 1
- echo.
- echo.Testing of coverage in the sources finished, look at the ^
-results in %BUILDDIR%/coverage/python.txt.
- goto end
-)
-
-if "%1" == "xml" (
- %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The XML files are in %BUILDDIR%/xml.
- goto end
-)
-
-if "%1" == "pseudoxml" (
- %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
- if errorlevel 1 exit /b 1
- echo.
- echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
- goto end
-)
-
-:end
diff --git a/docs/ply.rst b/docs/ply.rst deleted file mode 100644 index 600ed6d..0000000 --- a/docs/ply.rst +++ /dev/null @@ -1,2675 +0,0 @@ -PLY (Python Lex-Yacc) -===================== - -This document provides an overview of lexing and parsing with PLY. -Given the intrinsic complexity of parsing, I strongly advise -that you read (or at least skim) this entire document before jumping -into a big development project with PLY. - -PLY-4.0 requires Python 3.6 or newer. If you're using an older version -of Python, you're out of luck. Sorry. - -Introduction ------------- - -PLY is a pure-Python implementation of the compiler -construction tools lex and yacc. The main goal of PLY is to stay -fairly faithful to the way in which traditional lex/yacc tools work. -This includes supporting LALR(1) parsing as well as providing -extensive input validation, error reporting, and diagnostics. Thus, -if you've used yacc in another programming language, it should be -relatively straightforward to use PLY. - -Early versions of PLY were developed to support an Introduction to -Compilers Course I taught in 2001 at the University of Chicago. Since -PLY was primarily developed as an instructional tool, you will find it -to be fairly picky about token and grammar rule specification. In -part, this added formality is meant to catch common programming -mistakes made by novice users. However, advanced users will also find -such features to be useful when building complicated grammars for real -programming languages. It should also be noted that PLY does not -provide much in the way of bells and whistles (e.g., automatic -construction of abstract syntax trees, tree traversal, etc.). Nor -would I consider it to be a parsing framework. Instead, you will find -a bare-bones, yet fully capable lex/yacc implementation written -entirely in Python. - -The rest of this document assumes that you are somewhat familiar with -parsing theory, syntax directed translation, and the use of compiler -construction tools such as lex and yacc in other programming -languages. If you are unfamiliar with these topics, you will probably -want to consult an introductory text such as "Compilers: Principles, -Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex -and Yacc" by John Levine may also be handy. In fact, the O'Reilly -book can be used as a reference for PLY as the concepts are virtually -identical. - -PLY Overview ------------- - -PLY consists of two separate modules; ``lex.py`` and ``yacc.py``, both -of which are found in a Python package called ``ply``. The ``lex.py`` -module is used to break input text into a collection of tokens -specified by a collection of regular expression rules. ``yacc.py`` is -used to recognize language syntax that has been specified in the form -of a context free grammar. - -The two tools are meant to work together. Specifically, ``lex.py`` -provides an interface to produce tokens. ``yacc.py`` uses this -retrieve tokens and invoke grammar rules. The output of ``yacc.py`` -is often an Abstract Syntax Tree (AST). However, this is entirely up -to the user. If desired, ``yacc.py`` can also be used to implement -simple one-pass compilers. - -Like its Unix counterpart, ``yacc.py`` provides most of the features -you expect including extensive error checking, grammar validation, -support for empty productions, error tokens, and ambiguity resolution -via precedence rules. In fact, almost everything that is possible in -traditional yacc should be supported in PLY. - -The primary difference between ``yacc.py`` and Unix ``yacc`` is that -``yacc.py`` doesn't involve a separate code-generation process. -Instead, PLY relies on reflection (introspection) to build its lexers -and parsers. Unlike traditional lex/yacc which require a special -input file that is converted into a separate source file, the -specifications given to PLY *are* valid Python programs. This -means that there are no extra source files nor is there a special -compiler construction step (e.g., running yacc to generate Python code -for the compiler). - -Lex ---- - -``lex.py`` is used to tokenize an input string. For example, suppose -you're writing a programming language and a user supplied the -following input string:: - - x = 3 + 42 * (s - t) - -A tokenizer splits the string into individual tokens:: - - 'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')' - -Tokens are usually given names to indicate what they are. For example:: - - 'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES', - 'LPAREN','ID','MINUS','ID','RPAREN' - -More specifically, the input is broken into pairs of token types and -values. For example:: - - ('ID','x'), ('EQUALS','='), ('NUMBER','3'), - ('PLUS','+'), ('NUMBER','42), ('TIMES','*'), - ('LPAREN','('), ('ID','s'), ('MINUS','-'), - ('ID','t'), ('RPAREN',')' - -The specification of tokens is done by writing a series of -regular expression rules. The next section shows how this is done -using ``lex.py``. - -Lex Example -^^^^^^^^^^^ - -The following example shows how ``lex.py`` is used to write a simple tokenizer:: - - # ------------------------------------------------------------ - # calclex.py - # - # tokenizer for a simple expression evaluator for - # numbers and +,-,*,/ - # ------------------------------------------------------------ - import ply.lex as lex - - # List of token names. This is always required - tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', - ) - - # Regular expression rules for simple tokens - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - # A regular expression rule with some action code - def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - - # Define a rule so we can track line numbers - def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - - # A string containing ignored characters (spaces and tabs) - t_ignore = ' \t' - - # Error handling rule - def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - # Build the lexer - lexer = lex.lex() - -To use the lexer, you first need to feed it some input text using -its ``input()`` method. After that, repeated calls -to ``token()`` produce tokens. The following code shows how this -works:: - - # Test it out - data = ''' - 3 + 4 * 10 - + -20 *2 - ''' - - # Give the lexer some input - lexer.input(data) - - # Tokenize - while True: - tok = lexer.token() - if not tok: - break # No more input - print(tok) - -When executed, the example will produce the following output:: - - $ python example.py - LexToken(NUMBER,3,2,1) - LexToken(PLUS,'+',2,3) - LexToken(NUMBER,4,2,5) - LexToken(TIMES,'*',2,7) - LexToken(NUMBER,10,2,10) - LexToken(PLUS,'+',3,14) - LexToken(MINUS,'-',3,16) - LexToken(NUMBER,20,3,18) - LexToken(TIMES,'*',3,20) - LexToken(NUMBER,2,3,21) - -Lexers also support the iteration protocol. So, you can write the -above loop as follows:: - - for tok in lexer: - print(tok) - -The tokens returned by ``lexer.token()`` are instances of -``LexToken``. This object has attributes ``type``, ``value``, -``lineno``, and ``lexpos``. The following code shows an -example of accessing these attributes:: - - # Tokenize - while True: - tok = lexer.token() - if not tok: - break # No more input - print(tok.type, tok.value, tok.lineno, tok.lexpos) - -The ``type`` and ``value`` attributes contain the type and -value of the token itself. ``lineno`` and ``lexpos`` contain -information about the location of the token. ``lexpos`` is the -index of the token relative to the start of the input text. - -The tokens list -^^^^^^^^^^^^^^^ - -All lexers must provide a list ``tokens`` that defines all of the -possible token names that can be produced by the lexer. This list is -always required and is used to perform a variety of validation checks. -The tokens list is also used by the ``yacc.py`` module to identify -terminals. - -In the example, the following code specified the token names:: - - tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', - ) - -Specification of tokens -^^^^^^^^^^^^^^^^^^^^^^^ - -Each token is specified by writing a regular expression rule -compatible with Python's ``re`` module. Each of these rules are -defined by making declarations with a special prefix ``t_`` to -indicate that it defines a token. For simple tokens, the regular -expression can be specified as strings such as this (note: Python raw -strings are used since they are the most convenient way to write -regular expression strings):: - - t_PLUS = r'\+' - -In this case, the name following the ``t_`` must exactly match one of -the names supplied in ``tokens``. If some kind of action needs to be -performed, a token rule can be specified as a function. For example, -this rule matches numbers and converts the string into a Python -integer:: - - def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - -When a function is used, the regular expression rule is specified in -the function documentation string. The function always takes a single -argument which is an instance of ``LexToken``. This object has -attributes of ``type`` which is the token type (as a string), -``value`` which is the lexeme (the actual text matched), -``lineno`` which is the current line number, and ``lexpos`` which -is the position of the token relative to the beginning of the input -text. By default, ``type`` is set to the name following the ``t_`` -prefix. The action function can modify the contents of the -``LexToken`` object as appropriate. However, when it is done, the -resulting token should be returned. If no value is returned by the -action function, the token is discarded and the next token -read. - -Internally, ``lex.py`` uses the ``re`` module to do its pattern -matching. Patterns are compiled using the ``re.VERBOSE`` flag which -can be used to help readability. However, be aware that unescaped -whitespace is ignored and comments are allowed in this mode. If your -pattern involves whitespace, make sure you use ``\s``. If you need to -match the ``#`` character, use ``[#]``. - -When building the master regular expression, rules are added in the -following order: - -1. All tokens defined by functions are added in the same order as they - appear in the lexer file. - -2. Tokens defined by strings are added next by sorting them in order - of decreasing regular expression length (longer expressions are added - first). - -Without this ordering, it can be difficult to correctly match certain -types of tokens. For example, if you wanted to have separate tokens -for "=" and "==", you need to make sure that "==" is checked first. -By sorting regular expressions in order of decreasing length, this -problem is solved for rules defined as strings. For functions, the -order can be explicitly controlled since rules appearing first are -checked first. - -To handle reserved words, you should write a single rule to match an -identifier and do a special name lookup in a function like this:: - - reserved = { - 'if' : 'IF', - 'then' : 'THEN', - 'else' : 'ELSE', - 'while' : 'WHILE', - ... - } - - tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) - - def t_ID(t): - r'[a-zA-Z_][a-zA-Z_0-9]*' - t.type = reserved.get(t.value,'ID') # Check for reserved words - return t - -This approach greatly reduces the number of regular expression rules -and is likely to make things a little faster. - -Note: You should avoid writing individual rules for reserved words. -For example, if you write rules like this:: - - t_FOR = r'for' - t_PRINT = r'print' - -those rules will be triggered for identifiers that include those words -as a prefix such as "forget" or "printed". This is probably not what -you want. - -Token values -^^^^^^^^^^^^ - -When tokens are returned by lex, they have a value that is stored in -the ``value`` attribute. Normally, the value is the text that was -matched. However, the value can be assigned to any Python object. -For instance, when lexing identifiers, you may want to return both the -identifier name and information from some sort of symbol table. To do -this, you might write a rule like this:: - - def t_ID(t): - ... - # Look up symbol table information and return a tuple - t.value = (t.value, symbol_lookup(t.value)) - ... - return t - -It is important to note that storing data in other attribute names is -*not* recommended. The ``yacc.py`` module only exposes the -contents of the ``value`` attribute. Thus, accessing other attributes -may be unnecessarily awkward. If you need to store multiple values on -a token, assign a tuple, dictionary, or instance to ``value``. - -Discarded tokens -^^^^^^^^^^^^^^^^ - -To discard a token, such as a comment, define a token rule that -returns no value. For example:: - - def t_COMMENT(t): - r'\#.*' - pass - # No return value. Token discarded - -Alternatively, you can include the prefix ``ignore_`` in the token -declaration to force a token to be ignored. For example:: - - t_ignore_COMMENT = r'\#.*' - -Be advised that if you are ignoring many different kinds of text, you -may still want to use functions since these provide more precise -control over the order in which regular expressions are matched (i.e., -functions are matched in order of specification whereas strings are -sorted by regular expression length). - -Line numbers and positional information -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -By default, ``lex.py`` knows nothing about line numbers. This is -because ``lex.py`` doesn't know anything about what constitutes a -"line" of input (e.g., the newline character or even if the input is -textual data). To update this information, you need to write a -special rule. In the example, the ``t_newline()`` rule shows how to -do this:: - - # Define a rule so we can track line numbers - def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - -Within the rule, the ``lineno`` attribute of the underlying lexer -``t.lexer`` is updated. After the line number is updated, the token -is discarded since nothing is returned. - -``lex.py`` does not perform any kind of automatic column tracking. -However, it does record positional information related to each token -in the ``lexpos`` attribute. Using this, it is usually possible to -compute column information as a separate step. For instance, just -count backwards until you reach a newline:: - - # Compute column. - # input is the input text string - # token is a token instance - def find_column(input, token): - line_start = input.rfind('\n', 0, token.lexpos) + 1 - return (token.lexpos - line_start) + 1 - -Since column information is often only useful in the context of error -handling, calculating the column position can be performed when needed -as opposed to doing it for each token. Note: If you're parsing a language -where whitespace matters (i.e., Python), it's probably better match -whitespace as a token instead of ignoring it. - -Ignored characters -^^^^^^^^^^^^^^^^^^ - -The special ``t_ignore`` rule is reserved by ``lex.py`` for characters -that should be completely ignored in the input stream. Usually this -is used to skip over whitespace and other non-essential characters. -Although it is possible to define a regular expression rule for -whitespace in a manner similar to ``t_newline()``, the use of -``t_ignore`` provides substantially better lexing performance because -it is handled as a special case and is checked in a much more -efficient manner than the normal regular expression rules. - -The characters given in ``t_ignore`` are not ignored when such -characters are part of other regular expression patterns. For -example, if you had a rule to capture quoted text, that pattern can -include the ignored characters (which will be captured in the normal -way). The main purpose of ``t_ignore`` is to ignore whitespace and -other padding between the tokens that you actually want to parse. - -Literal characters -^^^^^^^^^^^^^^^^^^ - -Literal characters can be specified by defining a variable -``literals`` in your lexing module. For example:: - - literals = [ '+','-','*','/' ] - -or alternatively:: - - literals = "+-*/" - -A literal character is a single character that is returned "as -is" when encountered by the lexer. Literals are checked after all of -the defined regular expression rules. Thus, if a rule starts with one -of the literal characters, it will always take precedence. - -When a literal token is returned, both its ``type`` and ``value`` -attributes are set to the character itself. For example, ``'+'``. - -It's possible to write token functions that perform additional actions -when literals are matched. However, you'll need to set the token type -appropriately. For example:: - - literals = [ '{', '}' ] - - def t_lbrace(t): - r'\{' - t.type = '{' # Set token type to the expected literal - return t - - def t_rbrace(t): - r'\}' - t.type = '}' # Set token type to the expected literal - return t - -Error handling -^^^^^^^^^^^^^^ - -The ``t_error()`` function is used to handle lexing errors that occur -when illegal characters are detected. In this case, the ``t.value`` -attribute contains the rest of the input string that has not been -tokenized. In the example, the error function was defined as -follows:: - - # Error handling rule - def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -In this case, we print the offending character and skip ahead -one character by calling ``t.lexer.skip(1)``. - -EOF Handling -^^^^^^^^^^^^ - -The ``t_eof()`` function is used to handle an end-of-file (EOF) -condition in the input. As input, it receives a token type ``'eof'`` -with the ``lineno`` and ``lexpos`` attributes set appropriately. The -main use of this function is provide more input to the lexer so that -it can continue to parse. Here is an example of how this works:: - - # EOF handling rule - def t_eof(t): - # Get more input (Example) - more = input('... ') - if more: - self.lexer.input(more) - return self.lexer.token() - return None - -The EOF function should return the next available token (by calling -``self.lexer.token())`` or ``None`` to indicate no more data. Be -aware that setting more input with the ``self.lexer.input()`` method -does NOT reset the lexer state or the ``lineno`` attribute used for -position tracking. The ``lexpos`` attribute is reset so be aware of -that if you're using it in error reporting. - -Building and using the lexer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To build the lexer, the function ``lex.lex()`` is used. For example:: - - lexer = lex.lex() - -This function uses Python reflection (or introspection) to read the -regular expression rules out of the calling context and build the -lexer. Once the lexer has been built, two methods can be used to -control the lexer. - -``lexer.input(data)``. Reset the lexer and store a new input string. - -``lexer.token()``. Return the next token. Returns a special -``LexToken`` instance on success or None if the end of the input text -has been reached. - -The @TOKEN decorator -^^^^^^^^^^^^^^^^^^^^ - -In some applications, you may want to define tokens as a series of -more complex regular expression rules. For example:: - - digit = r'([0-9])' - nondigit = r'([_A-Za-z])' - identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' - - def t_ID(t): - # want docstring to be identifier above. ????? - ... - -In this case, we want the regular expression rule for ``ID`` to be one -of the variables above. However, there is no way to directly specify -this using a normal documentation string. To solve this problem, you -can use the ``@TOKEN`` decorator. For example:: - - from ply.lex import TOKEN - - @TOKEN(identifier) - def t_ID(t): - ... - -This will attach ``identifier`` to the docstring for ``t_ID()`` -allowing ``lex.py`` to work normally. Naturally, you could use ``@TOKEN`` -on all functions as an alternative to using docstrings. - -Debugging -^^^^^^^^^ - -For the purpose of debugging, you can run ``lex()`` in a debugging -mode as follows:: - - lexer = lex.lex(debug=True) - -This will produce various sorts of debugging information including all -of the added rules, the master regular expressions used by the lexer, -and tokens generating during lexing. - -In addition, ``lex.py`` comes with a simple main function which will -either tokenize input read from standard input or from a file -specified on the command line. To use it, put this in your -lexer:: - - if __name__ == '__main__': - lex.runmain() - -Please refer to the "Debugging" section near the end for some more -advanced details of debugging. - -Alternative specification of lexers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -As shown in the example, lexers are specified all within one Python -module. If you want to put token rules in a different module from the -one in which you invoke ``lex()``, use the ``module`` keyword -argument. - -For example, you might have a dedicated module that just contains the -token rules:: - - # module: tokrules.py - # This module just contains the lexing rules - - # List of token names. This is always required - tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', - ) - - # Regular expression rules for simple tokens - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - # A regular expression rule with some action code - def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - - # Define a rule so we can track line numbers - def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - - # A string containing ignored characters (spaces and tabs) - t_ignore = ' \t' - - # Error handling rule - def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -Now, if you wanted to build a tokenizer from these rules from within a -different module, you would do the following (shown for Python -interactive mode):: - - >>> import tokrules - >>> lexer = lex.lex(module=tokrules) - >>> lexer.input("3 + 4") - >>> lexer.token() - LexToken(NUMBER,3,1,1,0) - >>> lexer.token() - LexToken(PLUS,'+',1,2) - >>> lexer.token() - LexToken(NUMBER,4,1,4) - >>> lexer.token() - None - >>> - -The ``module`` option can also be used to define lexers from instances -of a class. For example:: - - import ply.lex as lex - - class MyLexer(object): - # List of token names. This is always required - tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', - ) - - # Regular expression rules for simple tokens - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - # A regular expression rule with some action code - # Note addition of self parameter since we're in a class - def t_NUMBER(self,t): - r'\d+' - t.value = int(t.value) - return t - - # Define a rule so we can track line numbers - def t_newline(self,t): - r'\n+' - t.lexer.lineno += len(t.value) - - # A string containing ignored characters (spaces and tabs) - t_ignore = ' \t' - - # Error handling rule - def t_error(self,t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - # Build the lexer - def build(self,**kwargs): - self.lexer = lex.lex(module=self, **kwargs) - - # Test it output - def test(self,data): - self.lexer.input(data) - while True: - tok = self.lexer.token() - if not tok: - break - print(tok) - - # Build the lexer and try it out - m = MyLexer() - m.build() # Build the lexer - m.test("3 + 4") # Test it - - -When building a lexer from class, *you should construct the lexer -from an instance of the class*, not the class object itself. This -is because PLY only works properly if the lexer actions are defined by -bound-methods. - -When using the ``module`` option to ``lex()``, PLY collects symbols -from the underlying object using the ``dir()`` function. There is no -direct access to the ``__dict__`` attribute of the object supplied as -a module value. - -Finally, if you want to keep things nicely encapsulated, but don't -want to use a full-fledged class definition, lexers can be defined -using closures. For example:: - - import ply.lex as lex - - # List of token names. This is always required - tokens = ( - 'NUMBER', - 'PLUS', - 'MINUS', - 'TIMES', - 'DIVIDE', - 'LPAREN', - 'RPAREN', - ) - - def MyLexer(): - # Regular expression rules for simple tokens - t_PLUS = r'\+' - t_MINUS = r'-' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - - # A regular expression rule with some action code - def t_NUMBER(t): - r'\d+' - t.value = int(t.value) - return t - - # Define a rule so we can track line numbers - def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - - # A string containing ignored characters (spaces and tabs) - t_ignore = ' \t' - - # Error handling rule - def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - # Build the lexer from my environment and return it - return lex.lex() - -Important note: If you are defining a lexer using a class or closure, -be aware that PLY still requires you to only define a single lexer per -module (source file). There are extensive validation/error checking -parts of the PLY that may falsely report error messages if you don't -follow this rule. - -Maintaining state -^^^^^^^^^^^^^^^^^ - -In your lexer, you may want to maintain a variety of state -information. This might include mode settings, symbol tables, and -other details. As an example, suppose that you wanted to keep track -of how many NUMBER tokens had been encountered. - -One way to do this is to keep a set of global variables in the module -where you created the lexer. For example:: - - num_count = 0 - def t_NUMBER(t): - r'\d+' - global num_count - num_count += 1 - t.value = int(t.value) - return t - -If you don't like the use of a global variable, another place to store -information is inside the Lexer object created by ``lex()``. To this, -you can use the ``lexer`` attribute of tokens passed to the various -rules. For example:: - - def t_NUMBER(t): - r'\d+' - t.lexer.num_count += 1 # Note use of lexer attribute - t.value = int(t.value) - return t - - lexer = lex.lex() - lexer.num_count = 0 # Set the initial count - -This latter approach has the advantage of being simple and working -correctly in applications where multiple instantiations of a given -lexer exist in the same application. However, this might also feel -like a gross violation of encapsulation to OO purists. Just to put -your mind at some ease, all internal attributes of the lexer (with the -exception of ``lineno``) have names that are prefixed by ``lex`` -(e.g., ``lexdata``,``lexpos``, etc.). Thus, it is perfectly safe to -store attributes in the lexer that don't have names starting with that -prefix or a name that conflicts with one of the predefined methods -(e.g., ``input()``, ``token()``, etc.). - -If you don't like assigning values on the lexer object, you can define -your lexer as a class as shown in the previous section:: - - class MyLexer: - ... - def t_NUMBER(self,t): - r'\d+' - self.num_count += 1 - t.value = int(t.value) - return t - - def build(self, **kwargs): - self.lexer = lex.lex(object=self,**kwargs) - - def __init__(self): - self.num_count = 0 - -The class approach may be the easiest to manage if your application is -going to be creating multiple instances of the same lexer and you need -to manage a lot of state. - -State can also be managed through closures. For example:: - - def MyLexer(): - num_count = 0 - ... - def t_NUMBER(t): - r'\d+' - nonlocal num_count - num_count += 1 - t.value = int(t.value) - return t - ... - -Lexer cloning -^^^^^^^^^^^^^ - -If necessary, a lexer object can be duplicated by invoking its -``clone()`` method. For example:: - - lexer = lex.lex() - ... - newlexer = lexer.clone() - -When a lexer is cloned, the copy is exactly identical to the original -lexer including any input text and internal state. However, the clone -allows a different set of input text to be supplied which may be -processed separately. This may be useful in situations when you are -writing a parser/compiler that involves recursive or reentrant -processing. For instance, if you needed to scan ahead in the input -for some reason, you could create a clone and use it to look ahead. -Or, if you were implementing some kind of preprocessor, cloned lexers -could be used to handle different input files. - -Creating a clone is different than calling ``lex.lex()`` in that -PLY doesn't regenerate any of the internal tables or regular expressions. - -Special considerations need to be made when cloning lexers that also -maintain their own internal state using classes or closures. Namely, -you need to be aware that the newly created lexers will share all of -this state with the original lexer. For example, if you defined a -lexer as a class and did this:: - - m = MyLexer() - a = lex.lex(object=m) # Create a lexer - - b = a.clone() # Clone the lexer - -Then both ``a`` and ``b`` are going to be bound to the same object -``m`` and any changes to ``m`` will be reflected in both lexers. It's -important to emphasize that ``clone()`` is only meant to create a new -lexer that reuses the regular expressions and environment of another -lexer. If you need to make a totally new copy of a lexer, then call -``lex()`` again. - -Internal lexer state -^^^^^^^^^^^^^^^^^^^^ - -A Lexer object ``lexer`` has a number of internal attributes that may be useful in certain -situations. - -``lexer.lexpos`` - This attribute is an integer that contains the current position - within the input text. If you modify the value, it will change - the result of the next call to ``token()``. Within token rule - functions, this points to the first character *after* the - matched text. If the value is modified within a rule, the next - returned token will be matched at the new position. - -``lexer.lineno`` - The current value of the line number attribute stored in the - lexer. PLY only specifies that the attribute exists---it never - sets, updates, or performs any processing with it. If you want to - track line numbers, you will need to add code yourself (see the - section on line numbers and positional information). - -``lexer.lexdata`` - The current input text stored in the lexer. This is the string - passed with the ``input()`` method. It would probably be a bad - idea to modify this unless you really know what you're doing. - -``lexer.lexmatch`` - This is the raw ``Match`` object returned by the Python - ``re.match()`` function (used internally by PLY) for the current - token. If you have written a regular expression that contains - named groups, you can use this to retrieve those values. Note: - This attribute is only updated when tokens are defined and - processed by functions. - -Conditional lexing and start conditions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In advanced parsing applications, it may be useful to have different -lexing states. For instance, you may want the occurrence of a certain -token or syntactic construct to trigger a different kind of lexing. -PLY supports a feature that allows the underlying lexer to be put into -a series of different states. Each state can have its own tokens, -lexing rules, and so forth. The implementation is based largely on -the "start condition" feature of GNU flex. Details of this can be -found at http://flex.sourceforge.net/manual/Start-Conditions.html - -To define a new lexing state, it must first be declared. This is done -by including a "states" declaration in your lex file. For example:: - - states = ( - ('foo','exclusive'), - ('bar','inclusive'), - ) - -This declaration declares two states, ``'foo'`` and ``'bar'``. States -may be of two types; ``'exclusive'`` and ``'inclusive'``. An -exclusive state completely overrides the default behavior of the -lexer. That is, lex will only return tokens and apply rules defined -specifically for that state. An inclusive state adds additional -tokens and rules to the default set of rules. Thus, lex will return -both the tokens defined by default in addition to those defined for -the inclusive state. - -Once a state has been declared, tokens and rules are declared by -including the state name in token/rule declaration. For example:: - - t_foo_NUMBER = r'\d+' # Token 'NUMBER' in state 'foo' - t_bar_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' # Token 'ID' in state 'bar' - - def t_foo_newline(t): - r'\n' - t.lexer.lineno += 1 - -A token can be declared in multiple states by including multiple state -names in the declaration. For example:: - - t_foo_bar_NUMBER = r'\d+' # Defines token 'NUMBER' in both state 'foo' and 'bar' - -Alternative, a token can be declared in all states using the 'ANY' in -the name:: - - t_ANY_NUMBER = r'\d+' # Defines a token 'NUMBER' in all states - -If no state name is supplied, as is normally the case, the token is -associated with a special state ``'INITIAL'``. For example, these two -declarations are identical:: - - t_NUMBER = r'\d+' - t_INITIAL_NUMBER = r'\d+' - - -States are also associated with the special ``t_ignore``, -``t_error()``, and ``t_eof()`` declarations. For example, if a state -treats these differently, you can declare:: - - t_foo_ignore = " \t\n" # Ignored characters for state 'foo' - - def t_bar_error(t): # Special error handler for state 'bar' - pass - -By default, lexing operates in the ``'INITIAL'`` state. This state -includes all of the normally defined tokens. For users who aren't -using different states, this fact is completely transparent. If, -during lexing or parsing, you want to change the lexing state, use the -``begin()`` method. For example:: - - def t_begin_foo(t): - r'start_foo' - t.lexer.begin('foo') # Starts 'foo' state - -To get out of a state, you use ``begin()`` to switch back to the -initial state. For example:: - - def t_foo_end(t): - r'end_foo' - t.lexer.begin('INITIAL') # Back to the initial state - -The management of states can also be done with a stack. For example:: - - def t_begin_foo(t): - r'start_foo' - t.lexer.push_state('foo') # Starts 'foo' state - - def t_foo_end(t): - r'end_foo' - t.lexer.pop_state() # Back to the previous state - - -The use of a stack would be useful in situations where there are many -ways of entering a new lexing state and you merely want to go back to -the previous state afterwards. - -An example might help clarify. Suppose you were writing a parser and -you wanted to grab sections of arbitrary C code enclosed by curly -braces. That is, whenever you encounter a starting brace ``'{'``, you -want to read all of the enclosed code up to the ending brace ``'}'`` and -return it as a string. Doing this with a normal regular expression -rule is nearly (if not actually) impossible. This is because braces -can be nested and can be included in comments and strings. Thus, -matching up to the first matching ``'}'`` character isn't good -enough. Here is how you might use lexer states to do this:: - - # Declare the state - states = ( - ('ccode','exclusive'), - ) - - # Match the first {. Enter ccode state. - def t_ccode(t): - r'\{' - t.lexer.code_start = t.lexer.lexpos # Record the starting position - t.lexer.level = 1 # Initial brace level - t.lexer.begin('ccode') # Enter 'ccode' state - - # Rules for the ccode state - def t_ccode_lbrace(t): - r'\{' - t.lexer.level +=1 - - def t_ccode_rbrace(t): - r'\}' - t.lexer.level -=1 - - # If closing brace, return the code fragment - if t.lexer.level == 0: - t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1] - t.type = "CCODE" - t.lexer.lineno += t.value.count('\n') - t.lexer.begin('INITIAL') - return t - - # C or C++ comment (ignore) - def t_ccode_comment(t): - r'(/\*(.|\n)*?\*/)|(//.*)' - pass - - # C string - def t_ccode_string(t): - r'\"([^\\\n]|(\\.))*?\"' - - # C character literal - def t_ccode_char(t): - r'\'([^\\\n]|(\\.))*?\'' - - # Any sequence of non-whitespace characters (not braces, strings) - def t_ccode_nonspace(t): - r'[^\s\{\}\'\"]+' - - # Ignored characters (whitespace) - t_ccode_ignore = " \t\n" - - # For bad characters, we just skip over it - def t_ccode_error(t): - t.lexer.skip(1) - -In this example, the occurrence of the first '{' causes the lexer to -record the starting position and enter a new state ``'ccode'``. A -collection of rules then match various parts of the input that follow -(comments, strings, etc.). All of these rules merely discard the -token (by not returning a value). However, if the closing right brace -is encountered, the rule ``t_ccode_rbrace`` collects all of the code -(using the earlier recorded starting position), stores it, and returns -a token 'CCODE' containing all of that text. When returning the -token, the lexing state is restored back to its initial state. - -Miscellaneous Issues -^^^^^^^^^^^^^^^^^^^^ - -- The lexer requires input to be supplied as a single input string. - Since most machines have more than enough memory, this rarely presents - a performance concern. However, it means that the lexer currently - can't be used with streaming data such as open files or sockets. This - limitation is primarily a side-effect of using the ``re`` module. You - might be able to work around this by implementing an appropriate ``def - t_eof()`` end-of-file handling rule. The main complication here is - that you'll probably need to ensure that data is fed to the lexer in a - way so that it doesn't split in in the middle of a token. - -- If you need to supply optional flags to the re.compile() function, - use the reflags option to lex. For example:: - - lex.lex(reflags=re.UNICODE | re.VERBOSE) - - Note: by default, ``reflags`` is set to ``re.VERBOSE``. If you provide - your own flags, you may need to include this for PLY to preserve its normal behavior. - -- If you are going to create a hand-written lexer and you plan to use it with ``yacc.py``, - it only needs to conform to the following requirements: - - 1. It must provide a ``token()`` method that returns the next token or - ``None`` if no more tokens are available. - - 2. The ``token()`` method must return an object ``tok`` that has - ``type`` and ``value`` attributes. If line number tracking is - being used, then the token should also define a ``lineno`` - attribute. - -Parsing basics --------------- - -``yacc.py`` is used to parse language syntax. Before showing an -example, there are a few important bits of background that must be -mentioned. First, *syntax* is usually specified in terms of a -BNF grammar. For example, if you wanted to parse simple arithmetic -expressions, you might first write an unambiguous grammar -specification like this:: - - expression : expression + term - | expression - term - | term - - term : term * factor - | term / factor - | factor - - factor : NUMBER - | ( expression ) - -In the grammar, symbols such as ``NUMBER``, ``+``, ``-``, ``*``, and -``/`` are known as *terminals* and correspond to input -tokens. Identifiers such as ``term`` and ``factor`` refer to grammar -rules comprised of a collection of terminals and other rules. These -identifiers are known as *non-terminals*. - -The semantic behavior of a language is often specified using a -technique known as syntax directed translation. In syntax directed -translation, attributes are attached to each symbol in a given grammar -rule along with an action. Whenever a particular grammar rule is -recognized, the action describes what to do. For example, given the -expression grammar above, you might write the specification for a -simple calculator like this:: - - Grammar Action - -------------------------------- -------------------------------------------- - expression0 : expression1 + term expression0.val = expression1.val + term.val - | expression1 - term expression0.val = expression1.val - term.val - | term expression0.val = term.val - - term0 : term1 * factor term0.val = term1.val * factor.val - | term1 / factor term0.val = term1.val / factor.val - | factor term0.val = factor.val - - factor : NUMBER factor.val = int(NUMBER.lexval) - | ( expression ) factor.val = expression.val - -A good way to think about syntax directed translation is to view each -symbol in the grammar as a kind of object. Associated with each symbol -is a value representing its "state" (for example, the ``val`` -attribute above). Semantic actions are then expressed as a collection -of functions or methods that operate on the symbols and associated -values. - -Yacc uses a parsing technique known as LR-parsing or shift-reduce -parsing. LR parsing is a bottom up technique that tries to recognize -the right-hand-side of various grammar rules. Whenever a valid -right-hand-side is found in the input, the appropriate action code is -triggered and the grammar symbols are replaced by the grammar symbol -on the left-hand-side. - -LR parsing is commonly implemented by shifting grammar symbols onto a -stack and looking at the stack and the next input token for patterns -that match one of the grammar rules. The details of the algorithm can -be found in a compiler textbook, but the following example illustrates -the steps that are performed if you wanted to parse the expression ``3 -+ 5 * (10 - 20)`` using the grammar defined above. In the example, -the special symbol ``$`` represents the end of input:: - - Step Symbol Stack Input Tokens Action - ---- --------------------- --------------------- ------------------------------- - 1 3 + 5 * ( 10 - 20 )$ Shift 3 - 2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER - 3 factor + 5 * ( 10 - 20 )$ Reduce term : factor - 4 term + 5 * ( 10 - 20 )$ Reduce expr : term - 5 expr + 5 * ( 10 - 20 )$ Shift + - 6 expr + 5 * ( 10 - 20 )$ Shift 5 - 7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER - 8 expr + factor * ( 10 - 20 )$ Reduce term : factor - 9 expr + term * ( 10 - 20 )$ Shift * - 10 expr + term * ( 10 - 20 )$ Shift ( - 11 expr + term * ( 10 - 20 )$ Shift 10 - 12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER - 13 expr + term * ( factor - 20 )$ Reduce term : factor - 14 expr + term * ( term - 20 )$ Reduce expr : term - 15 expr + term * ( expr - 20 )$ Shift - - 16 expr + term * ( expr - 20 )$ Shift 20 - 17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER - 18 expr + term * ( expr - factor )$ Reduce term : factor - 19 expr + term * ( expr - term )$ Reduce expr : expr - term - 20 expr + term * ( expr )$ Shift ) - 21 expr + term * ( expr ) $ Reduce factor : (expr) - 22 expr + term * factor $ Reduce term : term * factor - 23 expr + term $ Reduce expr : expr + term - 24 expr $ Reduce expr - 25 $ Success! - -When parsing the expression, an underlying state machine and the -current input token determine what happens next. If the next token -looks like part of a valid grammar rule (based on other items on the -stack), it is generally shifted onto the stack. If the top of the -stack contains a valid right-hand-side of a grammar rule, it is -usually "reduced" and the symbols replaced with the symbol on the -left-hand-side. When this reduction occurs, the appropriate action is -triggered (if defined). If the input token can't be shifted and the -top of stack doesn't match any grammar rules, a syntax error has -occurred and the parser must take some kind of recovery step (or bail -out). A parse is only successful if the parser reaches a state where -the symbol stack is empty and there are no more input tokens. - -It is important to note that the underlying implementation is built -around a large finite-state machine that is encoded in a collection of -tables. The construction of these tables is non-trivial and -beyond the scope of this discussion. However, subtle details of this -process explain why, in the example above, the parser chooses to shift -a token onto the stack in step 9 rather than reducing the -rule ``expr : expr + term``. - -Yacc ----- - -The ``ply.yacc`` module implements the parsing component of PLY. -The name "yacc" stands for "Yet Another Compiler Compiler" and is -borrowed from the Unix tool of the same name. - -An example -^^^^^^^^^^ - -Suppose you wanted to make a grammar for simple arithmetic expressions -as previously described. Here is how you would do it with -``yacc.py``:: - - # Yacc example - - import ply.yacc as yacc - - # Get the token map from the lexer. This is required. - from calclex import tokens - - def p_expression_plus(p): - 'expression : expression PLUS term' - p[0] = p[1] + p[3] - - def p_expression_minus(p): - 'expression : expression MINUS term' - p[0] = p[1] - p[3] - - def p_expression_term(p): - 'expression : term' - p[0] = p[1] - - def p_term_times(p): - 'term : term TIMES factor' - p[0] = p[1] * p[3] - - def p_term_div(p): - 'term : term DIVIDE factor' - p[0] = p[1] / p[3] - - def p_term_factor(p): - 'term : factor' - p[0] = p[1] - - def p_factor_num(p): - 'factor : NUMBER' - p[0] = p[1] - - def p_factor_expr(p): - 'factor : LPAREN expression RPAREN' - p[0] = p[2] - - # Error rule for syntax errors - def p_error(p): - print("Syntax error in input!") - - # Build the parser - parser = yacc.yacc() - - while True: - try: - s = input('calc > ') - except EOFError: - break - if not s: continue - result = parser.parse(s) - print(result) - -In this example, each grammar rule is defined by a Python function -where the docstring to that function contains the appropriate -context-free grammar specification. The statements that make up the -function body implement the semantic actions of the rule. Each -function accepts a single argument ``p`` that is a sequence containing -the values of each grammar symbol in the corresponding rule. The -values of ``p[i]`` are mapped to grammar symbols as shown here:: - - def p_expression_plus(p): - 'expression : expression PLUS term' - # ^ ^ ^ ^ - # p[0] p[1] p[2] p[3] - - p[0] = p[1] + p[3] - - -For tokens, the "value" of the corresponding ``p[i]`` is the -*same* as the ``p.value`` attribute assigned in the lexer -module. For non-terminals, the value is determined by whatever is -placed in ``p[0]`` when rules are reduced. This value can be anything -at all. However, it probably most common for the value to be a simple -Python type, a tuple, or an instance. In this example, we are relying -on the fact that the ``NUMBER`` token stores an integer value in its -value field. All of the other rules perform various types of -integer operations and propagate the result. - -Note: The use of negative indices have a special meaning in -yacc---specially ``p[-1]`` does not have the same value as ``p[3]`` in -this example. Please see the section on "Embedded Actions" for -further details. - -The first rule defined in the yacc specification determines the -starting grammar symbol (in this case, a rule for ``expression`` -appears first). Whenever the starting rule is reduced by the parser -and no more input is available, parsing stops and the final value is -returned (this value will be whatever the top-most rule placed in -``p[0]``). Note: an alternative starting symbol can be specified using -the ``start`` keyword argument to ``yacc()``. - -The ``p_error(p)`` rule is defined to catch syntax errors. See the -error handling section below for more detail. - -To build the parser, call the ``yacc.yacc()`` function. This function -looks at the module and attempts to construct all of the LR parsing -tables for the grammar you have specified. - -If any errors are detected in your grammar specification, ``yacc.py`` -will produce diagnostic messages and possibly raise an exception. -Some of the errors that can be detected include: - -- Duplicated function names (if more than one rule function have the same name in the grammar file). -- Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. -- Badly specified grammar rules. -- Infinite recursion (rules that can never terminate). -- Unused rules and tokens -- Undefined rules and tokens - -The next few sections discuss grammar specification in more detail. - -The final part of the example shows how to actually run the parser -created by ``yacc()``. To run the parser, you have to call the -``parse()`` with a string of input text. This will run all of the -grammar rules and return the result of the entire parse. This result -return is the value assigned to ``p[0]`` in the starting grammar rule. - -Combining Grammar Rule Functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When grammar rules are similar, they can be combined into a single -function. For example, consider the two rules in our earlier -example:: - - def p_expression_plus(p): - 'expression : expression PLUS term' - p[0] = p[1] + p[3] - - def p_expression_minus(t): - 'expression : expression MINUS term' - p[0] = p[1] - p[3] - -Instead of writing two functions, you might write a single function -like this:: - - def p_expression(p): - '''expression : expression PLUS term - | expression MINUS term''' - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - -In general, the docstring for any given function can contain multiple -grammar rules. So, it would have also been legal (although possibly -confusing) to write this:: - - def p_binary_operators(p): - '''expression : expression PLUS term - | expression MINUS term - term : term TIMES factor - | term DIVIDE factor''' - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - elif p[2] == '*': - p[0] = p[1] * p[3] - elif p[2] == '/': - p[0] = p[1] / p[3] - -When combining grammar rules into a single function, it is usually a -good idea for all of the rules to have a similar structure (e.g., the -same number of terms). Otherwise, the corresponding action code may -be more complicated than necessary. However, it is possible to handle -simple cases using len(). For example:: - - def p_expressions(p): - '''expression : expression MINUS expression - | MINUS expression''' - if (len(p) == 4): - p[0] = p[1] - p[3] - elif (len(p) == 3): - p[0] = -p[2] - -If parsing performance is a concern, you should resist the urge to put -too much conditional processing into a single grammar rule as shown in -these examples. When you add checks to see which grammar rule is -being handled, you are actually duplicating the work that the parser -has already performed (i.e., the parser already knows exactly what -rule it matched). You can eliminate this overhead by using a separate -``p_rule()`` function for each grammar rule. - -Character Literals -^^^^^^^^^^^^^^^^^^ - -If desired, a grammar may contain tokens defined as single character -literals. For example:: - - def p_binary_operators(p): - '''expression : expression '+' term - | expression '-' term - term : term '*' factor - | term '/' factor''' - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - elif p[2] == '*': - p[0] = p[1] * p[3] - elif p[2] == '/': - p[0] = p[1] / p[3] - -A character literal must be enclosed in quotes such as ``'+'``. In -addition, if literals are used, they must be declared in the -corresponding ``lex`` file through the use of a special ``literals`` -declaration:: - - # Literals. Should be placed in module given to lex() - literals = ['+','-','*','/' ] - -Character literals are limited to a single character. Thus, it is not -legal to specify literals such as ``'<='`` or ``'=='``. For this, -use the normal lexing rules (e.g., define a rule such as ``t_EQ = -r'=='``). - -Empty Productions -^^^^^^^^^^^^^^^^^ - -``yacc.py`` can handle empty productions by defining a rule like this:: - - def p_empty(p): - 'empty :' - pass - -Now to use the empty production, use 'empty' as a symbol. For -example:: - - def p_optitem(p): - 'optitem : item' - ' | empty' - ... - -Note: You can write empty rules anywhere by specifying an empty -right hand side. However, I personally find that writing an "empty" -rule and using "empty" to denote an empty production is easier to read -and more clearly states your intentions. - -Changing the starting symbol -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Normally, the first rule found in a yacc specification defines the -starting grammar rule (top level rule). To change this, supply -a ``start`` specifier in your file. For example:: - - start = 'foo' - - def p_bar(p): - 'bar : A B' - - # This is the starting rule due to the start specifier above - def p_foo(p): - 'foo : bar X' - ... - -The use of a ``start`` specifier may be useful during debugging -since you can use it to have yacc build a subset of a larger grammar. -For this purpose, it is also possible to specify a starting symbol as -an argument to ``yacc()``. For example:: - - parser = yacc.yacc(start='foo') - -Dealing With Ambiguous Grammars -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The expression grammar given in the earlier example has been written -in a special format to eliminate ambiguity. However, in many -situations, it is extremely difficult or awkward to write grammars in -this format. A much more natural way to express the grammar is in a -more compact form like this:: - - expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression - | LPAREN expression RPAREN - | NUMBER - -Unfortunately, this grammar specification is ambiguous. For example, -if you are parsing the string "3 * 4 + 5", there is no way to tell how -the operators are supposed to be grouped. For example, does the -expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? - -When an ambiguous grammar is given to ``yacc.py`` it will print -messages about "shift/reduce conflicts" or "reduce/reduce conflicts". -A shift/reduce conflict is caused when the parser generator can't -decide whether or not to reduce a rule or shift a symbol on the -parsing stack. For example, consider the string "3 * 4 + 5" and the -internal parsing stack:: - - Step Symbol Stack Input Tokens Action - ---- --------------------- --------------------- ------------------------------- - 1 $ 3 * 4 + 5$ Shift 3 - 2 $ 3 * 4 + 5$ Reduce : expression : NUMBER - 3 $ expr * 4 + 5$ Shift * - 4 $ expr * 4 + 5$ Shift 4 - 5 $ expr * 4 + 5$ Reduce: expression : NUMBER - 6 $ expr * expr + 5$ SHIFT/REDUCE CONFLICT ???? - -In this case, when the parser reaches step 6, it has two options. One -is to reduce the rule ``expr : expr * expr`` on the stack. The other -option is to shift the token ``+`` on the stack. Both options are -perfectly legal from the rules of the context-free-grammar. - -By default, all shift/reduce conflicts are resolved in favor of -shifting. Therefore, in the above example, the parser will always -shift the ``+`` instead of reducing. Although this strategy works in -many cases (for example, the case of "if-then" versus "if-then-else"), -it is not enough for arithmetic expressions. In fact, in the above -example, the decision to shift ``+`` is completely wrong---we should -have reduced ``expr * expr`` since multiplication has higher -mathematical precedence than addition. - -To resolve ambiguity, especially in expression grammars, ``yacc.py`` -allows individual tokens to be assigned a precedence level and -associativity. This is done by adding a variable ``precedence`` to -the grammar file like this:: - - precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ) - -This declaration specifies that ``PLUS``/``MINUS`` have the same -precedence level and are left-associative and that -``TIMES``/``DIVIDE`` have the same precedence and are -left-associative. Within the ``precedence`` declaration, tokens are -ordered from lowest to highest precedence. Thus, this declaration -specifies that ``TIMES``/``DIVIDE`` have higher precedence than -``PLUS``/``MINUS`` (since they appear later in the precedence -specification). - -The precedence specification works by associating a numerical -precedence level value and associativity direction to the listed -tokens. For example, in the above example you get:: - - PLUS : level = 1, assoc = 'left' - MINUS : level = 1, assoc = 'left' - TIMES : level = 2, assoc = 'left' - DIVIDE : level = 2, assoc = 'left' - -These values are then used to attach a numerical precedence value and -associativity direction to each grammar rule. *This is always -determined by looking at the precedence of the right-most terminal -symbol.* For example:: - - expression : expression PLUS expression # level = 1, left - | expression MINUS expression # level = 1, left - | expression TIMES expression # level = 2, left - | expression DIVIDE expression # level = 2, left - | LPAREN expression RPAREN # level = None (not specified) - | NUMBER # level = None (not specified) - -When shift/reduce conflicts are encountered, the parser generator -resolves the conflict by looking at the precedence rules and -associativity specifiers. - -1. If the current token has higher precedence than the rule on the stack, it is shifted. - -2. If the grammar rule on the stack has higher precedence, the rule is reduced. - -3. If the current token and the grammar rule have the same precedence, the - rule is reduced for left associativity, whereas the token is shifted for right associativity. - -4. If nothing is known about the precedence, shift/reduce conflicts are resolved in - favor of shifting (the default). - -For example, if "expression PLUS expression" has been parsed and the -next token is "TIMES", the action is going to be a shift because -"TIMES" has a higher precedence level than "PLUS". On the other hand, -if "expression TIMES expression" has been parsed and the next token is -"PLUS", the action is going to be reduce because "PLUS" has a lower -precedence than "TIMES." - -When shift/reduce conflicts are resolved using the first three -techniques (with the help of precedence rules), ``yacc.py`` will -report no errors or conflicts in the grammar (although it will print -some information in the ``parser.out`` debugging file). - -One problem with the precedence specifier technique is that it is -sometimes necessary to change the precedence of an operator in certain -contexts. For example, consider a unary-minus operator in "3 + 4 * --5". Mathematically, the unary minus is normally given a very high -precedence--being evaluated before the multiply. However, in our -precedence specifier, MINUS has a lower precedence than TIMES. To -deal with this, precedence rules can be given for so-called -"fictitious tokens" like this:: - - precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), # Unary minus operator - ) - -Now, in the grammar file, we can write our unary minus rule like -this:: - - def p_expr_uminus(p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] - -In this case, ``%prec UMINUS`` overrides the default rule -precedence--setting it to that of UMINUS in the precedence specifier. - -At first, the use of UMINUS in this example may appear very confusing. -UMINUS is not an input token or a grammar rule. Instead, you should -think of it as the name of a special marker in the precedence table. -When you use the ``%prec`` qualifier, you're telling yacc that -you want the precedence of the expression to be the same as for this -special marker instead of the usual precedence. - -It is also possible to specify non-associativity in the ``precedence`` -table. This would be used when you *don't* want operations to -chain together. For example, suppose you wanted to support comparison -operators like ``<`` and ``>`` but you didn't want to allow -combinations like ``a < b < c``. To do this, specify a -rule like this:: - - precedence = ( - ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), # Unary minus operator - ) - -If you do this, the occurrence of input text such as ``a < b < c`` -will result in a syntax error. However, simple expressions such -as ``a < b`` will still be fine. - -Reduce/reduce conflicts are caused when there are multiple grammar -rules that can be applied to a given set of symbols. This kind of -conflict is almost always bad and is always resolved by picking the -rule that appears first in the grammar file. Reduce/reduce conflicts -are almost always caused when different sets of grammar rules somehow -generate the same set of symbols. For example:: - - assignment : ID EQUALS NUMBER - | ID EQUALS expression - - expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression - | LPAREN expression RPAREN - | NUMBER - -In this case, a reduce/reduce conflict exists between these two rules:: - - assignment : ID EQUALS NUMBER - expression : NUMBER - -For example, if you wrote "a = 5", the parser can't figure out if this -is supposed to be reduced as ``assignment : ID EQUALS NUMBER`` or -whether it's supposed to reduce the 5 as an expression and then reduce -the rule ``assignment : ID EQUALS expression``. - -It should be noted that reduce/reduce conflicts are notoriously -difficult to spot looking at the input grammar. When a -reduce/reduce conflict occurs, ``yacc()`` will try to help by printing -a warning message such as this:: - - WARNING: 1 reduce/reduce conflict - WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) - WARNING: rejected rule (expression -> NUMBER) - -This message identifies the two rules that are in conflict. However, -it may not tell you how the parser arrived at such a state. To try -and figure it out, you'll probably have to look at your grammar and -the contents of the ``parser.out`` debugging file with an -appropriately high level of caffeination. - -The parser.out file -^^^^^^^^^^^^^^^^^^^ - -Tracking down shift/reduce and reduce/reduce conflicts is one of the -finer pleasures of using an LR parsing algorithm. To assist in -debugging, ``yacc.py`` can create a debugging file called 'parser.out'. -To create this file, use ``yacc.yacc(debug=True)``. -The contents of this file look like the following:: - - Unused terminals: - - - Grammar - - Rule 1 expression -> expression PLUS expression - Rule 2 expression -> expression MINUS expression - Rule 3 expression -> expression TIMES expression - Rule 4 expression -> expression DIVIDE expression - Rule 5 expression -> NUMBER - Rule 6 expression -> LPAREN expression RPAREN - - Terminals, with rules where they appear - - TIMES : 3 - error : - MINUS : 2 - RPAREN : 6 - LPAREN : 6 - DIVIDE : 4 - PLUS : 1 - NUMBER : 5 - - Nonterminals, with rules where they appear - - expression : 1 1 2 2 3 3 4 4 6 0 - - - Parsing method: LALR - - - state 0 - - S' -> . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - - state 1 - - S' -> expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - PLUS shift and go to state 6 - MINUS shift and go to state 5 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - - state 2 - - expression -> LPAREN . expression RPAREN - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - - state 3 - - expression -> NUMBER . - - $ reduce using rule 5 - PLUS reduce using rule 5 - MINUS reduce using rule 5 - TIMES reduce using rule 5 - DIVIDE reduce using rule 5 - RPAREN reduce using rule 5 - - - state 4 - - expression -> expression TIMES . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - - state 5 - - expression -> expression MINUS . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - - state 6 - - expression -> expression PLUS . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - - state 7 - - expression -> expression DIVIDE . expression - expression -> . expression PLUS expression - expression -> . expression MINUS expression - expression -> . expression TIMES expression - expression -> . expression DIVIDE expression - expression -> . NUMBER - expression -> . LPAREN expression RPAREN - - NUMBER shift and go to state 3 - LPAREN shift and go to state 2 - - - state 8 - - expression -> LPAREN expression . RPAREN - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - RPAREN shift and go to state 13 - PLUS shift and go to state 6 - MINUS shift and go to state 5 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - - state 9 - - expression -> expression TIMES expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 3 - PLUS reduce using rule 3 - MINUS reduce using rule 3 - TIMES reduce using rule 3 - DIVIDE reduce using rule 3 - RPAREN reduce using rule 3 - - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - ! TIMES [ shift and go to state 4 ] - ! DIVIDE [ shift and go to state 7 ] - - state 10 - - expression -> expression MINUS expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 2 - PLUS reduce using rule 2 - MINUS reduce using rule 2 - RPAREN reduce using rule 2 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - ! TIMES [ reduce using rule 2 ] - ! DIVIDE [ reduce using rule 2 ] - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - - state 11 - - expression -> expression PLUS expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 1 - PLUS reduce using rule 1 - MINUS reduce using rule 1 - RPAREN reduce using rule 1 - TIMES shift and go to state 4 - DIVIDE shift and go to state 7 - - ! TIMES [ reduce using rule 1 ] - ! DIVIDE [ reduce using rule 1 ] - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - - state 12 - - expression -> expression DIVIDE expression . - expression -> expression . PLUS expression - expression -> expression . MINUS expression - expression -> expression . TIMES expression - expression -> expression . DIVIDE expression - - $ reduce using rule 4 - PLUS reduce using rule 4 - MINUS reduce using rule 4 - TIMES reduce using rule 4 - DIVIDE reduce using rule 4 - RPAREN reduce using rule 4 - - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - ! TIMES [ shift and go to state 4 ] - ! DIVIDE [ shift and go to state 7 ] - - state 13 - - expression -> LPAREN expression RPAREN . - - $ reduce using rule 6 - PLUS reduce using rule 6 - MINUS reduce using rule 6 - TIMES reduce using rule 6 - DIVIDE reduce using rule 6 - RPAREN reduce using rule 6 - -The different states that appear in this file are a representation of -every possible sequence of valid input tokens allowed by the grammar. -When receiving input tokens, the parser is building up a stack and -looking for matching rules. Each state keeps track of the grammar -rules that might be in the process of being matched at that point. -Within each rule, the "." character indicates the current location of -the parse within that rule. In addition, the actions for each valid -input token are listed. When a shift/reduce or reduce/reduce conflict -arises, rules *not* selected are prefixed with an !. For -example:: - - ! TIMES [ reduce using rule 2 ] - ! DIVIDE [ reduce using rule 2 ] - ! PLUS [ shift and go to state 6 ] - ! MINUS [ shift and go to state 5 ] - -By looking at these rules (and with a little practice), you can -usually track down the source of most parsing conflicts. It should -also be stressed that not all shift-reduce conflicts are bad. -However, the only way to be sure that they are resolved correctly is -to look at ``parser.out``. - -Syntax Error Handling -^^^^^^^^^^^^^^^^^^^^^ - -If you are creating a parser for production use, the handling of -syntax errors is important. As a general rule, you don't want a -parser to throw up its hands and stop at the first sign of -trouble. Instead, you want it to report the error, recover if -possible, and continue parsing so that all of the errors in the input -get reported to the user at once. This is the standard behavior found -in compilers for languages such as C, C++, and Java. - -In PLY, when a syntax error occurs during parsing, the error is -immediately detected (i.e., the parser does not read any more tokens -beyond the source of the error). However, at this point, the parser -enters a recovery mode that can be used to try and continue further -parsing. As a general rule, error recovery in LR parsers is a -delicate topic that involves ancient rituals and black-magic. The -recovery mechanism provided by ``yacc.py`` is comparable to Unix yacc -so you may want consult a book like O'Reilly's "Lex and Yacc" for some -of the finer details. - -When a syntax error occurs, ``yacc.py`` performs the following steps: - -1. On the first occurrence of an error, the user-defined ``p_error()`` - function is called with the offending token as an - argument. However, if the syntax error is due to reaching the - end-of-file, ``p_error()`` is called with an argument of ``None``. - Afterwards, the parser enters an "error-recovery" mode in which it - will not make future calls to ``p_error()`` until it has - successfully shifted at least 3 tokens onto the parsing stack. - - -2. If no recovery action is taken in ``p_error()``, the offending - lookahead token is replaced with a special ``error`` token. - -3. If the offending lookahead token is already set to ``error``, the - top item of the parsing stack is deleted. - -4. If the entire parsing stack is unwound, the parser enters a restart - state and attempts to start parsing from its initial state. - -5. If a grammar rule accepts ``error`` as a token, it will be - shifted onto the parsing stack. - -6. If the top item of the parsing stack is ``error``, lookahead tokens - will be discarded until the parser can successfully shift a new - symbol or reduce a rule involving ``error``. - -Recovery and resynchronization with error rules -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The most well-behaved approach for handling syntax errors is to write -grammar rules that include the ``error`` token. For example, suppose -your language had a grammar rule for a print statement like this:: - - def p_statement_print(p): - 'statement : PRINT expr SEMI' - ... - -To account for the possibility of a bad expression, you might write an -additional grammar rule like this:: - - def p_statement_print_error(p): - 'statement : PRINT error SEMI' - print("Syntax error in print statement. Bad expression") - -In this case, the ``error`` token will match any sequence of -tokens that might appear up to the first semicolon that is -encountered. Once the semicolon is reached, the rule will be -invoked and the ``error`` token will go away. - -This type of recovery is sometimes known as parser resynchronization. -The ``error`` token acts as a wildcard for any bad input text and -the token immediately following ``error`` acts as a -synchronization token. - -It is important to note that the ``error`` token usually does not -appear as the last token on the right in an error rule. For example:: - - def p_statement_print_error(p): - 'statement : PRINT error' - print("Syntax error in print statement. Bad expression") - -This is because the first bad token encountered will cause the rule to -be reduced--which may make it difficult to recover if more bad tokens -immediately follow. - -Panic mode recovery -~~~~~~~~~~~~~~~~~~~ - -An alternative error recovery scheme is to enter a panic mode recovery -in which tokens are discarded to a point where the parser might be -able to recover in some sensible manner. - -Panic mode recovery is implemented entirely in the ``p_error()`` -function. For example, this function starts discarding tokens until -it reaches a closing '}'. Then, it restarts the parser in its initial -state:: - - def p_error(p): - print("Whoa. You are seriously hosed.") - if not p: - print("End of File!") - return - - # Read ahead looking for a closing '}' - while True: - tok = parser.token() # Get the next token - if not tok or tok.type == 'RBRACE': - break - parser.restart() - -This function discards the bad token and tells the parser that -the error was ok:: - - def p_error(p): - if p: - print("Syntax error at token", p.type) - # Just discard the token and tell the parser it's okay. - parser.errok() - else: - print("Syntax error at EOF") - -More information on these methods is as follows: - -``parser.errok()`` - This resets the parser state so it doesn't think it's in error-recovery - mode. This will prevent an ``error`` token from being generated and will reset the internal - error counters so that the next syntax error will call ``p_error()`` again. - -``parser.token()`` - This returns the next token on the input stream. - -``parser.restart()``. - This discards the entire parsing stack and resets the parser - to its initial state. - -To supply the next lookahead token to the parser, ``p_error()`` can -return a token. This might be useful if trying to synchronize on -special characters. For example:: - - def p_error(p): - # Read ahead looking for a terminating ";" - while True: - tok = parser.token() # Get the next token - if not tok or tok.type == 'SEMI': break - parser.errok() - - # Return SEMI to the parser as the next lookahead token - return tok - -Keep in mind in that the above error handling functions, ``parser`` is -an instance of the parser created by ``yacc()``. You'll need to save -this instance someplace in your code so that you can refer to it -during error handling. - -Signalling an error from a production -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If necessary, a production rule can manually force the parser to enter -error recovery. This is done by raising the ``SyntaxError`` exception -like this:: - - def p_production(p): - 'production : some production ...' - raise SyntaxError - -The effect of raising ``SyntaxError`` is the same as if the last -symbol shifted onto the parsing stack was actually a syntax error. -Thus, when you do this, the last symbol shifted is popped off of the -parsing stack and the current lookahead token is set to an ``error`` -token. The parser then enters error-recovery mode where it tries to -reduce rules that can accept ``error`` tokens. The steps that follow -from this point are exactly the same as if a syntax error were -detected and ``p_error()`` were called. - -One important aspect of manually setting an error is that the -``p_error()`` function will NOT be called in this case. If you need -to issue an error message, make sure you do it in the production that -raises ``SyntaxError``. - -Note: This feature of PLY is meant to mimic the behavior of the -YYERROR macro in yacc. - -When Do Syntax Errors Get Reported? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In most cases, yacc will handle errors as soon as a bad input token is -detected on the input. However, be aware that yacc may choose to -delay error handling until after it has reduced one or more grammar -rules first. This behavior might be unexpected, but it's related to -special states in the underlying parsing table known as "defaulted -states." A defaulted state is parsing condition where the same -grammar rule will be reduced regardless of what *valid* token -comes next on the input. For such states, yacc chooses to go ahead -and reduce the grammar rule *without reading the next input -token*. If the next token is bad, yacc will eventually get around -to reading it and report a syntax error. It's just a little unusual -in that you might see some of your grammar rules firing immediately -prior to the syntax error. - -Usually, the delayed error reporting with defaulted states is harmless -(and there are other reasons for wanting PLY to behave in this way). -However, if you need to turn this behavior off for some reason. You -can clear the defaulted states table like this:: - - parser = yacc.yacc() - parser.defaulted_states = {} - -Disabling defaulted states is not recommended if your grammar makes -use of embedded actions as described in Section 6.11. - -General comments on error handling -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For normal types of languages, error recovery with error rules and -resynchronization characters is probably the most reliable -technique. This is because you can instrument the grammar to catch -errors at selected places where it is relatively easy to recover and -continue parsing. Panic mode recovery is really only useful in -certain specialized applications where you might want to discard huge -portions of the input text to find a valid restart point. - -Line Number and Position Tracking -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Position tracking is often a tricky problem when writing compilers. -By default, PLY tracks the line number and position of all tokens. -This information is available using the following functions: - -``p.lineno(num)``. Return the line number for symbol *num* - -``p.lexpos(num)``. Return the lexing position for symbol *num* - -For example:: - - def p_expression(p): - 'expression : expression PLUS expression' - line = p.lineno(2) # line number of the PLUS token - index = p.lexpos(2) # Position of the PLUS token - -As an optional feature, ``yacc.py`` can automatically track line -numbers and positions for all of the grammar symbols as well. -However, this extra tracking requires extra processing and can -significantly slow down parsing. Therefore, it must be enabled by -passing the ``tracking=True`` option to ``yacc.parse()``. For -example:: - - yacc.parse(data,tracking=True) - -Once enabled, the ``lineno()`` and ``lexpos()`` methods work for all -grammar symbols. In addition, two additional methods can be used: - -``p.linespan(num)``. Return a tuple (startline,endline) with the starting and ending line number for symbol *num*. - -``p.lexspan(num)``. Return a tuple (start,end) with the starting and ending positions for symbol *num*. - -For example:: - - def p_expression(p): - 'expression : expression PLUS expression' - p.lineno(1) # Line number of the left expression - p.lineno(2) # line number of the PLUS operator - p.lineno(3) # line number of the right expression - ... - start,end = p.linespan(3) # Start,end lines of the right expression - starti,endi = p.lexspan(3) # Start,end positions of right expression - - -Note: The ``lexspan()`` function only returns the range of values up -to the start of the last grammar symbol. - -Although it may be convenient for PLY to track position information on -all grammar symbols, this is often unnecessary. For example, if you -are merely using line number information in an error message, you can -often just key off of a specific token in the grammar rule. For -example:: - - def p_bad_func(p): - 'funccall : fname LPAREN error RPAREN' - # Line number reported from LPAREN token - print("Bad function call at line", p.lineno(2)) - -Similarly, you may get better parsing performance if you only -selectively propagate line number information where it's needed using -the ``p.set_lineno()`` method. For example:: - - def p_fname(p): - 'fname : ID' - p[0] = p[1] - p.set_lineno(0,p.lineno(1)) - -PLY doesn't retain line number information from rules that have -already been parsed. If you are building an abstract syntax tree and -need to have line numbers, you should make sure that the line numbers -appear in the tree itself. - -AST Construction -^^^^^^^^^^^^^^^^ - -``yacc.py`` provides no special functions for constructing an abstract -syntax tree. However, such construction is easy enough to do on your -own. - -A minimal way to construct a tree is to create and propagate a -tuple or list in each grammar rule function. There are many possible -ways to do this, but one example would be something like this:: - - def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - - p[0] = ('binary-expression',p[2],p[1],p[3]) - - def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = ('group-expression',p[2]) - - def p_expression_number(p): - 'expression : NUMBER' - p[0] = ('number-expression',p[1]) - -Another approach is to create a set of data structure for different -kinds of abstract syntax tree nodes and assign nodes to ``p[0]`` in -each rule. For example:: - - class Expr: pass - - class BinOp(Expr): - def __init__(self,left,op,right): - self.left = left - self.right = right - self.op = op - - class Number(Expr): - def __init__(self,value): - self.value = value - - def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - - p[0] = BinOp(p[1],p[2],p[3]) - - def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - - def p_expression_number(p): - 'expression : NUMBER' - p[0] = Number(p[1]) - -The advantage to this approach is that it may make it easier to attach -more complicated semantics, type checking, code generation, and other -features to the node classes. - -To simplify tree traversal, it may make sense to pick a very generic -tree structure for your parse tree nodes. For example:: - - class Node: - def __init__(self,type,children=None,leaf=None): - self.type = type - if children: - self.children = children - else: - self.children = [ ] - self.leaf = leaf - - def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - - p[0] = Node("binop", [p[1],p[3]], p[2]) - -Embedded Actions -^^^^^^^^^^^^^^^^ - -The parsing technique used by yacc only allows actions to be executed -at the end of a rule. For example, suppose you have a rule like -this:: - - def p_foo(p): - "foo : A B C D" - print("Parsed a foo", p[1],p[2],p[3],p[4]) - -In this case, the supplied action code only executes after all of the -symbols ``A``, ``B``, ``C``, and ``D`` have been parsed. Sometimes, -however, it is useful to execute small code fragments during -intermediate stages of parsing. For example, suppose you wanted to -perform some action immediately after ``A`` has been parsed. To do -this, write an empty rule like this:: - - def p_foo(p): - "foo : A seen_A B C D" - print("Parsed a foo", p[1],p[3],p[4],p[5]) - print("seen_A returned", p[2]) - - def p_seen_A(p): - "seen_A :" - print("Saw an A = ", p[-1]) # Access grammar symbol to left - p[0] = some_value # Assign value to seen_A - -In this example, the empty ``seen_A`` rule executes immediately after -``A`` is shifted onto the parsing stack. Within this rule, ``p[-1]`` -refers to the symbol on the stack that appears immediately to the left -of the ``seen_A`` symbol. In this case, it would be the value of -``A`` in the ``foo`` rule immediately above. Like other rules, a -value can be returned from an embedded action by assigning it -to ``p[0]`` - -The use of embedded actions can sometimes introduce extra shift/reduce -conflicts. For example, this grammar has no conflicts:: - - def p_foo(p): - """foo : abcd - | abcx""" - - def p_abcd(p): - "abcd : A B C D" - - def p_abcx(p): - "abcx : A B C X" - -However, if you insert an embedded action into one of the rules like -this:: - - def p_foo(p): - """foo : abcd - | abcx""" - - def p_abcd(p): - "abcd : A B C D" - - def p_abcx(p): - "abcx : A B seen_AB C X" - - def p_seen_AB(p): - "seen_AB :" - -an extra shift-reduce conflict will be introduced. This conflict is -caused by the fact that the same symbol ``C`` appears next in both the -``abcd`` and ``abcx`` rules. The parser can either shift the symbol -(``abcd`` rule) or reduce the empty rule ``seen_AB`` (``abcx`` rule). - -A common use of embedded rules is to control other aspects of parsing -such as scoping of local variables. For example, if you were parsing -C code, you might write code like this:: - - def p_statements_block(p): - "statements: LBRACE new_scope statements RBRACE""" - # Action code - ... - pop_scope() # Return to previous scope - - def p_new_scope(p): - "new_scope :" - # Create a new scope for local variables - s = new_scope() - push_scope(s) - ... - -In this case, the embedded action ``new_scope`` executes -immediately after a ``LBRACE`` (``{``) symbol is parsed. -This might adjust internal symbol tables and other aspects of the -parser. Upon completion of the rule ``statements_block``, code -might undo the operations performed in the embedded action -(e.g., ``pop_scope()``). - -Miscellaneous Yacc Notes -^^^^^^^^^^^^^^^^^^^^^^^^ - - -1. By default, ``yacc.py`` relies on ``lex.py`` for tokenizing. However, an alternative tokenizer - can be supplied as follows:: - - parser = yacc.parse(lexer=x) - - in this case, ``x`` must be a Lexer object that minimally has a ``x.token()`` method for retrieving the next - token. If an input string is given to ``yacc.parse()``, the lexer must also have an ``x.input()`` method. - -2. To print copious amounts of debugging during parsing, use:: - - parser.parse(input_text, debug=True) - -3. Since LR parsing is driven by tables, the performance of the parser is largely independent of the - size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. - -4. ``yacc()`` also allows parsers to be defined as classes and as closures (see the section on alternative specification of - lexers). However, be aware that only one parser may be defined in a single module (source file). There are various - error checks and validation steps that may issue confusing error messages if you try to define multiple parsers - in the same source file. - -Multiple Parsers and Lexers ---------------------------- - -In advanced parsing applications, you may want to have multiple -parsers and lexers. - -As a general rules this isn't a problem. However, to make it work, -you need to carefully make sure everything gets hooked up correctly. -First, make sure you save the objects returned by ``lex()`` and -``yacc()``. For example:: - - lexer = lex.lex() # Return lexer object - parser = yacc.yacc() # Return parser object - -Next, when parsing, make sure you give the ``parse()`` function a -reference to the lexer it should be using. For example:: - - parser.parse(text,lexer=lexer) - -If you forget to do this, the parser will use the last lexer -created--which is not always what you want. - -Within lexer and parser rule functions, these objects are also -available. In the lexer, the "lexer" attribute of a token refers to -the lexer object that triggered the rule. For example:: - - def t_NUMBER(t): - r'\d+' - ... - print(t.lexer) # Show lexer object - -In the parser, the "lexer" and "parser" attributes refer to the lexer -and parser objects respectively:: - - def p_expr_plus(p): - 'expr : expr PLUS expr' - ... - print(p.parser) # Show parser object - print(p.lexer) # Show lexer object - -If necessary, arbitrary attributes can be attached to the lexer or -parser object. For example, if you wanted to have different parsing -modes, you could attach a mode attribute to the parser object and look -at it later. - -Advanced Debugging ------------------- - -Debugging a compiler is typically not an easy task. PLY provides some -diagostic capabilities through the use of Python's -``logging`` module. The next two sections describe this: - -Debugging the lex() and yacc() commands -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Both the ``lex()`` and ``yacc()`` commands have a debugging mode that -can be enabled using the ``debug`` flag. For example:: - - lex.lex(debug=True) - yacc.yacc(debug=True) - -Normally, the output produced by debugging is routed to either -standard error or, in the case of ``yacc()``, to a file -``parser.out``. This output can be more carefully controlled by -supplying a logging object. Here is an example that adds information -about where different debugging messages are coming from:: - - # Set up a logging object - import logging - logging.basicConfig( - level = logging.DEBUG, - filename = "parselog.txt", - filemode = "w", - format = "%(filename)10s:%(lineno)4d:%(message)s" - ) - log = logging.getLogger() - - lex.lex(debug=True,debuglog=log) - yacc.yacc(debug=True,debuglog=log) - -If you supply a custom logger, the amount of debugging information -produced can be controlled by setting the logging level. Typically, -debugging messages are either issued at the ``DEBUG``, ``INFO``, or -``WARNING`` levels. - -PLY's error messages and warnings are also produced using the logging -interface. This can be controlled by passing a logging object using -the ``errorlog`` parameter:: - - lex.lex(errorlog=log) - yacc.yacc(errorlog=log) - -If you want to completely silence warnings, you can either pass in a -logging object with an appropriate filter level or use the -``NullLogger`` object defined in either ``lex`` or ``yacc``. For -example:: - - yacc.yacc(errorlog=yacc.NullLogger()) - -Run-time Debugging -^^^^^^^^^^^^^^^^^^ - -To enable run-time debugging of a parser, use the ``debug`` option to -parse. This option can either be an integer (which turns -debugging on or off) or an instance of a logger object. For example:: - - log = logging.getLogger() - parser.parse(input,debug=log) - -If a logging object is passed, you can use its filtering level to -control how much output gets generated. The ``INFO`` level is used to -produce information about rule reductions. The ``DEBUG`` level will -show information about the parsing stack, token shifts, and other -details. The ``ERROR`` level shows information related to parsing -errors. - -For very complicated problems, you should pass in a logging object that -redirects to a file where you can more easily inspect the output after -execution. - -Using Python -OO Mode ---------------------- -Because of PLY's reliance on docstrings, it is not compatible with -`-OO` mode of the interpreter (which strips docstrings). If you -want to support this, you'll need to write a decorator or some other -tool to attach docstrings to functions. For example: - - def _(doc): - def decorate(func): - func.__doc__ = doc - return func - return decorate - - @_("assignment : expr PLUS expr") - def p_assignment(p): - ... - -PLY does not provide such a decorator by default. - -Where to go from here? ----------------------- - -The ``examples`` directory of the PLY distribution contains several -simple examples. Please consult a compilers textbook for the theory -and underlying implementation details or LR parsing. - - - - - - - - diff --git a/example/BASIC/basic.py b/example/BASIC/basic.py index 8a8a500..a8a0ac9 100644 --- a/example/BASIC/basic.py +++ b/example/BASIC/basic.py @@ -1,9 +1,6 @@ # An implementation of Dartmouth BASIC (1964) # -import sys -sys.path.insert(0, "../..") - import basiclex import basparse import basinterp diff --git a/example/BASIC/basiclog.py b/example/BASIC/basiclog.py index 9258e29..2463bc4 100644 --- a/example/BASIC/basiclog.py +++ b/example/BASIC/basiclog.py @@ -2,8 +2,6 @@ # import sys -sys.path.insert(0, "../..") - if sys.version_info[0] >= 3: raw_input = input diff --git a/example/ansic/clex.py b/example/ansic/clex.py index 4bde1d7..5d99416 100644 --- a/example/ansic/clex.py +++ b/example/ansic/clex.py @@ -4,9 +4,6 @@ # A lexer for ANSI C. # ---------------------------------------------------------------------- -import sys -sys.path.insert(0, "../..") - import ply.lex as lex # Reserved words diff --git a/example/ansic/cparse.py b/example/ansic/cparse.py index 5fe9bce..21c3478 100644 --- a/example/ansic/cparse.py +++ b/example/ansic/cparse.py @@ -4,7 +4,6 @@ # Simple parser for ANSI C. Based on the grammar in K&R, 2nd Ed. # ----------------------------------------------------------------------------- -import sys import clex import ply.yacc as yacc diff --git a/example/calc/calc.py b/example/calc/calc.py index 406d83c..3eae2b2 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -5,9 +5,6 @@ # "Lex and Yacc", p. 63. # ----------------------------------------------------------------------------- -import sys -sys.path.insert(0, "../..") - tokens = ( 'NAME', 'NUMBER', ) diff --git a/example/calcdebug/calc.py b/example/calcdebug/calc.py index 386000e..35de8ab 100644 --- a/example/calcdebug/calc.py +++ b/example/calcdebug/calc.py @@ -5,9 +5,6 @@ # with output routed to a logging object. # ----------------------------------------------------------------------------- -import sys -sys.path.insert(0, "../..") - tokens = ( 'NAME', 'NUMBER', ) diff --git a/example/calceof/calc.py b/example/calceof/calc.py index 7bb7e0f..1ec8298 100644 --- a/example/calceof/calc.py +++ b/example/calceof/calc.py @@ -5,9 +5,6 @@ # demonstrates the use of the t_eof() rule. # ----------------------------------------------------------------------------- -import sys -sys.path.insert(0, "../..") - tokens = ( 'NAME', 'NUMBER', ) diff --git a/example/classcalc/calc.py b/example/classcalc/calc.py index 6f35195..93bfeaa 100755 --- a/example/classcalc/calc.py +++ b/example/classcalc/calc.py @@ -9,9 +9,6 @@ # Class-based example contributed to PLY by David McNab # ----------------------------------------------------------------------------- -import sys -sys.path.insert(0, "../..") - import ply.lex as lex import ply.yacc as yacc import os diff --git a/example/closurecalc/calc.py b/example/closurecalc/calc.py index 59c9d6f..4fc0cd7 100644 --- a/example/closurecalc/calc.py +++ b/example/closurecalc/calc.py @@ -6,9 +6,6 @@ # lexing rules, parsing rules, and internal state are held inside the function. # ----------------------------------------------------------------------------- -import sys -sys.path.insert(0, "../..") - # Make a calculator function diff --git a/example/yply/ylex.py b/example/yply/ylex.py index 16410e2..6fb3692 100644 --- a/example/yply/ylex.py +++ b/example/yply/ylex.py @@ -3,9 +3,6 @@ # Author: David Beazley (dave@dabeaz.com) # Date : October 2, 2006 -import sys -sys.path.append("../..") - from ply import * tokens = ( diff --git a/example/yply/yply.py b/example/yply/yply.py index e24616c..b730fbb 100755 --- a/example/yply/yply.py +++ b/example/yply/yply.py @@ -21,7 +21,6 @@ # import sys -sys.path.insert(0, "../..") import ylex import yparse diff --git a/install.py b/install.py deleted file mode 100644 index fc27bc0..0000000 --- a/install.py +++ /dev/null @@ -1,28 +0,0 @@ -import sys -import shutil -import ply, ply.lex, ply.yacc -from pathlib import Path - -TARGET = "ply" - -def main(argv): - user = False - if len(argv) > 1: - if argv[1] != '--user': - raise SystemExit('usage: python install.py [--user]') - else: - user = True - site_packages = [p for p in sys.path if p.endswith('site-packages')] - path = Path(site_packages[0] if user else site_packages[-1]) / TARGET - if path.exists(): - shutil.rmtree(path) - shutil.copytree(TARGET, path) - print(f'{TARGET} installed at {path}') - -if __name__ == '__main__': - main(sys.argv) - - - - - diff --git a/ply/__init__.py b/ply/__init__.py deleted file mode 100644 index 8783862..0000000 --- a/ply/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# PLY package -# Author: David Beazley (dave@dabeaz.com) -# https://dabeaz.com/ply/index.html - -__version__ = '4.0' -__all__ = ['lex','yacc'] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9787c3b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b4ba4ed --- /dev/null +++ b/setup.cfg @@ -0,0 +1,18 @@ +[metadata] +name = ply +version = 2022.10.27 +url = https://github.com/dabeaz/ply +author = David Beazley +author_email = "David Beazley" <dave@dabeaz.com> +description = "PLY - Sly Lex Yacc" +long_description = "PLY is an implementation of lex and yacc. No longer maintained on PyPI. Latest version on GitHub." +license = MIT +license_files = LICENSE +classifiers = + License :: OSI Approved :: MIT License + +[options] +package_dir = + =src + +packages = ply diff --git a/setup.md b/setup.md deleted file mode 100644 index 3b7508c..0000000 --- a/setup.md +++ /dev/null @@ -1,36 +0,0 @@ -# Maintained, No Package Releases - -PLY is maintained software, but no longer produces package releases. -There is no `setup.py` file. It is not something that you install -with `pip` or a similar tool. PLY is free software which means that -you are free to COPY the necessary code from PLY into your project and -use it in any manner that you wish. - -If you'd simply like to play around with PLY in a virtual environment -or install it into your normal Python distribution, use the included -install.py script: - - $ python install.py - -Why this policy? PLY is a highly specialized tool for expert-level -programmers who are writing parsers and compilers. If you are writing -a compiler, there's a good chance that it's part of a substantially -larger project. Managing complexity and external dependencies (such -as PLY) in such projects is an ongoing challenge. However, the truth -of the matter is that PLY just isn't that big. All of the core -functionality is contained in just two files. PLY has no external -dependencies of its own. It changes very rarely. Plus, there are -various customizations that you might want to apply to how it works. -So, all things equal, it's probably better for you to copy it. This -also protects you in the event that some other project decides to use -PLY in a different way (or from a different version) than that used -in your project. - -But what about getting all of the latest improvements and bug fixes? -What improvements? PLY is implementing a 1970s-era parsing algorithm. -It's not cutting edge. As for bug fixes, you'll know pretty rapidly -if PLY works for your project or not. If it's working, there's -literally no reason to ever upgrade it. Keep using the version of code -that you copied. If you think you've found a bug, check back with the -repository to see if it's been fixed. Or submit it as an issue so that -it can be looked at. diff --git a/src/ply/__init__.py b/src/ply/__init__.py new file mode 100644 index 0000000..45f28c5 --- /dev/null +++ b/src/ply/__init__.py @@ -0,0 +1,5 @@ +# PLY package +# Author: David Beazley (dave@dabeaz.com) +# https://github.com/dabeaz/ply + +__version__ = '2022.10.27' diff --git a/ply/lex.py b/src/ply/lex.py index 3b670ef..de011fe 100644 --- a/ply/lex.py +++ b/src/ply/lex.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # ply: lex.py # -# Copyright (C) 2001-2020 +# Copyright (C) 2001-2022 # David M. Beazley (Dabeaz LLC) # All rights reserved. # diff --git a/ply/yacc.py b/src/ply/yacc.py index 5a750d7..6528796 100644 --- a/ply/yacc.py +++ b/src/ply/yacc.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # ply: yacc.py # -# Copyright (C) 2001-2020 +# Copyright (C) 2001-2022 # David M. Beazley (Dabeaz LLC) # All rights reserved. # diff --git a/test/testcpp.py b/test/testcpp.py deleted file mode 100644 index dbfb3e4..0000000 --- a/test/testcpp.py +++ /dev/null @@ -1,153 +0,0 @@ -from unittest import TestCase, main - -from multiprocessing import Process, Queue -from six.moves.queue import Empty - -import sys -import locale - -if ".." not in sys.path: - sys.path.insert(0, "..") - -from ply.lex import lex -from ply.cpp import * - - -def preprocessing(in_, out_queue): - out = None - - try: - p = Preprocessor(lex()) - p.parse(in_) - tokens = [t.value for t in p.parser] - out = "".join(tokens) - finally: - out_queue.put(out) - -class CPPTests(TestCase): - "Tests related to ANSI-C style lexical preprocessor." - - def __test_preprocessing(self, in_, expected, time_limit = 1.0): - out_queue = Queue() - - preprocessor = Process( - name = "PLY`s C preprocessor", - target = preprocessing, - args = (in_, out_queue) - ) - - preprocessor.start() - - try: - out = out_queue.get(timeout = time_limit) - except Empty: - preprocessor.terminate() - raise RuntimeError("Time limit exceeded!") - else: - self.assertMultiLineEqual(out, expected) - - def test_infinite_argument_expansion(self): - # CPP does not drags set of currently expanded macros through macro - # arguments expansion. If there is a match between an argument value - # and name of an already expanded macro then CPP falls into infinite - # recursion. - self.__test_preprocessing("""\ -#define a(x) x -#define b a(b) -b -""" , """\ - - -b""" - ) - - - def test_concatenation(self): - self.__test_preprocessing("""\ -#define a(x) x##_ -#define b(x) _##x -#define c(x) _##x##_ -#define d(x,y) _##x##y##_ - -a(i) -b(j) -c(k) -d(q,s)""" - , """\ - - - - - -i_ -_j -_k_ -_qs_""" - ) - - def test_deadloop_macro(self): - # If there is a word which equals to name of a parametrized macro, then - # attempt to expand such word as a macro manages the parser to fall - # into an infinite loop. - - self.__test_preprocessing("""\ -#define a(x) x - -a;""" - , """\ - - -a;""" - ) - - def test_index_error(self): - # If there are no tokens after a word ("a") which equals to name of - # a parameterized macro, then attempt to expand this word leads to - # IndexError. - - self.__test_preprocessing("""\ -#define a(x) x - -a""" - , """\ - - -a""" - ) - - def test_evalexpr(self): - # #if 1 != 2 is not processed correctly; undefined values are converted - # to 0L instead of 0 (issue #195) - # - self.__test_preprocessing("""\ -#if (1!=0) && (!x || (!(1==2))) -a; -#else -b; -#endif -""" - , """\ - -a; - -""" - ) - - def test_include_nonascii(self): - # Issue #196: #included files are read using the current locale's - # getdefaultencoding. if a #included file contains non-ascii characters, - # while default encoding is e.g. US_ASCII, this causes an error - locale.setlocale(locale.LC_ALL, 'C') - self.__test_preprocessing("""\ -#include "test_cpp_nonascii.c" -x; - -""" - , """\ - - -1; -""" - ) - -main() diff --git a/test/README b/tests/README index 03b167c..03b167c 100644 --- a/test/README +++ b/tests/README diff --git a/test/calclex.py b/tests/calclex.py index 030a986..4862753 100644 --- a/test/calclex.py +++ b/tests/calclex.py @@ -1,9 +1,6 @@ # ----------------------------------------------------------------------------- # calclex.py # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex tokens = ( diff --git a/test/cleanup.sh b/tests/cleanup.sh index 9374f2c..9374f2c 100755 --- a/test/cleanup.sh +++ b/tests/cleanup.sh diff --git a/test/lex_closure.py b/tests/lex_closure.py index 30ee679..0b34cff 100644 --- a/test/lex_closure.py +++ b/tests/lex_closure.py @@ -1,9 +1,6 @@ # ----------------------------------------------------------------------------- # lex_closure.py # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex tokens = ( diff --git a/test/lex_doc1.py b/tests/lex_doc1.py index 8a2bfcc..4b62609 100644 --- a/test/lex_doc1.py +++ b/tests/lex_doc1.py @@ -2,9 +2,6 @@ # # Missing documentation string -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_dup1.py b/tests/lex_dup1.py index fd04cdb..b9dc3c5 100644 --- a/test/lex_dup1.py +++ b/tests/lex_dup1.py @@ -2,9 +2,6 @@ # # Duplicated rule specifiers -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_dup2.py b/tests/lex_dup2.py index 870e5e7..47bf5b5 100644 --- a/test/lex_dup2.py +++ b/tests/lex_dup2.py @@ -2,9 +2,6 @@ # # Duplicated rule specifiers -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_dup3.py b/tests/lex_dup3.py index 94b5592..5398f48 100644 --- a/test/lex_dup3.py +++ b/tests/lex_dup3.py @@ -2,9 +2,6 @@ # # Duplicated rule specifiers -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_empty.py b/tests/lex_empty.py index e0368bf..d0baf31 100644 --- a/test/lex_empty.py +++ b/tests/lex_empty.py @@ -2,9 +2,6 @@ # # No rules defined -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_error1.py b/tests/lex_error1.py index 4508a80..fec0282 100644 --- a/test/lex_error1.py +++ b/tests/lex_error1.py @@ -2,9 +2,6 @@ # # Missing t_error() rule -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_error2.py b/tests/lex_error2.py index 8040d39..8914ab6 100644 --- a/test/lex_error2.py +++ b/tests/lex_error2.py @@ -2,9 +2,6 @@ # # t_error defined, but not function -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_error3.py b/tests/lex_error3.py index 1feefb6..71ca903 100644 --- a/test/lex_error3.py +++ b/tests/lex_error3.py @@ -2,9 +2,6 @@ # # t_error defined as function, but with wrong # args -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_error4.py b/tests/lex_error4.py index f4f48db..5387808 100644 --- a/test/lex_error4.py +++ b/tests/lex_error4.py @@ -2,9 +2,6 @@ # # t_error defined as function, but too many args -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_hedit.py b/tests/lex_hedit.py index 34f15a1..9f724d0 100644 --- a/test/lex_hedit.py +++ b/tests/lex_hedit.py @@ -13,8 +13,6 @@ # This example shows how to modify the state of the lexer to parse # such tokens # ----------------------------------------------------------------------------- -import sys -if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex diff --git a/test/lex_ignore.py b/tests/lex_ignore.py index 6c43b4c..b31fb39 100644 --- a/test/lex_ignore.py +++ b/tests/lex_ignore.py @@ -2,9 +2,6 @@ # # Improperly specific ignore declaration -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_ignore2.py b/tests/lex_ignore2.py index f60987a..de0958a 100644 --- a/test/lex_ignore2.py +++ b/tests/lex_ignore2.py @@ -2,9 +2,6 @@ # # ignore declaration as a raw string -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_literal1.py b/tests/lex_literal1.py index db389c3..510b379 100644 --- a/test/lex_literal1.py +++ b/tests/lex_literal1.py @@ -2,9 +2,6 @@ # # Bad literal specification -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_literal2.py b/tests/lex_literal2.py index b50b92c..a7a2c56 100644 --- a/test/lex_literal2.py +++ b/tests/lex_literal2.py @@ -2,9 +2,6 @@ # # Bad literal specification -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_literal3.py b/tests/lex_literal3.py index 91ab980..9d697c9 100644 --- a/test/lex_literal3.py +++ b/tests/lex_literal3.py @@ -3,9 +3,6 @@ # An empty literal specification given as a list # Issue 8 : Literals empty list causes IndexError -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_many_tokens.py b/tests/lex_many_tokens.py index 81ae57a..02e3a11 100644 --- a/test/lex_many_tokens.py +++ b/tests/lex_many_tokens.py @@ -4,8 +4,6 @@ # 100-group limit of the re module) import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = ["TOK%d" % i for i in range(1000)] diff --git a/test/lex_module.py b/tests/lex_module.py index 8bdd3ed..0fa8544 100644 --- a/test/lex_module.py +++ b/tests/lex_module.py @@ -1,9 +1,6 @@ # lex_module.py # -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex import lex_module_import lex.lex(module=lex_module_import) diff --git a/test/lex_module_import.py b/tests/lex_module_import.py index df42082..df42082 100644 --- a/test/lex_module_import.py +++ b/tests/lex_module_import.py diff --git a/test/lex_object.py b/tests/lex_object.py index 7e9f389..2fa7537 100644 --- a/test/lex_object.py +++ b/tests/lex_object.py @@ -1,9 +1,7 @@ # ----------------------------------------------------------------------------- # lex_object.py # ----------------------------------------------------------------------------- -import sys -if ".." not in sys.path: sys.path.insert(0,"..") import ply.lex as lex class CalcLexer: diff --git a/test/lex_re1.py b/tests/lex_re1.py index 5be7aef..d6b7c38 100644 --- a/test/lex_re1.py +++ b/tests/lex_re1.py @@ -2,9 +2,6 @@ # # Bad regular expression in a string -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_re2.py b/tests/lex_re2.py index 8dfb8e3..f5ce695 100644 --- a/test/lex_re2.py +++ b/tests/lex_re2.py @@ -2,9 +2,6 @@ # # Regular expression rule matches empty string -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_re3.py b/tests/lex_re3.py index e179925..8f594f3 100644 --- a/test/lex_re3.py +++ b/tests/lex_re3.py @@ -2,9 +2,6 @@ # # Regular expression rule matches empty string -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_rule1.py b/tests/lex_rule1.py index 0406c6f..ae279e9 100644 --- a/test/lex_rule1.py +++ b/tests/lex_rule1.py @@ -2,9 +2,6 @@ # # Rule function with incorrect number of arguments -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_rule2.py b/tests/lex_rule2.py index 1c29d87..adefee2 100644 --- a/test/lex_rule2.py +++ b/tests/lex_rule2.py @@ -2,9 +2,6 @@ # # Rule function with incorrect number of arguments -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_rule3.py b/tests/lex_rule3.py index 9ea94da..bd61040 100644 --- a/test/lex_rule3.py +++ b/tests/lex_rule3.py @@ -2,9 +2,6 @@ # # Rule function with incorrect number of arguments -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state1.py b/tests/lex_state1.py index 7528c91..6fa5e9f 100644 --- a/test/lex_state1.py +++ b/tests/lex_state1.py @@ -2,9 +2,6 @@ # # Bad state declaration -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state2.py b/tests/lex_state2.py index 3aef69e..9300ed7 100644 --- a/test/lex_state2.py +++ b/tests/lex_state2.py @@ -2,9 +2,6 @@ # # Bad state declaration -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state3.py b/tests/lex_state3.py index 616e484..2e5b7cf 100644 --- a/test/lex_state3.py +++ b/tests/lex_state3.py @@ -2,9 +2,6 @@ # # Bad state declaration -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state4.py b/tests/lex_state4.py index 1825016..fb147c8 100644 --- a/test/lex_state4.py +++ b/tests/lex_state4.py @@ -2,9 +2,6 @@ # # Bad state declaration -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state5.py b/tests/lex_state5.py index 4ce828e..fb2f3a7 100644 --- a/test/lex_state5.py +++ b/tests/lex_state5.py @@ -2,9 +2,6 @@ # # Bad state declaration -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state_noerror.py b/tests/lex_state_noerror.py index 90bbea8..54b892e 100644 --- a/test/lex_state_noerror.py +++ b/tests/lex_state_noerror.py @@ -2,9 +2,6 @@ # # Declaration of a state for which no rules are defined -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state_norule.py b/tests/lex_state_norule.py index 64ec6d3..4b979b2 100644 --- a/test/lex_state_norule.py +++ b/tests/lex_state_norule.py @@ -2,9 +2,6 @@ # # Declaration of a state for which no rules are defined -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_state_try.py b/tests/lex_state_try.py index fd5ba22..c5f3448 100644 --- a/test/lex_state_try.py +++ b/tests/lex_state_try.py @@ -2,9 +2,6 @@ # # Declaration of a state for which no rules are defined -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_token1.py b/tests/lex_token1.py index 6fca300..1001a99 100644 --- a/test/lex_token1.py +++ b/tests/lex_token1.py @@ -2,9 +2,6 @@ # # Tests for absence of tokens variable -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex t_PLUS = r'\+' diff --git a/test/lex_token2.py b/tests/lex_token2.py index 6e65ab0..00e758b 100644 --- a/test/lex_token2.py +++ b/tests/lex_token2.py @@ -2,9 +2,6 @@ # # Tests for tokens of wrong type -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = "PLUS MINUS NUMBER" diff --git a/test/lex_token3.py b/tests/lex_token3.py index 636452e..3b71b3a 100644 --- a/test/lex_token3.py +++ b/tests/lex_token3.py @@ -2,9 +2,6 @@ # # tokens is right type, but is missing a token for one rule -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_token4.py b/tests/lex_token4.py index 52947e9..89bc2e2 100644 --- a/test/lex_token4.py +++ b/tests/lex_token4.py @@ -2,9 +2,6 @@ # # Bad token name -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/lex_token_dup.py b/tests/lex_token_dup.py index 384f4e9..631535f 100644 --- a/test/lex_token_dup.py +++ b/tests/lex_token_dup.py @@ -2,9 +2,6 @@ # # Duplicate token name in tokens -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - import ply.lex as lex tokens = [ diff --git a/test/test_cpp_nonascii.c b/tests/test_cpp_nonascii.c index 3e97d81..3e97d81 100644 --- a/test/test_cpp_nonascii.c +++ b/tests/test_cpp_nonascii.c diff --git a/test/testlex.py b/tests/testlex.py index 318b47a..7f2adad 100755 --- a/test/testlex.py +++ b/tests/testlex.py @@ -11,7 +11,6 @@ import os import warnings import platform -sys.path.insert(0,"..") sys.tracebacklimit = 0 import ply.lex @@ -97,68 +96,68 @@ class LexErrorWarningTests(unittest.TestCase): def test_lex_doc1(self): self.assertRaises(SyntaxError,run_import,"lex_doc1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_doc1.py:18: No regular expression defined for rule 't_NUMBER'\n")) + self.assertTrue(check_expected(result, + "lex_doc1.py:15: No regular expression defined for rule 't_NUMBER'\n")) def test_lex_dup1(self): self.assertRaises(SyntaxError,run_import,"lex_dup1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_dup1.py:20: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + self.assertTrue(check_expected(result, + "lex_dup1.py:17: Rule t_NUMBER redefined. Previously defined on line 15\n" )) def test_lex_dup2(self): self.assertRaises(SyntaxError,run_import,"lex_dup2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_dup2.py:22: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + self.assertTrue(check_expected(result, + "lex_dup2.py:19: Rule t_NUMBER redefined. Previously defined on line 15\n" )) def test_lex_dup3(self): self.assertRaises(SyntaxError,run_import,"lex_dup3") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_dup3.py:20: Rule t_NUMBER redefined. Previously defined on line 18\n" )) + self.assertTrue(check_expected(result, + "lex_dup3.py:17: Rule t_NUMBER redefined. Previously defined on line 15\n" )) def test_lex_empty(self): self.assertRaises(SyntaxError,run_import,"lex_empty") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "No rules of the form t_rulename are defined\n" "No rules defined for state 'INITIAL'\n")) def test_lex_error1(self): run_import("lex_error1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "No t_error rule is defined\n")) def test_lex_error2(self): self.assertRaises(SyntaxError,run_import,"lex_error2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Rule 't_error' must be defined as a function\n") ) def test_lex_error3(self): self.assertRaises(SyntaxError,run_import,"lex_error3") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_error3.py:20: Rule 't_error' requires an argument\n")) + self.assertTrue(check_expected(result, + "lex_error3.py:17: Rule 't_error' requires an argument\n")) def test_lex_error4(self): self.assertRaises(SyntaxError,run_import,"lex_error4") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_error4.py:20: Rule 't_error' has too many arguments\n")) + self.assertTrue(check_expected(result, + "lex_error4.py:17: Rule 't_error' has too many arguments\n")) def test_lex_ignore(self): self.assertRaises(SyntaxError,run_import,"lex_ignore") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_ignore.py:20: Rule 't_ignore' must be defined as a string\n")) + self.assertTrue(check_expected(result, + "lex_ignore.py:17: Rule 't_ignore' must be defined as a string\n")) def test_lex_ignore2(self): run_import("lex_ignore2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "t_ignore contains a literal backslash '\\'\n")) @@ -169,20 +168,20 @@ class LexErrorWarningTests(unittest.TestCase): msg = "Invalid regular expression for rule 't_NUMBER'. unbalanced parenthesis\n" else: msg = "Invalid regular expression for rule 't_NUMBER'. missing ), unterminated subpattern at position 0" - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, msg, contains=True)) def test_lex_re2(self): self.assertRaises(SyntaxError,run_import,"lex_re2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Regular expression for rule 't_PLUS' matches empty string\n")) def test_lex_re3(self): self.assertRaises(SyntaxError,run_import,"lex_re3") result = sys.stderr.getvalue() -# self.assert_(check_expected(result, +# self.assertTrue(check_expected(result, # "Invalid regular expression for rule 't_POUND'. unbalanced parenthesis\n" # "Make sure '#' in rule 't_POUND' is escaped with '\\#'\n")) @@ -192,78 +191,78 @@ class LexErrorWarningTests(unittest.TestCase): else: msg = ("Invalid regular expression for rule 't_POUND'. missing ), unterminated subpattern at position 0\n" "ERROR: Make sure '#' in rule 't_POUND' is escaped with '\#'") - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, msg, contains=True), result) def test_lex_rule1(self): self.assertRaises(SyntaxError,run_import,"lex_rule1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "t_NUMBER not defined as a function or string\n")) def test_lex_rule2(self): self.assertRaises(SyntaxError,run_import,"lex_rule2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_rule2.py:18: Rule 't_NUMBER' requires an argument\n")) + self.assertTrue(check_expected(result, + "lex_rule2.py:15: Rule 't_NUMBER' requires an argument\n")) def test_lex_rule3(self): self.assertRaises(SyntaxError,run_import,"lex_rule3") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "lex_rule3.py:18: Rule 't_NUMBER' has too many arguments\n")) + self.assertTrue(check_expected(result, + "lex_rule3.py:15: Rule 't_NUMBER' has too many arguments\n")) def test_lex_state1(self): self.assertRaises(SyntaxError,run_import,"lex_state1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "states must be defined as a tuple or list\n")) def test_lex_state2(self): self.assertRaises(SyntaxError,run_import,"lex_state2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Invalid state specifier 'comment'. Must be a tuple (statename,'exclusive|inclusive')\n" "Invalid state specifier 'example'. Must be a tuple (statename,'exclusive|inclusive')\n")) def test_lex_state3(self): self.assertRaises(SyntaxError,run_import,"lex_state3") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "State name 1 must be a string\n" "No rules defined for state 'example'\n")) def test_lex_state4(self): self.assertRaises(SyntaxError,run_import,"lex_state4") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "State type for state 'comment' must be 'inclusive' or 'exclusive'\n")) def test_lex_state5(self): self.assertRaises(SyntaxError,run_import,"lex_state5") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "State 'comment' already defined\n")) def test_lex_state_noerror(self): run_import("lex_state_noerror") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "No error rule is defined for exclusive state 'comment'\n")) def test_lex_state_norule(self): self.assertRaises(SyntaxError,run_import,"lex_state_norule") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "No rules defined for state 'example'\n")) def test_lex_token1(self): self.assertRaises(SyntaxError,run_import,"lex_token1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "No token list is defined\n" "Rule 't_NUMBER' defined for an unspecified token NUMBER\n" "Rule 't_PLUS' defined for an unspecified token PLUS\n" @@ -273,7 +272,7 @@ class LexErrorWarningTests(unittest.TestCase): def test_lex_token2(self): self.assertRaises(SyntaxError,run_import,"lex_token2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "tokens must be a list or tuple\n" "Rule 't_NUMBER' defined for an unspecified token NUMBER\n" "Rule 't_PLUS' defined for an unspecified token PLUS\n" @@ -283,34 +282,34 @@ class LexErrorWarningTests(unittest.TestCase): def test_lex_token3(self): self.assertRaises(SyntaxError,run_import,"lex_token3") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Rule 't_MINUS' defined for an unspecified token MINUS\n")) def test_lex_token4(self): self.assertRaises(SyntaxError,run_import,"lex_token4") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Bad token name '-'\n")) def test_lex_token_dup(self): run_import("lex_token_dup") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Token 'MINUS' multiply defined\n")) def test_lex_literal1(self): self.assertRaises(SyntaxError,run_import,"lex_literal1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Invalid literal '**'. Must be a single character\n")) def test_lex_literal2(self): self.assertRaises(SyntaxError,run_import,"lex_literal2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Invalid literals specification. literals must be a sequence of characters\n")) import os @@ -333,7 +332,7 @@ class LexBuildOptionTests(unittest.TestCase): def test_lex_module(self): run_import("lex_module") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "(NUMBER,3,1,0)\n" "(PLUS,'+',1,1)\n" "(NUMBER,4,1,2)\n")) @@ -341,7 +340,7 @@ class LexBuildOptionTests(unittest.TestCase): def test_lex_object(self): run_import("lex_object") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "(NUMBER,3,1,0)\n" "(PLUS,'+',1,1)\n" "(NUMBER,4,1,2)\n")) @@ -349,7 +348,7 @@ class LexBuildOptionTests(unittest.TestCase): def test_lex_closure(self): run_import("lex_closure") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "(NUMBER,3,1,0)\n" "(PLUS,'+',1,1)\n" "(NUMBER,4,1,2)\n")) @@ -357,7 +356,7 @@ class LexBuildOptionTests(unittest.TestCase): def test_lex_many_tokens(self): run_import("lex_many_tokens") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "(TOK34,'TOK34:',1,0)\n" "(TOK143,'TOK143:',1,7)\n" "(TOK269,'TOK269:',1,15)\n" @@ -379,7 +378,7 @@ class LexRunTests(unittest.TestCase): def test_lex_hedit(self): run_import("lex_hedit") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "(H_EDIT_DESCRIPTOR,'abc',1,0)\n" "(H_EDIT_DESCRIPTOR,'abcdefghij',1,6)\n" "(H_EDIT_DESCRIPTOR,'xy',1,20)\n")) @@ -387,7 +386,7 @@ class LexRunTests(unittest.TestCase): def test_lex_state_try(self): run_import("lex_state_try") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "(NUMBER,'3',1,0)\n" "(PLUS,'+',1,2)\n" "(NUMBER,'4',1,4)\n" diff --git a/test/testyacc.py b/tests/testyacc.py index 96d4b0d..b488bf7 100644 --- a/test/testyacc.py +++ b/tests/testyacc.py @@ -12,7 +12,6 @@ import warnings import re import platform -sys.path.insert(0,"..") sys.tracebacklimit = 0 import ply.yacc @@ -97,16 +96,16 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_badargs(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badargs") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments\n" "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument\n" )) def test_yacc_badid(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badid") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_badid.py:32: Illegal name 'bad&rule' in rule 'statement'\n" - "yacc_badid.py:36: Illegal rule name 'bad&rule'\n" + self.assertTrue(check_expected(result, + "yacc_badid.py:29: Illegal name 'bad&rule' in rule 'statement'\n" + "yacc_badid.py:33: Illegal rule name 'bad&rule'\n" )) def test_yacc_badprec(self): @@ -114,20 +113,20 @@ class YaccErrorWarningTests(unittest.TestCase): run_import("yacc_badprec") except ply.yacc.YaccError: result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "precedence must be a list or tuple\n" )) def test_yacc_badprec2(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badprec2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Bad precedence table\n" )) def test_yacc_badprec3(self): run_import("yacc_badprec3") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Precedence already specified for terminal 'MINUS'\n" "Generating LALR tables\n" @@ -136,11 +135,11 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_badrule(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badrule") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_badrule.py:24: Syntax error. Expected ':'\n" - "yacc_badrule.py:28: Syntax error in rule 'statement'\n" - "yacc_badrule.py:33: Syntax error. Expected ':'\n" - "yacc_badrule.py:42: Syntax error. Expected ':'\n" + self.assertTrue(check_expected(result, + "yacc_badrule.py:21: Syntax error. Expected ':'\n" + "yacc_badrule.py:25: Syntax error in rule 'statement'\n" + "yacc_badrule.py:30: Syntax error. Expected ':'\n" + "yacc_badrule.py:39: Syntax error. Expected ':'\n" )) def test_yacc_badtok(self): @@ -148,14 +147,14 @@ class YaccErrorWarningTests(unittest.TestCase): run_import("yacc_badtok") except ply.yacc.YaccError: result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "tokens must be a list or tuple\n")) def test_yacc_dup(self): run_import("yacc_dup") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_dup.py:27: Function p_statement redefined. Previously defined on line 23\n" + self.assertTrue(check_expected(result, + "yacc_dup.py:24: Function p_statement redefined. Previously defined on line 20\n" "Token 'EQUALS' defined, but not used\n" "There is 1 unused token\n" "Generating LALR tables\n" @@ -166,16 +165,16 @@ class YaccErrorWarningTests(unittest.TestCase): run_import("yacc_error1") except ply.yacc.YaccError: result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_error1.py:61: p_error() requires 1 argument\n")) + self.assertTrue(check_expected(result, + "yacc_error1.py:58: p_error() requires 1 argument\n")) def test_yacc_error2(self): try: run_import("yacc_error2") except ply.yacc.YaccError: result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_error2.py:61: p_error() requires 1 argument\n")) + self.assertTrue(check_expected(result, + "yacc_error2.py:58: p_error() requires 1 argument\n")) def test_yacc_error3(self): try: @@ -183,21 +182,21 @@ class YaccErrorWarningTests(unittest.TestCase): except ply.yacc.YaccError: e = sys.exc_info()[1] result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "'p_error' defined, but is not a function or method\n")) def test_yacc_error4(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_error4") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token\n" + self.assertTrue(check_expected(result, + "yacc_error4.py:59: Illegal rule name 'error'. Already defined as a token\n" )) def test_yacc_error5(self): run_import("yacc_error5") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Group at 3:10 to 3:12\n" "Undefined name 'a'\n" "Syntax error at 'b'\n" @@ -209,7 +208,7 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_error6(self): run_import("yacc_error6") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "a=7\n" "Line 3: Syntax error at '*'\n" "c=21\n" @@ -218,7 +217,7 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_error7(self): run_import("yacc_error7") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "a=7\n" "Line 3: Syntax error at '*'\n" "c=21\n" @@ -227,7 +226,7 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_inf(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_inf") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Token 'NUMBER' defined, but not used\n" "There is 1 unused token\n" "Infinite recursion detected for symbol 'statement'\n" @@ -236,27 +235,27 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_literal(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_literal") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_literal.py:36: Literal token '**' in rule 'expression' may only be a single character\n" + self.assertTrue(check_expected(result, + "yacc_literal.py:33: Literal token '**' in rule 'expression' may only be a single character\n" )) def test_yacc_misplaced(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_misplaced") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_misplaced.py:32: Misplaced '|'\n" + self.assertTrue(check_expected(result, + "yacc_misplaced.py:29: Misplaced '|'\n" )) def test_yacc_missing1(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_missing1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule\n" + self.assertTrue(check_expected(result, + "yacc_missing1.py:21: Symbol 'location' used, but not defined as a token or a rule\n" )) def test_yacc_nested(self): run_import("yacc_nested") result = sys.stdout.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "A\n" "A\n" "A\n", @@ -265,15 +264,15 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_nodoc(self): run_import("yacc_nodoc") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr' (ignored)\n" + self.assertTrue(check_expected(result, + "yacc_nodoc.py:24: No documentation string specified in function 'p_statement_expr' (ignored)\n" "Generating LALR tables\n" )) def test_yacc_noerror(self): run_import("yacc_noerror") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "no p_error() function is defined\n" "Generating LALR tables\n" )) @@ -281,15 +280,15 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_nop(self): run_import("yacc_nop") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_nop.py:27: Possible grammar rule 'statement_expr' defined without p_ prefix\n" + self.assertTrue(check_expected(result, + "yacc_nop.py:24: Possible grammar rule 'statement_expr' defined without p_ prefix\n" "Generating LALR tables\n" )) def test_yacc_notfunc(self): run_import("yacc_notfunc") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "'p_statement_assign' not defined as a function\n" "Token 'EQUALS' defined, but not used\n" "There is 1 unused token\n" @@ -300,13 +299,13 @@ class YaccErrorWarningTests(unittest.TestCase): run_import("yacc_notok") except ply.yacc.YaccError: result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "No token list is defined\n")) def test_yacc_rr(self): run_import("yacc_rr") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Generating LALR tables\n" "1 reduce/reduce conflict\n" "reduce/reduce conflict in state 15 resolved using rule (statement -> NAME EQUALS NUMBER)\n" @@ -317,7 +316,7 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_rr_unused(self): run_import("yacc_rr_unused") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "no p_error() function is defined\n" "Generating LALR tables\n" "3 reduce/reduce conflicts\n" @@ -333,14 +332,14 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_simple(self): run_import("yacc_simple") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Generating LALR tables\n" )) def test_yacc_sr(self): run_import("yacc_sr") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Generating LALR tables\n" "20 shift/reduce conflicts\n" )) @@ -348,30 +347,30 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_term1(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_term1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token\n" + self.assertTrue(check_expected(result, + "yacc_term1.py:21: Illegal rule name 'NUMBER'. Already defined as a token\n" )) def test_yacc_unicode_literals(self): run_import("yacc_unicode_literals") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Generating LALR tables\n" )) def test_yacc_unused(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_unused") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule\n" + self.assertTrue(check_expected(result, + "yacc_unused.py:59: Symbol 'COMMA' used, but not defined as a token or a rule\n" "Symbol 'COMMA' is unreachable\n" "Symbol 'exprlist' is unreachable\n" )) def test_yacc_unused_rule(self): run_import("yacc_unused_rule") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_unused_rule.py:62: Rule 'integer' defined, but not used\n" + self.assertTrue(check_expected(result, + "yacc_unused_rule.py:59: Rule 'integer' defined, but not used\n" "There is 1 unused rule\n" "Symbol 'integer' is unreachable\n" "Generating LALR tables\n" @@ -380,21 +379,21 @@ class YaccErrorWarningTests(unittest.TestCase): def test_yacc_uprec(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_uprec.py:37: Nothing known about the precedence of 'UMINUS'\n" + self.assertTrue(check_expected(result, + "yacc_uprec.py:34: Nothing known about the precedence of 'UMINUS'\n" )) def test_yacc_uprec2(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec2") result = sys.stderr.getvalue() - self.assert_(check_expected(result, - "yacc_uprec2.py:37: Syntax error. Nothing follows %prec\n" + self.assertTrue(check_expected(result, + "yacc_uprec2.py:34: Syntax error. Nothing follows %prec\n" )) def test_yacc_prec1(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_prec1") result = sys.stderr.getvalue() - self.assert_(check_expected(result, + self.assertTrue(check_expected(result, "Precedence rule 'left' defined for unknown symbol '+'\n" "Precedence rule 'left' defined for unknown symbol '*'\n" "Precedence rule 'left' defined for unknown symbol '-'\n" diff --git a/test/yacc_badargs.py b/tests/yacc_badargs.py index 9a1d03f..cd4b1e7 100644 --- a/test/yacc_badargs.py +++ b/tests/yacc_badargs.py @@ -5,7 +5,7 @@ # ----------------------------------------------------------------------------- import sys sys.tracebacklimit = 0 -sys.path.insert(0,"..") + import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_badid.py b/tests/yacc_badid.py index e4b9f5e..f624aca 100644 --- a/test/yacc_badid.py +++ b/tests/yacc_badid.py @@ -3,9 +3,6 @@ # # Attempt to define a rule with a bad-identifier name # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_badprec.py b/tests/yacc_badprec.py index 3013bb6..95e9db2 100644 --- a/test/yacc_badprec.py +++ b/tests/yacc_badprec.py @@ -3,9 +3,6 @@ # # Bad precedence specifier # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_badprec2.py b/tests/yacc_badprec2.py index 83093b4..c33303d 100644 --- a/test/yacc_badprec2.py +++ b/tests/yacc_badprec2.py @@ -3,9 +3,6 @@ # # Bad precedence # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_badprec3.py b/tests/yacc_badprec3.py index d925ecd..e1ce3a6 100644 --- a/test/yacc_badprec3.py +++ b/tests/yacc_badprec3.py @@ -3,9 +3,6 @@ # # Bad precedence # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_badrule.py b/tests/yacc_badrule.py index 92af646..898a8c7 100644 --- a/test/yacc_badrule.py +++ b/tests/yacc_badrule.py @@ -3,9 +3,6 @@ # # Syntax problems in the rule strings # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_badtok.py b/tests/yacc_badtok.py index fc4afe1..567aadf 100644 --- a/test/yacc_badtok.py +++ b/tests/yacc_badtok.py @@ -3,9 +3,6 @@ # # A grammar, but tokens is a bad datatype # ----------------------------------------------------------------------------- - -import sys -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc tokens = "Hello" diff --git a/test/yacc_dup.py b/tests/yacc_dup.py index 309ba32..94238a4 100644 --- a/test/yacc_dup.py +++ b/tests/yacc_dup.py @@ -3,9 +3,6 @@ # # Duplicated rule name # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error1.py b/tests/yacc_error1.py index 10ac6a9..39b9124 100644 --- a/test/yacc_error1.py +++ b/tests/yacc_error1.py @@ -3,9 +3,6 @@ # # Bad p_error() function # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error2.py b/tests/yacc_error2.py index 7591418..2463ee9 100644 --- a/test/yacc_error2.py +++ b/tests/yacc_error2.py @@ -3,9 +3,6 @@ # # Bad p_error() function # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error3.py b/tests/yacc_error3.py index 4604a48..f181253 100644 --- a/test/yacc_error3.py +++ b/tests/yacc_error3.py @@ -3,9 +3,6 @@ # # Bad p_error() function # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error4.py b/tests/yacc_error4.py index 9c550cd..f6bef7e 100644 --- a/test/yacc_error4.py +++ b/tests/yacc_error4.py @@ -3,9 +3,6 @@ # # Attempt to define a rule named 'error' # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error5.py b/tests/yacc_error5.py index 9eb0f85..5d4b683 100644 --- a/test/yacc_error5.py +++ b/tests/yacc_error5.py @@ -3,9 +3,6 @@ # # Lineno and position tracking with error tokens # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error6.py b/tests/yacc_error6.py index 8d0ec85..fde3c4d 100644 --- a/test/yacc_error6.py +++ b/tests/yacc_error6.py @@ -3,9 +3,6 @@ # # Panic mode recovery test # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_error7.py b/tests/yacc_error7.py index abdc834..17cc4f1 100644 --- a/test/yacc_error7.py +++ b/tests/yacc_error7.py @@ -3,9 +3,6 @@ # # Panic mode recovery test using deprecated functionality # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_inf.py b/tests/yacc_inf.py index efd3612..f8b4c30 100644 --- a/test/yacc_inf.py +++ b/tests/yacc_inf.py @@ -3,9 +3,6 @@ # # Infinite recursion # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_literal.py b/tests/yacc_literal.py index 0d62803..687d916 100644 --- a/test/yacc_literal.py +++ b/tests/yacc_literal.py @@ -3,9 +3,6 @@ # # Grammar with bad literal characters # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_misplaced.py b/tests/yacc_misplaced.py index 9159b01..7cc75be 100644 --- a/test/yacc_misplaced.py +++ b/tests/yacc_misplaced.py @@ -3,9 +3,6 @@ # # A misplaced | in grammar rules # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_missing1.py b/tests/yacc_missing1.py index d1b5105..81955be 100644 --- a/test/yacc_missing1.py +++ b/tests/yacc_missing1.py @@ -3,9 +3,6 @@ # # Grammar with a missing rule # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_nested.py b/tests/yacc_nested.py index a3543a9..93ff400 100644 --- a/test/yacc_nested.py +++ b/tests/yacc_nested.py @@ -1,6 +1,3 @@ -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") from ply import lex, yacc diff --git a/test/yacc_nodoc.py b/tests/yacc_nodoc.py index 0f61920..a2f34dd 100644 --- a/test/yacc_nodoc.py +++ b/tests/yacc_nodoc.py @@ -3,9 +3,6 @@ # # Rule with a missing doc-string # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_noerror.py b/tests/yacc_noerror.py index b38c758..2262745 100644 --- a/test/yacc_noerror.py +++ b/tests/yacc_noerror.py @@ -3,9 +3,6 @@ # # No p_error() rule defined. # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_nop.py b/tests/yacc_nop.py index 789a9cf..3ff17ac 100644 --- a/test/yacc_nop.py +++ b/tests/yacc_nop.py @@ -3,9 +3,6 @@ # # Possible grammar rule defined without p_ prefix # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_notfunc.py b/tests/yacc_notfunc.py index 5093a74..c8bd123 100644 --- a/test/yacc_notfunc.py +++ b/tests/yacc_notfunc.py @@ -3,9 +3,6 @@ # # p_rule not defined as a function # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_notok.py b/tests/yacc_notok.py index cff55a8..6e271a0 100644 --- a/test/yacc_notok.py +++ b/tests/yacc_notok.py @@ -3,10 +3,6 @@ # # A grammar, but we forgot to import the tokens list # ----------------------------------------------------------------------------- - -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc # Parsing rules diff --git a/test/yacc_prec1.py b/tests/yacc_prec1.py index 99fcd90..d4fad6e 100644 --- a/test/yacc_prec1.py +++ b/tests/yacc_prec1.py @@ -3,9 +3,6 @@ # # Tests case where precedence specifier doesn't match up to terminals # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_rr.py b/tests/yacc_rr.py index e7336c2..de78a58 100644 --- a/test/yacc_rr.py +++ b/tests/yacc_rr.py @@ -3,9 +3,6 @@ # # A grammar with a reduce/reduce conflict # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_rr_unused.py b/tests/yacc_rr_unused.py index 1ca5f7e..70a8c78 100644 --- a/test/yacc_rr_unused.py +++ b/tests/yacc_rr_unused.py @@ -4,9 +4,6 @@ # A grammar with reduce/reduce conflicts and a rule that never # gets reduced. # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc tokens = ('A', 'B', 'C') diff --git a/test/yacc_simple.py b/tests/yacc_simple.py index bd989f4..5261838 100644 --- a/test/yacc_simple.py +++ b/tests/yacc_simple.py @@ -3,9 +3,6 @@ # # A simple, properly specifier grammar # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_sr.py b/tests/yacc_sr.py index 69a1e9c..0f15a50 100644 --- a/test/yacc_sr.py +++ b/tests/yacc_sr.py @@ -3,9 +3,6 @@ # # A grammar with shift-reduce conflicts # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_term1.py b/tests/yacc_term1.py index eaa36e9..1c1d8c2 100644 --- a/test/yacc_term1.py +++ b/tests/yacc_term1.py @@ -3,9 +3,6 @@ # # Terminal used on the left-hand-side # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_unicode_literals.py b/tests/yacc_unicode_literals.py index 5ae4f5b..04f6cfd 100644 --- a/test/yacc_unicode_literals.py +++ b/tests/yacc_unicode_literals.py @@ -5,9 +5,6 @@ # ----------------------------------------------------------------------------- from __future__ import unicode_literals -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_unused.py b/tests/yacc_unused.py index 55b677b..7bc6749 100644 --- a/test/yacc_unused.py +++ b/tests/yacc_unused.py @@ -3,9 +3,6 @@ # # A grammar with an unused rule # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_unused_rule.py b/tests/yacc_unused_rule.py index 4868ef8..2e08d6d 100644 --- a/test/yacc_unused_rule.py +++ b/tests/yacc_unused_rule.py @@ -3,9 +3,6 @@ # # Grammar with an unused rule # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_uprec.py b/tests/yacc_uprec.py index 569adb8..f173509 100644 --- a/test/yacc_uprec.py +++ b/tests/yacc_uprec.py @@ -3,9 +3,6 @@ # # A grammar with a bad %prec specifier # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens diff --git a/test/yacc_uprec2.py b/tests/yacc_uprec2.py index 73274bf..85c0f2f 100644 --- a/test/yacc_uprec2.py +++ b/tests/yacc_uprec2.py @@ -3,9 +3,6 @@ # # A grammar with a bad %prec specifier # ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") import ply.yacc as yacc from calclex import tokens |