From 5eafa07470a3e761944ff5af00dbcd794dfa09da Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Fri, 30 Jul 2021 14:51:33 +0200 Subject: Add support for LR parsing * first draft of LR parsing * removed debug output * cache is owned and cleared by ParserElement * bounded recursion must be enabled explicitly * packrat rejects recursion * basic LR unit test * tests for associativity and nesting * added math example * fixed test typo * unittest for empty and non-peg clauses * LR-Forward can match Empty * fixed test typos * added base case to unittest * memo cache only provides copies * flattened Forward parse method * added high-level description of algorithm * expanded docstring * added tests for repetition rules * renamed bounded to left recursion * naive test for existing suite * explicitly testing tests for LR compatibility * LR memo no longer mixes action/no-action results * simplified replacement logic * adjusted example with ambiguous failure case * LR memo content is always returned as copy * draft for peeking recursion * memo update consistent for all actions * fixed a bug for non-string token identifiers * action wins against no-action * cleanup * properly setting names in tests * memoization can be turned off * testing memo switches * typos * flattened recursion memo * left recursion memo size may be limited * adjusted docs for recursion cache --- pyparsing/core.py | 146 +++++++++++++++++++++++++++++++- pyparsing/results.py | 2 +- pyparsing/testing.py | 3 + pyparsing/util.py | 46 ++++++++++ tests/test_unit.py | 235 ++++++++++++++++++++++++++++++++++++++++++++++----- 5 files changed, 409 insertions(+), 23 deletions(-) diff --git a/pyparsing/core.py b/pyparsing/core.py index c4adb5d..d5ee05f 100644 --- a/pyparsing/core.py +++ b/pyparsing/core.py @@ -1,6 +1,7 @@ # # core.py # +from typing import Optional from abc import ABC, abstractmethod from enum import Enum import string @@ -23,6 +24,8 @@ from .util import ( _escapeRegexRangeChars, _bslash, _flatten, + LRUMemo as _LRUMemo, + UnboundedMemo as _UnboundedMemo, ) from .exceptions import * from .actions import * @@ -703,6 +706,10 @@ class ParserElement(ABC): else: return True + # cache for left-recursion in Forward references + recursion_lock = RLock() + recursion_memos = {} # type: dict[tuple[int, Forward, bool], tuple[int, ParseResults | Exception]] + # argument cache for optimizing repeated calls when backtracking through recursive expressions packrat_cache = ( {} @@ -766,11 +773,73 @@ class ParserElement(ABC): ParserElement.packrat_cache_stats[:] = [0] * len( ParserElement.packrat_cache_stats ) + ParserElement.recursion_memos.clear() _packratEnabled = False + _left_recursion_enabled = False + + @staticmethod + def disable_memoization(): + """ + Disables active Packrat or Left Recursion parsing and their memoization + + This method also works if neither Packrat nor Left Recursion are enabled. + This makes it safe to call before activating Packrat nor Left Recursion + to clear any previous settings. + """ + ParserElement.resetCache() + ParserElement._left_recursion_enabled = False + ParserElement._packratEnabled = False + ParserElement._parse = ParserElement._parseNoCache + + @staticmethod + def enable_left_recursion(cache_size_limit: Optional[int] = None, *, force=False): + """ + Enables "bounded recursion" parsing, which allows for both direct and indirect + left-recursion. During parsing, left-recursive :class:`Forward` elements are + repeatedly matched with a fixed recursion depth that is gradually increased + until finding the longest match. + + Example:: + + import pyparsing as pp + pp.ParserElement.enable_left_recursion() + + E = pp.Forward("E") + num = pp.Word(pp.nums) + # match `num`, or `num '+' num`, or `num '+' num '+' num`, ... + E <<= E + '+' - num | num + + print(E.parseString("1+2+3")) + + Recursion search naturally memoizes matches of ``Forward`` elements and may + thus skip reevaluation of parse actions during backtracking. This may break + programs with parse actions which rely on strict ordering of side-effects. + + Parameters: + + - cache_size_limit - (default=``None``) - memoize at most this many + ``Forward`` elements during matching; if ``None`` (the default), + memoize all ``Forward`` elements. + + Bounded Recursion parsing works similar but not identical to Packrat parsing, + thus the two cannot be used together. Use ``force=True`` to disable any + previous, conflicting settings. + """ + if force: + ParserElement.disable_memoization() + elif ParserElement._packratEnabled: + raise RuntimeError("Packrat and Bounded Recursion are not compatible") + if cache_size_limit is None: + ParserElement.recursion_memos = _UnboundedMemo() + elif cache_size_limit > 0: + ParserElement.recursion_memos = _LRUMemo(capacity=cache_size_limit) + else: + raise NotImplementedError("Memo size of %s" % cache_size_limit) + ParserElement._left_recursion_enabled = True @staticmethod - def enablePackrat(cache_size_limit=128): + def enablePackrat(cache_size_limit=128, *, force=False): """Enables "packrat" parsing, which adds memoizing to the parsing logic. Repeated parse attempts at the same string location (which happens often in many complex grammars) can immediately return a cached value, @@ -795,7 +864,15 @@ class ParserElement(ABC): import pyparsing pyparsing.ParserElement.enablePackrat() + + Packrat parsing works similar but not identical to Bounded Recursion parsing, + thus the two cannot be used together. Use ``force=True`` to disable any + previous, conflicting settings. """ + if force: + ParserElement.disable_memoization() + elif ParserElement._left_recursion_enabled: + raise RuntimeError("Packrat and Bounded Recursion are not compatible") if not ParserElement._packratEnabled: ParserElement._packratEnabled = True if cache_size_limit is None: @@ -4390,7 +4467,72 @@ class Forward(ParseElementEnhance): "Forward expression was never assigned a value, will not parse any input", stacklevel=stacklevel, ) - return super().parseImpl(instring, loc, doActions) + if not ParserElement._left_recursion_enabled: + return super().parseImpl(instring, loc, doActions) + # ## Bounded Recursion algorithm ## + # Recursion only needs to be processed at ``Forward`` elements, since they are + # the only ones that can actually refer to themselves. The general idea is + # to handle recursion stepwise: We start at no recursion, then recurse once, + # recurse twice, ..., until more recursion offers no benefit (we hit the bound). + # + # The "trick" here is that each ``Forward`` gets evaluated in two contexts + # - to *match* a specific recursion level, and + # - to *search* the bounded recursion level + # and the two run concurrently. The *search* must *match* each recursion level + # to find the best possible match. This is handled by a memo table, which + # provides the previous match to the next level match attempt. + # + # See also "Left Recursion in Parsing Expression Grammars", Medeiros et al. + # + # There is a complication since we not only *parse* but also *transform* via + # actions: We do not want to run the actions too often while expanding. Thus, + # we expand using `doActions=False` and only run `doActions=True` if the next + # recursion level is acceptable. + with ParserElement.recursion_lock: + memo = ParserElement.recursion_memos + try: + # we are parsing at a specific recursion expansion – use it as-is + prev_loc, prev_result = memo[loc, self, doActions] + if isinstance(prev_result, Exception): + raise prev_result + return prev_loc, prev_result.copy() + except KeyError: + act_key = (loc, self, True) + peek_key = (loc, self, False) + # we are searching for the best recursion expansion – keep on improving + # both `doActions` cases must be tracked separately here! + prev_loc, prev_peek = memo[peek_key] = loc - 1, ParseException( + instring, loc, "Forward recursion without base case", self + ) + if doActions: + memo[act_key] = memo[peek_key] + while True: + try: + new_loc, new_peek = super().parseImpl(instring, loc, False) + except ParseException: + # we failed before getting any match – do not hide the error + if isinstance(prev_peek, Exception): + raise + new_loc, new_peek = prev_loc, prev_peek + # the match did not get better: we are done + if new_loc <= prev_loc: + if doActions: + # replace the match for doActions=False as well, + # in case the action did backtrack + prev_loc, prev_result = memo[peek_key] = memo[act_key] + del memo[peek_key], memo[act_key] + return prev_loc, prev_result.copy() + del memo[peek_key] + return prev_loc, prev_peek.copy() + # the match did get better: see if we can improve further + else: + if doActions: + try: + memo[act_key] = super().parseImpl(instring, loc, True) + except ParseException as e: + memo[peek_key] = memo[act_key] = (new_loc, e) + raise + prev_loc, prev_peek = memo[peek_key] = new_loc, new_peek def leaveWhitespace(self, recursive=True): self.skipWhitespace = False diff --git a/pyparsing/results.py b/pyparsing/results.py index fb03ecd..fa94a00 100644 --- a/pyparsing/results.py +++ b/pyparsing/results.py @@ -522,7 +522,7 @@ class ParseResults: Returns a new copy of a :class:`ParseResults` object. """ ret = ParseResults(self._toklist) - ret._tokdict = dict(**self._tokdict) + ret._tokdict = self._tokdict.copy() ret._parent = self._parent ret._all_names |= self._all_names ret._name = self._name diff --git a/pyparsing/testing.py b/pyparsing/testing.py index 393f37b..43f23ab 100644 --- a/pyparsing/testing.py +++ b/pyparsing/testing.py @@ -20,6 +20,7 @@ class pyparsing_test: """ Context manager to be used when writing unit tests that modify pyparsing config values: - packrat parsing + - bounded recursion parsing - default whitespace characters. - default keyword characters - literal string auto-conversion class @@ -61,6 +62,7 @@ class pyparsing_test: else: self._save_context["packrat_cache_size"] = None self._save_context["packrat_parse"] = ParserElement._parse + self._save_context["recursion_enabled"] = ParserElement._left_recursion_enabled self._save_context["__diag__"] = { name: getattr(__diag__, name) for name in __diag__._all_names @@ -97,6 +99,7 @@ class pyparsing_test: ParserElement.enablePackrat(self._save_context["packrat_cache_size"]) else: ParserElement._parse = self._save_context["packrat_parse"] + ParserElement._left_recursion_enabled = self._save_context["recursion_enabled"] __compat__.collect_all_And_tokens = self._save_context["__compat__"] diff --git a/pyparsing/util.py b/pyparsing/util.py index 3cb69d2..875799d 100644 --- a/pyparsing/util.py +++ b/pyparsing/util.py @@ -119,6 +119,52 @@ class _FifoCache: self.clear = types.MethodType(clear, self) +class LRUMemo: + """ + A memoizing mapping that retains `capacity` deleted items + + The memo tracks retained items by their access order; once `capacity` items + are retained, the least recently used item is discarded. + """ + def __init__(self, capacity): + self._capacity = capacity + self._active = {} + self._memory = collections.OrderedDict() + + def __getitem__(self, key): + try: + return self._active[key] + except KeyError: + self._memory.move_to_end(key) + return self._memory[key] + + def __setitem__(self, key, value): + self._memory.pop(key, None) + self._active[key] = value + + def __delitem__(self, key): + try: + value = self._active.pop(key) + except KeyError: + pass + else: + while len(self._memory) >= self._capacity: + self._memory.popitem(last=False) + self._memory[key] = value + + def clear(self): + self._active.clear() + self._memory.clear() + + +class UnboundedMemo(dict): + """ + A memoizing mapping that retains all deleted items + """ + def __delitem__(self, key): + pass + + def _escapeRegexRangeChars(s): # escape these chars: ^-[] for c in r"\^-[]": diff --git a/tests/test_unit.py b/tests/test_unit.py index 09ad6a4..aed2db3 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -287,7 +287,7 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): test("-9", -9) test("--9", 9) test("-E", -math.e) - test("9 + 3 + 6", 9 + 3 + 6) + test("9 + 3 + 5", 9 + 3 + 5) test("9 + 3 / 11", 9 + 3.0 / 11) test("(9 + 3)", (9 + 3)) test("(9+3) / 11", (9 + 3.0) / 11) @@ -1609,8 +1609,8 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): pp.QuotedString("", "\\") def testRepeater(self): - if ParserElement._packratEnabled: - print("skipping this test, not compatible with packratting") + if ParserElement._packratEnabled or ParserElement._left_recursion_enabled: + print("skipping this test, not compatible with memoization") return first = pp.Word("abcdef").setName("word1") @@ -1715,8 +1715,8 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testRepeater2(self): """test matchPreviousLiteral with empty repeater""" - if ParserElement._packratEnabled: - print("skipping this test, not compatible with packratting") + if ParserElement._packratEnabled or ParserElement._left_recursion_enabled: + print("skipping this test, not compatible with memoization") return first = pp.Optional(pp.Word("abcdef").setName("words1")) @@ -1735,8 +1735,8 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testRepeater3(self): """test matchPreviousLiteral with multiple repeater tokens""" - if ParserElement._packratEnabled: - print("skipping this test, not compatible with packratting") + if ParserElement._packratEnabled or ParserElement._left_recursion_enabled: + print("skipping this test, not compatible with memoization") return first = pp.Word("a") + pp.Word("d") @@ -1755,8 +1755,8 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testRepeater4(self): """test matchPreviousExpr with multiple repeater tokens""" - if ParserElement._packratEnabled: - print("skipping this test, not compatible with packratting") + if ParserElement._packratEnabled or ParserElement._left_recursion_enabled: + print("skipping this test, not compatible with memoization") return first = pp.Group(pp.Word(pp.alphas) + pp.Word(pp.alphas)) @@ -1782,8 +1782,8 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testRepeater5(self): """a simplified testRepeater4 to examine matchPreviousExpr with a single repeater token""" - if ParserElement._packratEnabled: - print("skipping this test, not compatible with packratting") + if ParserElement._packratEnabled or ParserElement._left_recursion_enabled: + print("skipping this test, not compatible with memoization") return first = pp.Word(pp.alphas) @@ -6712,6 +6712,7 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): self.fail("failed to raise exception when matching empty string") def testExplainException(self): + pp.ParserElement.disable_memoization() expr = pp.Word(pp.nums).setName("int") + pp.Word(pp.alphas).setName("word") try: expr.parseString("123 355") @@ -6732,16 +6733,22 @@ class Test2_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): return t[0] / t[1] expr.addParseAction(divide_args) - pp.ParserElement.enablePackrat() - print() + for memo_kind, enable_memo in [ + ('Packrat', pp.ParserElement.enablePackrat), + ('Left Recursion', pp.ParserElement.enable_left_recursion), + ]: + enable_memo(force=True) + print("Explain for", memo_kind) - try: - expr.parseString("123 0") - except pp.ParseException as pe: - print(pe.explain()) - except Exception as exc: - print(pp.ParseBaseException.explain_exception(exc)) - raise + try: + expr.parseString("123 0") + except pp.ParseException as pe: + print(pe.explain()) + except Exception as exc: + print(pp.ParseBaseException.explain_exception(exc)) + raise + # make sure we leave the state compatible with everything + pp.ParserElement.disable_memoization() def testCaselessKeywordVsKeywordCaseless(self): frule = pp.Keyword("t", caseless=True) + pp.Keyword("yes", caseless=True) @@ -8067,9 +8074,197 @@ class Test8_WithUnboundedPackrat(Test2_WithoutPackrat): ) +class Test9_WithLeftRecursionParsing(Test2_WithoutPackrat): + """ + rerun Test2 tests, now with unbounded left recursion cache + """ + + def setUp(self): + ParserElement.enable_left_recursion(force=True) + + def tearDown(self): + default_suite_context.restore() + + def test000_assert_packrat_status(self): + print("Left-Recursion enabled:", ParserElement._left_recursion_enabled) + self.assertTrue(ParserElement._left_recursion_enabled, "left recursion not enabled") + self.assertIsInstance(ParserElement.recursion_memos, pp.util.UnboundedMemo) + + +class Test10_WithLeftRecursionParsingBoundedMemo(Test2_WithoutPackrat): + """ + rerun Test2 tests, now with bounded left recursion cache + """ + + def setUp(self): + ParserElement.enable_left_recursion(cache_size_limit=4, force=True) + + def tearDown(self): + default_suite_context.restore() + + def test000_assert_packrat_status(self): + print("Left-Recursion enabled:", ParserElement._left_recursion_enabled) + self.assertTrue(ParserElement._left_recursion_enabled, "left recursion not enabled") + self.assertIsInstance(ParserElement.recursion_memos, pp.util.LRUMemo) + # check that the cache matches roughly what we expect + # – it may be larger due to action handling + self.assertLessEqual(ParserElement.recursion_memos._capacity, 4) + self.assertGreater(ParserElement.recursion_memos._capacity * 3, 4) + + +class TestLR1_Recursion(ppt.TestParseResultsAsserts, TestCase): + """ + Tests for recursive parsing + """ + suite_context = None + save_suite_context = None + + def setUp(self): + recursion_suite_context.restore() + + def tearDown(self): + default_suite_context.restore() + + def test_repeat_as_recurse(self): + """repetition rules formulated with recursion""" + one_or_more = pp.Forward().setName("one_or_more") + one_or_more <<= one_or_more + "a" | "a" + self.assertParseResultsEquals( + one_or_more.parseString("a"), + expected_list=["a"], + ) + self.assertParseResultsEquals( + one_or_more.parseString("aaa aa"), + expected_list=["a", "a", "a", "a", "a"], + ) + delimited_list = pp.Forward().setName("delimited_list") + delimited_list <<= delimited_list + pp.Suppress(',') + "b" | "b" + self.assertParseResultsEquals( + delimited_list.parseString("b"), + expected_list=["b"], + ) + self.assertParseResultsEquals( + delimited_list.parseString("b,b"), + expected_list=["b", "b"], + ) + self.assertParseResultsEquals( + delimited_list.parseString("b,b , b, b,b"), + expected_list=["b", "b", "b", "b", "b"], + ) + + def test_binary_recursive(self): + """parsing of single left-recursive binary operator""" + expr = pp.Forward().setName("expr") + num = pp.Word(pp.nums) + expr <<= expr + '+' - num | num + self.assertParseResultsEquals( + expr.parseString("1+2"), + expected_list=['1', '+', '2'] + ) + self.assertParseResultsEquals( + expr.parseString("1+2+3+4"), + expected_list=['1', '+', '2', '+', '3', '+', '4'] + ) + + def test_binary_associative(self): + """associative is preserved for single left-recursive binary operator""" + expr = pp.Forward().setName("expr") + num = pp.Word(pp.nums) + expr <<= pp.Group(expr) + '+' - num | num + self.assertParseResultsEquals( + expr.parseString("1+2"), + expected_list=[['1'], '+', '2'], + ) + self.assertParseResultsEquals( + expr.parseString("1+2+3+4"), + expected_list=[[[['1'], '+', '2'], '+', '3'], '+', '4'], + ) + + def test_add_sub(self): + """indirectly left-recursive/associative add/sub calculator""" + expr = pp.Forward().setName("expr") + num = pp.Word(pp.nums).setParseAction(lambda t: int(t[0])) + expr <<= ( + (expr + '+' - num).setParseAction(lambda t: t[0] + t[2]) + | (expr + '-' - num).setParseAction(lambda t: t[0] - t[2]) + | num + ) + self.assertEqual(expr.parseString("1+2")[0], 3) + self.assertEqual(expr.parseString("1+2+3")[0], 6) + self.assertEqual(expr.parseString("1+2-3")[0], 0) + self.assertEqual(expr.parseString("1-2+3")[0], 2) + self.assertEqual(expr.parseString("1-2-3")[0], -4) + + def test_math(self): + """precedence climbing parser for math""" + # named references + expr = pp.Forward().setName("expr") + add_sub = pp.Forward().setName("add_sub") + mul_div = pp.Forward().setName("mul_div") + power = pp.Forward().setName("power") + terminal = pp.Forward().setName("terminal") + # concrete rules + number = pp.Word(pp.nums).setParseAction(lambda t: int(t[0])) + signed = ('+' - expr) | ('-' - expr).setParseAction(lambda t: -t[1]) + group = pp.Suppress('(') - expr - pp.Suppress(')') + add_sub <<= ( + (add_sub + '+' - mul_div).setParseAction(lambda t: t[0] + t[2]) + | (add_sub + '-' - mul_div).setParseAction(lambda t: t[0] - t[2]) + | mul_div + ) + mul_div <<= ( + (mul_div + '*' - power).setParseAction(lambda t: t[0] * t[2]) + | (mul_div + '/' - power).setParseAction(lambda t: t[0] / t[2]) + | power + ) + power <<= ( + (terminal + '^' - power).setParseAction(lambda t: t[0] ** t[2]) + | terminal + ) + terminal <<= number | signed | group + expr <<= add_sub + # simple add_sub expressions + self.assertEqual(expr.parseString("1+2")[0], 3) + self.assertEqual(expr.parseString("1+2+3")[0], 6) + self.assertEqual(expr.parseString("1+2-3")[0], 0) + self.assertEqual(expr.parseString("1-2+3")[0], 2) + self.assertEqual(expr.parseString("1-2-3")[0], -4) + # precedence overwriting via parentheses + self.assertEqual(expr.parseString("1+(2+3)")[0], 6) + self.assertEqual(expr.parseString("1+(2-3)")[0], 0) + self.assertEqual(expr.parseString("1-(2+3)")[0], -4) + self.assertEqual(expr.parseString("1-(2-3)")[0], 2) + # complicated math expressions – same as Python expressions + self.assertEqual(expr.parseString("1----3")[0], 1----3) + self.assertEqual(expr.parseString("1+2*3")[0], 1+2*3) + self.assertEqual(expr.parseString("1*2+3")[0], 1*2+3) + self.assertEqual(expr.parseString("1*2^3")[0], 1*2**3) + self.assertEqual(expr.parseString("4^3^2^1")[0], 4**3**2**1) + + def test_terminate_empty(self): + """Recursion with ``Empty`` terminates""" + empty = pp.Forward().setName('e') + empty <<= empty + pp.Empty() | pp.Empty() + self.assertParseResultsEquals(empty.parseString(""), expected_list=[]) + + def test_non_peg(self): + """Recursion works for non-PEG operators""" + expr = pp.Forward().setName('expr') + expr <<= expr + "a" ^ expr + "ab" ^ expr + "abc" ^ "." + self.assertParseResultsEquals( + expr.parseString(".abcabaabc"), + expected_list=[".", "abc", "ab", "a", "abc"] + ) + + # force clear of packrat parsing flags before saving contexts pp.ParserElement._packratEnabled = False pp.ParserElement._parse = pp.ParserElement._parseNoCache Test2_WithoutPackrat.suite_context = ppt.reset_pyparsing_context().save() Test2_WithoutPackrat.save_suite_context = ppt.reset_pyparsing_context().save() + +default_suite_context = ppt.reset_pyparsing_context().save() +pp.ParserElement.enable_left_recursion() +recursion_suite_context = ppt.reset_pyparsing_context().save() +default_suite_context.restore() -- cgit v1.2.1