From 191c56031a16a6a7910613fa2be77a670d225996 Mon Sep 17 00:00:00 2001 From: ptmcg Date: Mon, 13 Jun 2016 20:24:24 +0000 Subject: Added ParserElement.split() generator method Also fixed minor blip in originalTextFor, to inherit ignored exprs from the embedded expression. git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@369 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b --- src/CHANGES | 5 +++ src/pyparsing.py | 18 ++++++++++- src/unitTests.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/src/CHANGES b/src/CHANGES index 10c5593..799cdf8 100644 --- a/src/CHANGES +++ b/src/CHANGES @@ -4,6 +4,11 @@ Change Log Verison 2.1.5 - June, 2016 ------------------------------ +- Added ParserElement.split() generator method, similar to re.split(). + Includes optional arguments maxsplit (to limit the number of splits), + and includeSeparators (to include the separating matched text in the + returned output, default=False). + - Added a new parse action construction helper tokenMap, which will apply a function and optional arguments to each element in a ParseResults. So this parse action: diff --git a/src/pyparsing.py b/src/pyparsing.py index 5fb3fbc..ec678d0 100644 --- a/src/pyparsing.py +++ b/src/pyparsing.py @@ -58,7 +58,7 @@ The pyparsing module handles some of the problems that are typically vexing when """ __version__ = "2.1.5" -__versionTime__ = "12 Jun 2016 21:53 UTC" +__versionTime__ = "13 Jun 2016 19:59 UTC" __author__ = "Paul McGuire " import string @@ -1326,6 +1326,21 @@ class ParserElement(object): # catch and re-raise exception from here, clears out pyparsing internal stack trace raise exc + def split(self, instring, maxsplit=_MAX_INT, includeSeparators=False): + """Generator method to split a string using the given expression as a separator. + May be called with optional C{maxsplit} argument, to limit the number of splits; + and the optional C{includeSeparators} argument (default=C{False}), if the separating + matching text should be included in the split results. + """ + splits = 0 + last = 0 + for t,s,e in self.scanString(instring, maxMatches=maxsplit): + yield instring[last:s] + if includeSeparators: + yield t[0] + last = e + yield instring[last:] + def __add__(self, other ): """Implementation of + operator - returns C{L{And}}""" if isinstance( other, basestring ): @@ -3525,6 +3540,7 @@ def originalTextFor(expr, asString=True): def extractText(s,l,t): t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]] matchExpr.setParseAction(extractText) + matchExpr.ignoreExprs = expr.ignoreExprs return matchExpr def ungroup(expr): diff --git a/src/unitTests.py b/src/unitTests.py index 9987a58..90cbc2a 100644 --- a/src/unitTests.py +++ b/src/unitTests.py @@ -2822,6 +2822,102 @@ class HTMLStripperTest(ParseTestCase): result = read_everything.parseString(sample) assert result[0].strip() == 'Here is some sample HTML text.' +class ExprSplitterTest(ParseTestCase): + def runTest(self): + + from pyparsing import Literal, quotedString, pythonStyleComment, Empty + + expr = Literal(';') + Empty() + expr.ignore(quotedString) + expr.ignore(pythonStyleComment) + + + sample = """ + def main(): + this_semi_does_nothing(); + neither_does_this_but_there_are_spaces_afterward(); + a = "a;b"; return a # this is a comment; it has a semicolon! + + def b(): + if False: + z=1000;b("; in quotes"); c=200;return z + return ';' + + class Foo(object): + def bar(self): + '''a docstring; with a semicolon''' + a = 10; b = 11; c = 12 + + # this comment; has several; semicolons + if self.spam: + x = 12; return x # so; does; this; one + x = 15;;; y += x; return y + + def baz(self): + return self.bar + """ + expected = [ + [' this_semi_does_nothing()', ''], + [' neither_does_this_but_there_are_spaces_afterward()', ''], + [' a = "a;b"', 'return a # this is a comment; it has a semicolon!'], + [' z=1000', 'b("; in quotes")', 'c=200', 'return z'], + [" return ';'"], + [" '''a docstring; with a semicolon'''"], + [' a = 10', 'b = 11', 'c = 12'], + [' # this comment; has several; semicolons'], + [' x = 12', 'return x # so; does; this; one'], + [' x = 15', '', '', 'y += x', 'return y'], + ] + + exp_iter = iter(expected) + for line in filter(lambda ll: ';' in ll, sample.splitlines()): + print_(str(list(expr.split(line)))+',') + assert list(expr.split(line)) == next(exp_iter), "invalid split on expression" + + print_() + + expected = [ + [' this_semi_does_nothing()', ';', ''], + [' neither_does_this_but_there_are_spaces_afterward()', ';', ''], + [' a = "a;b"', ';', 'return a # this is a comment; it has a semicolon!'], + [' z=1000', ';', 'b("; in quotes")', ';', 'c=200', ';', 'return z'], + [" return ';'"], + [" '''a docstring; with a semicolon'''"], + [' a = 10', ';', 'b = 11', ';', 'c = 12'], + [' # this comment; has several; semicolons'], + [' x = 12', ';', 'return x # so; does; this; one'], + [' x = 15', ';', '', ';', '', ';', 'y += x', ';', 'return y'], + ] + exp_iter = iter(expected) + for line in filter(lambda ll: ';' in ll, sample.splitlines()): + print_(str(list(expr.split(line, includeSeparators=True)))+',') + assert list(expr.split(line, includeSeparators=True)) == next(exp_iter), "invalid split on expression" + + print_() + + + expected = [ + [' this_semi_does_nothing()', ''], + [' neither_does_this_but_there_are_spaces_afterward()', ''], + [' a = "a;b"', 'return a # this is a comment; it has a semicolon!'], + [' z=1000', 'b("; in quotes"); c=200;return z'], + [' a = 10', 'b = 11; c = 12'], + [' x = 12', 'return x # so; does; this; one'], + [' x = 15', ';; y += x; return y'], + ] + exp_iter = iter(expected) + for line in sample.splitlines(): + pieces = list(expr.split(line, maxsplit=1)) + print_(str(pieces)+',') + if len(pieces) == 2: + exp = next(exp_iter) + assert pieces == exp, "invalid split on expression with maxSplits=1" + elif len(pieces) == 1: + assert len(expr.searchString(line)) == 0, "invalid split with maxSplits=1 when expr not present" + else: + print_("\n>>> " + line) + assert False, "invalid split on expression with maxSplits=1, corner case" + class MiscellaneousParserTests(ParseTestCase): def runTest(self): -- cgit v1.2.1