Added ParserElement.split() generator method

Also fixed minor blip in originalTextFor, to inherit ignored exprs from the embedded expression. git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@369 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b
author: ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b> 2016-06-13 20:24:24 +0000
committer: ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b> 2016-06-13 20:24:24 +0000
commit: 191c56031a16a6a7910613fa2be77a670d225996 (patch)
tree: a3d34868cf4a29037f012362869144457dfdf611
parent: ec812fbba1eba0eca156b7d740a367867a9102f7 (diff)
download: pyparsing-191c56031a16a6a7910613fa2be77a670d225996.tar.gz
3 files changed, 118 insertions, 1 deletions
diff --git a/src/CHANGES b/src/CHANGES
index 10c5593..799cdf8 100644
--- a/src/CHANGES
+++ b/src/CHANGES
@@ -4,6 +4,11 @@ Change Log
 
 Verison 2.1.5 - June, 2016
 ------------------------------
+- Added ParserElement.split() generator method, similar to re.split(). 
+  Includes optional arguments maxsplit (to limit the number of splits),
+  and includeSeparators (to include the separating matched text in the 
+  returned output, default=False).
+
 - Added a new parse action construction helper tokenMap, which will
   apply a function and optional arguments to each element in a 
   ParseResults. So this parse action:
diff --git a/src/pyparsing.py b/src/pyparsing.py
index 5fb3fbc..ec678d0 100644
--- a/src/pyparsing.py
+++ b/src/pyparsing.py
@@ -58,7 +58,7 @@ The pyparsing module handles some of the problems that are typically vexing when
 """
 
 __version__ = "2.1.5"
-__versionTime__ = "12 Jun 2016 21:53 UTC"
+__versionTime__ = "13 Jun 2016 19:59 UTC"
 __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
 
 import string
@@ -1326,6 +1326,21 @@ class ParserElement(object):
                 # catch and re-raise exception from here, clears out pyparsing internal stack trace
                 raise exc
 
+    def split(self, instring, maxsplit=_MAX_INT, includeSeparators=False):
+        """Generator method to split a string using the given expression as a separator.
+           May be called with optional C{maxsplit} argument, to limit the number of splits;
+           and the optional C{includeSeparators} argument (default=C{False}), if the separating
+           matching text should be included in the split results.
+        """
+        splits = 0
+        last = 0
+        for t,s,e in self.scanString(instring, maxMatches=maxsplit):
+            yield instring[last:s]
+            if includeSeparators:
+                yield t[0]
+            last = e
+        yield instring[last:]
+
     def __add__(self, other ):
         """Implementation of + operator - returns C{L{And}}"""
         if isinstance( other, basestring ):
@@ -3525,6 +3540,7 @@ def originalTextFor(expr, asString=True):
         def extractText(s,l,t):
             t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]]
     matchExpr.setParseAction(extractText)
+    matchExpr.ignoreExprs = expr.ignoreExprs
     return matchExpr
 
 def ungroup(expr): 
diff --git a/src/unitTests.py b/src/unitTests.py
index 9987a58..90cbc2a 100644
--- a/src/unitTests.py
+++ b/src/unitTests.py
@@ -2822,6 +2822,102 @@ class HTMLStripperTest(ParseTestCase):
         result = read_everything.parseString(sample)
         assert result[0].strip() == 'Here is some sample HTML text.'
 
+class ExprSplitterTest(ParseTestCase):
+    def runTest(self):
+        
+        from pyparsing import Literal, quotedString, pythonStyleComment, Empty
+        
+        expr = Literal(';') + Empty()
+        expr.ignore(quotedString)
+        expr.ignore(pythonStyleComment)
+        
+        
+        sample = """
+        def main():
+            this_semi_does_nothing();
+            neither_does_this_but_there_are_spaces_afterward();   
+            a = "a;b"; return a # this is a comment; it has a semicolon!
+
+        def b():
+            if False:
+                z=1000;b("; in quotes");  c=200;return z
+            return ';'
+
+        class Foo(object):
+            def bar(self):
+                '''a docstring; with a semicolon'''
+                a = 10; b = 11; c = 12
+                
+                # this comment; has several; semicolons
+                if self.spam:
+                    x = 12; return x # so; does; this; one
+                    x = 15;;; y += x; return y
+
+            def baz(self):
+                return self.bar
+        """
+        expected = [
+            ['            this_semi_does_nothing()', ''],
+            ['            neither_does_this_but_there_are_spaces_afterward()', ''],
+            ['            a = "a;b"', 'return a # this is a comment; it has a semicolon!'],
+            ['                z=1000', 'b("; in quotes")', 'c=200', 'return z'],
+            ["            return ';'"],
+            ["                '''a docstring; with a semicolon'''"],
+            ['                a = 10', 'b = 11', 'c = 12'],
+            ['                # this comment; has several; semicolons'],
+            ['                    x = 12', 'return x # so; does; this; one'],
+            ['                    x = 15', '', '', 'y += x', 'return y'],
+            ]
+
+        exp_iter = iter(expected)
+        for line in filter(lambda ll: ';' in ll, sample.splitlines()):
+            print_(str(list(expr.split(line)))+',')
+            assert list(expr.split(line)) == next(exp_iter), "invalid split on expression"
+
+        print_()
+
+        expected = [
+            ['            this_semi_does_nothing()', ';', ''],
+            ['            neither_does_this_but_there_are_spaces_afterward()', ';', ''],
+            ['            a = "a;b"', ';', 'return a # this is a comment; it has a semicolon!'],
+            ['                z=1000', ';', 'b("; in quotes")', ';', 'c=200', ';', 'return z'],
+            ["            return ';'"],
+            ["                '''a docstring; with a semicolon'''"],
+            ['                a = 10', ';', 'b = 11', ';', 'c = 12'],
+            ['                # this comment; has several; semicolons'],
+            ['                    x = 12', ';', 'return x # so; does; this; one'],
+            ['                    x = 15', ';', '', ';', '', ';', 'y += x', ';', 'return y'],
+            ]
+        exp_iter = iter(expected)
+        for line in filter(lambda ll: ';' in ll, sample.splitlines()):
+            print_(str(list(expr.split(line, includeSeparators=True)))+',')
+            assert list(expr.split(line, includeSeparators=True)) == next(exp_iter), "invalid split on expression"
+
+        print_()
+
+
+        expected = [
+            ['            this_semi_does_nothing()', ''],
+            ['            neither_does_this_but_there_are_spaces_afterward()', ''],
+            ['            a = "a;b"', 'return a # this is a comment; it has a semicolon!'],
+            ['                z=1000', 'b("; in quotes");  c=200;return z'],
+            ['                a = 10', 'b = 11; c = 12'],
+            ['                    x = 12', 'return x # so; does; this; one'],
+            ['                    x = 15', ';; y += x; return y'],
+            ]
+        exp_iter = iter(expected)
+        for line in sample.splitlines():
+            pieces = list(expr.split(line, maxsplit=1))
+            print_(str(pieces)+',')
+            if len(pieces) == 2:
+                exp = next(exp_iter)
+                assert pieces == exp, "invalid split on expression with maxSplits=1"
+            elif len(pieces) == 1:
+                assert len(expr.searchString(line)) == 0, "invalid split with maxSplits=1 when expr not present"
+            else:
+                print_("\n>>> " + line)
+                assert False, "invalid split on expression with maxSplits=1, corner case"
+
 
 class MiscellaneousParserTests(ParseTestCase):
     def runTest(self):
author	ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>	2016-06-13 20:24:24 +0000
committer	ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>	2016-06-13 20:24:24 +0000
commit	191c56031a16a6a7910613fa2be77a670d225996 (patch)
tree	a3d34868cf4a29037f012362869144457dfdf611
parent	ec812fbba1eba0eca156b7d740a367867a9102f7 (diff)
download	pyparsing-191c56031a16a6a7910613fa2be77a670d225996.tar.gz