Added stopOn arg to ZeroOrMore and OneOrMore; refactored/cleaned up SkipTo to reduce exception-based flow of control (using new ParserElement.canParseNext); made toklist a default arg so ParseResults can now be constructed using an empty constructor; fleshed out some docstrings with parameter lists

git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@322 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b
author: ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b> 2016-02-07 20:15:34 +0000
committer: ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b> 2016-02-07 20:15:34 +0000
commit: c270b98ab3a0ab22ad5e2b2ccf42b83245c9ff4d (patch)
tree: 8466121b7a9332b2c76a158f0f7b08ad662cc5eb
parent: 710c36de0f37ab0acc73142687665dcbdc5d3860 (diff)
download: pyparsing-c270b98ab3a0ab22ad5e2b2ccf42b83245c9ff4d.tar.gz
2 files changed, 152 insertions, 97 deletions
diff --git a/src/CHANGES b/src/CHANGES
index 6f9ea39..b88afe0 100644
--- a/src/CHANGES
+++ b/src/CHANGES
@@ -20,6 +20,30 @@ Version 2.1.0 - February, 2016
   all protocols 0-4 are supported. Thanks for reporting this on StackOverflow,
   Arne Wolframm, and for providing a nice simple test case!
 
+- Added optional 'stopOn' argument to ZeroOrMore and OneOrMore, to
+  simplify breaking on stop tokens that would match the repetition 
+  expression. 
+  
+  It is a common problem to fail to look ahead when matching repetitive 
+  tokens if the sentinel at the end also matches the repetition
+  expression, as when parsing "BEGIN aaa bbb ccc END" with:
+  
+    "BEGIN" + OneOrMore(Word(alphas)) + "END"
+
+  Since "END" matches the repetition expression "Word(alphas)", it will
+  never get parsed as the terminating sentinel. Up until now, this has 
+  to be resolved by the user inserting their own negative lookahead:
+  
+    "BEGIN" + OneOrMore(~Literal("END") + Word(alphas)) + "END"
+  
+  Using stopOn, they can more easily write:
+  
+    "BEGIN" + OneOrMore(Word(alphas), stopOn="END") + "END"
+  
+  The stopOn argument can be a literal string or a pyparsing expression.
+  Inspired by a question by Lamakaha on StackOverflow (and many previous
+  questions with the same negative-lookahead resolution).
+
 - Added expression names for many internal and builtin expressions, to 
   reduce name and error message overhead during parsing.
 
diff --git a/src/pyparsing.py b/src/pyparsing.py
index 3139bd9..8fd5d74 100644
--- a/src/pyparsing.py
+++ b/src/pyparsing.py
@@ -58,7 +58,7 @@ The pyparsing module handles some of the problems that are typically vexing when
 """
 
 __version__ = "2.1.0"
-__versionTime__ = "6 Feb 2016 18:52"
+__versionTime__ = "7 Feb 2016 14:09"
 __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
 
 import string
@@ -154,7 +154,7 @@ def _xml_escape(data):
 class _Constants(object):
     pass
 
-alphas = string.ascii_lowercase + string.ascii_uppercase
+alphas     = string.ascii_uppercase + string.ascii_lowercase
 nums       = "0123456789"
 hexnums    = nums + "ABCDEFabcdef"
 alphanums  = alphas + nums
@@ -267,7 +267,7 @@ class ParseResults(object):
        - by list index (C{results[0], results[1]}, etc.)
        - by attribute (C{results.<resultsName>})
        """
-    def __new__(cls, toklist, name=None, asList=True, modal=True ):
+    def __new__(cls, toklist=None, name=None, asList=True, modal=True ):
         if isinstance(toklist, cls):
             return toklist
         retobj = object.__new__(cls)
@@ -276,7 +276,7 @@ class ParseResults(object):
 
     # Performance tuning: we construct a *lot* of these, so keep this
     # constructor as small and fast as possible
-    def __init__( self, toklist, name=None, asList=True, modal=True, isinstance=isinstance ):
+    def __init__( self, toklist=None, name=None, asList=True, modal=True, isinstance=isinstance ):
         if self.__doinit:
             self.__doinit = False
             self.__name = None
@@ -284,6 +284,8 @@ class ParseResults(object):
             self.__accumNames = {}
             self.__asList = asList
             self.__modal = modal
+            if toklist is None:
+                toklist = []
             if isinstance(toklist, list):
                 self.__toklist = toklist[:]
             elif isinstance(toklist, _generatorType):
@@ -365,7 +367,7 @@ class ParseResults(object):
         return k in self.__tokdict
 
     def __len__( self ): return len( self.__toklist )
-    def __bool__(self): return len( self.__toklist ) > 0
+    def __bool__(self): return ( not not self.__toklist )
     __nonzero__ = __bool__
     def __iter__( self ): return iter( self.__toklist )
     def __reversed__( self ): return iter( self.__toklist[::-1] )
@@ -1058,6 +1060,14 @@ class ParserElement(object):
             return self._parse( instring, loc, doActions=False )[0]
         except ParseFatalException:
             raise ParseException( instring, loc, self.errmsg, self)
+    
+    def canParseNext(self, instring, loc):
+        try:
+            self.tryParse(instring, loc)
+        except (ParseException, IndexError):
+            return False
+        else:
+            return True
 
     # this method gets repeatedly called during backtracking with the same arguments -
     # we can cache these arguments and save ourselves the trouble of re-parsing the contained expression
@@ -1864,7 +1874,7 @@ class Regex(Token):
         super(Regex,self).__init__()
 
         if isinstance(pattern, basestring):
-            if len(pattern) == 0:
+            if not pattern:
                 warnings.warn("null string passed to Regex; use Empty() instead",
                         SyntaxWarning, stacklevel=2)
 
@@ -1935,7 +1945,7 @@ class QuotedString(Token):
 
         # remove white space from quote chars - wont work anyway
         quoteChar = quoteChar.strip()
-        if len(quoteChar) == 0:
+        if not quoteChar:
             warnings.warn("quoteChar cannot be the empty string",SyntaxWarning,stacklevel=2)
             raise SyntaxError()
 
@@ -1943,7 +1953,7 @@ class QuotedString(Token):
             endQuoteChar = quoteChar
         else:
             endQuoteChar = endQuoteChar.strip()
-            if len(endQuoteChar) == 0:
+            if not endQuoteChar:
                 warnings.warn("endQuoteChar cannot be the empty string",SyntaxWarning,stacklevel=2)
                 raise SyntaxError()
 
@@ -2612,16 +2622,14 @@ class Each(ParseExpression):
             tmpExprs = tmpReqd + tmpOpt + self.multioptionals + self.multirequired
             failed = []
             for e in tmpExprs:
-                try:
-                    tmpLoc = e.tryParse( instring, tmpLoc )
-                except ParseException:
-                    failed.append(e)
-                else:
+                if e.canParseNext(instring, tmpLoc):
                     matchOrder.append(self.opt1map.get(id(e),e))
                     if e in tmpReqd:
                         tmpReqd.remove(e)
                     elif e in tmpOpt:
                         tmpOpt.remove(e)
+                else:
+                    failed.append(e)
             if len(failed) == len(tmpExprs):
                 keepMatching = False
 
@@ -2637,7 +2645,7 @@ class Each(ParseExpression):
             loc,results = e._parse(instring,loc,doActions)
             resultlist.append(results)
 
-        finalResults = ParseResults([])
+        finalResults = ParseResults()
         for r in resultlist:
             dups = {}
             for k in r.keys():
@@ -2765,11 +2773,7 @@ class NotAny(ParseElementEnhance):
         self.errmsg = "Found unwanted token, "+_ustr(self.expr)
 
     def parseImpl( self, instring, loc, doActions=True ):
-        try:
-            self.expr.tryParse( instring, loc )
-        except (ParseException,IndexError):
-            pass
-        else:
+        if self.expr.canParseNext(instring, loc):
             raise ParseException(instring, loc, self.errmsg, self)
         return loc, []
 
@@ -2783,23 +2787,44 @@ class NotAny(ParseElementEnhance):
         return self.strRepr
 
 
-class ZeroOrMore(ParseElementEnhance):
-    """Optional repetition of zero or more of the given expression."""
-    def __init__( self, expr ):
-        super(ZeroOrMore,self).__init__(expr)
-        self.mayReturnEmpty = True
+class OneOrMore(ParseElementEnhance):
+    """Repetition of one or more of the given expression.
+    
+       Parameters:
+        - expr - expression that must match one or more times
+        - stopOn - (default=None) - expression for a terminating sentinel
+          (only required if the sentinel would ordinarily match the repetition 
+          expression)          
+    """
+    def __init__( self, expr, stopOn=None):
+        super(OneOrMore, self).__init__(expr)
+        ender = stopOn
+        if isinstance(ender, basestring):
+            ender = Literal(ender)
+        self.not_ender = ~ender if ender is not None else None
 
     def parseImpl( self, instring, loc, doActions=True ):
-        tokens = []
+        self_expr_parse = self.expr._parse
+        self_skip_ignorables = self._skipIgnorables
+        check_ender = self.not_ender is not None
+        if check_ender:
+            try_not_ender = self.not_ender.tryParse
+        
+        # must be at least one (but first see if we are the stopOn sentinel;
+        # if so, fail)
+        if check_ender:
+            try_not_ender(instring, loc)
+        loc, tokens = self_expr_parse( instring, loc, doActions, callPreParse=False )
         try:
-            loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False )
-            hasIgnoreExprs = ( len(self.ignoreExprs) > 0 )
+            hasIgnoreExprs = (not not self.ignoreExprs)
             while 1:
+                if check_ender:
+                    try_not_ender(instring, loc)
                 if hasIgnoreExprs:
-                    preloc = self._skipIgnorables( instring, loc )
+                    preloc = self_skip_ignorables( instring, loc )
                 else:
                     preloc = loc
-                loc, tmptokens = self.expr._parse( instring, preloc, doActions )
+                loc, tmptokens = self_expr_parse( instring, preloc, doActions )
                 if tmptokens or tmptokens.haskeys():
                     tokens += tmptokens
         except (ParseException,IndexError):
@@ -2812,50 +2837,43 @@ class ZeroOrMore(ParseElementEnhance):
             return self.name
 
         if self.strRepr is None:
-            self.strRepr = "[" + _ustr(self.expr) + "]..."
+            self.strRepr = "{" + _ustr(self.expr) + "}..."
 
         return self.strRepr
 
     def setResultsName( self, name, listAllMatches=False ):
-        ret = super(ZeroOrMore,self).setResultsName(name,listAllMatches)
+        ret = super(OneOrMore,self).setResultsName(name,listAllMatches)
         ret.saveAsList = True
         return ret
 
-
-class OneOrMore(ParseElementEnhance):
-    """Repetition of one or more of the given expression."""
+class ZeroOrMore(OneOrMore):
+    """Optional repetition of zero or more of the given expression.
+    
+       Parameters:
+        - expr - expression that must match zero or more times
+        - stopOn - (default=None) - expression for a terminating sentinel
+          (only required if the sentinel would ordinarily match the repetition 
+          expression)          
+    """
+    def __init__( self, expr, stopOn=None):
+        super(ZeroOrMore,self).__init__(expr, stopOn=stopOn)
+        self.mayReturnEmpty = True
+        
     def parseImpl( self, instring, loc, doActions=True ):
-        # must be at least one
-        loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False )
         try:
-            hasIgnoreExprs = ( len(self.ignoreExprs) > 0 )
-            while 1:
-                if hasIgnoreExprs:
-                    preloc = self._skipIgnorables( instring, loc )
-                else:
-                    preloc = loc
-                loc, tmptokens = self.expr._parse( instring, preloc, doActions )
-                if tmptokens or tmptokens.haskeys():
-                    tokens += tmptokens
+            return super(ZeroOrMore, self).parseImpl(instring, loc, doActions)
         except (ParseException,IndexError):
-            pass
-
-        return loc, tokens
+            return loc, []
 
     def __str__( self ):
         if hasattr(self,"name"):
             return self.name
 
         if self.strRepr is None:
-            self.strRepr = "{" + _ustr(self.expr) + "}..."
+            self.strRepr = "[" + _ustr(self.expr) + "]..."
 
         return self.strRepr
 
-    def setResultsName( self, name, listAllMatches=False ):
-        ret = super(OneOrMore,self).setResultsName(name,listAllMatches)
-        ret.saveAsList = True
-        return ret
-
 class _NullToken(object):
     def __bool__(self):
         return False
@@ -2866,8 +2884,11 @@ class _NullToken(object):
 _optionalNotMatched = _NullToken()
 class Optional(ParseElementEnhance):
     """Optional matching of the given expression.
-       A default return string can also be specified, if the optional expression
-       is not found.
+
+       Parameters:
+        - expr - expression that must match zero or more times
+        - default (optional) - value to be returned if the optional expression
+          is not found.
     """
     def __init__( self, expr, default=_optionalNotMatched ):
         super(Optional,self).__init__( expr, savelist=False )
@@ -2897,13 +2918,18 @@ class Optional(ParseElementEnhance):
 
         return self.strRepr
 
-
 class SkipTo(ParseElementEnhance):
     """Token for skipping over all undefined text until the matched expression is found.
-       If C{include} is set to true, the matched expression is also parsed (the skipped text
-       and matched expression are returned as a 2-element list).  The C{ignore}
-       argument is used to define grammars (typically quoted strings and comments) that
-       might contain false matches.
+
+       Parameters:
+        - expr - target expression marking the end of the data to be skipped
+        - include - (default=False) if True, the target expression is also parsed 
+          (the skipped text and target expression are returned as a 2-element list).
+        - ignore - (default=None) used to define grammars (typically quoted strings and 
+          comments) that might contain false matches to the target expression
+        - failOn - (default=None) define expressions that are not allowed to be 
+          included in the skipped test; if found before the target expression is found, 
+          the SkipTo is not a match
     """
     def __init__( self, other, include=False, ignore=None, failOn=None ):
         super( SkipTo, self ).__init__( other )
@@ -2919,46 +2945,51 @@ class SkipTo(ParseElementEnhance):
         self.errmsg = "No match found for "+_ustr(self.expr)
 
     def parseImpl( self, instring, loc, doActions=True ):
-        startLoc = loc
+        startloc = loc
         instrlen = len(instring)
         expr = self.expr
-        failParse = False
-        while loc <= instrlen:
-            try:
-                if self.failOn:
+        expr_parse = self.expr._parse
+        self_failOn_canParseNext = self.failOn.canParseNext if self.failOn is not None else None
+        self_ignoreExpr_tryParse = self.ignoreExpr.tryParse if self.ignoreExpr is not None else None
+        
+        tmploc = loc
+        while tmploc <= instrlen:
+            if self_failOn_canParseNext is not None:
+                # break if failOn expression matches
+                if self_failOn.canParseNext(instring, tmploc):
+                    break
+                    
+            if self_ignoreExpr_tryParse is not None:
+                # advance past ignore expressions
+                while 1:
                     try:
-                        self.failOn.tryParse(instring, loc)
+                        tmploc = self_ignoreExpr_tryParse(instring, tmploc)
                     except ParseBaseException:
-                        pass
-                    else:
-                        failParse = True
-                        raise ParseException(instring, loc, "Found expression " + str(self.failOn))
-                    failParse = False
-                if self.ignoreExpr is not None:
-                    while 1:
-                        try:
-                            loc = self.ignoreExpr.tryParse(instring,loc)
-                            # print("found ignoreExpr, advance to", loc)
-                        except ParseBaseException:
-                            break
-                expr._parse( instring, loc, doActions=False, callPreParse=False )
-                skipText = instring[startLoc:loc]
-                if self.includeMatch:
-                    loc,mat = expr._parse(instring,loc,doActions,callPreParse=False)
-                    if mat:
-                        skipRes = ParseResults( skipText )
-                        skipRes += mat
-                        return loc, [ skipRes ]
-                    else:
-                        return loc, [ skipText ]
-                else:
-                    return loc, [ skipText ]
-            except (ParseException,IndexError):
-                if failParse:
-                    raise
-                else:
-                    loc += 1
-        raise ParseException(instring, loc, self.errmsg, self)
+                        break
+            
+            try:
+                expr_parse(instring, tmploc, doActions=False, callPreParse=False)
+            except (ParseException, IndexError):
+                # no match, advance loc in string
+                tmploc += 1
+            else:
+                # matched skipto expr, done
+                break
+
+        else:
+            # ran off the end of the input string without matching skipto expr, fail
+            raise ParseException(instring, loc, self.errmsg, self)
+
+        # build up return values
+        loc = tmploc
+        skiptext = instring[startloc:loc]
+        skipresult = ParseResults(skiptext)
+        
+        if self.includeMatch:
+            loc, mat = expr_parse(instring,loc,doActions,callPreParse=False)
+            skipresult += mat
+
+        return loc, skipresult
 
 class Forward(ParseElementEnhance):
     """Forward declaration of an expression to be defined later -
author	ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>	2016-02-07 20:15:34 +0000
committer	ptmcg <ptmcg@9bf210a0-9d2d-494c-87cf-cfb32e7dff7b>	2016-02-07 20:15:34 +0000
commit	c270b98ab3a0ab22ad5e2b2ccf42b83245c9ff4d (patch)
tree	8466121b7a9332b2c76a158f0f7b08ad662cc5eb
parent	710c36de0f37ab0acc73142687665dcbdc5d3860 (diff)
download	pyparsing-c270b98ab3a0ab22ad5e2b2ccf42b83245c9ff4d.tar.gz