diff options
author | ptmcg <ptmcg@austin.rr.com> | 2018-12-13 00:37:09 -0600 |
---|---|---|
committer | ptmcg <ptmcg@austin.rr.com> | 2018-12-13 00:37:09 -0600 |
commit | 569966d06a0c0ea25843d84c7559f9e797346acc (patch) | |
tree | 77ab3fbbc24905a85f67b2ee246a5b6c899eb493 /pyparsing.py | |
parent | 20d621d6d3bf95f311a8070d7e1b2ca21db81025 (diff) | |
download | pyparsing-git-569966d06a0c0ea25843d84c7559f9e797346acc.tar.gz |
Issue #51 Expanded the whitespace characters recognized by the White class to include all unicode defined spaces; added docstrings to unicode ranges
Diffstat (limited to 'pyparsing.py')
-rw-r--r-- | pyparsing.py | 69 |
1 files changed, 53 insertions, 16 deletions
diff --git a/pyparsing.py b/pyparsing.py index 711e0d0..eab4c9f 100644 --- a/pyparsing.py +++ b/pyparsing.py @@ -94,7 +94,7 @@ classes inherit from. Use the docstrings for examples of how to: """ __version__ = "2.3.1" -__versionTime__ = "22 Nov 2018 07:07 UTC" +__versionTime__ = "13 Dec 2018 06:25 UTC" __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>" import string @@ -515,13 +515,13 @@ class ParseResults(object): if PY_3: keys = _iterkeys - """Returns an iterator of all named result keys (Python 3.x only).""" + """Returns an iterator of all named result keys.""" values = _itervalues - """Returns an iterator of all named result values (Python 3.x only).""" + """Returns an iterator of all named result values.""" items = _iteritems - """Returns an iterator of all named result key-value tuples (Python 3.x only).""" + """Returns an iterator of all named result key-value tuples.""" else: iterkeys = _iterkeys @@ -1213,7 +1213,7 @@ class ParserElement(object): self.resultsName = None self.saveAsList = savelist self.skipWhitespace = True - self.whiteChars = ParserElement.DEFAULT_WHITE_CHARS + self.whiteChars = set(ParserElement.DEFAULT_WHITE_CHARS) self.copyDefaultWhiteChars = True self.mayReturnEmpty = False # used when checking for left-recursion self.keepTabs = False @@ -1391,8 +1391,9 @@ class ParserElement(object): msg = kwargs.get("message", "failed user-defined condition") exc_type = ParseFatalException if kwargs.get("fatal", False) else ParseException for fn in fns: + fn = _trim_arity(fn) def pa(s,l,t): - if not bool(_trim_arity(fn)(s,l,t)): + if not bool(fn(s,l,t)): raise exc_type(s,l,msg) self.parseAction.append(pa) self.callDuringTry = self.callDuringTry or kwargs.get("callDuringTry", False) @@ -2248,7 +2249,7 @@ class ParserElement(object): The output shown is that produced by the default debug actions - custom debug actions can be specified using :class:`setDebugActions`. Prior to attempting to match the ``wd`` expression, the debugging message ``"Match <exprname> at loc <n>(<line>,<col>)"`` - is shown. Then if the parse succeeds, a ``"Matched"` message is shown, or an ``"Exception raised"`` + is shown. Then if the parse succeeds, a ``"Matched"`` message is shown, or an ``"Exception raised"`` message is shown. Also note the use of :class:`setName` to assign a human-readable name to the expression, which makes debugging and exception messages easier to understand - for instance, the default name created for the :class:`Word` expression without calling ``setName`` is ``"W:(ABCD...)"``. @@ -2968,16 +2969,16 @@ class Regex(Token): raise ParseException(instring, loc, self.errmsg, self) loc = result.end() - d = result.groupdict() if self.asMatch: ret = result elif self.asGroupList: ret = result.groups() else: ret = ParseResults(result.group()) + d = result.groupdict() if d: - for k in d: - ret[k] = d[k] + for k, v in d.items(): + ret[k] = v return loc,ret def __str__( self ): @@ -3258,11 +3259,29 @@ class White(Token): :class:`Word` class. """ whiteStrs = { - " " : "<SPC>", - "\t": "<TAB>", - "\n": "<LF>", - "\r": "<CR>", - "\f": "<FF>", + ' ' : '<SP>', + '\t': '<TAB>', + '\n': '<LF>', + '\r': '<CR>', + '\f': '<FF>', + 'u\00A0': '<NBSP>', + 'u\1680': '<OGHAM_SPACE_MARK>', + 'u\180E': '<MONGOLIAN_VOWEL_SEPARATOR>', + 'u\2000': '<EN_QUAD>', + 'u\2001': '<EM_QUAD>', + 'u\2002': '<EN_SPACE>', + 'u\2003': '<EM_SPACE>', + 'u\2004': '<THREE-PER-EM_SPACE>', + 'u\2005': '<FOUR-PER-EM_SPACE>', + 'u\2006': '<SIX-PER-EM_SPACE>', + 'u\2007': '<FIGURE_SPACE>', + 'u\2008': '<PUNCTUATION_SPACE>', + 'u\2009': '<THIN_SPACE>', + 'u\200A': '<HAIR_SPACE>', + 'u\200B': '<ZERO_WIDTH_SPACE>', + 'u\202F': '<NNBSP>', + 'u\205F': '<MMSP>', + 'u\3000': '<IDEOGRAPHIC_SPACE>', } def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): super(White,self).__init__() @@ -6112,6 +6131,8 @@ class pyparsing_common: class _lazyclassproperty(object): def __init__(self, fn): self.fn = fn + self.__doc__ = fn.__doc__ + self.__name__ = fn.__name__ def __get__(self, obj, cls): if cls is None: @@ -6178,15 +6199,19 @@ class pyparsing_unicode(unicode_set): _ranges = [(32, sys.maxunicode)] class Latin1(unicode_set): + "Unicode set for Latin-1 Unicode Character Range" _ranges = [(0x0020, 0x007e), (0x00a0, 0x00ff),] class LatinA(unicode_set): + "Unicode set for Latin-A Unicode Character Range" _ranges = [(0x0100, 0x017f),] class LatinB(unicode_set): + "Unicode set for Latin-B Unicode Character Range" _ranges = [(0x0180, 0x024f),] class Greek(unicode_set): + "Unicode set for Greek Unicode Character Ranges" _ranges = [ (0x0370, 0x03ff), (0x1f00, 0x1f15), (0x1f18, 0x1f1d), (0x1f20, 0x1f45), (0x1f48, 0x1f4d), (0x1f50, 0x1f57), (0x1f59,), (0x1f5b,), (0x1f5d,), (0x1f5f, 0x1f7d), (0x1f80, 0x1fb4), (0x1fb6, 0x1fc4), @@ -6194,39 +6219,51 @@ class pyparsing_unicode(unicode_set): ] class Cyrillic(unicode_set): + "Unicode set for Cyrillic Unicode Character Range" _ranges = [(0x0400, 0x04ff)] class Chinese(unicode_set): + "Unicode set for Chinese Unicode Character Range" _ranges = [(0x4e00, 0x9fff), (0x3000, 0x303f), ] class Japanese(unicode_set): - _ranges = [ ] # sum of Kanji, Hiragana, and Katakana ranges + "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges" + _ranges = [ ] class Kanji(unicode_set): + "Unicode set for Kanji Unicode Character Range" _ranges = [(0x4E00, 0x9Fbf), (0x3000, 0x303f), ] class Hiragana(unicode_set): + "Unicode set for Hiragana Unicode Character Range" _ranges = [(0x3040, 0x309f), ] class Katakana(unicode_set): + "Unicode set for Katakana Unicode Character Range" _ranges = [(0x30a0, 0x30ff), ] class Korean(unicode_set): + "Unicode set for Korean Unicode Character Range" _ranges = [(0xac00, 0xd7af), (0x1100, 0x11ff), (0x3130, 0x318f), (0xa960, 0xa97f), (0xd7b0, 0xd7ff), (0x3000, 0x303f), ] class CJK(Chinese, Japanese, Korean): + "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range" pass class Thai(unicode_set): + "Unicode set for Thai Unicode Character Range" _ranges = [(0x0e01, 0x0e3a), (0x0e3f, 0x0e5b), ] class Arabic(unicode_set): + "Unicode set for Arabic Unicode Character Range" _ranges = [(0x0600, 0x061b), (0x061e, 0x06ff), (0x0700, 0x077f), ] class Hebrew(unicode_set): + "Unicode set for Hebrew Unicode Character Range" _ranges = [(0x0590, 0x05ff), ] class Devanagari(unicode_set): + "Unicode set for Devanagari Unicode Character Range" _ranges = [(0x0900, 0x097f), (0xa8e0, 0xa8ff)] pyparsing_unicode.Japanese._ranges = (pyparsing_unicode.Japanese.Kanji._ranges |