summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul McGuire <ptmcg@austin.rr.com>2018-11-17 19:09:54 -0600
committerPaul McGuire <ptmcg@austin.rr.com>2018-11-17 19:09:54 -0600
commit830e5cfcea1dc4bc592f1e61d788b6eccc6052c6 (patch)
treee3d0bc378de8a1f34f711446c426c71f91c9e563
parent905b01bece3a17e952cb11eaa8c9d2d272b27569 (diff)
downloadpyparsing-git-830e5cfcea1dc4bc592f1e61d788b6eccc6052c6.tar.gz
Add support for combining unicode_sets using multiple inheritance
-rw-r--r--CHANGES8
-rw-r--r--pyparsing.py18
-rw-r--r--unitTests.py8
3 files changed, 24 insertions, 10 deletions
diff --git a/CHANGES b/CHANGES
index b9ae519..585166b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -6,6 +6,14 @@ Version 2.3.1 -
---------------
- Added unicode sets to pyparsing_unicode for Latin-A and Latin-B ranges.
+- Added ability to define custom unicode sets as combinations of other sets
+ using multiple inheritance.
+
+ class Turkish_set(pp.pyparsing_unicode.Latin1, pp.pyparsing_unicode.LatinA):
+ pass
+
+ turkish_word = pp.Word(Turkish_set.alphas)
+
Version 2.3.0 - October, 2018
-----------------------------
diff --git a/pyparsing.py b/pyparsing.py
index 19b0141..d7be7b4 100644
--- a/pyparsing.py
+++ b/pyparsing.py
@@ -91,6 +91,7 @@ import pprint
import traceback
import types
from datetime import datetime
+from itertools import takewhile
try:
# Python 3
from itertools import filterfalse
@@ -5841,20 +5842,23 @@ class _lazyclassproperty(object):
return ret
-class unicode_set:
+class unicode_set(object):
_ranges = []
@_lazyclassproperty
def printables(cls):
- return ''.join(filterfalse(unicode.isspace, (unichr(c) for r in cls._ranges for c in range(r[0], r[-1] + 1))))
+ ranges = set(sum((cc._ranges for cc in takewhile(lambda x: x is not unicode_set, cls.__mro__)), []))
+ return ''.join(filterfalse(unicode.isspace, (unichr(c) for r in ranges for c in range(r[0], r[-1] + 1))))
@_lazyclassproperty
def alphas(cls):
- return ''.join(filter(unicode.isalpha, (unichr(c) for r in cls._ranges for c in range(r[0], r[-1] + 1))))
+ ranges = set(sum((cc._ranges for cc in takewhile(lambda x: x is not unicode_set, cls.__mro__)), []))
+ return ''.join(filter(unicode.isalpha, (unichr(c) for r in ranges for c in range(r[0], r[-1] + 1))))
@_lazyclassproperty
def nums(cls):
- return ''.join(filter(unicode.isdigit, (unichr(c) for r in cls._ranges for c in range(r[0], r[-1] + 1))))
+ ranges = set(sum((cc._ranges for cc in takewhile(lambda x: x is not unicode_set, cls.__mro__)), []))
+ return ''.join(filter(unicode.isdigit, (unichr(c) for r in ranges for c in range(r[0], r[-1] + 1))))
@_lazyclassproperty
def alphanums(cls):
@@ -5901,9 +5905,8 @@ class pyparsing_unicode(unicode_set):
class Korean(unicode_set):
_ranges = [(0xac00, 0xd7af), (0x1100, 0x11ff), (0x3130, 0x318f), (0xa960, 0xa97f), (0xd7b0, 0xd7ff), ]
- class CJK(unicode_set):
- _ranges = [ # sum of Chinese, Japanese, and Korean ranges
- ]
+ class CJK(Chinese, Japanese, Korean):
+ pass
class Thai(unicode_set):
_ranges = [(0x0e01, 0x0e3a), (0x0e3f, 0x0e5b), ]
@@ -5918,7 +5921,6 @@ class pyparsing_unicode(unicode_set):
_ranges = [(0x0900, 0x097f), (0xa8e0, 0xa8ff)]
pyparsing_unicode.Japanese._ranges = pyparsing_unicode.Japanese.Kanji._ranges + pyparsing_unicode.Japanese.Hiragana._ranges + pyparsing_unicode.Japanese.Katakana._ranges
-pyparsing_unicode.CJK._ranges = pyparsing_unicode.Chinese._ranges + pyparsing_unicode.Japanese._ranges + pyparsing_unicode.Korean._ranges
# define ranges in language character sets
if PY_3:
diff --git a/unitTests.py b/unitTests.py
index 5cb293d..2aff419 100644
--- a/unitTests.py
+++ b/unitTests.py
@@ -3650,8 +3650,9 @@ class UnicodeTests(ParseTestCase):
self.assertTrue(result.asList() == [u'Καλημέρα', ',', u'κόσμε', '!'],
"Failed to parse Greek 'Hello, World!' using pyparsing_unicode.Greek.alphas")
- class Turkish_set(pp.unicode_set):
- _ranges = pp.pyparsing_unicode.Latin1._ranges + pp.pyparsing_unicode.LatinA._ranges
+ # define a custom unicode range using multiple inheritance
+ class Turkish_set(pp.pyparsing_unicode.Latin1, pp.pyparsing_unicode.LatinA):
+ pass
key = pp.Word(Turkish_set.alphas)
value = pp.pyparsing_common.integer | pp.Word(Turkish_set.alphas, Turkish_set.alphanums)
@@ -3667,6 +3668,9 @@ class UnicodeTests(ParseTestCase):
print(result.asDict())
self.assertEqual(result.asDict(), {'şehir': 'İzmir', 'ülke': 'Türkiye', 'nüfus': 4279677},
"Failed to parse Turkish key-value pairs")
+ self.assertEqual(len(pp.pyparsing_unicode.CJK.printables), 53760,
+ "failed to construct ranges by merging Chinese, Japanese and Korean")
+ self.assertEqual(len(Turkish_set.printables), 317, "failed to construct ranges by merging Latin1 and LatinA")
class IndentedBlockTest(ParseTestCase):
# parse pseudo-yaml indented text