diff options
author | Jeffrey B. Arnold <jeffrey.arnold@gmail.com> | 2012-09-01 15:45:51 -0400 |
---|---|---|
committer | Jeffrey B. Arnold <jeffrey.arnold@gmail.com> | 2012-09-01 15:45:51 -0400 |
commit | 2645ed16565af188959633c17638acf91395a6a3 (patch) | |
tree | 9176fb77ac48cd95492e495ce59a2e8ae583e8b5 /pygments/util.py | |
parent | aa10b337c917219367b3b29f7dc4157e1f45b292 (diff) | |
parent | fe187b72911c8a1c653c2d81690a1ec47f13fa77 (diff) | |
download | pygments-2645ed16565af188959633c17638acf91395a6a3.tar.gz |
merged
Diffstat (limited to 'pygments/util.py')
-rw-r--r-- | pygments/util.py | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/pygments/util.py b/pygments/util.py index f8c6c824..127f6e87 100644 --- a/pygments/util.py +++ b/pygments/util.py @@ -206,6 +206,51 @@ def looks_like_xml(text): _looks_like_xml_cache[key] = rv return rv +# Python narrow build compatibility + +def _surrogatepair(c): + return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) + +def unirange(a, b): + """ + Returns a regular expression string to match the given non-BMP range. + """ + if b < a: + raise ValueError("Bad character range") + if a < 0x10000 or b < 0x10000: + raise ValueError("unirange is only defined for non-BMP ranges") + + if sys.maxunicode > 0xffff: + # wide build + return u'[%s-%s]' % (unichr(a), unichr(b)) + else: + # narrow build stores surrogates, and the 're' module handles them + # (incorrectly) as characters. Since there is still ordering among + # these characters, expand the range to one that it understands. Some + # background in http://bugs.python.org/issue3665 and + # http://bugs.python.org/issue12749 + # + # Additionally, the lower constants are using unichr rather than + # literals because jython [which uses the wide path] can't load this + # file if they are literals. + ah, al = _surrogatepair(a) + bh, bl = _surrogatepair(b) + if ah == bh: + return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl)) + else: + buf = [] + buf.append(u'%s[%s-%s]' % + (unichr(ah), unichr(al), + ah == bh and unichr(bl) or unichr(0xdfff))) + if ah - bh > 1: + buf.append(u'[%s-%s][%s-%s]' % + unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff)) + if ah != bh: + buf.append(u'%s[%s-%s]' % + (unichr(bh), unichr(0xdc00), unichr(bl))) + + return u'(?:' + u'|'.join(buf) + u')' + # Python 2/3 compatibility if sys.version_info < (3,0): |