summaryrefslogtreecommitdiff
path: root/pygments/util.py
diff options
context:
space:
mode:
authorJeffrey B. Arnold <jeffrey.arnold@gmail.com>2012-09-01 15:45:51 -0400
committerJeffrey B. Arnold <jeffrey.arnold@gmail.com>2012-09-01 15:45:51 -0400
commit2645ed16565af188959633c17638acf91395a6a3 (patch)
tree9176fb77ac48cd95492e495ce59a2e8ae583e8b5 /pygments/util.py
parentaa10b337c917219367b3b29f7dc4157e1f45b292 (diff)
parentfe187b72911c8a1c653c2d81690a1ec47f13fa77 (diff)
downloadpygments-2645ed16565af188959633c17638acf91395a6a3.tar.gz
merged
Diffstat (limited to 'pygments/util.py')
-rw-r--r--pygments/util.py45
1 files changed, 45 insertions, 0 deletions
diff --git a/pygments/util.py b/pygments/util.py
index f8c6c824..127f6e87 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -206,6 +206,51 @@ def looks_like_xml(text):
_looks_like_xml_cache[key] = rv
return rv
+# Python narrow build compatibility
+
+def _surrogatepair(c):
+ return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff)))
+
+def unirange(a, b):
+ """
+ Returns a regular expression string to match the given non-BMP range.
+ """
+ if b < a:
+ raise ValueError("Bad character range")
+ if a < 0x10000 or b < 0x10000:
+ raise ValueError("unirange is only defined for non-BMP ranges")
+
+ if sys.maxunicode > 0xffff:
+ # wide build
+ return u'[%s-%s]' % (unichr(a), unichr(b))
+ else:
+ # narrow build stores surrogates, and the 're' module handles them
+ # (incorrectly) as characters. Since there is still ordering among
+ # these characters, expand the range to one that it understands. Some
+ # background in http://bugs.python.org/issue3665 and
+ # http://bugs.python.org/issue12749
+ #
+ # Additionally, the lower constants are using unichr rather than
+ # literals because jython [which uses the wide path] can't load this
+ # file if they are literals.
+ ah, al = _surrogatepair(a)
+ bh, bl = _surrogatepair(b)
+ if ah == bh:
+ return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl))
+ else:
+ buf = []
+ buf.append(u'%s[%s-%s]' %
+ (unichr(ah), unichr(al),
+ ah == bh and unichr(bl) or unichr(0xdfff)))
+ if ah - bh > 1:
+ buf.append(u'[%s-%s][%s-%s]' %
+ unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff))
+ if ah != bh:
+ buf.append(u'%s[%s-%s]' %
+ (unichr(bh), unichr(0xdc00), unichr(bl)))
+
+ return u'(?:' + u'|'.join(buf) + u')'
+
# Python 2/3 compatibility
if sys.version_info < (3,0):