summaryrefslogtreecommitdiff
path: root/Cython/Compiler/StringEncoding.py
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2013-01-06 11:10:43 +0100
committerStefan Behnel <stefan_ml@behnel.de>2013-01-06 11:10:43 +0100
commit496d322487cc37aa9a80c6797b4b243b451c4473 (patch)
tree3524f05caab69ac64aed7e132caef459533833cc /Cython/Compiler/StringEncoding.py
parent02a5f1e06c525815342c1bf6b627fc9ea021d72a (diff)
downloadcython-496d322487cc37aa9a80c6797b4b243b451c4473.tar.gz
fix surrogates in Unicode literals in Python 3.3 (the UTF-8 codec rejects them explictly)
Diffstat (limited to 'Cython/Compiler/StringEncoding.py')
-rw-r--r--Cython/Compiler/StringEncoding.py38
1 files changed, 26 insertions, 12 deletions
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py
index 1ca490efc..8fc37fc16 100644
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -12,6 +12,8 @@ else:
_unicode, _str, _bytes = unicode, str, str
IS_PYTHON3 = False
+IS_PYTHON24 = sys.version_info[:2] < (2,5)
+
empty_bytes = _bytes()
empty_unicode = _unicode()
@@ -126,6 +128,13 @@ class EncodedString(_unicode):
assert self.encoding is None
return self.encode("UTF-8")
+ def escapeencode(self):
+ assert self.encoding is None
+ if IS_PYTHON24:
+ # work around bug in Py24 encoder
+ return self.replace(u'\\', u'\\\\').encode('unicode_escape')
+ return self.encode('unicode_escape')
+
def is_unicode(self):
return self.encoding is None
is_unicode = property(is_unicode)
@@ -147,6 +156,9 @@ class BytesLiteral(_bytes):
def utf8encode(self):
assert False, "this is not a unicode string: %r" % self
+ def escapeencode(self):
+ assert False, "this is not a unicode string: %r" % self
+
def __str__(self):
"""Fake-decode the byte string to unicode to support %
formatting of unicode strings.
@@ -165,6 +177,8 @@ char_from_escape_sequence = {
r'\v' : u'\v',
}.get
+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
+
def _to_escape_sequence(s):
if s in '\n\r\t':
return repr(s)[1:-1]
@@ -176,19 +190,22 @@ def _to_escape_sequence(s):
# within a character sequence, oct passes much better than hex
return ''.join(['\\%03o' % ord(c) for c in s])
-_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
-_c_special_replacements = [(orig.encode('ASCII'),
- _to_escape_sequence(orig).encode('ASCII'))
- for orig in _c_special ]
-
-def _build_specials_test():
+def _build_specials_replacer():
subexps = []
+ replacements = {}
for special in _c_special:
regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
subexps.append(regexp)
- return re.compile('|'.join(subexps).encode('ASCII')).search
+ replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
+
+ sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
+ def replace_specials(m):
+ return replacements[m.group(1)]
+ def replace(s):
+ return sub(replace_specials, s)
+ return replace
-_has_specials = _build_specials_test()
+_replace_specials = _build_specials_replacer()
def escape_char(c):
if IS_PYTHON3:
@@ -210,10 +227,7 @@ def escape_byte_string(s):
encoded as ISO-8859-1, will result in the correct byte sequence
being written.
"""
- if _has_specials(s):
- for special, replacement in _c_special_replacements:
- if special in s:
- s = s.replace(special, replacement)
+ s = _replace_specials(s)
try:
return s.decode("ASCII") # trial decoding: plain ASCII => done
except UnicodeDecodeError: