diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2013-01-06 11:10:43 +0100 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2013-01-06 11:10:43 +0100 |
commit | 496d322487cc37aa9a80c6797b4b243b451c4473 (patch) | |
tree | 3524f05caab69ac64aed7e132caef459533833cc /Cython/Compiler/StringEncoding.py | |
parent | 02a5f1e06c525815342c1bf6b627fc9ea021d72a (diff) | |
download | cython-496d322487cc37aa9a80c6797b4b243b451c4473.tar.gz |
fix surrogates in Unicode literals in Python 3.3 (the UTF-8 codec rejects them explictly)
Diffstat (limited to 'Cython/Compiler/StringEncoding.py')
-rw-r--r-- | Cython/Compiler/StringEncoding.py | 38 |
1 files changed, 26 insertions, 12 deletions
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py index 1ca490efc..8fc37fc16 100644 --- a/Cython/Compiler/StringEncoding.py +++ b/Cython/Compiler/StringEncoding.py @@ -12,6 +12,8 @@ else: _unicode, _str, _bytes = unicode, str, str IS_PYTHON3 = False +IS_PYTHON24 = sys.version_info[:2] < (2,5) + empty_bytes = _bytes() empty_unicode = _unicode() @@ -126,6 +128,13 @@ class EncodedString(_unicode): assert self.encoding is None return self.encode("UTF-8") + def escapeencode(self): + assert self.encoding is None + if IS_PYTHON24: + # work around bug in Py24 encoder + return self.replace(u'\\', u'\\\\').encode('unicode_escape') + return self.encode('unicode_escape') + def is_unicode(self): return self.encoding is None is_unicode = property(is_unicode) @@ -147,6 +156,9 @@ class BytesLiteral(_bytes): def utf8encode(self): assert False, "this is not a unicode string: %r" % self + def escapeencode(self): + assert False, "this is not a unicode string: %r" % self + def __str__(self): """Fake-decode the byte string to unicode to support % formatting of unicode strings. @@ -165,6 +177,8 @@ char_from_escape_sequence = { r'\v' : u'\v', }.get +_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) + def _to_escape_sequence(s): if s in '\n\r\t': return repr(s)[1:-1] @@ -176,19 +190,22 @@ def _to_escape_sequence(s): # within a character sequence, oct passes much better than hex return ''.join(['\\%03o' % ord(c) for c in s]) -_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) -_c_special_replacements = [(orig.encode('ASCII'), - _to_escape_sequence(orig).encode('ASCII')) - for orig in _c_special ] - -def _build_specials_test(): +def _build_specials_replacer(): subexps = [] + replacements = {} for special in _c_special: regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) subexps.append(regexp) - return re.compile('|'.join(subexps).encode('ASCII')).search + replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') + + sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub + def replace_specials(m): + return replacements[m.group(1)] + def replace(s): + return sub(replace_specials, s) + return replace -_has_specials = _build_specials_test() +_replace_specials = _build_specials_replacer() def escape_char(c): if IS_PYTHON3: @@ -210,10 +227,7 @@ def escape_byte_string(s): encoded as ISO-8859-1, will result in the correct byte sequence being written. """ - if _has_specials(s): - for special, replacement in _c_special_replacements: - if special in s: - s = s.replace(special, replacement) + s = _replace_specials(s) try: return s.decode("ASCII") # trial decoding: plain ASCII => done except UnicodeDecodeError: |