diff options
author | Nikita Nemkin <nikita@nemkin.ru> | 2013-03-07 00:05:32 +0600 |
---|---|---|
committer | Nikita Nemkin <nikita@nemkin.ru> | 2013-03-07 00:05:32 +0600 |
commit | 2e3cf3da44a51be66956fdcd503b8e0b9caac491 (patch) | |
tree | f4109171169df133858afdff329a823081669c7a /Cython/Compiler/StringEncoding.py | |
parent | c0e70e4afe83940cb9fea42fec8b4cc2d7ea4e27 (diff) | |
download | cython-2e3cf3da44a51be66956fdcd503b8e0b9caac491.tar.gz |
Compatibility fix: no UTF-32 codec in Python 2.4/2.5.
Diffstat (limited to 'Cython/Compiler/StringEncoding.py')
-rw-r--r-- | Cython/Compiler/StringEncoding.py | 35 |
1 files changed, 21 insertions, 14 deletions
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py index 35ad1048a..95d3fa43f 100644 --- a/Cython/Compiler/StringEncoding.py +++ b/Cython/Compiler/StringEncoding.py @@ -4,7 +4,6 @@ import re import sys -import array if sys.version_info[0] >= 3: _unicode, _str, _bytes = str, str, bytes @@ -267,18 +266,26 @@ def split_string_literal(s, limit=2000): def encode_pyunicode_string(s): """Create Py_UNICODE[] representation of a given unicode string. """ - utf32_array = array.array('i', s.encode('UTF-32')) - assert utf32_array.itemsize == 4 - utf32_array.pop(0) # Remove BOM - utf32_array.append(0) # Add NULL terminator - - for c in utf32_array: - if c > 65535: - utf16_array = array.array('H', s.encode('UTF-16')) - utf16_array.pop(0) # Remove BOM - utf16_array.append(0) # Add NULL terminator - break + s = map(ord, s) + [0] + + if sys.maxunicode >= 0x10000: # Wide build or Py3.3 + utf16, utf32 = [], s + for code_point in s: + if code_point >= 0x10000: # outside of BMP + high, low = divmod(code_point - 0x10000, 1024) + utf16.append(high + 0xD800) + utf16.append(low + 0xDC00) + else: + utf16.append(code_point) else: - utf16_array = [] + utf16, utf32 = s, [] + for code_unit in s: + if 0xDC00 <= code_unit <= 0xDFFF: # low surrogate + high, low = utf32.pop(), code_unit + utf32.append(((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000) + else: + utf32.append(code_unit) - return ",".join(map(unicode, utf16_array)), ",".join(map(unicode, utf32_array)) + if utf16 == utf32: + utf16 = [] + return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32)) |