summaryrefslogtreecommitdiff
path: root/Cython/Compiler/StringEncoding.py
diff options
context:
space:
mode:
authorNikita Nemkin <nikita@nemkin.ru>2013-03-07 00:05:32 +0600
committerNikita Nemkin <nikita@nemkin.ru>2013-03-07 00:05:32 +0600
commit2e3cf3da44a51be66956fdcd503b8e0b9caac491 (patch)
treef4109171169df133858afdff329a823081669c7a /Cython/Compiler/StringEncoding.py
parentc0e70e4afe83940cb9fea42fec8b4cc2d7ea4e27 (diff)
downloadcython-2e3cf3da44a51be66956fdcd503b8e0b9caac491.tar.gz
Compatibility fix: no UTF-32 codec in Python 2.4/2.5.
Diffstat (limited to 'Cython/Compiler/StringEncoding.py')
-rw-r--r--Cython/Compiler/StringEncoding.py35
1 files changed, 21 insertions, 14 deletions
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py
index 35ad1048a..95d3fa43f 100644
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -4,7 +4,6 @@
import re
import sys
-import array
if sys.version_info[0] >= 3:
_unicode, _str, _bytes = str, str, bytes
@@ -267,18 +266,26 @@ def split_string_literal(s, limit=2000):
def encode_pyunicode_string(s):
"""Create Py_UNICODE[] representation of a given unicode string.
"""
- utf32_array = array.array('i', s.encode('UTF-32'))
- assert utf32_array.itemsize == 4
- utf32_array.pop(0) # Remove BOM
- utf32_array.append(0) # Add NULL terminator
-
- for c in utf32_array:
- if c > 65535:
- utf16_array = array.array('H', s.encode('UTF-16'))
- utf16_array.pop(0) # Remove BOM
- utf16_array.append(0) # Add NULL terminator
- break
+ s = map(ord, s) + [0]
+
+ if sys.maxunicode >= 0x10000: # Wide build or Py3.3
+ utf16, utf32 = [], s
+ for code_point in s:
+ if code_point >= 0x10000: # outside of BMP
+ high, low = divmod(code_point - 0x10000, 1024)
+ utf16.append(high + 0xD800)
+ utf16.append(low + 0xDC00)
+ else:
+ utf16.append(code_point)
else:
- utf16_array = []
+ utf16, utf32 = s, []
+ for code_unit in s:
+ if 0xDC00 <= code_unit <= 0xDFFF: # low surrogate
+ high, low = utf32.pop(), code_unit
+ utf32.append(((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000)
+ else:
+ utf32.append(code_unit)
- return ",".join(map(unicode, utf16_array)), ",".join(map(unicode, utf32_array))
+ if utf16 == utf32:
+ utf16 = []
+ return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32))