summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cython/Compiler/ExprNodes.py4
-rw-r--r--Cython/Compiler/Tests/TestStringEncoding.py10
2 files changed, 12 insertions, 2 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index 0010b01ee..9c34e3af0 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1678,6 +1678,10 @@ class UnicodeNode(ConstNode):
def generate_evaluation_code(self, code):
if self.type.is_pyobject:
# FIXME: this should go away entirely!
+ # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+ # Py2 can generate different code from Py3 here. Let's hope we get away with claiming that
+ # the processing of surrogate pairs in code was always ambiguous and lead to different results
+ # on P16/32bit Unicode platforms.
if StringEncoding.string_contains_lone_surrogates(self.value):
# lone (unpaired) surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3
diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py
index 76de06905..91d099333 100644
--- a/Cython/Compiler/Tests/TestStringEncoding.py
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
@@ -16,10 +16,16 @@ class StringEncodingTest(unittest.TestCase):
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
# This behaves differently in Py2 when freshly parsed and read from a .pyc file,
- # but it seems to be a parser bug in Py2, which doesn't hurt us in Cython.
+ # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
if sys.version_info[0] != 2:
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
- self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"[::-1]))
+
+ # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+ obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+ if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+ self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+ else:
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))