summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2020-06-23 13:38:41 +0200
committerStefan Behnel <stefan_ml@behnel.de>2020-06-23 13:38:41 +0200
commit7d8f52c198c48faa88e05b8cc68121d72acd0ce0 (patch)
tree086215fb77e2237394dd25cab4dabef7a3afa4a3
parentcfc3a80aab4ea5c401ee53ff1062dbec9943ea5a (diff)
downloadcython-gh3678_undep_unicode.tar.gz
Extend tests and accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.gh3678_undep_unicode
-rw-r--r--Cython/Compiler/ExprNodes.py4
-rw-r--r--Cython/Compiler/Tests/TestStringEncoding.py10
2 files changed, 12 insertions, 2 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index 0010b01ee..9c34e3af0 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1678,6 +1678,10 @@ class UnicodeNode(ConstNode):
def generate_evaluation_code(self, code):
if self.type.is_pyobject:
# FIXME: this should go away entirely!
+ # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+ # Py2 can generate different code from Py3 here. Let's hope we get away with claiming that
+ # the processing of surrogate pairs in code was always ambiguous and lead to different results
+ # on P16/32bit Unicode platforms.
if StringEncoding.string_contains_lone_surrogates(self.value):
# lone (unpaired) surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3
diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py
index 76de06905..91d099333 100644
--- a/Cython/Compiler/Tests/TestStringEncoding.py
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
@@ -16,10 +16,16 @@ class StringEncodingTest(unittest.TestCase):
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
# This behaves differently in Py2 when freshly parsed and read from a .pyc file,
- # but it seems to be a parser bug in Py2, which doesn't hurt us in Cython.
+ # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
if sys.version_info[0] != 2:
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
- self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"[::-1]))
+
+ # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+ obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+ if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+ self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+ else:
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))