Extend tests and accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.gh3678_undep_unicode

author: Stefan Behnel <stefan_ml@behnel.de> 2020-06-23 13:38:41 +0200
committer: Stefan Behnel <stefan_ml@behnel.de> 2020-06-23 13:38:41 +0200
commit: 7d8f52c198c48faa88e05b8cc68121d72acd0ce0 (patch)
tree: 086215fb77e2237394dd25cab4dabef7a3afa4a3
parent: cfc3a80aab4ea5c401ee53ff1062dbec9943ea5a (diff)
download: cython-gh3678_undep_unicode.tar.gz
2 files changed, 12 insertions, 2 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index 0010b01ee..9c34e3af0 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1678,6 +1678,10 @@ class UnicodeNode(ConstNode):
     def generate_evaluation_code(self, code):
         if self.type.is_pyobject:
             # FIXME: this should go away entirely!
+            # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+            # Py2 can generate different code from Py3 here.  Let's hope we get away with claiming that
+            # the processing of surrogate pairs in code was always ambiguous and lead to different results
+            # on P16/32bit Unicode platforms.
             if StringEncoding.string_contains_lone_surrogates(self.value):
                 # lone (unpaired) surrogates are not really portable and cannot be
                 # decoded by the UTF-8 codec in Py3.3
diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py
index 76de06905..91d099333 100644
--- a/Cython/Compiler/Tests/TestStringEncoding.py
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
@@ -16,10 +16,16 @@ class StringEncodingTest(unittest.TestCase):
         self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
 
         # This behaves differently in Py2 when freshly parsed and read from a .pyc file,
-        # but it seems to be a parser bug in Py2, which doesn't hurt us in Cython.
+        # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
         if sys.version_info[0] != 2:
             self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
-        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"[::-1]))
+
+        # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+        obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+        if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+            self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+        else:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
 
         self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
         self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
author	Stefan Behnel <stefan_ml@behnel.de>	2020-06-23 13:38:41 +0200
committer	Stefan Behnel <stefan_ml@behnel.de>	2020-06-23 13:38:41 +0200
commit	7d8f52c198c48faa88e05b8cc68121d72acd0ce0 (patch)
tree	086215fb77e2237394dd25cab4dabef7a3afa4a3
parent	cfc3a80aab4ea5c401ee53ff1062dbec9943ea5a (diff)
download	cython-gh3678_undep_unicode.tar.gz