diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2020-06-23 13:38:41 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2020-06-23 13:38:41 +0200 |
commit | 7d8f52c198c48faa88e05b8cc68121d72acd0ce0 (patch) | |
tree | 086215fb77e2237394dd25cab4dabef7a3afa4a3 | |
parent | cfc3a80aab4ea5c401ee53ff1062dbec9943ea5a (diff) | |
download | cython-gh3678_undep_unicode.tar.gz |
Extend tests and accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.gh3678_undep_unicode
-rw-r--r-- | Cython/Compiler/ExprNodes.py | 4 | ||||
-rw-r--r-- | Cython/Compiler/Tests/TestStringEncoding.py | 10 |
2 files changed, 12 insertions, 2 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index 0010b01ee..9c34e3af0 100644 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -1678,6 +1678,10 @@ class UnicodeNode(ConstNode): def generate_evaluation_code(self, code): if self.type.is_pyobject: # FIXME: this should go away entirely! + # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2, + # Py2 can generate different code from Py3 here. Let's hope we get away with claiming that + # the processing of surrogate pairs in code was always ambiguous and lead to different results + # on P16/32bit Unicode platforms. if StringEncoding.string_contains_lone_surrogates(self.value): # lone (unpaired) surrogates are not really portable and cannot be # decoded by the UTF-8 codec in Py3.3 diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py index 76de06905..91d099333 100644 --- a/Cython/Compiler/Tests/TestStringEncoding.py +++ b/Cython/Compiler/Tests/TestStringEncoding.py @@ -16,10 +16,16 @@ class StringEncodingTest(unittest.TestCase): self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}")) # This behaves differently in Py2 when freshly parsed and read from a .pyc file, - # but it seems to be a parser bug in Py2, which doesn't hurt us in Cython. + # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython. if sys.version_info[0] != 2: self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF")) - self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"[::-1])) + + # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character. + obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1] + if sys.version_info[0] == 2 and sys.maxunicode == 65565: + self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair)) + else: + self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair)) self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800")) self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF")) |