diff options
author | scoder <stefan_ml@behnel.de> | 2020-06-30 13:52:18 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-30 13:52:18 +0200 |
commit | 9c26b302791515f608d4b200f692dcec9f066229 (patch) | |
tree | c48a43f095d16406a9cab2dcc8fc8c03e4a91624 | |
parent | 7e9a6b75c08ae0163c20a4ec41bd23fa9888b1b8 (diff) | |
download | cython-9c26b302791515f608d4b200f692dcec9f066229.tar.gz |
Really only use PyUnicode_FromUnicode() when needed (GH-3697)
* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms.
See https://github.com/cython/cython/issues/3678
* Extend buildenv test to debug a MacOS problem.
* Add a test for surrogate pairs in Unicode strings.
* Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates.
* Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.
-rw-r--r-- | Cython/Compiler/ExprNodes.py | 9 | ||||
-rw-r--r-- | Cython/Compiler/StringEncoding.py | 28 | ||||
-rw-r--r-- | Cython/Compiler/Tests/TestStringEncoding.py | 44 | ||||
-rw-r--r-- | Cython/Utility/ModuleSetupCode.c | 4 | ||||
-rw-r--r-- | Cython/Utility/StringTools.c | 9 | ||||
-rw-r--r-- | tests/compile/buildenv.pyx | 5 | ||||
-rw-r--r-- | tests/run/unicodeliterals.pyx | 17 |
7 files changed, 112 insertions, 4 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index f8579d48c..ae61d742d 100644 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -1677,8 +1677,13 @@ class UnicodeNode(ConstNode): def generate_evaluation_code(self, code): if self.type.is_pyobject: - if self.contains_surrogates(): - # surrogates are not really portable and cannot be + # FIXME: this should go away entirely! + # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2, + # Py2 can generate different code from Py3 here. Let's hope we get away with claiming that + # the processing of surrogate pairs in code was always ambiguous and lead to different results + # on P16/32bit Unicode platforms. + if StringEncoding.string_contains_lone_surrogates(self.value): + # lone (unpaired) surrogates are not really portable and cannot be # decoded by the UTF-8 codec in Py3.3 self.result_code = code.get_py_const(py_object_type, 'ustring') data_cname = code.get_pyunicode_ptr_const(self.value) diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py index 5703c51ae..192fc3de3 100644 --- a/Cython/Compiler/StringEncoding.py +++ b/Cython/Compiler/StringEncoding.py @@ -172,6 +172,34 @@ def string_contains_surrogates(ustring): return False +def string_contains_lone_surrogates(ustring): + """ + Check if the unicode string contains lone surrogate code points + on a CPython platform with wide (UCS-4) or narrow (UTF-16) + Unicode, i.e. characters that would be spelled as two + separate code units on a narrow platform, but that do not form a pair. + """ + last_was_start = False + unicode_uses_surrogate_encoding = sys.maxunicode == 65535 + for c in map(ord, ustring): + # surrogates tend to be rare + if c < 0xD800 or c > 0xDFFF: + if last_was_start: + return True + elif not unicode_uses_surrogate_encoding: + # on 32bit Unicode platforms, there is never a pair + return True + elif c <= 0xDBFF: + if last_was_start: + return True # lone start + last_was_start = True + else: + if not last_was_start: + return True # lone end + last_was_start = False + return last_was_start + + class BytesLiteral(_bytes): # bytes subclass that is compatible with EncodedString encoding = None diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py new file mode 100644 index 000000000..91d099333 --- /dev/null +++ b/Cython/Compiler/Tests/TestStringEncoding.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +import sys +import unittest + +import Cython.Compiler.StringEncoding as StringEncoding + + +class StringEncodingTest(unittest.TestCase): + """ + Test the StringEncoding module. + """ + def test_string_contains_lone_surrogates(self): + self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc")) + self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD")) + self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}")) + + # This behaves differently in Py2 when freshly parsed and read from a .pyc file, + # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython. + if sys.version_info[0] != 2: + self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF")) + + # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character. + obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1] + if sys.version_info[0] == 2 and sys.maxunicode == 65565: + self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair)) + else: + self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair)) + + self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800")) + self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF")) + self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800")) + self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF")) + + def test_string_contains_surrogates(self): + self.assertFalse(StringEncoding.string_contains_surrogates(u"abc")) + self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD")) + self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}")) + + self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800")) + self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF")) + self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF")) + self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800")) + self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF")) diff --git a/Cython/Utility/ModuleSetupCode.c b/Cython/Utility/ModuleSetupCode.c index 3a04d1f6e..e122603d8 100644 --- a/Cython/Utility/ModuleSetupCode.c +++ b/Cython/Utility/ModuleSetupCode.c @@ -741,7 +741,11 @@ static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStrWithError(PyObject *dict, #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) #define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch) + #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE) #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u))) + #else + #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_LENGTH(u)) + #endif #else #define CYTHON_PEP393_ENABLED 0 #define PyUnicode_1BYTE_KIND 1 diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c index 9de65dce3..222edd8fc 100644 --- a/Cython/Utility/StringTools.c +++ b/Cython/Utility/StringTools.c @@ -118,6 +118,7 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch //////////////////// PyUCS4InUnicode //////////////////// +#if Py_UNICODE_SIZE == 2 static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) { /* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */ Py_UNICODE high_val, low_val; @@ -129,6 +130,7 @@ static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t l } return 0; } +#endif static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) { Py_UNICODE uchar; @@ -153,12 +155,15 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch return 0; } #endif - if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) { +#if Py_UNICODE_SIZE == 2 + if (unlikely(character > 65535)) { return __Pyx_PyUnicodeBufferContainsUCS4_SP( PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), character); - } else { + } else +#endif + { return __Pyx_PyUnicodeBufferContainsUCS4_BMP( PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), diff --git a/tests/compile/buildenv.pyx b/tests/compile/buildenv.pyx index fc14d0e11..def37de59 100644 --- a/tests/compile/buildenv.pyx +++ b/tests/compile/buildenv.pyx @@ -34,6 +34,7 @@ cdef extern from *: # Cython config cdef int CYTHON_COMPILING_IN_CPYTHON + cdef int CYTHON_COMPILING_IN_LIMITED_API cdef int CYTHON_COMPILING_IN_PYPY cdef int CYTHON_COMPILING_IN_PYSTON cdef int CYTHON_USE_PYLONG_INTERNALS @@ -42,6 +43,7 @@ cdef extern from *: cdef int CYTHON_USE_UNICODE_WRITER cdef int CYTHON_AVOID_BORROWED_REFS cdef int CYTHON_ASSUME_SAFE_MACROS + cdef int CYTHON_USE_TYPE_SLOTS cdef int CYTHON_UNPACK_METHODS cdef int CYTHON_FAST_THREAD_STATE cdef int CYTHON_FAST_PYCALL @@ -76,6 +78,7 @@ Python {sys.version_info} PY_VERSION_HEX 0x{PY_VERSION_HEX:X} CYTHON_COMPILING_IN_CPYTHON {CYTHON_COMPILING_IN_CPYTHON} +CYTHON_COMPILING_IN_LIMITED_API {CYTHON_COMPILING_IN_LIMITED_API} CYTHON_COMPILING_IN_PYPY {CYTHON_COMPILING_IN_PYPY} CYTHON_COMPILING_IN_PYSTON {CYTHON_COMPILING_IN_PYSTON} @@ -85,6 +88,7 @@ CYTHON_USE_UNICODE_INTERNALS {CYTHON_USE_UNICODE_INTERNALS} CYTHON_USE_UNICODE_WRITER {CYTHON_USE_UNICODE_WRITER} CYTHON_AVOID_BORROWED_REFS {CYTHON_AVOID_BORROWED_REFS} CYTHON_ASSUME_SAFE_MACROS {CYTHON_ASSUME_SAFE_MACROS} +CYTHON_USE_TYPE_SLOTS {CYTHON_USE_TYPE_SLOTS} CYTHON_UNPACK_METHODS {CYTHON_UNPACK_METHODS} CYTHON_FAST_THREAD_STATE {CYTHON_FAST_THREAD_STATE} CYTHON_FAST_PYCALL {CYTHON_FAST_PYCALL} @@ -132,6 +136,7 @@ LINKCC (distutils) = {config_var('LINKCC')} LINKCC (env) = {get_env('LINKCC', '')} Encodings: +sys maxunicode = {sys.maxunicode} LANG (env) = {get_env('LANG', '')} PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')} sys stdout encoding = {sys.stdout.encoding} diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx index 9a9b20142..1947c6009 100644 --- a/tests/run/unicodeliterals.pyx +++ b/tests/run/unicodeliterals.pyx @@ -21,6 +21,13 @@ __doc__ = br""" u'\udc00' >>> h u'\ud800' + >>> q + u'\udc00\ud800' + + # The output of surrogate pairs differs between 16/32bit Unicode runtimes. + #>>> p + #u'\ud800\udc00' + >>> add u'S\xf8k ik\xfc\xd6\xe4abc' >>> null @@ -44,6 +51,10 @@ __doc__ = br""" 1 >>> len(h) 1 + >>> len(q) + 2 + >>> len(q) + 2 >>> len(add) 12 >>> len(null) @@ -75,6 +86,10 @@ __doc__ = br""" True >>> h == u'\\ud800' # unescaped by Python (required by doctest) True + >>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest) + True + >>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest) + True >>> k == u'\\N{SNOWMAN}' == u'\\u2603' True >>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8' # unescaped by Python (required by doctest) @@ -111,6 +126,8 @@ g = u'\udc00' # lone trail surrogate h = u'\ud800' # lone lead surrogate k = u'\N{SNOWMAN}' m = ur'abc\xf8\t\u00f8\U000000f8' +p = u'\ud800\udc00' # surrogate pair +q = u'\udc00\ud800' # reversed surrogate pair add = u'Søk ik' + u'üÖä' + u'abc' null = u'\x00' |