Really only use PyUnicode_FromUnicode() when needed (GH-3697)

* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms. See https://github.com/cython/cython/issues/3678 * Extend buildenv test to debug a MacOS problem. * Add a test for surrogate pairs in Unicode strings. * Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates. * Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.
author: scoder <stefan_ml@behnel.de> 2020-06-30 13:52:18 +0200
committer: GitHub <noreply@github.com> 2020-06-30 13:52:18 +0200
commit: 9c26b302791515f608d4b200f692dcec9f066229 (patch)
tree: c48a43f095d16406a9cab2dcc8fc8c03e4a91624
parent: 7e9a6b75c08ae0163c20a4ec41bd23fa9888b1b8 (diff)
download: cython-9c26b302791515f608d4b200f692dcec9f066229.tar.gz
7 files changed, 112 insertions, 4 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index f8579d48c..ae61d742d 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1677,8 +1677,13 @@ class UnicodeNode(ConstNode):
 
     def generate_evaluation_code(self, code):
         if self.type.is_pyobject:
-            if self.contains_surrogates():
-                # surrogates are not really portable and cannot be
+            # FIXME: this should go away entirely!
+            # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+            # Py2 can generate different code from Py3 here.  Let's hope we get away with claiming that
+            # the processing of surrogate pairs in code was always ambiguous and lead to different results
+            # on P16/32bit Unicode platforms.
+            if StringEncoding.string_contains_lone_surrogates(self.value):
+                # lone (unpaired) surrogates are not really portable and cannot be
                 # decoded by the UTF-8 codec in Py3.3
                 self.result_code = code.get_py_const(py_object_type, 'ustring')
                 data_cname = code.get_pyunicode_ptr_const(self.value)
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py
index 5703c51ae..192fc3de3 100644
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -172,6 +172,34 @@ def string_contains_surrogates(ustring):
     return False
 
 
+def string_contains_lone_surrogates(ustring):
+    """
+    Check if the unicode string contains lone surrogate code points
+    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+    Unicode, i.e. characters that would be spelled as two
+    separate code units on a narrow platform, but that do not form a pair.
+    """
+    last_was_start = False
+    unicode_uses_surrogate_encoding = sys.maxunicode == 65535
+    for c in map(ord, ustring):
+        # surrogates tend to be rare
+        if c < 0xD800 or c > 0xDFFF:
+            if last_was_start:
+                return True
+        elif not unicode_uses_surrogate_encoding:
+            # on 32bit Unicode platforms, there is never a pair
+            return True
+        elif c <= 0xDBFF:
+            if last_was_start:
+                return True  # lone start
+            last_was_start = True
+        else:
+            if not last_was_start:
+                return True  # lone end
+            last_was_start = False
+    return last_was_start
+
+
 class BytesLiteral(_bytes):
     # bytes subclass that is compatible with EncodedString
     encoding = None
diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py
new file mode 100644
index 000000000..91d099333
--- /dev/null
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import unittest
+
+import Cython.Compiler.StringEncoding as StringEncoding
+
+
+class StringEncodingTest(unittest.TestCase):
+    """
+    Test the StringEncoding module.
+    """
+    def test_string_contains_lone_surrogates(self):
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc"))
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD"))
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
+
+        # This behaves differently in Py2 when freshly parsed and read from a .pyc file,
+        # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
+        if sys.version_info[0] != 2:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
+
+        # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+        obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+        if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+            self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+        else:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF"))
+
+    def test_string_contains_surrogates(self):
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"abc"))
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD"))
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}"))
+
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF"))
diff --git a/Cython/Utility/ModuleSetupCode.c b/Cython/Utility/ModuleSetupCode.c
index 3a04d1f6e..e122603d8 100644
--- a/Cython/Utility/ModuleSetupCode.c
+++ b/Cython/Utility/ModuleSetupCode.c
@@ -741,7 +741,11 @@ static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStrWithError(PyObject *dict,
   #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
   #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
   #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
   #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+  #else
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #endif
 #else
   #define CYTHON_PEP393_ENABLED 0
   #define PyUnicode_1BYTE_KIND  1
diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c
index 9de65dce3..222edd8fc 100644
--- a/Cython/Utility/StringTools.c
+++ b/Cython/Utility/StringTools.c
@@ -118,6 +118,7 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch
 
 //////////////////// PyUCS4InUnicode ////////////////////
 
+#if Py_UNICODE_SIZE == 2
 static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
     /* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */
     Py_UNICODE high_val, low_val;
@@ -129,6 +130,7 @@ static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t l
     }
     return 0;
 }
+#endif
 
 static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
     Py_UNICODE uchar;
@@ -153,12 +155,15 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch
         return 0;
     }
 #endif
-    if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) {
+#if Py_UNICODE_SIZE == 2
+    if (unlikely(character > 65535)) {
         return __Pyx_PyUnicodeBufferContainsUCS4_SP(
             PyUnicode_AS_UNICODE(unicode),
             PyUnicode_GET_SIZE(unicode),
             character);
-    } else {
+    } else
+#endif
+    {
         return __Pyx_PyUnicodeBufferContainsUCS4_BMP(
             PyUnicode_AS_UNICODE(unicode),
             PyUnicode_GET_SIZE(unicode),
diff --git a/tests/compile/buildenv.pyx b/tests/compile/buildenv.pyx
index fc14d0e11..def37de59 100644
--- a/tests/compile/buildenv.pyx
+++ b/tests/compile/buildenv.pyx
@@ -34,6 +34,7 @@ cdef extern from *:
 
     # Cython config
     cdef int CYTHON_COMPILING_IN_CPYTHON
+    cdef int CYTHON_COMPILING_IN_LIMITED_API
     cdef int CYTHON_COMPILING_IN_PYPY
     cdef int CYTHON_COMPILING_IN_PYSTON
     cdef int CYTHON_USE_PYLONG_INTERNALS
@@ -42,6 +43,7 @@ cdef extern from *:
     cdef int CYTHON_USE_UNICODE_WRITER
     cdef int CYTHON_AVOID_BORROWED_REFS
     cdef int CYTHON_ASSUME_SAFE_MACROS
+    cdef int CYTHON_USE_TYPE_SLOTS
     cdef int CYTHON_UNPACK_METHODS
     cdef int CYTHON_FAST_THREAD_STATE
     cdef int CYTHON_FAST_PYCALL
@@ -76,6 +78,7 @@ Python  {sys.version_info}
 PY_VERSION_HEX  0x{PY_VERSION_HEX:X}
 
 CYTHON_COMPILING_IN_CPYTHON  {CYTHON_COMPILING_IN_CPYTHON}
+CYTHON_COMPILING_IN_LIMITED_API  {CYTHON_COMPILING_IN_LIMITED_API}
 CYTHON_COMPILING_IN_PYPY  {CYTHON_COMPILING_IN_PYPY}
 CYTHON_COMPILING_IN_PYSTON  {CYTHON_COMPILING_IN_PYSTON}
 
@@ -85,6 +88,7 @@ CYTHON_USE_UNICODE_INTERNALS  {CYTHON_USE_UNICODE_INTERNALS}
 CYTHON_USE_UNICODE_WRITER  {CYTHON_USE_UNICODE_WRITER}
 CYTHON_AVOID_BORROWED_REFS  {CYTHON_AVOID_BORROWED_REFS}
 CYTHON_ASSUME_SAFE_MACROS  {CYTHON_ASSUME_SAFE_MACROS}
+CYTHON_USE_TYPE_SLOTS  {CYTHON_USE_TYPE_SLOTS}
 CYTHON_UNPACK_METHODS  {CYTHON_UNPACK_METHODS}
 CYTHON_FAST_THREAD_STATE  {CYTHON_FAST_THREAD_STATE}
 CYTHON_FAST_PYCALL  {CYTHON_FAST_PYCALL}
@@ -132,6 +136,7 @@ LINKCC (distutils) = {config_var('LINKCC')}
 LINKCC (env) = {get_env('LINKCC', '')}
 
 Encodings:
+sys maxunicode = {sys.maxunicode}
 LANG (env) = {get_env('LANG', '')}
 PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')}
 sys stdout encoding = {sys.stdout.encoding}
diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx
index 9a9b20142..1947c6009 100644
--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -21,6 +21,13 @@ __doc__ = br"""
     u'\udc00'
     >>> h
     u'\ud800'
+    >>> q
+    u'\udc00\ud800'
+
+    # The output of surrogate pairs differs between 16/32bit Unicode runtimes.
+    #>>> p
+    #u'\ud800\udc00'
+
     >>> add
     u'S\xf8k ik\xfc\xd6\xe4abc'
     >>> null
@@ -44,6 +51,10 @@ __doc__ = br"""
     1
     >>> len(h)
     1
+    >>> len(q)
+    2
+    >>> len(q)
+    2
     >>> len(add)
     12
     >>> len(null)
@@ -75,6 +86,10 @@ __doc__ = br"""
     True
     >>> h == u'\\ud800' # unescaped by Python (required by doctest)
     True
+    >>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest)
+    True
+    >>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest)
+    True
     >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
     True
     >>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8'  # unescaped by Python (required by doctest)
@@ -111,6 +126,8 @@ g = u'\udc00'   # lone trail surrogate
 h = u'\ud800'   # lone lead surrogate
 k = u'\N{SNOWMAN}'
 m = ur'abc\xf8\t\u00f8\U000000f8'
+p = u'\ud800\udc00'  # surrogate pair
+q = u'\udc00\ud800'  # reversed surrogate pair
 
 add = u'Søk ik' + u'üÖä' + u'abc'
 null = u'\x00'
author	scoder <stefan_ml@behnel.de>	2020-06-30 13:52:18 +0200
committer	GitHub <noreply@github.com>	2020-06-30 13:52:18 +0200
commit	9c26b302791515f608d4b200f692dcec9f066229 (patch)
tree	c48a43f095d16406a9cab2dcc8fc8c03e4a91624
parent	7e9a6b75c08ae0163c20a4ec41bd23fa9888b1b8 (diff)
download	cython-9c26b302791515f608d4b200f692dcec9f066229.tar.gz