summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorscoder <stefan_ml@behnel.de>2020-06-30 13:52:18 +0200
committerGitHub <noreply@github.com>2020-06-30 13:52:18 +0200
commit9c26b302791515f608d4b200f692dcec9f066229 (patch)
treec48a43f095d16406a9cab2dcc8fc8c03e4a91624
parent7e9a6b75c08ae0163c20a4ec41bd23fa9888b1b8 (diff)
downloadcython-9c26b302791515f608d4b200f692dcec9f066229.tar.gz
Really only use PyUnicode_FromUnicode() when needed (GH-3697)
* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms. See https://github.com/cython/cython/issues/3678 * Extend buildenv test to debug a MacOS problem. * Add a test for surrogate pairs in Unicode strings. * Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates. * Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.
-rw-r--r--Cython/Compiler/ExprNodes.py9
-rw-r--r--Cython/Compiler/StringEncoding.py28
-rw-r--r--Cython/Compiler/Tests/TestStringEncoding.py44
-rw-r--r--Cython/Utility/ModuleSetupCode.c4
-rw-r--r--Cython/Utility/StringTools.c9
-rw-r--r--tests/compile/buildenv.pyx5
-rw-r--r--tests/run/unicodeliterals.pyx17
7 files changed, 112 insertions, 4 deletions
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index f8579d48c..ae61d742d 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1677,8 +1677,13 @@ class UnicodeNode(ConstNode):
def generate_evaluation_code(self, code):
if self.type.is_pyobject:
- if self.contains_surrogates():
- # surrogates are not really portable and cannot be
+ # FIXME: this should go away entirely!
+ # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+ # Py2 can generate different code from Py3 here. Let's hope we get away with claiming that
+ # the processing of surrogate pairs in code was always ambiguous and lead to different results
+ # on P16/32bit Unicode platforms.
+ if StringEncoding.string_contains_lone_surrogates(self.value):
+ # lone (unpaired) surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3
self.result_code = code.get_py_const(py_object_type, 'ustring')
data_cname = code.get_pyunicode_ptr_const(self.value)
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py
index 5703c51ae..192fc3de3 100644
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -172,6 +172,34 @@ def string_contains_surrogates(ustring):
return False
+def string_contains_lone_surrogates(ustring):
+ """
+ Check if the unicode string contains lone surrogate code points
+ on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+ Unicode, i.e. characters that would be spelled as two
+ separate code units on a narrow platform, but that do not form a pair.
+ """
+ last_was_start = False
+ unicode_uses_surrogate_encoding = sys.maxunicode == 65535
+ for c in map(ord, ustring):
+ # surrogates tend to be rare
+ if c < 0xD800 or c > 0xDFFF:
+ if last_was_start:
+ return True
+ elif not unicode_uses_surrogate_encoding:
+ # on 32bit Unicode platforms, there is never a pair
+ return True
+ elif c <= 0xDBFF:
+ if last_was_start:
+ return True # lone start
+ last_was_start = True
+ else:
+ if not last_was_start:
+ return True # lone end
+ last_was_start = False
+ return last_was_start
+
+
class BytesLiteral(_bytes):
# bytes subclass that is compatible with EncodedString
encoding = None
diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py
new file mode 100644
index 000000000..91d099333
--- /dev/null
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import unittest
+
+import Cython.Compiler.StringEncoding as StringEncoding
+
+
+class StringEncodingTest(unittest.TestCase):
+ """
+ Test the StringEncoding module.
+ """
+ def test_string_contains_lone_surrogates(self):
+ self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc"))
+ self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD"))
+ self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
+
+ # This behaves differently in Py2 when freshly parsed and read from a .pyc file,
+ # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
+ if sys.version_info[0] != 2:
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
+
+ # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+ obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+ if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+ self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+ else:
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"))
+ self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF"))
+
+ def test_string_contains_surrogates(self):
+ self.assertFalse(StringEncoding.string_contains_surrogates(u"abc"))
+ self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD"))
+ self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}"))
+
+ self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800"))
+ self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF"))
+ self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF"))
+ self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800"))
+ self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF"))
diff --git a/Cython/Utility/ModuleSetupCode.c b/Cython/Utility/ModuleSetupCode.c
index 3a04d1f6e..e122603d8 100644
--- a/Cython/Utility/ModuleSetupCode.c
+++ b/Cython/Utility/ModuleSetupCode.c
@@ -741,7 +741,11 @@ static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStrWithError(PyObject *dict,
#define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u)
#define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i)
#define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch)
+ #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
#define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+ #else
+ #define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_LENGTH(u))
+ #endif
#else
#define CYTHON_PEP393_ENABLED 0
#define PyUnicode_1BYTE_KIND 1
diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c
index 9de65dce3..222edd8fc 100644
--- a/Cython/Utility/StringTools.c
+++ b/Cython/Utility/StringTools.c
@@ -118,6 +118,7 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch
//////////////////// PyUCS4InUnicode ////////////////////
+#if Py_UNICODE_SIZE == 2
static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
/* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */
Py_UNICODE high_val, low_val;
@@ -129,6 +130,7 @@ static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t l
}
return 0;
}
+#endif
static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
Py_UNICODE uchar;
@@ -153,12 +155,15 @@ static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 ch
return 0;
}
#endif
- if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) {
+#if Py_UNICODE_SIZE == 2
+ if (unlikely(character > 65535)) {
return __Pyx_PyUnicodeBufferContainsUCS4_SP(
PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
character);
- } else {
+ } else
+#endif
+ {
return __Pyx_PyUnicodeBufferContainsUCS4_BMP(
PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
diff --git a/tests/compile/buildenv.pyx b/tests/compile/buildenv.pyx
index fc14d0e11..def37de59 100644
--- a/tests/compile/buildenv.pyx
+++ b/tests/compile/buildenv.pyx
@@ -34,6 +34,7 @@ cdef extern from *:
# Cython config
cdef int CYTHON_COMPILING_IN_CPYTHON
+ cdef int CYTHON_COMPILING_IN_LIMITED_API
cdef int CYTHON_COMPILING_IN_PYPY
cdef int CYTHON_COMPILING_IN_PYSTON
cdef int CYTHON_USE_PYLONG_INTERNALS
@@ -42,6 +43,7 @@ cdef extern from *:
cdef int CYTHON_USE_UNICODE_WRITER
cdef int CYTHON_AVOID_BORROWED_REFS
cdef int CYTHON_ASSUME_SAFE_MACROS
+ cdef int CYTHON_USE_TYPE_SLOTS
cdef int CYTHON_UNPACK_METHODS
cdef int CYTHON_FAST_THREAD_STATE
cdef int CYTHON_FAST_PYCALL
@@ -76,6 +78,7 @@ Python {sys.version_info}
PY_VERSION_HEX 0x{PY_VERSION_HEX:X}
CYTHON_COMPILING_IN_CPYTHON {CYTHON_COMPILING_IN_CPYTHON}
+CYTHON_COMPILING_IN_LIMITED_API {CYTHON_COMPILING_IN_LIMITED_API}
CYTHON_COMPILING_IN_PYPY {CYTHON_COMPILING_IN_PYPY}
CYTHON_COMPILING_IN_PYSTON {CYTHON_COMPILING_IN_PYSTON}
@@ -85,6 +88,7 @@ CYTHON_USE_UNICODE_INTERNALS {CYTHON_USE_UNICODE_INTERNALS}
CYTHON_USE_UNICODE_WRITER {CYTHON_USE_UNICODE_WRITER}
CYTHON_AVOID_BORROWED_REFS {CYTHON_AVOID_BORROWED_REFS}
CYTHON_ASSUME_SAFE_MACROS {CYTHON_ASSUME_SAFE_MACROS}
+CYTHON_USE_TYPE_SLOTS {CYTHON_USE_TYPE_SLOTS}
CYTHON_UNPACK_METHODS {CYTHON_UNPACK_METHODS}
CYTHON_FAST_THREAD_STATE {CYTHON_FAST_THREAD_STATE}
CYTHON_FAST_PYCALL {CYTHON_FAST_PYCALL}
@@ -132,6 +136,7 @@ LINKCC (distutils) = {config_var('LINKCC')}
LINKCC (env) = {get_env('LINKCC', '')}
Encodings:
+sys maxunicode = {sys.maxunicode}
LANG (env) = {get_env('LANG', '')}
PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')}
sys stdout encoding = {sys.stdout.encoding}
diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx
index 9a9b20142..1947c6009 100644
--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -21,6 +21,13 @@ __doc__ = br"""
u'\udc00'
>>> h
u'\ud800'
+ >>> q
+ u'\udc00\ud800'
+
+ # The output of surrogate pairs differs between 16/32bit Unicode runtimes.
+ #>>> p
+ #u'\ud800\udc00'
+
>>> add
u'S\xf8k ik\xfc\xd6\xe4abc'
>>> null
@@ -44,6 +51,10 @@ __doc__ = br"""
1
>>> len(h)
1
+ >>> len(q)
+ 2
+ >>> len(q)
+ 2
>>> len(add)
12
>>> len(null)
@@ -75,6 +86,10 @@ __doc__ = br"""
True
>>> h == u'\\ud800' # unescaped by Python (required by doctest)
True
+ >>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest)
+ True
+ >>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest)
+ True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True
>>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8' # unescaped by Python (required by doctest)
@@ -111,6 +126,8 @@ g = u'\udc00' # lone trail surrogate
h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}'
m = ur'abc\xf8\t\u00f8\U000000f8'
+p = u'\ud800\udc00' # surrogate pair
+q = u'\udc00\ud800' # reversed surrogate pair
add = u'Søk ik' + u'üÖä' + u'abc'
null = u'\x00'