From 13eb4498b70c947cd1bb972dcf4ebd75f609d3e4 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Wed, 8 Jul 2020 09:27:00 +0200 Subject: Using Py_UNICODE to store lone surrogates makes Py3 join surrogate pairs on 16-bit Unicode platforms (Windows) when reading them back in, although we correctly processed them before. Instead, we now use the "unicode_escape" codec to store byte strings, because it can return surrogate characters (which the other codecs cannot). --- Cython/Compiler/ExprNodes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index 5ebb273e0..831575b35 100644 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -1632,13 +1632,14 @@ class UnicodeNode(ConstNode): # lone (unpaired) surrogates are not really portable and cannot be # decoded by the UTF-8 codec in Py3.3 self.result_code = code.get_py_const(py_object_type, 'ustring') - data_cname = code.get_pyunicode_ptr_const(self.value) + data_cname = code.get_string_const( + StringEncoding.BytesLiteral(self.value.encode('unicode_escape'))) const_code = code.get_cached_constants_writer(self.result_code) if const_code is None: return # already initialised const_code.mark_pos(self.pos) const_code.putln( - "%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % ( + "%s = PyUnicode_DecodeUnicodeEscape(%s, sizeof(%s) - 1, NULL); %s" % ( self.result_code, data_cname, data_cname, -- cgit v1.2.1