De-optimise the unicode methods ".upper()", ".lower()" and ".title()" on single character values since they must still be able to return multiple characters.

author: Stefan Behnel <stefan_ml@behnel.de> 2019-01-03 09:49:35 +0100
committer: Stefan Behnel <stefan_ml@behnel.de> 2019-01-03 09:50:23 +0100
commit: 4b64bbe1c792c0b0f40d97bb98e4192e28923ca2 (patch)
tree: 514fea12a5cf1e93ad14f5586ec570957889096d
parent: 8103d0552c8129211241f047f3bfa10c43e3ab2a (diff)
download: cython-4b64bbe1c792c0b0f40d97bb98e4192e28923ca2.tar.gz
4 files changed, 32 insertions, 15 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index a7468576b..a848617ef 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -20,6 +20,11 @@ Features added
 Bugs fixed
 ----------
 
+* The unicode methods ``.upper()``, ``.lower()`` and ``.title()`` were
+  incorrectly optimised for single character input values and only returned
+  the first character if multiple characters should have been returned.
+  They now use the original Python methods again.
+
 * The ``Py_hash_t`` type failed to accept arbitrary "index" values.
   (Github issue #2752)
 
diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py
index 4c306d566..a577facf1 100644
--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -3374,6 +3374,8 @@ class OptimizeBuiltinCalls(Visitor.NodeRefCleanupMixin,
             PyrexTypes.CFuncTypeArg("uchar", PyrexTypes.c_py_ucs4_type, None),
             ])
 
+    # DISABLED: Return value can only be one character, which is not correct.
+    '''
     def _inject_unicode_character_conversion(self, node, function, args, is_unbound_method):
         if is_unbound_method or len(args) != 1:
             return node
@@ -3392,9 +3394,10 @@ class OptimizeBuiltinCalls(Visitor.NodeRefCleanupMixin,
             func_call = func_call.coerce_to_pyobject(self.current_env)
         return func_call
 
-    _handle_simple_method_unicode_lower = _inject_unicode_character_conversion
-    _handle_simple_method_unicode_upper = _inject_unicode_character_conversion
-    _handle_simple_method_unicode_title = _inject_unicode_character_conversion
+    #_handle_simple_method_unicode_lower = _inject_unicode_character_conversion
+    #_handle_simple_method_unicode_upper = _inject_unicode_character_conversion
+    #_handle_simple_method_unicode_title = _inject_unicode_character_conversion
+    '''
 
     PyUnicode_Splitlines_func_type = PyrexTypes.CFuncType(
         Builtin.list_type, [
diff --git a/tests/run/py_ucs4_type.pyx b/tests/run/py_ucs4_type.pyx
index 7193319c6..afd45fca3 100644
--- a/tests/run/py_ucs4_type.pyx
+++ b/tests/run/py_ucs4_type.pyx
@@ -132,15 +132,24 @@ def unicode_type_methods(Py_UCS4 uchar):
         uchar.isupper(),
         ]
 
-@cython.test_assert_path_exists('//PythonCapiCallNode')
-@cython.test_fail_if_path_exists('//SimpleCallNode')
+#@cython.test_assert_path_exists('//PythonCapiCallNode')
+#@cython.test_fail_if_path_exists('//SimpleCallNode')
 def unicode_methods(Py_UCS4 uchar):
     """
-    >>> unicode_methods(ord('A')) == ['a', 'A', 'A']
+    >>> unicode_methods(ord('A')) == ['a', 'A', 'A'] or unicode_methods(ord('A'))
+    True
+    >>> unicode_methods(ord('a')) == ['a', 'A', 'A'] or unicode_methods(ord('a'))
     True
-    >>> unicode_methods(ord('a')) == ['a', 'A', 'A']
+    >>> unicode_methods(0x1E9E) == [u'\\xdf', u'\\u1e9e', u'\\u1e9e'] or unicode_methods(0x1E9E)
+    True
+    >>> unicode_methods(0x0130) in (
+    ...     [u'i\\u0307', u'\\u0130', u'\\u0130'],  # Py3
+    ...     [u'i', u'\\u0130', u'\\u0130'],  # Py2
+    ... ) or unicode_methods(0x0130)
     True
     """
+    # \u1E9E == 'LATIN CAPITAL LETTER SHARP S'
+    # \u0130 == 'LATIN CAPITAL LETTER I WITH DOT ABOVE'
     return [
         # character conversion
         uchar.lower(),
@@ -149,11 +158,11 @@ def unicode_methods(Py_UCS4 uchar):
         ]
 
 
-@cython.test_assert_path_exists('//PythonCapiCallNode')
-@cython.test_fail_if_path_exists(
-    '//SimpleCallNode',
-    '//CoerceFromPyTypeNode',
-)
+#@cython.test_assert_path_exists('//PythonCapiCallNode')
+#@cython.test_fail_if_path_exists(
+#    '//SimpleCallNode',
+#    '//CoerceFromPyTypeNode',
+#)
 def unicode_method_return_type(Py_UCS4 uchar):
     """
     >>> unicode_method_return_type(ord('A'))
@@ -366,5 +375,5 @@ def uchar_lookup_in_dict(obj, Py_UCS4 uchar):
 
 
 _WARNINGS = """
-364:16: Item lookup of unicode character codes now always converts to a Unicode string. Use an explicit C integer cast to get back the previous integer lookup behaviour.
+373:16: Item lookup of unicode character codes now always converts to a Unicode string. Use an explicit C integer cast to get back the previous integer lookup behaviour.
 """
diff --git a/tests/run/py_unicode_type.pyx b/tests/run/py_unicode_type.pyx
index d8d172bc9..0d33be927 100644
--- a/tests/run/py_unicode_type.pyx
+++ b/tests/run/py_unicode_type.pyx
@@ -123,8 +123,8 @@ def unicode_type_methods(Py_UNICODE uchar):
         uchar.isupper(),
         ]
 
-@cython.test_assert_path_exists('//PythonCapiCallNode')
-@cython.test_fail_if_path_exists('//SimpleCallNode')
+#@cython.test_assert_path_exists('//PythonCapiCallNode')
+#@cython.test_fail_if_path_exists('//SimpleCallNode')
 def unicode_methods(Py_UNICODE uchar):
     """
     >>> unicode_methods(ord('A')) == ['a', 'A', 'A']
author	Stefan Behnel <stefan_ml@behnel.de>	2019-01-03 09:49:35 +0100
committer	Stefan Behnel <stefan_ml@behnel.de>	2019-01-03 09:50:23 +0100
commit	4b64bbe1c792c0b0f40d97bb98e4192e28923ca2 (patch)
tree	514fea12a5cf1e93ad14f5586ec570957889096d
parent	8103d0552c8129211241f047f3bfa10c43e3ab2a (diff)
download	cython-4b64bbe1c792c0b0f40d97bb98e4192e28923ca2.tar.gz