Ressurect unicode_errors of the Packer. (#379)

author: Inada Naoki <songofacandy@gmail.com> 2019-12-03 20:53:11 +0900
committer: GitHub <noreply@github.com> 2019-12-03 20:53:11 +0900
commit: 83ebb63c447a99c81d043eb6808bbfb50697a751 (patch)
tree: 1f31aa6d43adccf27d236f3b63adeb71aa933a26
parent: a0480c760256b4afc18beaebd5e3c79de1d4ce56 (diff)
download: msgpack-python-83ebb63c447a99c81d043eb6808bbfb50697a751.tar.gz
4 files changed, 50 insertions, 13 deletions
diff --git a/ChangeLog.rst b/ChangeLog.rst
index 1352af8..1d784af 100644
--- a/ChangeLog.rst
+++ b/ChangeLog.rst
@@ -5,7 +5,7 @@ Release Date: TBD
 
 * Remove Python 2 support from the ``msgpack/_cmsgpack``.
   ``msgpack/fallback`` still supports Python 2.
-* Remove encoding and unicode_errors options from the Packer.
+* Remove ``encoding`` option from the Packer.
 
 
 0.6.2
diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx
index 2e698e1..8b1a392 100644
--- a/msgpack/_packer.pyx
+++ b/msgpack/_packer.pyx
@@ -89,9 +89,15 @@ cdef class Packer(object):
         Additionally tuples will not be serialized as lists.
         This is useful when trying to implement accurate serialization
         for python types.
+
+    :param str unicode_errors:
+        The error handler for encoding unicode. (default: 'strict')
+        DO NOT USE THIS!!  This option is kept for very specific usage.
     """
     cdef msgpack_packer pk
     cdef object _default
+    cdef object _berrors
+    cdef const char *unicode_errors
     cdef bint strict_types
     cdef bool use_float
     cdef bint autoreset
@@ -104,10 +110,8 @@ cdef class Packer(object):
         self.pk.buf_size = buf_size
         self.pk.length = 0
 
-    def __init__(self, default=None,
-                 bint use_single_float=False,
-                 bint autoreset=True,
-                 bint use_bin_type=False,
+    def __init__(self, *, default=None, unicode_errors=None,
+                 bint use_single_float=False, bint autoreset=True, bint use_bin_type=False,
                  bint strict_types=False):
         self.use_float = use_single_float
         self.strict_types = strict_types
@@ -118,6 +122,12 @@ cdef class Packer(object):
                 raise TypeError("default must be a callable.")
         self._default = default
 
+        self._berrors = unicode_errors
+        if unicode_errors is None:
+            self.unicode_errors = NULL
+        else:
+            self.unicode_errors = self._berrors
+
     def __dealloc__(self):
         PyMem_Free(self.pk.buf)
         self.pk.buf = NULL
@@ -183,9 +193,19 @@ cdef class Packer(object):
                 if ret == 0:
                     ret = msgpack_pack_raw_body(&self.pk, rawval, L)
             elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o):
-                ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
-                if ret == -2:
-                    raise ValueError("unicode string is too large")
+                if self.unicode_errors == NULL:
+                    ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT);
+                    if ret == -2:
+                        raise ValueError("unicode string is too large")
+                else:
+                    o = PyUnicode_AsEncodedString(o, NULL, self.unicode_errors)
+                    L = Py_SIZE(o)
+                    if L > ITEM_LIMIT:
+                        raise ValueError("unicode string is too large")
+                    ret = msgpack_pack_raw(&self.pk, L)
+                    if ret == 0:
+                        rawval = o
+                        ret = msgpack_pack_raw_body(&self.pk, rawval, L)
             elif PyDict_CheckExact(o):
                 d = <dict>o
                 L = len(d)
diff --git a/msgpack/fallback.py b/msgpack/fallback.py
index 5dab906..0c0c101 100644
--- a/msgpack/fallback.py
+++ b/msgpack/fallback.py
@@ -667,7 +667,7 @@ class Unpacker(object):
             elif self._raw:
                 obj = bytes(obj)
             else:
-                obj = obj.decode('utf_8')
+                obj = obj.decode('utf_8', self._unicode_errors)
             return obj
         if typ == TYPE_EXT:
             return self._ext_hook(n, bytes(obj))
@@ -752,14 +752,19 @@ class Packer(object):
         Additionally tuples will not be serialized as lists.
         This is useful when trying to implement accurate serialization
         for python types.
+
+    :param str unicode_errors:
+        The error handler for encoding unicode. (default: 'strict')
+        DO NOT USE THIS!!  This option is kept for very specific usage.
     """
-    def __init__(self, default=None,
+    def __init__(self, default=None, unicode_errors=None,
                  use_single_float=False, autoreset=True, use_bin_type=False,
                  strict_types=False):
         self._strict_types = strict_types
         self._use_float = use_single_float
         self._autoreset = autoreset
         self._use_bin_type = use_bin_type
+        self._unicode_errors = unicode_errors or "strict"
         self._buffer = StringIO()
         if default is not None:
             if not callable(default):
@@ -816,7 +821,7 @@ class Packer(object):
                 self._pack_bin_header(n)
                 return self._buffer.write(obj)
             if check(obj, unicode):
-                obj = obj.encode("utf-8")
+                obj = obj.encode("utf-8", self._unicode_errors)
                 n = len(obj)
                 if n >= 2**32:
                     raise ValueError("String is too large")
diff --git a/test/test_pack.py b/test/test_pack.py
index 194b2c9..b6752e5 100644
--- a/test/test_pack.py
+++ b/test/test_pack.py
@@ -5,6 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 from collections import OrderedDict
 from io import BytesIO
 import struct
+import sys
 
 import pytest
 from pytest import raises, xfail
@@ -54,13 +55,24 @@ def testPackByteArrays():
     for td in test_data:
         check(td)
 
+@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates")
+def testIgnoreUnicodeErrors():
+    re = unpackb(packb(b'abc\xeddef', use_bin_type=False),
+                 raw=False, unicode_errors='ignore')
+    assert re == "abcdef"
+
 def testStrictUnicodeUnpack():
-    packed = packb(b'abc\xeddef')
+    packed = packb(b'abc\xeddef', use_bin_type=False)
     with pytest.raises(UnicodeDecodeError):
         unpackb(packed, raw=False, use_list=1)
 
+@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates")
+def testIgnoreErrorsPack():
+    re = unpackb(packb(u"abc\uDC80\uDCFFdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1)
+    assert re == "abcdef"
+
 def testDecodeBinary():
-    re = unpackb(packb(b"abc"), encoding=None, use_list=1)
+    re = unpackb(packb(b"abc"), use_list=1)
     assert re == b"abc"
 
 def testPackFloat():
author	Inada Naoki <songofacandy@gmail.com>	2019-12-03 20:53:11 +0900
committer	GitHub <noreply@github.com>	2019-12-03 20:53:11 +0900
commit	83ebb63c447a99c81d043eb6808bbfb50697a751 (patch)
tree	1f31aa6d43adccf27d236f3b63adeb71aa933a26
parent	a0480c760256b4afc18beaebd5e3c79de1d4ce56 (diff)
download	msgpack-python-83ebb63c447a99c81d043eb6808bbfb50697a751.tar.gz