diff options
author | Inada Naoki <songofacandy@gmail.com> | 2019-12-03 20:53:11 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-12-03 20:53:11 +0900 |
commit | 83ebb63c447a99c81d043eb6808bbfb50697a751 (patch) | |
tree | 1f31aa6d43adccf27d236f3b63adeb71aa933a26 | |
parent | a0480c760256b4afc18beaebd5e3c79de1d4ce56 (diff) | |
download | msgpack-python-83ebb63c447a99c81d043eb6808bbfb50697a751.tar.gz |
Ressurect unicode_errors of the Packer. (#379)
-rw-r--r-- | ChangeLog.rst | 2 | ||||
-rw-r--r-- | msgpack/_packer.pyx | 34 | ||||
-rw-r--r-- | msgpack/fallback.py | 11 | ||||
-rw-r--r-- | test/test_pack.py | 16 |
4 files changed, 50 insertions, 13 deletions
diff --git a/ChangeLog.rst b/ChangeLog.rst index 1352af8..1d784af 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -5,7 +5,7 @@ Release Date: TBD * Remove Python 2 support from the ``msgpack/_cmsgpack``. ``msgpack/fallback`` still supports Python 2. -* Remove encoding and unicode_errors options from the Packer. +* Remove ``encoding`` option from the Packer. 0.6.2 diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 2e698e1..8b1a392 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -89,9 +89,15 @@ cdef class Packer(object): Additionally tuples will not be serialized as lists. This is useful when trying to implement accurate serialization for python types. + + :param str unicode_errors: + The error handler for encoding unicode. (default: 'strict') + DO NOT USE THIS!! This option is kept for very specific usage. """ cdef msgpack_packer pk cdef object _default + cdef object _berrors + cdef const char *unicode_errors cdef bint strict_types cdef bool use_float cdef bint autoreset @@ -104,10 +110,8 @@ cdef class Packer(object): self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, - bint use_single_float=False, - bint autoreset=True, - bint use_bin_type=False, + def __init__(self, *, default=None, unicode_errors=None, + bint use_single_float=False, bint autoreset=True, bint use_bin_type=False, bint strict_types=False): self.use_float = use_single_float self.strict_types = strict_types @@ -118,6 +122,12 @@ cdef class Packer(object): raise TypeError("default must be a callable.") self._default = default + self._berrors = unicode_errors + if unicode_errors is None: + self.unicode_errors = NULL + else: + self.unicode_errors = self._berrors + def __dealloc__(self): PyMem_Free(self.pk.buf) self.pk.buf = NULL @@ -183,9 +193,19 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): - ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); - if ret == -2: - raise ValueError("unicode string is too large") + if self.unicode_errors == NULL: + ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); + if ret == -2: + raise ValueError("unicode string is too large") + else: + o = PyUnicode_AsEncodedString(o, NULL, self.unicode_errors) + L = Py_SIZE(o) + if L > ITEM_LIMIT: + raise ValueError("unicode string is too large") + ret = msgpack_pack_raw(&self.pk, L) + if ret == 0: + rawval = o + ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyDict_CheckExact(o): d = <dict>o L = len(d) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 5dab906..0c0c101 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -667,7 +667,7 @@ class Unpacker(object): elif self._raw: obj = bytes(obj) else: - obj = obj.decode('utf_8') + obj = obj.decode('utf_8', self._unicode_errors) return obj if typ == TYPE_EXT: return self._ext_hook(n, bytes(obj)) @@ -752,14 +752,19 @@ class Packer(object): Additionally tuples will not be serialized as lists. This is useful when trying to implement accurate serialization for python types. + + :param str unicode_errors: + The error handler for encoding unicode. (default: 'strict') + DO NOT USE THIS!! This option is kept for very specific usage. """ - def __init__(self, default=None, + def __init__(self, default=None, unicode_errors=None, use_single_float=False, autoreset=True, use_bin_type=False, strict_types=False): self._strict_types = strict_types self._use_float = use_single_float self._autoreset = autoreset self._use_bin_type = use_bin_type + self._unicode_errors = unicode_errors or "strict" self._buffer = StringIO() if default is not None: if not callable(default): @@ -816,7 +821,7 @@ class Packer(object): self._pack_bin_header(n) return self._buffer.write(obj) if check(obj, unicode): - obj = obj.encode("utf-8") + obj = obj.encode("utf-8", self._unicode_errors) n = len(obj) if n >= 2**32: raise ValueError("String is too large") diff --git a/test/test_pack.py b/test/test_pack.py index 194b2c9..b6752e5 100644 --- a/test/test_pack.py +++ b/test/test_pack.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera from collections import OrderedDict from io import BytesIO import struct +import sys import pytest from pytest import raises, xfail @@ -54,13 +55,24 @@ def testPackByteArrays(): for td in test_data: check(td) +@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates") +def testIgnoreUnicodeErrors(): + re = unpackb(packb(b'abc\xeddef', use_bin_type=False), + raw=False, unicode_errors='ignore') + assert re == "abcdef" + def testStrictUnicodeUnpack(): - packed = packb(b'abc\xeddef') + packed = packb(b'abc\xeddef', use_bin_type=False) with pytest.raises(UnicodeDecodeError): unpackb(packed, raw=False, use_list=1) +@pytest.mark.skipif(sys.version_info < (3,0), reason="Python 2 passes invalid surrogates") +def testIgnoreErrorsPack(): + re = unpackb(packb(u"abc\uDC80\uDCFFdef", use_bin_type=True, unicode_errors='ignore'), raw=False, use_list=1) + assert re == "abcdef" + def testDecodeBinary(): - re = unpackb(packb(b"abc"), encoding=None, use_list=1) + re = unpackb(packb(b"abc"), use_list=1) assert re == b"abc" def testPackFloat(): |