From 12e30827bc6224a133a1ccfb977c0bccb0c84576 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 25 Apr 2018 00:39:48 +0300 Subject: Allow to disable serializing bytes by default in Python 3. If encoding is None, then bytes objects will be passed to the default() method instead of transforming into unicode. --- index.rst | 22 ++++++++++--- simplejson/_speedups.c | 29 ++++++++++++----- simplejson/encoder.py | 19 ++++++----- simplejson/tests/test_dump.py | 76 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 21 deletions(-) diff --git a/index.rst b/index.rst index 8c9c7ab..94c792a 100644 --- a/index.rst +++ b/index.rst @@ -192,8 +192,16 @@ Basic Usage .. versionchanged:: 2.1.4 Use ``(',', ': ')`` as default if *indent* is not ``None``. - *encoding* is the character encoding for str instances, default is - ``'utf-8'``. + If *encoding* is not ``None``, then all input :class:`bytes` objects in + Python 3 and 8-bit strings in Python 2 will be transformed + into unicode using that encoding prior to JSON-encoding. The default is + ``'utf-8'``. If *encoding* is ``None``, then all :class:`bytes` objects + will be passed to the *default* function in Python 3 + + .. versionchanged:: 3.15.0 + ``encoding=None`` disables serializing :class:`bytes` by default in + Python 3. + *default(obj)* is a function that should return a serializable version of *obj* or raise :exc:`TypeError`. The default simply raises :exc:`TypeError`. @@ -656,9 +664,15 @@ Encoders and decoders that can't otherwise be serialized. It should return a JSON encodable version of the object or raise a :exc:`TypeError`. - If *encoding* is not ``None``, then all input strings will be transformed + If *encoding* is not ``None``, then all input :class:`bytes` objects in + Python 3 and 8-bit strings in Python 2 will be transformed into unicode using that encoding prior to JSON-encoding. The default is - ``'utf-8'``. + ``'utf-8'``. If *encoding* is ``None``, then all :class:`bytes` objects + will be passed to the :meth:`default` method in Python 3 + + .. versionchanged:: 3.15.0 + ``encoding=None`` disables serializing :class:`bytes` by default in + Python 3. If *namedtuple_as_object* is true (default: ``True``), objects with ``_asdict()`` methods will be encoded diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c index df23d15..e791618 100644 --- a/simplejson/_speedups.c +++ b/simplejson/_speedups.c @@ -634,8 +634,8 @@ encoder_stringify_key(PyEncoderObject *s, PyObject *key) Py_INCREF(key); return key; } - else if (PyString_Check(key)) { #if PY_MAJOR_VERSION >= 3 + else if (PyString_Check(key) && s->encoding != NULL) { const char *encoding = JSON_ASCII_AS_STRING(s->encoding); if (encoding == NULL) return NULL; @@ -644,11 +644,13 @@ encoder_stringify_key(PyEncoderObject *s, PyObject *key) PyString_GET_SIZE(key), encoding, NULL); + } #else /* PY_MAJOR_VERSION >= 3 */ + else if (PyString_Check(key)) { Py_INCREF(key); return key; -#endif /* PY_MAJOR_VERSION < 3 */ } +#endif /* PY_MAJOR_VERSION < 3 */ else if (PyFloat_Check(key)) { return encoder_encode_float(s, key); } @@ -676,7 +678,7 @@ encoder_stringify_key(PyEncoderObject *s, PyObject *key) else if (s->use_decimal && PyObject_TypeCheck(key, (PyTypeObject *)s->Decimal)) { return PyObject_Str(key); } - else if (s->skipkeys) { + if (s->skipkeys) { Py_INCREF(Py_None); return Py_None; } @@ -2578,11 +2580,19 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) s->defaultfn = defaultfn; Py_INCREF(encoder); s->encoder = encoder; - s->encoding = JSON_ParseEncoding(encoding); - if (s->encoding == NULL) - goto bail; - if (JSON_ASCII_AS_STRING(s->encoding) == NULL) - goto bail; +#if PY_MAJOR_VERSION >= 3 + if (encoding == Py_None) { + s->encoding = NULL; + } + else +#endif /* PY_MAJOR_VERSION >= 3 */ + { + s->encoding = JSON_ParseEncoding(encoding); + if (s->encoding == NULL) + goto bail; + if (JSON_ASCII_AS_STRING(s->encoding) == NULL) + goto bail; + } Py_INCREF(indent); s->indent = indent; Py_INCREF(key_separator); @@ -2854,7 +2864,8 @@ encoder_listencode_obj(PyEncoderObject *s, JSON_Accu *rval, PyObject *obj, Py_ss if (cstr != NULL) rv = _steal_accumulate(rval, cstr); } - else if (PyString_Check(obj) || PyUnicode_Check(obj)) + else if ((PyString_Check(obj) && s->encoding != NULL) || + PyUnicode_Check(obj)) { PyObject *encoded = encoder_encode_string(s, obj); if (encoded != NULL) diff --git a/simplejson/encoder.py b/simplejson/encoder.py index 5693eb6..d2b6bca 100644 --- a/simplejson/encoder.py +++ b/simplejson/encoder.py @@ -309,7 +309,7 @@ class JSONEncoder(object): _encoder = encode_basestring_ascii else: _encoder = encode_basestring - if self.encoding != 'utf-8': + if self.encoding != 'utf-8' and self.encoding is not None: def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding): if isinstance(o, binary_type): o = o.decode(_encoding) @@ -482,8 +482,9 @@ def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, first = False else: buf = separator - if (isinstance(value, string_types) or - (_PY3 and isinstance(value, bytes))): + if isinstance(value, string_types): + yield buf + _encoder(value) + elif _PY3 and isinstance(value, bytes) and _encoding is not None: yield buf + _encoder(value) elif isinstance(value, RawJSON): yield buf + value.encoded_json @@ -533,7 +534,7 @@ def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, def _stringify_key(key): if isinstance(key, string_types): # pragma: no cover pass - elif isinstance(key, binary_type): + elif _PY3 and isinstance(key, bytes) and _encoding is not None: key = key.decode(_encoding) elif isinstance(key, float): key = _floatstr(key) @@ -603,8 +604,9 @@ def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, yield item_separator yield _encoder(key) yield _key_separator - if (isinstance(value, string_types) or - (_PY3 and isinstance(value, bytes))): + if isinstance(value, string_types): + yield _encoder(value) + elif _PY3 and isinstance(value, bytes) and _encoding is not None: yield _encoder(value) elif isinstance(value, RawJSON): yield value.encoded_json @@ -647,8 +649,9 @@ def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, del markers[markerid] def _iterencode(o, _current_indent_level): - if (isinstance(o, string_types) or - (_PY3 and isinstance(o, bytes))): + if isinstance(o, string_types): + yield _encoder(o) + elif _PY3 and isinstance(o, bytes) and _encoding is not None: yield _encoder(o) elif isinstance(o, RawJSON): yield o.encoded_json diff --git a/simplejson/tests/test_dump.py b/simplejson/tests/test_dump.py index 2a30125..6b36c20 100644 --- a/simplejson/tests/test_dump.py +++ b/simplejson/tests/test_dump.py @@ -11,6 +11,9 @@ def as_text_type(s): return s.decode('ascii') return s +def decode_iso_8859_15(b): + return b.decode('iso-8859-15') + class TestDump(TestCase): def test_dump(self): sio = StringIO() @@ -140,3 +143,76 @@ class TestDump(TestCase): json.dumps(MisbehavingTextSubtype(text)), json.dumps(text) ) + + def test_bytes_toplevel(self): + self.assertEqual(json.dumps(b('\xe2\x82\xac')), r'"\u20ac"') + self.assertRaises(UnicodeDecodeError, json.dumps, b('\xa4')) + self.assertEqual(json.dumps(b('\xa4'), encoding='iso-8859-1'), + r'"\u00a4"') + self.assertEqual(json.dumps(b('\xa4'), encoding='iso-8859-15'), + r'"\u20ac"') + if PY3: + self.assertRaises(TypeError, json.dumps, b('\xe2\x82\xac'), + encoding=None) + self.assertRaises(TypeError, json.dumps, b('\xa4'), + encoding=None) + self.assertEqual(json.dumps(b('\xa4'), encoding=None, + default=decode_iso_8859_15), + r'"\u20ac"') + else: + self.assertEqual(json.dumps(b('\xe2\x82\xac'), encoding=None), + r'"\u20ac"') + self.assertRaises(UnicodeDecodeError, json.dumps, b('\xa4'), + encoding=None) + self.assertRaises(UnicodeDecodeError, json.dumps, b('\xa4'), + encoding=None, default=decode_iso_8859_15) + + def test_bytes_nested(self): + self.assertEqual(json.dumps([b('\xe2\x82\xac')]), r'["\u20ac"]') + self.assertRaises(UnicodeDecodeError, json.dumps, [b('\xa4')]) + self.assertEqual(json.dumps([b('\xa4')], encoding='iso-8859-1'), + r'["\u00a4"]') + self.assertEqual(json.dumps([b('\xa4')], encoding='iso-8859-15'), + r'["\u20ac"]') + if PY3: + self.assertRaises(TypeError, json.dumps, [b('\xe2\x82\xac')], + encoding=None) + self.assertRaises(TypeError, json.dumps, [b('\xa4')], + encoding=None) + self.assertEqual(json.dumps([b('\xa4')], encoding=None, + default=decode_iso_8859_15), + r'["\u20ac"]') + else: + self.assertEqual(json.dumps([b('\xe2\x82\xac')], encoding=None), + r'["\u20ac"]') + self.assertRaises(UnicodeDecodeError, json.dumps, [b('\xa4')], + encoding=None) + self.assertRaises(UnicodeDecodeError, json.dumps, [b('\xa4')], + encoding=None, default=decode_iso_8859_15) + + def test_bytes_key(self): + self.assertEqual(json.dumps({b('\xe2\x82\xac'): 42}), r'{"\u20ac": 42}') + self.assertRaises(UnicodeDecodeError, json.dumps, {b('\xa4'): 42}) + self.assertEqual(json.dumps({b('\xa4'): 42}, encoding='iso-8859-1'), + r'{"\u00a4": 42}') + self.assertEqual(json.dumps({b('\xa4'): 42}, encoding='iso-8859-15'), + r'{"\u20ac": 42}') + if PY3: + self.assertRaises(TypeError, json.dumps, {b('\xe2\x82\xac'): 42}, + encoding=None) + self.assertRaises(TypeError, json.dumps, {b('\xa4'): 42}, + encoding=None) + self.assertRaises(TypeError, json.dumps, {b('\xa4'): 42}, + encoding=None, default=decode_iso_8859_15) + self.assertEqual(json.dumps({b('\xa4'): 42}, encoding=None, + skipkeys=True), + r'{}') + else: + self.assertEqual(json.dumps({b('\xe2\x82\xac'): 42}, encoding=None), + r'{"\u20ac": 42}') + self.assertRaises(UnicodeDecodeError, json.dumps, {b('\xa4'): 42}, + encoding=None) + self.assertRaises(UnicodeDecodeError, json.dumps, {b('\xa4'): 42}, + encoding=None, default=decode_iso_8859_15) + self.assertRaises(UnicodeDecodeError, json.dumps, {b('\xa4'): 42}, + encoding=None, skipkeys=True) -- cgit v1.2.1