From 60ef3879d792ec92480cf9d6d610951657c2e8c7 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 11 Jan 2018 19:41:05 +0900 Subject: packer: Use PyUnicode_AsUTF8AndSize() for utf-8 (#272) --- docker/runtests.sh | 2 +- msgpack/_packer.pyx | 36 +++++++++++++++++++++++------------- msgpack/pack.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/docker/runtests.sh b/docker/runtests.sh index 11ef9f4..113b630 100755 --- a/docker/runtests.sh +++ b/docker/runtests.sh @@ -9,6 +9,6 @@ for V in cp36-cp36m cp35-cp35m cp27-cp27m cp27-cp27mu; do pushd test # prevent importing msgpack package in current directory. $PYBIN/python -c 'import sys; print(hex(sys.maxsize))' $PYBIN/python -c 'from msgpack import _packer, _unpacker' - $PYBIN/py.test -v + $PYBIN/pytest -v . popd done diff --git a/msgpack/_packer.pyx b/msgpack/_packer.pyx index 39da91b..a4913ab 100644 --- a/msgpack/_packer.pyx +++ b/msgpack/_packer.pyx @@ -13,6 +13,7 @@ cdef extern from "Python.h": int PyMemoryView_Check(object obj) int PyByteArray_Check(object obj) int PyByteArray_CheckExact(object obj) + char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t *l) except NULL cdef extern from "pack.h": @@ -37,6 +38,7 @@ cdef extern from "pack.h": int msgpack_pack_bin(msgpack_packer* pk, size_t l) int msgpack_pack_raw_body(msgpack_packer* pk, char* body, size_t l) int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l) + int msgpack_pack_unicode(msgpack_packer* pk, object o, long long limit) cdef int DEFAULT_RECURSE_LIMIT=511 cdef long long ITEM_LIMIT = (2**32)-1 @@ -126,8 +128,12 @@ cdef class Packer(object): raise TypeError("default must be a callable.") self._default = default if encoding is None: - self.encoding = 'utf_8' - self.unicode_errors = NULL + if unicode_errors is None: + self.encoding = NULL + self.unicode_errors = NULL + else: + self.encoding = "utf_8" + self.unicode_errors = unicode_errors else: if isinstance(encoding, unicode): self._bencoding = encoding.encode('ascii') @@ -140,6 +146,8 @@ cdef class Packer(object): self._berrors = unicode_errors if self._berrors is not None: self.unicode_errors = PyBytes_AsString(self._berrors) + else: + self.unicode_errors = NULL def __dealloc__(self): PyMem_Free(self.pk.buf) @@ -206,17 +214,19 @@ cdef class Packer(object): if ret == 0: ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_CheckExact(o) if strict_types else PyUnicode_Check(o): - if not self.encoding: - raise TypeError("Can't encode unicode string: no encoding is specified") - #TODO: Use faster API for UTF-8 - o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) - L = len(o) - if L > ITEM_LIMIT: - raise PackValueError("unicode string is too large") - rawval = o - ret = msgpack_pack_raw(&self.pk, L) - if ret == 0: - ret = msgpack_pack_raw_body(&self.pk, rawval, L) + if self.encoding == NULL: + ret = msgpack_pack_unicode(&self.pk, o, ITEM_LIMIT); + if ret == -2: + raise PackValueError("unicode string is too large") + else: + o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + L = len(o) + if L > ITEM_LIMIT: + raise PackValueError("unicode string is too large") + ret = msgpack_pack_raw(&self.pk, L) + if ret == 0: + rawval = o + ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyDict_CheckExact(o): d = o L = len(d) diff --git a/msgpack/pack.h b/msgpack/pack.h index 3bc21ea..4f3ce1d 100644 --- a/msgpack/pack.h +++ b/msgpack/pack.h @@ -67,6 +67,53 @@ static inline int msgpack_pack_write(msgpack_packer* pk, const char *data, size_ #include "pack_template.h" +// return -2 when o is too long +static inline int +msgpack_pack_unicode(msgpack_packer *pk, PyObject *o, long long limit) +{ +#if PY_MAJOR_VERSION >= 3 + assert(PyUnicode_Check(o)); + + Py_ssize_t len; + const char* buf = PyUnicode_AsUTF8AndSize(o, &len); + if (buf == NULL) + return -1; + + if (len > limit) { + return -2; + } + + int ret = msgpack_pack_raw(pk, len); + if (ret) return ret; + + return msgpack_pack_raw_body(pk, buf, len); +#else + PyObject *bytes; + Py_ssize_t len; + int ret; + + // py2 + bytes = PyUnicode_AsUTF8String(o); + if (bytes == NULL) + return -1; + + len = PyString_GET_SIZE(bytes); + if (len > limit) { + Py_DECREF(bytes); + return -2; + } + + ret = msgpack_pack_raw(pk, len); + if (ret) { + Py_DECREF(bytes); + return -1; + } + ret = msgpack_pack_raw_body(pk, PyString_AS_STRING(bytes), len); + Py_DECREF(bytes); + return ret; +#endif +} + #ifdef __cplusplus } #endif -- cgit v1.2.1