summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortailhook <pc@gafol.net>2011-04-15 17:36:17 +0300
committertailhook <pc@gafol.net>2011-04-15 18:39:17 +0300
commit752e3d1b783fc1c12a28e05a93aa73ac7c6b751c (patch)
tree1b22ff0771d1164468253bed6d0fd668e7e49a79
parentaf7113bb31877f337eef3d43048c0a9f1cb74258 (diff)
downloadmsgpack-python-752e3d1b783fc1c12a28e05a93aa73ac7c6b751c.tar.gz
Implemented encoding for strings
* Packer by default uses `utf-8` encoding by default * Unpacker uses `None` by default, so no decoding is done * Both pack and unpack has `encoding` and `unicode_errors` arguments, if `encoding` is `None` no encoding/decoding is done, otherwise it is python codec. `unicode_errors` is supplied as `errors` parameter to codec
-rw-r--r--msgpack/_msgpack.pyx72
-rw-r--r--msgpack/unpack.h8
-rw-r--r--test/test_pack.py57
-rw-r--r--test3/test_obj.py2
-rw-r--r--test3/test_pack.py55
5 files changed, 169 insertions, 25 deletions
diff --git a/msgpack/_msgpack.pyx b/msgpack/_msgpack.pyx
index cdcd0c8..443cbd7 100644
--- a/msgpack/_msgpack.pyx
+++ b/msgpack/_msgpack.pyx
@@ -36,7 +36,7 @@ cdef int DEFAULT_RECURSE_LIMIT=511
cdef class Packer(object):
"""MessagePack Packer
-
+
usage:
packer = Packer()
@@ -45,6 +45,10 @@ cdef class Packer(object):
"""
cdef msgpack_packer pk
cdef object _default
+ cdef object _bencoding
+ cdef object _berrors
+ cdef char *encoding
+ cdef char *unicode_errors
def __cinit__(self):
cdef int buf_size = 1024*1024
@@ -54,11 +58,25 @@ cdef class Packer(object):
self.pk.buf_size = buf_size
self.pk.length = 0
- def __init__(self, default=None):
+ def __init__(self, default=None, encoding='utf-8', unicode_errors='strict'):
if default is not None:
if not PyCallable_Check(default):
raise TypeError("default must be a callable.")
self._default = default
+ if encoding is None:
+ self.encoding = NULL
+ self.unicode_errors = NULL
+ else:
+ if isinstance(encoding, unicode):
+ self._bencoding = encoding.encode('ascii')
+ else:
+ self._bencoding = encoding
+ self.encoding = PyBytes_AsString(self._bencoding)
+ if isinstance(unicode_errors, unicode):
+ self._berrors = unicode_errors.encode('ascii')
+ else:
+ self._berrors = unicode_errors
+ self.unicode_errors = PyBytes_AsString(self._berrors)
def __dealloc__(self):
free(self.pk.buf);
@@ -68,7 +86,7 @@ cdef class Packer(object):
cdef unsigned long long ullval
cdef long longval
cdef double fval
- cdef char* rawval
+ cdef char* rawval
cdef int ret
cdef dict d
@@ -101,7 +119,9 @@ cdef class Packer(object):
if ret == 0:
ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
elif PyUnicode_Check(o):
- o = PyUnicode_AsUTF8String(o)
+ if not self.encoding:
+ raise TypeError("Can't encode utf-8 no encoding is specified")
+ o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors)
rawval = o
ret = msgpack_pack_raw(&self.pk, len(o))
if ret == 0:
@@ -138,14 +158,14 @@ cdef class Packer(object):
return buf
-def pack(object o, object stream, default=None):
+def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'):
"""pack an object `o` and write it to stream)."""
- packer = Packer(default=default)
+ packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
stream.write(packer.pack(o))
-def packb(object o, default=None):
+def packb(object o, default=None, encoding='utf-8', unicode_errors='strict'):
"""pack o and return packed bytes."""
- packer = Packer(default=default)
+ packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors)
return packer.pack(o)
dumps = packs = packb
@@ -155,6 +175,8 @@ cdef extern from "unpack.h":
int use_list
PyObject* object_hook
PyObject* list_hook
+ char *encoding
+ char *unicode_errors
ctypedef struct template_context:
msgpack_user user
@@ -164,12 +186,12 @@ cdef extern from "unpack.h":
PyObject* key
int template_execute(template_context* ctx, const_char_ptr data,
- size_t len, size_t* off)
+ size_t len, size_t* off) except -1
void template_init(template_context* ctx)
object template_data(template_context* ctx)
-def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0):
+def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
"""Unpack packed_bytes to object. Returns an unpacked object."""
cdef template_context ctx
cdef size_t off = 0
@@ -179,9 +201,25 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
cdef Py_ssize_t buf_len
PyObject_AsReadBuffer(packed, <const_void_ptr*>&buf, &buf_len)
+ if encoding is None:
+ enc = NULL
+ else:
+ if isinstance(encoding, unicode):
+ bencoding = encoding.encode('ascii')
+ else:
+ bencoding = encoding
+ if isinstance(unicode_errors, unicode):
+ berrors = unicode_errors.encode('ascii')
+ else:
+ berrors = unicode_errors
+ enc = PyBytes_AsString(bencoding)
+ err = PyBytes_AsString(berrors)
+
template_init(&ctx)
ctx.user.use_list = use_list
ctx.user.object_hook = ctx.user.list_hook = NULL
+ ctx.user.encoding = enc
+ ctx.user.unicode_errors = err
if object_hook is not None:
if not PyCallable_Check(object_hook):
raise TypeError("object_hook must be a callable.")
@@ -191,8 +229,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
raise TypeError("list_hook must be a callable.")
ctx.user.list_hook = <PyObject*>list_hook
_gc_disable()
- ret = template_execute(&ctx, buf, buf_len, &off)
- _gc_enable()
+ try:
+ ret = template_execute(&ctx, buf, buf_len, &off)
+ finally:
+ _gc_enable()
if ret == 1:
return template_data(&ctx)
else:
@@ -200,10 +240,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, bint
loads = unpacks = unpackb
-def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0):
+def unpack(object stream, object object_hook=None, object list_hook=None, bint use_list=0, encoding=None, unicode_errors="strict"):
"""unpack an object from stream."""
return unpackb(stream.read(), use_list=use_list,
- object_hook=object_hook, list_hook=list_hook)
+ object_hook=object_hook, list_hook=list_hook, encoding=encoding, unicode_errors=unicode_errors)
cdef class Unpacker(object):
"""Unpacker(read_size=1024*1024)
@@ -236,7 +276,7 @@ cdef class Unpacker(object):
self.buf = NULL;
def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=0,
- object object_hook=None, object list_hook=None):
+ object object_hook=None, object list_hook=None, encoding=None, unicode_errors=None):
if read_size == 0:
read_size = 1024*1024
self.use_list = use_list
@@ -292,7 +332,7 @@ cdef class Unpacker(object):
new_size = tail + _buf_len
if new_size < buf_size*2:
new_size = buf_size*2
- buf = <char*>realloc(buf, new_size)
+ buf = <char*>realloc(buf, new_size)
if buf == NULL:
# self.buf still holds old buffer and will be freed during
# obj destruction
diff --git a/msgpack/unpack.h b/msgpack/unpack.h
index 453ec2b..0586ca8 100644
--- a/msgpack/unpack.h
+++ b/msgpack/unpack.h
@@ -23,6 +23,8 @@ typedef struct unpack_user {
int use_list;
PyObject *object_hook;
PyObject *list_hook;
+ const char *encoding;
+ const char *unicode_errors;
} unpack_user;
@@ -197,7 +199,11 @@ static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_objec
static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o)
{
PyObject *py;
- py = PyBytes_FromStringAndSize(p, l);
+ if(u->encoding) {
+ py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors);
+ } else {
+ py = PyBytes_FromStringAndSize(p, l);
+ }
if (!py)
return -1;
*o = py;
diff --git a/test/test_pack.py b/test/test_pack.py
index 5dec068..2aef588 100644
--- a/test/test_pack.py
+++ b/test/test_pack.py
@@ -15,14 +15,63 @@ def testPack():
0, 1, 127, 128, 255, 256, 65535, 65536,
-1, -32, -33, -128, -129, -32768, -32769,
1.0,
- "", "a", "a"*31, "a"*32,
+ b"", b"a", b"a"*31, b"a"*32,
None, True, False,
- (), ((),), ((), None,),
- {None: 0},
- (1<<23),
+ (), ((),), ((), None,),
+ {None: 0},
+ (1<<23),
]
for td in test_data:
check(td)
+def testPackUnicode():
+ test_data = [
+ u"", u"abcd", (u"defgh",), u"Русский текст",
+ ]
+ for td in test_data:
+ re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
+ assert_equal(re, td)
+
+def testPackUTF32():
+ test_data = [
+ u"", u"abcd", (u"defgh",), u"Русский текст",
+ ]
+ for td in test_data:
+ print(packs(td, encoding='utf-32'))
+ re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+ assert_equal(re, td)
+
+def testPackBytes():
+ test_data = [
+ b"", b"abcd", (b"defgh",),
+ ]
+ for td in test_data:
+ check(td)
+
+def testIgnoreUnicodeErrors():
+ re = unpacks(packs(b'abc\xeddef'),
+ encoding='utf-8', unicode_errors='ignore')
+ assert_equal(re, "abcdef")
+
+@raises(UnicodeDecodeError)
+def testStrictUnicodeUnpack():
+ unpacks(packs(b'abc\xeddef'), encoding='utf-8')
+
+@raises(UnicodeEncodeError)
+def testStrictUnicodePack():
+ packs(u"abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():
+ re = unpacks(packs(u"abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
+ assert_equal(re, u"abcdef")
+
+@raises(TypeError)
+def testNoEncoding():
+ packs(u"abc", encoding=None)
+
+def testDecodeBinary():
+ re = unpacks(packs(u"abc"), encoding=None)
+ assert_equal(re, b"abc")
+
if __name__ == '__main__':
main()
diff --git a/test3/test_obj.py b/test3/test_obj.py
index 236988d..b54021f 100644
--- a/test3/test_obj.py
+++ b/test3/test_obj.py
@@ -26,7 +26,7 @@ def test_decode_hook():
unpacked = unpacks(packed, object_hook=_decode_complex)
eq_(unpacked[1], 1+2j)
-@raises(TypeError)
+@raises(ValueError)
def test_bad_hook():
packed = packs([3, 1+2j], default=lambda o: o)
unpacked = unpacks(packed)
diff --git a/test3/test_pack.py b/test3/test_pack.py
index c861704..e53f7e6 100644
--- a/test3/test_pack.py
+++ b/test3/test_pack.py
@@ -17,12 +17,61 @@ def testPack():
1.0,
b"", b"a", b"a"*31, b"a"*32,
None, True, False,
- (), ((),), ((), None,),
- {None: 0},
- (1<<23),
+ (), ((),), ((), None,),
+ {None: 0},
+ (1<<23),
]
for td in test_data:
check(td)
+def testPackUnicode():
+ test_data = [
+ "", "abcd", ("defgh",), "Русский текст",
+ ]
+ for td in test_data:
+ re = unpacks(packs(td, encoding='utf-8'), encoding='utf-8')
+ assert_equal(re, td)
+
+def testPackUTF32():
+ test_data = [
+ "", "abcd", ("defgh",), "Русский текст",
+ ]
+ for td in test_data:
+ print(packs(td, encoding='utf-32'))
+ re = unpacks(packs(td, encoding='utf-32'), encoding='utf-32')
+ assert_equal(re, td)
+
+def testPackBytes():
+ test_data = [
+ b"", b"abcd", (b"defgh",),
+ ]
+ for td in test_data:
+ check(td)
+
+def testIgnoreUnicodeErrors():
+ re = unpacks(packs(b'abc\xeddef'),
+ encoding='utf-8', unicode_errors='ignore')
+ assert_equal(re, "abcdef")
+
+@raises(UnicodeDecodeError)
+def testStrictUnicodeUnpack():
+ unpacks(packs(b'abc\xeddef'), encoding='utf-8')
+
+@raises(UnicodeEncodeError)
+def testStrictUnicodePack():
+ packs("abc\xeddef", encoding='ascii', unicode_errors='strict')
+
+def testIgnoreErrorsPack():
+ re = unpacks(packs("abcФФФdef", encoding='ascii', unicode_errors='ignore'), encoding='utf-8')
+ assert_equal(re, "abcdef")
+
+@raises(TypeError)
+def testNoEncoding():
+ packs("abc", encoding=None)
+
+def testDecodeBinary():
+ re = unpacks(packs("abc"), encoding=None)
+ assert_equal(re, b"abc")
+
if __name__ == '__main__':
main()