summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/bench.sh1
-rw-r--r--setup.py2
-rw-r--r--simplejson/__init__.py2
-rw-r--r--simplejson/_speedups.c399
-rw-r--r--simplejson/decoder.py12
-rw-r--r--simplejson/tests/test_unicode.py10
6 files changed, 422 insertions, 4 deletions
diff --git a/scripts/bench.sh b/scripts/bench.sh
index e30e246..de41c3e 100755
--- a/scripts/bench.sh
+++ b/scripts/bench.sh
@@ -1,2 +1,3 @@
#!/bin/sh
/usr/bin/env python -mtimeit -s 'from simplejson.tests.test_pass1 import test_parse' 'test_parse()'
+/usr/bin/env python -c 'from simplejson.tests.test_pass1 import test_parse; import profile; profile.run("for i in xrange(100): test_parse()")'
diff --git a/setup.py b/setup.py
index f7b0859..7fbd436 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ from distutils.command.build_ext import build_ext
from distutils.errors import CCompilerError, DistutilsExecError, \
DistutilsPlatformError
-VERSION = '1.8'
+VERSION = '1.8.1'
DESCRIPTION = "Simple, fast, extensible JSON encoder/decoder for Python"
LONG_DESCRIPTION = """
simplejson is a simple, fast, complete, correct and extensible
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index e39a110..be9e87a 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -99,7 +99,7 @@ pretty-print::
Note that the JSON produced by this module's default settings
is a subset of YAML, so it may be used as a serializer for that as well.
"""
-__version__ = '1.8'
+__version__ = '1.8.1'
__all__ = [
'dump', 'dumps', 'load', 'loads',
'JSONDecoder', 'JSONEncoder',
diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c
index 053369a..c7a7a7c 100644
--- a/simplejson/_speedups.c
+++ b/simplejson/_speedups.c
@@ -11,6 +11,8 @@ typedef int Py_ssize_t;
#define UNUSED
#endif
+#define DEFAULT_ENCODING "utf-8"
+
static Py_ssize_t
ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars);
static PyObject *
@@ -181,6 +183,402 @@ ascii_escape_str(PyObject *pystr) {
return rval;
}
+void
+raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) {
+ static PyObject *errmsg_fn = NULL;
+ PyObject *pymsg;
+ if (errmsg_fn == NULL) {
+ PyObject *decoder = PyImport_ImportModule("simplejson.decoder");
+ if (decoder == NULL) return;
+ errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
+ if (errmsg_fn == NULL) return;
+ Py_XDECREF(decoder);
+ }
+#if PY_VERSION_HEX < 0x02050000
+ pymsg = PyObject_CallFunction(errmsg_fn, "(zOi)", msg, s, end);
+#else
+ pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
+#endif
+ PyErr_SetObject(PyExc_ValueError, pymsg);
+/*
+
+def linecol(doc, pos):
+ lineno = doc.count('\n', 0, pos) + 1
+ if lineno == 1:
+ colno = pos
+ else:
+ colno = pos - doc.rindex('\n', 0, pos)
+ return lineno, colno
+
+def errmsg(msg, doc, pos, end=None):
+ lineno, colno = linecol(doc, pos)
+ if end is None:
+ return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
+ endlineno, endcolno = linecol(doc, end)
+ return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
+ msg, lineno, colno, endlineno, endcolno, pos, end)
+
+*/
+}
+
+static PyObject *
+join_list_unicode(PyObject *lst) {
+ static PyObject *ustr = NULL;
+ static PyObject *joinstr = NULL;
+ if (ustr == NULL) {
+ Py_UNICODE c = 0;
+ ustr = PyUnicode_FromUnicode(&c, 0);
+ }
+ if (joinstr == NULL) {
+ joinstr = PyString_FromString("join");
+ }
+ if (joinstr == NULL || ustr == NULL) {
+ return NULL;
+ }
+ return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
+}
+
+static PyObject *
+scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding) {
+ PyObject *rval;
+ Py_ssize_t len = PyString_GET_SIZE(pystr);
+ Py_ssize_t begin = end - 1;
+ Py_ssize_t next = begin;
+ char *buf = PyString_AS_STRING(pystr);
+ PyObject *chunks = PyList_New(0);
+ if (chunks == NULL) {
+ goto bail;
+ }
+ while (1) {
+ /* Find the end of the string or the next escape */
+ Py_UNICODE c = 0;
+ PyObject *chunk = NULL;
+ for (next = end; next < len; next++) {
+ c = buf[next];
+ if (c == '"' || c == '\\') {
+ break;
+ }
+ }
+ if (!(c == '"' || c == '\\')) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ /* Pick up this chunk if it's not zero length */
+ if (next != end) {
+ PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
+ if (strchunk == NULL) {
+ goto bail;
+ }
+ chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+ next++;
+ if (c == '"') {
+ end = next;
+ break;
+ }
+ if (next == len) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ c = buf[next];
+ if (c != 'u') {
+ /* Non-unicode backslash escapes */
+ end = next + 1;
+ switch (c) {
+ case '"': break;
+ case '\\': break;
+ case '/': break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ default: c = 0;
+ }
+ if (c == 0) {
+ raise_errmsg("Invalid \\escape", pystr, end - 2);
+ goto bail;
+ }
+ } else {
+ c = 0;
+ next++;
+ end = next + 4;
+ if (end >= len) {
+ raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
+ goto bail;
+ }
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+#ifdef Py_UNICODE_WIDE
+ /* Surrogate pair */
+ if (c >= 0xd800 && c <= 0xdbff) {
+ Py_UNICODE c2 = 0;
+ if (end + 6 >= len) {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ if (buf[next++] != '\\' || buf[next++] != 'u') {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ end += 6;
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c2 |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c2 |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c2 |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ }
+#endif
+ }
+ chunk = PyUnicode_FromUnicode(&c, 1);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+
+ rval = join_list_unicode(chunks);
+ if (rval == NULL) {
+ goto bail;
+ }
+ Py_DECREF(chunks);
+ chunks = NULL;
+#if PY_VERSION_HEX < 0x02050000
+ return Py_BuildValue("(Ni)", rval, end);
+#else
+ return Py_BuildValue("(Nn)", rval, end);
+#endif
+bail:
+ Py_XDECREF(chunks);
+ return NULL;
+}
+
+
+static PyObject *
+scanstring_unicode(PyObject *pystr, Py_ssize_t end) {
+ PyObject *rval;
+ Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
+ Py_ssize_t begin = end - 1;
+ Py_ssize_t next = begin;
+ const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
+ PyObject *chunks = PyList_New(0);
+ if (chunks == NULL) {
+ goto bail;
+ }
+ while (1) {
+ /* Find the end of the string or the next escape */
+ Py_UNICODE c = 0;
+ PyObject *chunk = NULL;
+ for (next = end; next < len; next++) {
+ c = buf[next];
+ if (c == '"' || c == '\\') {
+ break;
+ }
+ }
+ if (!(c == '"' || c == '\\')) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ /* Pick up this chunk if it's not zero length */
+ if (next != end) {
+ chunk = PyUnicode_FromUnicode(&buf[end], next - end);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+ next++;
+ if (c == '"') {
+ end = next;
+ break;
+ }
+ if (next == len) {
+ raise_errmsg("Unterminated string starting at", pystr, begin);
+ goto bail;
+ }
+ c = buf[next];
+ if (c != 'u') {
+ /* Non-unicode backslash escapes */
+ end = next + 1;
+ switch (c) {
+ case '"': break;
+ case '\\': break;
+ case '/': break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ default: c = 0;
+ }
+ if (c == 0) {
+ raise_errmsg("Invalid \\escape", pystr, end - 2);
+ goto bail;
+ }
+ } else {
+ c = 0;
+ next++;
+ end = next + 4;
+ if (end >= len) {
+ raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
+ goto bail;
+ }
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+#ifdef Py_UNICODE_WIDE
+ /* Surrogate pair */
+ if (c >= 0xd800 && c <= 0xdbff) {
+ Py_UNICODE c2 = 0;
+ if (end + 6 >= len) {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ if (buf[next++] != '\\' || buf[next++] != 'u') {
+ raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
+ end - 5);
+ }
+ end += 6;
+ /* Decode 4 hex digits */
+ for (; next < end; next++) {
+ Py_ssize_t shl = (end - next - 1) << 2;
+ Py_UNICODE digit = buf[next];
+ switch (digit) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ c2 |= (digit - '0') << shl; break;
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f':
+ c2 |= (digit - 'a' + 10) << shl; break;
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F':
+ c2 |= (digit - 'A' + 10) << shl; break;
+ default:
+ raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
+ goto bail;
+ }
+ }
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+ }
+#endif
+ }
+ chunk = PyUnicode_FromUnicode(&c, 1);
+ if (chunk == NULL) {
+ goto bail;
+ }
+ if (PyList_Append(chunks, chunk)) {
+ goto bail;
+ }
+ Py_DECREF(chunk);
+ }
+
+ rval = join_list_unicode(chunks);
+ if (rval == NULL) {
+ goto bail;
+ }
+ Py_DECREF(chunks);
+ chunks = NULL;
+#if PY_VERSION_HEX < 0x02050000
+ return Py_BuildValue("(Ni)", rval, end);
+#else
+ return Py_BuildValue("(Nn)", rval, end);
+#endif
+bail:
+ Py_XDECREF(chunks);
+ return NULL;
+}
+
+PyDoc_STRVAR(pydoc_scanstring,
+ "scanstring(basestring, end, encoding) -> (str, end)\n"
+ "\n"
+ "..."
+);
+
+static PyObject *
+py_scanstring(PyObject* self UNUSED, PyObject *args) {
+ PyObject *pystr;
+ Py_ssize_t end;
+ char *encoding;
+#if PY_VERSION_HEX < 0x02050000
+ if (!PyArg_ParseTuple(args, "Oiz:scanstring", &pystr, &end, &encoding)) {
+#else
+ if (!PyArg_ParseTuple(args, "Onz:scanstring", &pystr, &end, &encoding)) {
+#endif
+ return NULL;
+ }
+ if (encoding == NULL) {
+ encoding = DEFAULT_ENCODING;
+ }
+ if (PyString_Check(pystr)) {
+ return scanstring_str(pystr, end, encoding);
+ } else if (PyUnicode_Check(pystr)) {
+ return scanstring_unicode(pystr, end);
+ }
+ PyErr_SetString(PyExc_TypeError, "first argument must be a string");
+ return NULL;
+}
+
PyDoc_STRVAR(pydoc_encode_basestring_ascii,
"encode_basestring_ascii(basestring) -> str\n"
"\n"
@@ -208,6 +606,7 @@ py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) {
}
static PyMethodDef speedups_methods[] = {
DEFN(encode_basestring_ascii, METH_O),
+ DEFN(scanstring, METH_VARARGS),
{}
};
#undef DEFN
diff --git a/simplejson/decoder.py b/simplejson/decoder.py
index 06f9de3..6a73e9a 100644
--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -5,6 +5,10 @@ import re
import sys
from simplejson.scanner import Scanner, pattern
+try:
+ from simplejson import _speedups
+except:
+ _speedups = None
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
@@ -110,7 +114,7 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match):
next_end = end + 5
msg = "Invalid \\uXXXX escape"
try:
- if len(esc) != 4 or not esc.isalnum():
+ if len(esc) != 4:
raise ValueError
uni = int(esc, 16)
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
@@ -118,7 +122,7 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match):
if not s[end + 5:end + 7] == '\\u':
raise ValueError
esc2 = s[end + 7:end + 11]
- if len(esc2) != 4 or not esc2.isalnum():
+ if len(esc2) != 4:
raise ValueError
uni2 = int(esc2, 16)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
@@ -130,6 +134,10 @@ def scanstring(s, end, encoding=None, _b=BACKSLASH, _m=STRINGCHUNK.match):
_append(m)
return u''.join(chunks), end
+# Use speedup
+if _speedups is not None:
+ scanstring = _speedups.scanstring
+
def JSONString(match, context):
encoding = getattr(context, 'encoding', None)
return scanstring(match.string, match.end(), encoding)
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index 0a92b28..f73b48a 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -24,3 +24,13 @@ def test_big_unicode_decode():
u = u'z\U0001d120x'
assert S.loads('"' + u + '"') == u
assert S.loads('"z\\ud834\\udd20x"') == u
+
+def test_unicode_decode():
+ for i in range(0, 0xd7ff):
+ u = unichr(i)
+ json = '"\\u%04x"' % (i,)
+ res = S.loads(json)
+ assert res == u, 'S.loads(%r) != %r got %r' % (json, u, res)
+
+if __name__ == '__main__':
+ test_unicode_decode()