From 2c626a37d1481035019162e951c748cb854696c7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 15 May 2014 14:37:42 +0300 Subject: Issue #13916: Disallowed the surrogatepass error handler for non UTF-* encodings. --- Python/codecs.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index e06d6e0922..700313633e 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -901,6 +901,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +#define ENC_UNKNOWN -1 #define ENC_UTF8 0 #define ENC_UTF16BE 1 #define ENC_UTF16LE 2 @@ -916,7 +917,11 @@ get_standard_encoding(const char *encoding, int *bytelength) encoding += 3; if (*encoding == '-' || *encoding == '_' ) encoding++; - if (encoding[0] == '1' && encoding[1] == '6') { + if (encoding[0] == '8' && encoding[1] == '\0') { + *bytelength = 3; + return ENC_UTF8; + } + else if (encoding[0] == '1' && encoding[1] == '6') { encoding += 2; *bytelength = 2; if (*encoding == '\0') { @@ -955,9 +960,7 @@ get_standard_encoding(const char *encoding, int *bytelength) } } } - /* utf-8 */ - *bytelength = 3; - return ENC_UTF8; + return ENC_UNKNOWN; } /* This handler is declared static until someone demonstrates @@ -994,6 +997,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); + if (code == ENC_UNKNOWN) { + /* Not supported, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); if (!res) { @@ -1068,6 +1077,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); + if (code == ENC_UNKNOWN) { + /* Not supported, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ -- cgit v1.2.1 From 569a55317259d71ebde1c85ef48874eba3824a5e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 16 May 2014 14:46:20 +0200 Subject: Issue #13916: Fix surrogatepass error handler on Windows --- Python/codecs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index 700313633e..4c2ae381b3 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -960,6 +960,10 @@ get_standard_encoding(const char *encoding, int *bytelength) } } } + else if (strcmp(encoding, "CP_UTF8") == 0) { + *bytelength = 3; + return ENC_UTF8; + } return ENC_UNKNOWN; } -- cgit v1.2.1 From 11e25614459b076ba1c6b12d6a08cf491212d9db Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 25 Nov 2014 13:57:17 +0200 Subject: Issue #19676: Added the "namereplace" error handler. --- Python/codecs.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index 151fea7d49..b09ea3a28f 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -9,6 +9,7 @@ Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */ #include "Python.h" +#include "ucnhash.h" #include const char *Py_hexdigits = "0123456789abcdef"; @@ -933,6 +934,97 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +static int ucnhash_initialized = 0; + +PyObject *PyCodec_NameReplaceErrors(PyObject *exc) +{ + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *restuple; + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + int ressize; + Py_UCS4 c; + char buffer[256]; /* NAME_MAXLEN */ + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + if (!ucnhash_initialized) { + /* load the unicode data module */ + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + PyUnicodeData_CAPSULE_NAME, 1); + ucnhash_initialized = 1; + } + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + ressize += 1+1+1+strlen(buffer)+1; + } + else if (c >= 0x10000) { + ressize += 1+1+8; + } + else if (c >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_New(ressize, 127); + if (res==NULL) + return NULL; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); + *outp++ = '\\'; + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + *outp++ = 'N'; + *outp++ = '{'; + strcpy((char *)outp, buffer); + outp += strlen(buffer); + *outp++ = '}'; + continue; + } + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + #define ENC_UNKNOWN -1 #define ENC_UTF8 0 #define ENC_UTF16BE 1 @@ -1276,6 +1368,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) return PyCodec_BackslashReplaceErrors(exc); } +static PyObject *namereplace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_NameReplaceErrors(exc); +} + static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) { return PyCodec_SurrogatePassErrors(exc); @@ -1345,6 +1442,17 @@ static int _PyCodecRegistry_Init(void) "backslashed escape sequence.") } }, + { + "namereplace", + { + "namereplace_errors", + namereplace_errors, + METH_O, + PyDoc_STR("Implements the 'namereplace' error handling, " + "which replaces an unencodable character with a " + "\\N{...} escape sequence.") + } + }, { "surrogatepass", { -- cgit v1.2.1 From de9ae0e1a87913d86b04b966121b93eb863c691a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 26 Nov 2014 12:11:40 +0200 Subject: Issue #19676: Fixed integer overflow issue in "namereplace" error handler. --- Python/codecs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index b09ea3a28f..8ffa80b213 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -947,7 +947,8 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) Py_ssize_t end; PyObject *res; unsigned char *outp; - int ressize; + Py_ssize_t ressize; + int replsize; Py_UCS4 c; char buffer[256]; /* NAME_MAXLEN */ if (PyUnicodeEncodeError_GetStart(exc, &start)) @@ -967,17 +968,21 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) c = PyUnicode_READ_CHAR(object, i); if (ucnhash_CAPI && ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { - ressize += 1+1+1+strlen(buffer)+1; + replsize = 1+1+1+strlen(buffer)+1; } else if (c >= 0x10000) { - ressize += 1+1+8; + replsize = 1+1+8; } else if (c >= 0x100) { - ressize += 1+1+4; + replsize = 1+1+4; } else - ressize += 1+1+2; + replsize = 1+1+2; + if (ressize > PY_SSIZE_T_MAX - replsize) + break; + ressize += replsize; } + end = i; res = PyUnicode_New(ressize, 127); if (res==NULL) return NULL; @@ -1014,6 +1019,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) *outp++ = Py_hexdigits[c&0xf]; } + assert(out == start + ressize); assert(_PyUnicode_CheckConsistency(res, 1)); restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); -- cgit v1.2.1 From 1b925e9225f50138d91d85d6a100ce7fe3d49526 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Wed, 26 Nov 2014 14:20:51 -0600 Subject: fix variable name --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index 8ffa80b213..ff9d1dec46 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1019,7 +1019,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) *outp++ = Py_hexdigits[c&0xf]; } - assert(out == start + ressize); + assert(outp == start + ressize); assert(_PyUnicode_CheckConsistency(res, 1)); restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); -- cgit v1.2.1 From fdcbaf4d0df22fcb9103ad8f2369cc9c87de2605 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Wed, 26 Nov 2014 14:39:54 -0600 Subject: correct assertion --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index ff9d1dec46..a0a540304a 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1019,7 +1019,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) *outp++ = Py_hexdigits[c&0xf]; } - assert(outp == start + ressize); + assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); assert(_PyUnicode_CheckConsistency(res, 1)); restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); -- cgit v1.2.1 From f7b3d343578c306d612261a925e648edd60daee6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 25 Jan 2015 22:56:57 +0200 Subject: Issue #22286: The "backslashreplace" error handlers now works with decoding and translating. --- Python/codecs.c | 144 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 53 deletions(-) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index a0a540304a..a5588598b3 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -864,74 +864,112 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { - if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - PyObject *restuple; - PyObject *object; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; - unsigned char *outp; - Py_ssize_t ressize; - Py_UCS4 c; - if (PyUnicodeEncodeError_GetStart(exc, &start)) + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + int ressize; + Py_UCS4 c; + + if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; - if (PyUnicodeEncodeError_GetEnd(exc, &end)) + if (PyUnicodeDecodeError_GetEnd(exc, &end)) return NULL; - if (!(object = PyUnicodeEncodeError_GetObject(exc))) + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); return NULL; - if (end - start > PY_SSIZE_T_MAX / (1+1+8)) - end = start + PY_SSIZE_T_MAX / (1+1+8); - for (i = start, ressize = 0; i < end; ++i) { - /* object is guaranteed to be "ready" */ - c = PyUnicode_READ_CHAR(object, i); - if (c >= 0x10000) { - ressize += 1+1+8; - } - else if (c >= 0x100) { - ressize += 1+1+4; - } - else - ressize += 1+1+2; } - res = PyUnicode_New(ressize, 127); + res = PyUnicode_New(4 * (end - start), 127); if (res == NULL) { Py_DECREF(object); return NULL; } - for (i = start, outp = PyUnicode_1BYTE_DATA(res); - i < end; ++i) { - c = PyUnicode_READ_CHAR(object, i); - *outp++ = '\\'; - if (c >= 0x00010000) { - *outp++ = 'U'; - *outp++ = Py_hexdigits[(c>>28)&0xf]; - *outp++ = Py_hexdigits[(c>>24)&0xf]; - *outp++ = Py_hexdigits[(c>>20)&0xf]; - *outp++ = Py_hexdigits[(c>>16)&0xf]; - *outp++ = Py_hexdigits[(c>>12)&0xf]; - *outp++ = Py_hexdigits[(c>>8)&0xf]; - } - else if (c >= 0x100) { - *outp++ = 'u'; - *outp++ = Py_hexdigits[(c>>12)&0xf]; - *outp++ = Py_hexdigits[(c>>8)&0xf]; - } - else - *outp++ = 'x'; - *outp++ = Py_hexdigits[(c>>4)&0xf]; - *outp++ = Py_hexdigits[c&0xf]; + outp = PyUnicode_1BYTE_DATA(res); + for (i = start; i < end; i++, outp += 4) { + unsigned char c = p[i]; + outp[0] = '\\'; + outp[1] = 'x'; + outp[2] = Py_hexdigits[(c>>4)&0xf]; + outp[3] = Py_hexdigits[c&0xf]; } assert(_PyUnicode_CheckConsistency(res, 1)); - restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); - return restuple; + return Py_BuildValue("(Nn)", res, end); + } + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { + if (PyUnicodeTranslateError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeTranslateError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeTranslateError_GetObject(exc))) + return NULL; } else { wrong_exception_type(exc); return NULL; } + + if (end - start > PY_SSIZE_T_MAX / (1+1+8)) + end = start + PY_SSIZE_T_MAX / (1+1+8); + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (c >= 0x10000) { + ressize += 1+1+8; + } + else if (c >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_New(ressize, 127); + if (res == NULL) { + Py_DECREF(object); + return NULL; + } + outp = PyUnicode_1BYTE_DATA(res); + for (i = start; i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); + *outp++ = '\\'; + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(_PyUnicode_CheckConsistency(res, 1)); + Py_DECREF(object); + return Py_BuildValue("(Nn)", res, end); } static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; @@ -1444,8 +1482,8 @@ static int _PyCodecRegistry_Init(void) backslashreplace_errors, METH_O, PyDoc_STR("Implements the 'backslashreplace' error handling, " - "which replaces an unencodable character with a " - "backslashed escape sequence.") + "which replaces malformed data with a backslashed " + "escape sequence.") } }, { -- cgit v1.2.1 From d53d28906ce286782125ad3ebfc3a26faeb7c08f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 16 Feb 2015 20:52:17 +0200 Subject: Issue #23450: Fixed possible integer overflows. --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Python/codecs.c') diff --git a/Python/codecs.c b/Python/codecs.c index a5588598b3..64fc3d6331 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1006,7 +1006,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) c = PyUnicode_READ_CHAR(object, i); if (ucnhash_CAPI && ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { - replsize = 1+1+1+strlen(buffer)+1; + replsize = 1+1+1+(int)strlen(buffer)+1; } else if (c >= 0x10000) { replsize = 1+1+8; -- cgit v1.2.1