diff options
author | Kristj?n Valur J?nsson <sweskman@gmail.com> | 2014-05-08 10:59:52 +0000 |
---|---|---|
committer | Kristj?n Valur J?nsson <sweskman@gmail.com> | 2014-05-08 10:59:52 +0000 |
commit | 498765f67801283837f883c7a0c7063cc31562ba (patch) | |
tree | 3b94ced69981f2796c5a422ac7f23c4d3a62dc79 /Python/codecs.c | |
parent | 3d53c3bc64eb55637eb8b4c44c5f26b5768054f6 (diff) | |
parent | 84bc39f849f49ebc706d2ae891214bba4e8cc340 (diff) | |
download | cpython-498765f67801283837f883c7a0c7063cc31562ba.tar.gz |
Merging from 3.3: The PyCOND_TIMEDWAIT must use microseconds for the timeout argument
in order to have the same resolution as pthreads condition variables.
At the same time, it must be large enough to accept 31 bits of
milliseconds, which is the maximum timeout value in the windows API.
A PY_LONG_LONG of microseconds fullfills both requirements.
This closes issue #20737
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 201 |
1 files changed, 175 insertions, 26 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index 0b736c1715..e06d6e0922 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -53,7 +53,7 @@ int PyCodec_Register(PyObject *search_function) static PyObject *normalizestring(const char *string) { - register size_t i; + size_t i; size_t len = strlen(string); char *p; PyObject *v; @@ -65,9 +65,9 @@ PyObject *normalizestring(const char *string) p = PyMem_Malloc(len + 1); if (p == NULL) - return NULL; + return PyErr_NoMemory(); for (i = 0; i < len; i++) { - register char ch = string[i]; + char ch = string[i]; if (ch == ' ') ch = '-'; else @@ -360,6 +360,22 @@ PyObject *PyCodec_StreamWriter(const char *encoding, return codec_getstreamcodec(encoding, stream, errors, 3); } +/* Helper that tries to ensure the reported exception chain indicates the + * codec that was invoked to trigger the failure without changing the type + * of the exception raised. + */ +static void +wrap_codec_error(const char *operation, + const char *encoding) +{ + /* TrySetFromCause will replace the active exception with a suitably + * updated clone if it can, otherwise it will leave the original + * exception alone. + */ + _PyErr_TrySetFromCause("%s with '%s' codec failed", + operation, encoding); +} + /* Encode an object (e.g. an Unicode object) using the given encoding and return the resulting encoded object (usually a Python string). @@ -379,8 +395,10 @@ _PyCodec_EncodeInternal(PyObject *object, goto onError; result = PyEval_CallObject(encoder, args); - if (result == NULL) + if (result == NULL) { + wrap_codec_error("encoding", encoding); goto onError; + } if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 2) { @@ -423,8 +441,10 @@ _PyCodec_DecodeInternal(PyObject *object, goto onError; result = PyEval_CallObject(decoder,args); - if (result == NULL) + if (result == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 2) { PyErr_SetString(PyExc_TypeError, @@ -589,7 +609,7 @@ int PyCodec_RegisterError(const char *name, PyObject *error) return -1; } return PyDict_SetItemString(interp->codec_error_registry, - (char *)name, error); + name, error); } /* Lookup the error handling callback function registered under the @@ -605,7 +625,7 @@ PyObject *PyCodec_LookupError(const char *name) if (name==NULL) name = "strict"; - handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name); + handler = PyDict_GetItemString(interp->codec_error_registry, name); if (!handler) PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); else @@ -881,6 +901,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +#define ENC_UTF8 0 +#define ENC_UTF16BE 1 +#define ENC_UTF16LE 2 +#define ENC_UTF32BE 3 +#define ENC_UTF32LE 4 + +static int +get_standard_encoding(const char *encoding, int *bytelength) +{ + if (Py_TOLOWER(encoding[0]) == 'u' && + Py_TOLOWER(encoding[1]) == 't' && + Py_TOLOWER(encoding[2]) == 'f') { + encoding += 3; + if (*encoding == '-' || *encoding == '_' ) + encoding++; + if (encoding[0] == '1' && encoding[1] == '6') { + encoding += 2; + *bytelength = 2; + if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN + return ENC_UTF16BE; +#else + return ENC_UTF16LE; +#endif + } + if (*encoding == '-' || *encoding == '_' ) + encoding++; + if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { + if (Py_TOLOWER(encoding[0]) == 'b') + return ENC_UTF16BE; + if (Py_TOLOWER(encoding[0]) == 'l') + return ENC_UTF16LE; + } + } + else if (encoding[0] == '3' && encoding[1] == '2') { + encoding += 2; + *bytelength = 4; + if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN + return ENC_UTF32BE; +#else + return ENC_UTF32LE; +#endif + } + if (*encoding == '-' || *encoding == '_' ) + encoding++; + if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { + if (Py_TOLOWER(encoding[0]) == 'b') + return ENC_UTF32BE; + if (Py_TOLOWER(encoding[0]) == 'l') + return ENC_UTF32LE; + } + } + } + /* utf-8 */ + *bytelength = 3; + return ENC_UTF8; +} + /* This handler is declared static until someone demonstrates a need to call it directly. */ static PyObject * @@ -888,37 +967,77 @@ PyCodec_SurrogatePassErrors(PyObject *exc) { PyObject *restuple; PyObject *object; + PyObject *encode; + char *encoding; + int code; + int bytelength; Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - char *outp; + unsigned char *outp; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + code = get_standard_encoding(encoding, &bytelength); + Py_DECREF(encode); + + res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); if (!res) { Py_DECREF(object); return NULL; } - outp = PyBytes_AsString(res); + outp = (unsigned char*)PyBytes_AsString(res); for (i = start; i < end; i++) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); - if (ch < 0xd800 || ch > 0xdfff) { + if (!Py_UNICODE_IS_SURROGATE(ch)) { /* Not a surrogate, fail with original exception */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); Py_DECREF(res); Py_DECREF(object); return NULL; } - *outp++ = (char)(0xe0 | (ch >> 12)); - *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *outp++ = (char)(0x80 | (ch & 0x3f)); + switch (code) { + case ENC_UTF8: + *outp++ = (unsigned char)(0xe0 | (ch >> 12)); + *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); + break; + case ENC_UTF16LE: + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + break; + case ENC_UTF16BE: + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + break; + case ENC_UTF32LE: + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 24); + break; + case ENC_UTF32BE: + *outp++ = (unsigned char)(ch >> 24); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + break; + } } restuple = Py_BuildValue("(On)", res, end); Py_DECREF(res); @@ -930,34 +1049,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_UCS4 ch = 0; if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; if (!(object = PyUnicodeDecodeError_GetObject(exc))) return NULL; if (!(p = (unsigned char*)PyBytes_AsString(object))) { Py_DECREF(object); return NULL; } + if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + code = get_standard_encoding(encoding, &bytelength); + Py_DECREF(encode); + /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ p += start; - if (PyBytes_GET_SIZE(object) - start >= 3 && - (p[0] & 0xf0) == 0xe0 && - (p[1] & 0xc0) == 0x80 && - (p[2] & 0xc0) == 0x80) { - /* it's a three-byte code */ - ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); - if (ch < 0xd800 || ch > 0xdfff) - /* it's not a surrogate - fail */ - ch = 0; + if (PyBytes_GET_SIZE(object) - start >= bytelength) { + switch (code) { + case ENC_UTF8: + if ((p[0] & 0xf0) == 0xe0 && + (p[1] & 0xc0) == 0x80 && + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + } + break; + case ENC_UTF16LE: + ch = p[1] << 8 | p[0]; + break; + case ENC_UTF16BE: + ch = p[0] << 8 | p[1]; + break; + case ENC_UTF32LE: + ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; + break; + case ENC_UTF32BE: + ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; + break; + } } + Py_DECREF(object); - if (ch == 0) { + if (!Py_UNICODE_IS_SURROGATE(ch)) { + /* it's not a surrogate - fail */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } res = PyUnicode_FromOrdinal(ch); if (res == NULL) return NULL; - return Py_BuildValue("(Nn)", res, start+3); + return Py_BuildValue("(Nn)", res, start + bytelength); } else { wrong_exception_type(exc); @@ -1174,7 +1323,7 @@ static int _PyCodecRegistry_Init(void) if (interp->codec_error_registry) { for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { - PyObject *func = PyCFunction_New(&methods[i].def, NULL); + PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); int res; if (!func) Py_FatalError("can't initialize codec error registry"); |