summaryrefslogtreecommitdiff
path: root/Python/codecs.c
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2013-11-19 11:32:41 +0200
committerSerhiy Storchaka <storchaka@gmail.com>2013-11-19 11:32:41 +0200
commitc1c39389d8a80767e05f84820e28caf528ba0f81 (patch)
tree6518ca8e2ef488b2f64999fbc013e4f95ab4fb05 /Python/codecs.c
parent6cb333aab2f849df8bfdeadf153b978d0efcc5cf (diff)
downloadcpython-c1c39389d8a80767e05f84820e28caf528ba0f81.tar.gz
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
Diffstat (limited to 'Python/codecs.c')
-rw-r--r--Python/codecs.c163
1 files changed, 146 insertions, 17 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index fe0cab4f7f..8fe0af7bf0 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
}
}
+#define ENC_UTF8 0
+#define ENC_UTF16BE 1
+#define ENC_UTF16LE 2
+#define ENC_UTF32BE 3
+#define ENC_UTF32LE 4
+
+static int
+get_standard_encoding(const char *encoding, int *bytelength)
+{
+ if (Py_TOLOWER(encoding[0]) == 'u' &&
+ Py_TOLOWER(encoding[1]) == 't' &&
+ Py_TOLOWER(encoding[2]) == 'f') {
+ encoding += 3;
+ if (*encoding == '-' || *encoding == '_' )
+ encoding++;
+ if (encoding[0] == '1' && encoding[1] == '6') {
+ encoding += 2;
+ *bytelength = 2;
+ if (*encoding == '\0') {
+#ifdef WORDS_BIGENDIAN
+ return ENC_UTF16BE;
+#else
+ return ENC_UTF16LE;
+#endif
+ }
+ if (*encoding == '-' || *encoding == '_' )
+ encoding++;
+ if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
+ if (Py_TOLOWER(encoding[0]) == 'b')
+ return ENC_UTF16BE;
+ if (Py_TOLOWER(encoding[0]) == 'l')
+ return ENC_UTF16LE;
+ }
+ }
+ else if (encoding[0] == '3' && encoding[1] == '2') {
+ encoding += 2;
+ *bytelength = 4;
+ if (*encoding == '\0') {
+#ifdef WORDS_BIGENDIAN
+ return ENC_UTF32BE;
+#else
+ return ENC_UTF32LE;
+#endif
+ }
+ if (*encoding == '-' || *encoding == '_' )
+ encoding++;
+ if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
+ if (Py_TOLOWER(encoding[0]) == 'b')
+ return ENC_UTF32BE;
+ if (Py_TOLOWER(encoding[0]) == 'l')
+ return ENC_UTF32LE;
+ }
+ }
+ }
+ /* utf-8 */
+ *bytelength = 3;
+ return ENC_UTF8;
+}
+
/* This handler is declared static until someone demonstrates
a need to call it directly. */
static PyObject *
@@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
+ PyObject *encode;
+ char *encoding;
+ int code;
+ int bytelength;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
- char *outp;
+ unsigned char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
- res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+ if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ if (!(encoding = PyUnicode_AsUTF8(encode))) {
+ Py_DECREF(object);
+ Py_DECREF(encode);
+ return NULL;
+ }
+ code = get_standard_encoding(encoding, &bytelength);
+ Py_DECREF(encode);
+
+ res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
}
- outp = PyBytes_AsString(res);
+ outp = (unsigned char*)PyBytes_AsString(res);
for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
@@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
Py_DECREF(object);
return NULL;
}
- *outp++ = (char)(0xe0 | (ch >> 12));
- *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
- *outp++ = (char)(0x80 | (ch & 0x3f));
+ switch (code) {
+ case ENC_UTF8:
+ *outp++ = (unsigned char)(0xe0 | (ch >> 12));
+ *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
+ *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
+ break;
+ case ENC_UTF16LE:
+ *outp++ = (unsigned char) ch;
+ *outp++ = (unsigned char)(ch >> 8);
+ break;
+ case ENC_UTF16BE:
+ *outp++ = (unsigned char)(ch >> 8);
+ *outp++ = (unsigned char) ch;
+ break;
+ case ENC_UTF32LE:
+ *outp++ = (unsigned char) ch;
+ *outp++ = (unsigned char)(ch >> 8);
+ *outp++ = (unsigned char)(ch >> 16);
+ *outp++ = (unsigned char)(ch >> 24);
+ break;
+ case ENC_UTF32BE:
+ *outp++ = (unsigned char)(ch >> 24);
+ *outp++ = (unsigned char)(ch >> 16);
+ *outp++ = (unsigned char)(ch >> 8);
+ *outp++ = (unsigned char) ch;
+ break;
+ }
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
@@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
Py_UCS4 ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
+ if (PyUnicodeDecodeError_GetEnd(exc, &end))
+ return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
Py_DECREF(object);
return NULL;
}
+ if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ if (!(encoding = PyUnicode_AsUTF8(encode))) {
+ Py_DECREF(object);
+ Py_DECREF(encode);
+ return NULL;
+ }
+ code = get_standard_encoding(encoding, &bytelength);
+ Py_DECREF(encode);
+
/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
- if (PyBytes_GET_SIZE(object) - start >= 3 &&
- (p[0] & 0xf0) == 0xe0 &&
- (p[1] & 0xc0) == 0x80 &&
- (p[2] & 0xc0) == 0x80) {
- /* it's a three-byte code */
- ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
- if (!Py_UNICODE_IS_SURROGATE(ch))
- /* it's not a surrogate - fail */
- ch = 0;
+ if (PyBytes_GET_SIZE(object) - start >= bytelength) {
+ switch (code) {
+ case ENC_UTF8:
+ if ((p[0] & 0xf0) == 0xe0 &&
+ (p[1] & 0xc0) == 0x80 &&
+ (p[2] & 0xc0) == 0x80) {
+ /* it's a three-byte code */
+ ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+ }
+ break;
+ case ENC_UTF16LE:
+ ch = p[1] << 8 | p[0];
+ break;
+ case ENC_UTF16BE:
+ ch = p[0] << 8 | p[1];
+ break;
+ case ENC_UTF32LE:
+ ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
+ break;
+ case ENC_UTF32BE:
+ ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
+ break;
+ }
}
+
Py_DECREF(object);
- if (ch == 0) {
+ if (!Py_UNICODE_IS_SURROGATE(ch)) {
+ /* it's not a surrogate - fail */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
res = PyUnicode_FromOrdinal(ch);
if (res == NULL)
return NULL;
- return Py_BuildValue("(Nn)", res, start+3);
+ return Py_BuildValue("(Nn)", res, start + bytelength);
}
else {
wrong_exception_type(exc);