Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.

The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
author: Serhiy Storchaka <storchaka@gmail.com> 2013-11-19 11:32:41 +0200
committer: Serhiy Storchaka <storchaka@gmail.com> 2013-11-19 11:32:41 +0200
commit: c1c39389d8a80767e05f84820e28caf528ba0f81 (patch)
tree: 6518ca8e2ef488b2f64999fbc013e4f95ab4fb05 /Python/codecs.c
parent: 6cb333aab2f849df8bfdeadf153b978d0efcc5cf (diff)
download: cpython-c1c39389d8a80767e05f84820e28caf528ba0f81.tar.gz
1 files changed, 146 insertions, 17 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index fe0cab4f7f..8fe0af7bf0 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
     }
 }
 
+#define ENC_UTF8        0
+#define ENC_UTF16BE     1
+#define ENC_UTF16LE     2
+#define ENC_UTF32BE     3
+#define ENC_UTF32LE     4
+
+static int
+get_standard_encoding(const char *encoding, int *bytelength)
+{
+    if (Py_TOLOWER(encoding[0]) == 'u' &&
+        Py_TOLOWER(encoding[1]) == 't' &&
+        Py_TOLOWER(encoding[2]) == 'f') {
+        encoding += 3;
+        if (*encoding == '-' || *encoding == '_' )
+            encoding++;
+        if (encoding[0] == '1' && encoding[1] == '6') {
+            encoding += 2;
+            *bytelength = 2;
+            if (*encoding == '\0') {
+#ifdef WORDS_BIGENDIAN
+                return ENC_UTF16BE;
+#else
+                return ENC_UTF16LE;
+#endif
+            }
+            if (*encoding == '-' || *encoding == '_' )
+                encoding++;
+            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
+                if (Py_TOLOWER(encoding[0]) == 'b')
+                    return ENC_UTF16BE;
+                if (Py_TOLOWER(encoding[0]) == 'l')
+                    return ENC_UTF16LE;
+            }
+        }
+        else if (encoding[0] == '3' && encoding[1] == '2') {
+            encoding += 2;
+            *bytelength = 4;
+            if (*encoding == '\0') {
+#ifdef WORDS_BIGENDIAN
+                return ENC_UTF32BE;
+#else
+                return ENC_UTF32LE;
+#endif
+            }
+            if (*encoding == '-' || *encoding == '_' )
+                encoding++;
+            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
+                if (Py_TOLOWER(encoding[0]) == 'b')
+                    return ENC_UTF32BE;
+                if (Py_TOLOWER(encoding[0]) == 'l')
+                    return ENC_UTF32LE;
+            }
+        }
+    }
+    /* utf-8 */
+    *bytelength = 3;
+    return ENC_UTF8;
+}
+
 /* This handler is declared static until someone demonstrates
    a need to call it directly. */
 static PyObject *
@@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
 {
     PyObject *restuple;
     PyObject *object;
+    PyObject *encode;
+    char *encoding;
+    int code;
+    int bytelength;
     Py_ssize_t i;
     Py_ssize_t start;
     Py_ssize_t end;
     PyObject *res;
     if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
-        char *outp;
+        unsigned char *outp;
         if (PyUnicodeEncodeError_GetStart(exc, &start))
             return NULL;
         if (PyUnicodeEncodeError_GetEnd(exc, &end))
             return NULL;
         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
             return NULL;
-        res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
+            Py_DECREF(object);
+            return NULL;
+        }
+        if (!(encoding = PyUnicode_AsUTF8(encode))) {
+            Py_DECREF(object);
+            Py_DECREF(encode);
+            return NULL;
+        }
+        code = get_standard_encoding(encoding, &bytelength);
+        Py_DECREF(encode);
+
+        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
         if (!res) {
             Py_DECREF(object);
             return NULL;
         }
-        outp = PyBytes_AsString(res);
+        outp = (unsigned char*)PyBytes_AsString(res);
         for (i = start; i < end; i++) {
             /* object is guaranteed to be "ready" */
             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
@@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
                 Py_DECREF(object);
                 return NULL;
             }
-            *outp++ = (char)(0xe0 | (ch >> 12));
-            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-            *outp++ = (char)(0x80 | (ch & 0x3f));
+            switch (code) {
+            case ENC_UTF8:
+                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
+                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
+                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
+                break;
+            case ENC_UTF16LE:
+                *outp++ = (unsigned char) ch;
+                *outp++ = (unsigned char)(ch >> 8);
+                break;
+            case ENC_UTF16BE:
+                *outp++ = (unsigned char)(ch >> 8);
+                *outp++ = (unsigned char) ch;
+                break;
+            case ENC_UTF32LE:
+                *outp++ = (unsigned char) ch;
+                *outp++ = (unsigned char)(ch >> 8);
+                *outp++ = (unsigned char)(ch >> 16);
+                *outp++ = (unsigned char)(ch >> 24);
+                break;
+            case ENC_UTF32BE:
+                *outp++ = (unsigned char)(ch >> 24);
+                *outp++ = (unsigned char)(ch >> 16);
+                *outp++ = (unsigned char)(ch >> 8);
+                *outp++ = (unsigned char) ch;
+                break;
+            }
         }
         restuple = Py_BuildValue("(On)", res, end);
         Py_DECREF(res);
@@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
         Py_UCS4 ch = 0;
         if (PyUnicodeDecodeError_GetStart(exc, &start))
             return NULL;
+        if (PyUnicodeDecodeError_GetEnd(exc, &end))
+            return NULL;
         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
             return NULL;
         if (!(p = (unsigned char*)PyBytes_AsString(object))) {
             Py_DECREF(object);
             return NULL;
         }
+        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
+            Py_DECREF(object);
+            return NULL;
+        }
+        if (!(encoding = PyUnicode_AsUTF8(encode))) {
+            Py_DECREF(object);
+            Py_DECREF(encode);
+            return NULL;
+        }
+        code = get_standard_encoding(encoding, &bytelength);
+        Py_DECREF(encode);
+
         /* Try decoding a single surrogate character. If
            there are more, let the codec call us again. */
         p += start;
-        if (PyBytes_GET_SIZE(object) - start >= 3 &&
-            (p[0] & 0xf0) == 0xe0 &&
-            (p[1] & 0xc0) == 0x80 &&
-            (p[2] & 0xc0) == 0x80) {
-            /* it's a three-byte code */
-            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
-            if (!Py_UNICODE_IS_SURROGATE(ch))
-                /* it's not a surrogate - fail */
-                ch = 0;
+        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
+            switch (code) {
+            case ENC_UTF8:
+                if ((p[0] & 0xf0) == 0xe0 &&
+                    (p[1] & 0xc0) == 0x80 &&
+                    (p[2] & 0xc0) == 0x80) {
+                    /* it's a three-byte code */
+                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+                }
+                break;
+            case ENC_UTF16LE:
+                ch = p[1] << 8 | p[0];
+                break;
+            case ENC_UTF16BE:
+                ch = p[0] << 8 | p[1];
+                break;
+            case ENC_UTF32LE:
+                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
+                break;
+            case ENC_UTF32BE:
+                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
+                break;
+            }
         }
+
         Py_DECREF(object);
-        if (ch == 0) {
+        if (!Py_UNICODE_IS_SURROGATE(ch)) {
+            /* it's not a surrogate - fail */
             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
             return NULL;
         }
         res = PyUnicode_FromOrdinal(ch);
         if (res == NULL)
             return NULL;
-        return Py_BuildValue("(Nn)", res, start+3);
+        return Py_BuildValue("(Nn)", res, start + bytelength);
     }
     else {
         wrong_exception_type(exc);
author	Serhiy Storchaka <storchaka@gmail.com>	2013-11-19 11:32:41 +0200
committer	Serhiy Storchaka <storchaka@gmail.com>	2013-11-19 11:32:41 +0200
commit	c1c39389d8a80767e05f84820e28caf528ba0f81 (patch)
tree	6518ca8e2ef488b2f64999fbc013e4f95ab4fb05 /Python/codecs.c
parent	6cb333aab2f849df8bfdeadf153b978d0efcc5cf (diff)
download	cpython-c1c39389d8a80767e05f84820e28caf528ba0f81.tar.gz