From 26ad1df9768ed894740d1f88993e146ecd627ea6 Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <benjamin@python.org>
Date: Wed, 28 Oct 2009 21:59:39 +0000
Subject: in wide builds, avoid storing high unicode characters from source
 code with surrogates

This is accomplished by decoding with utf-32 instead of utf-16 on all builds.
The patch is by Adam Olsen.
---
 Python/ast.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'Python/ast.c')

diff --git a/Python/ast.c b/Python/ast.c
index c3edea3534..c6a6417efe 100644
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
         u = NULL;
     } else {
         /* check for integer overflow */
-        if (len > PY_SIZE_MAX / 4)
+        if (len > PY_SIZE_MAX / 6)
             return NULL;
-        /* "\XX" may become "\u005c\uHHLL" (12 bytes) */
-        u = PyBytes_FromStringAndSize((char *)NULL, len * 4);
+        /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+           "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
+        u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
         if (u == NULL)
             return NULL;
         p = buf = PyBytes_AsString(u);
@@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
                 PyObject *w;
                 char *r;
                 Py_ssize_t rn, i;
-                w = decode_utf8(c, &s, end, "utf-16-be");
+                w = decode_utf8(c, &s, end, "utf-32-be");
                 if (w == NULL) {
                     Py_DECREF(u);
                     return NULL;
                 }
                 r = PyBytes_AS_STRING(w);
                 rn = Py_SIZE(w);
-                assert(rn % 2 == 0);
-                for (i = 0; i < rn; i += 2) {
-                    sprintf(p, "\\u%02x%02x",
+                assert(rn % 4 == 0);
+                for (i = 0; i < rn; i += 4) {
+                    sprintf(p, "\\U%02x%02x%02x%02x",
                             r[i + 0] & 0xFF,
-                            r[i + 1] & 0xFF);
-                    p += 6;
+                            r[i + 1] & 0xFF,
+                            r[i + 2] & 0xFF,
+                            r[i + 3] & 0xFF);
+                    p += 10;
                 }
+                /* Should be impossible to overflow */
+                assert(p - buf <= Py_SIZE(u));
                 Py_DECREF(w);
             } else {
                 *p++ = *s++;
-- 
cgit v1.2.1