Issue #21187: merge with 3.4

author: Ned Deily <nad@acm.org> 2014-04-09 16:22:19 -0700
committer: Ned Deily <nad@acm.org> 2014-04-09 16:22:19 -0700
commit: a4a8b4599f3ea553e45af913536d0f6985081a8f (patch)
tree: ca2a1984a51a388d86db2a006b5cd5a24fc15d3c /Objects
parent: 7c5c123a453e906da293896ed413614236064cce (diff)
parent: 8bc92821f69ba96eb0e2acabea54eb3b5e1c86a5 (diff)
download: cpython-a4a8b4599f3ea553e45af913536d0f6985081a8f.tar.gz
1 files changed, 246 insertions, 232 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ec22239d3b..af70ede749 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj)
     return PyBytes_AS_STRING(*obj);
 }
 
-static int
-is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
-{
-    const char *curr = s + offset;
-    const char *prev;
-
-    if (!IsDBCSLeadByteEx(code_page, *curr))
-        return 0;
-
-    prev = CharPrevExA(code_page, s, curr, 0);
-    if (prev == curr)
-        return 1;
-    /* FIXME: This code is limited to "true" double-byte encodings,
-       as it assumes an incomplete character consists of a single
-       byte. */
-    if (curr - prev == 2)
-        return 1;
-    if (!IsDBCSLeadByteEx(code_page, *prev))
-        return 1;
-    return 0;
-}
-
 static DWORD
 decode_code_page_flags(UINT code_page)
 {
@@ -6913,7 +6891,7 @@ static int
 decode_code_page_errors(UINT code_page,
                         PyObject **v,
                         const char *in, const int size,
-                        const char *errors)
+                        const char *errors, int final)
 {
     const char *startin = in;
     const char *endin = in + size;
@@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page,
     if (encoding == NULL)
         return -1;
 
-    if (errors == NULL || strcmp(errors, "strict") == 0) {
+    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
            UnicodeDecodeError. */
         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
@@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page,
         if (outsize <= 0) {
             Py_ssize_t startinpos, endinpos, outpos;
 
+            /* last character in partial decode? */
+            if (in + insize >= endin && !final)
+                break;
+
             startinpos = in - startin;
             endinpos = startinpos + 1;
             outpos = out - PyUnicode_AS_UNICODE(*v);
@@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page,
     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
     if (unicode_resize(v, outsize) < 0)
         goto error;
-    ret = size;
+    ret = in - startin;
 
 error:
     Py_XDECREF(encoding_obj);
@@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page,
             done = 1;
         }
 
-        /* Skip trailing lead-byte unless 'final' is set */
-        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
-            --chunk_size;
-
         if (chunk_size == 0 && done) {
             if (v != NULL)
                 break;
             _Py_RETURN_UNICODE_EMPTY();
         }
 
-
         converted = decode_code_page_strict(code_page, &v,
                                             s, chunk_size);
         if (converted == -2)
             converted = decode_code_page_errors(code_page, &v,
                                                 s, chunk_size,
-                                                errors);
-        assert(converted != 0);
+                                                errors, final);
+        assert(converted != 0 || done);
 
         if (converted < 0) {
             Py_XDECREF(v);
@@ -8496,10 +8473,10 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
     }
     else if (PyLong_Check(x)) {
         long value = PyLong_AS_LONG(x);
-        long max = PyUnicode_GetMax();
-        if (value < 0 || value > max) {
-            PyErr_Format(PyExc_TypeError,
-                         "character mapping must be in range(0x%x)", max+1);
+        if (value < 0 || value > MAX_UNICODE) {
+            PyErr_Format(PyExc_ValueError,
+                         "character mapping must be in range(0x%x)",
+                         MAX_UNICODE+1);
             Py_DECREF(x);
             return -1;
         }
@@ -8518,76 +8495,168 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
         return -1;
     }
 }
-/* ensure that *outobj is at least requiredsize characters long,
-   if not reallocate and adjust various state variables.
-   Return 0 on success, -1 on error */
+
+/* lookup the character, write the result into the writer.
+   Return 1 if the result was written into the writer, return 0 if the mapping
+   was undefined, raise an exception return -1 on error. */
 static int
-charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
-                               Py_ssize_t requiredsize)
-{
-    Py_ssize_t oldsize = *psize;
-    Py_UCS4 *new_outobj;
-    if (requiredsize > oldsize) {
-        /* exponentially overallocate to minimize reallocations */
-        if (requiredsize < 2 * oldsize)
-            requiredsize = 2 * oldsize;
-        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
-        if (new_outobj == 0)
+charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
+                        _PyUnicodeWriter *writer)
+{
+    PyObject *item;
+
+    if (charmaptranslate_lookup(ch, mapping, &item))
+        return -1;
+
+    if (item == NULL) {
+        /* not found => default to 1:1 mapping */
+        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
             return -1;
-        *outobj = new_outobj;
-        *psize = requiredsize;
+        }
+        return 1;
     }
-    return 0;
+
+    if (item == Py_None) {
+        Py_DECREF(item);
+        return 0;
+    }
+
+    if (PyLong_Check(item)) {
+        long ch = (Py_UCS4)PyLong_AS_LONG(item);
+        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
+           used it */
+        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
+            Py_DECREF(item);
+            return -1;
+        }
+        Py_DECREF(item);
+        return 1;
+    }
+
+    if (!PyUnicode_Check(item)) {
+        Py_DECREF(item);
+        return -1;
+    }
+
+    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
+        Py_DECREF(item);
+        return -1;
+    }
+
+    Py_DECREF(item);
+    return 1;
 }
-/* lookup the character, put the result in the output string and adjust
-   various state variables. Return a new reference to the object that
-   was put in the output buffer in *result, or Py_None, if the mapping was
-   undefined (in which case no character was written).
-   The called must decref result.
-   Return 0 on success, -1 on error. */
+
 static int
-charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
-                        PyObject *mapping, Py_UCS4 **output,
-                        Py_ssize_t *osize, Py_ssize_t *opos,
-                        PyObject **res)
+unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
+                              Py_UCS1 *translate)
 {
-    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
-    if (charmaptranslate_lookup(curinp, mapping, res))
+    PyObject *item = NULL;
+    int ret = 0;
+
+    if (charmaptranslate_lookup(ch, mapping, &item)) {
         return -1;
-    if (*res==NULL) {
+    }
+
+    if (item == Py_None) {
+        /* deletion */
+        translate[ch] = 0xfe;
+    }
+    else if (item == NULL) {
         /* not found => default to 1:1 mapping */
-        (*output)[(*opos)++] = curinp;
+        translate[ch] = ch;
+        return 1;
     }
-    else if (*res==Py_None)
-        ;
-    else if (PyLong_Check(*res)) {
-        /* no overflow check, because we know that the space is enough */
-        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
+    else if (PyLong_Check(item)) {
+        long replace = PyLong_AS_LONG(item);
+        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
+           used it */
+        if (127 < replace) {
+            /* invalid character or character outside ASCII:
+               skip the fast translate */
+            goto exit;
+        }
+        translate[ch] = (Py_UCS1)replace;
     }
-    else if (PyUnicode_Check(*res)) {
-        Py_ssize_t repsize;
-        if (PyUnicode_READY(*res) == -1)
+    else if (PyUnicode_Check(item)) {
+        Py_UCS4 replace;
+
+        if (PyUnicode_READY(item) == -1) {
+            Py_DECREF(item);
             return -1;
-        repsize = PyUnicode_GET_LENGTH(*res);
-        if (repsize==1) {
-            /* no overflow check, because we know that the space is enough */
-            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
         }
-        else if (repsize!=0) {
-            /* more than one character */
-            Py_ssize_t requiredsize = *opos +
-                (PyUnicode_GET_LENGTH(input) - ipos) +
-                repsize - 1;
-            Py_ssize_t i;
-            if (charmaptranslate_makespace(output, osize, requiredsize))
+        if (PyUnicode_GET_LENGTH(item) != 1)
+            goto exit;
+
+        replace = PyUnicode_READ_CHAR(item, 0);
+        if (replace > 127)
+            goto exit;
+        translate[ch] = (Py_UCS1)replace;
+    }
+    else {
+        /* not None, NULL, long or unicode */
+        goto exit;
+    }
+    ret = 1;
+
+  exit:
+    Py_DECREF(item);
+    return ret;
+}
+
+/* Fast path for ascii => ascii translation. Return 1 if the whole string
+   was translated into writer, return 0 if the input string was partially
+   translated into writer, raise an exception and return -1 on error. */
+static int
+unicode_fast_translate(PyObject *input, PyObject *mapping,
+                       _PyUnicodeWriter *writer, int ignore)
+{
+    Py_UCS1 ascii_table[128], ch, ch2;
+    Py_ssize_t len;
+    Py_UCS1 *in, *end, *out;
+    int res = 0;
+
+    if (PyUnicode_READY(input) == -1)
+        return -1;
+    if (!PyUnicode_IS_ASCII(input))
+        return 0;
+    len = PyUnicode_GET_LENGTH(input);
+
+    memset(ascii_table, 0xff, 128);
+
+    in = PyUnicode_1BYTE_DATA(input);
+    end = in + len;
+
+    assert(PyUnicode_IS_ASCII(writer->buffer));
+    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
+    out = PyUnicode_1BYTE_DATA(writer->buffer);
+
+    for (; in < end; in++) {
+        ch = *in;
+        ch2 = ascii_table[ch];
+        if (ch2 == 0xff) {
+            int translate = unicode_fast_translate_lookup(mapping, ch,
+                                                          ascii_table);
+            if (translate < 0)
                 return -1;
-            for(i = 0; i < repsize; i++)
-                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
+            if (translate == 0)
+                goto exit;
+            ch2 = ascii_table[ch];
         }
+        if (ch2 == 0xfe) {
+            if (ignore)
+                continue;
+            goto exit;
+        }
+        assert(ch2 < 128);
+        *out = ch2;
+        out++;
     }
-    else
-        return -1;
-    return 0;
+    res = 1;
+
+exit:
+    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
+    return res;
 }
 
 PyObject *
@@ -8596,22 +8665,17 @@ _PyUnicode_TranslateCharmap(PyObject *input,
                             const char *errors)
 {
     /* input object */
-    char *idata;
+    char *data;
     Py_ssize_t size, i;
     int kind;
     /* output buffer */
-    Py_UCS4 *output = NULL;
-    Py_ssize_t osize;
-    PyObject *res;
-    /* current output position */
-    Py_ssize_t opos;
+    _PyUnicodeWriter writer;
+    /* error handler */
     char *reason = "character maps to <undefined>";
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
-     * 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    int ignore;
+    int res;
 
     if (mapping == NULL) {
         PyErr_BadArgument();
@@ -8620,10 +8684,9 @@ _PyUnicode_TranslateCharmap(PyObject *input,
 
     if (PyUnicode_READY(input) == -1)
         return NULL;
-    idata = (char*)PyUnicode_DATA(input);
+    data = (char*)PyUnicode_DATA(input);
     kind = PyUnicode_KIND(input);
     size = PyUnicode_GET_LENGTH(input);
-    i = 0;
 
     if (size == 0) {
         Py_INCREF(input);
@@ -8632,121 +8695,81 @@ _PyUnicode_TranslateCharmap(PyObject *input,
 
     /* allocate enough for a simple 1:1 translation without
        replacements, if we need more, we'll resize */
-    osize = size;
-    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
-    opos = 0;
-    if (output == NULL) {
-        PyErr_NoMemory();
+    _PyUnicodeWriter_Init(&writer);
+    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
         goto onError;
+
+    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
+
+    res = unicode_fast_translate(input, mapping, &writer, ignore);
+    if (res < 0) {
+        _PyUnicodeWriter_Dealloc(&writer);
+        return NULL;
     }
+    if (res == 1)
+        return _PyUnicodeWriter_Finish(&writer);
 
+    i = writer.pos;
     while (i<size) {
         /* try to encode it */
-        PyObject *x = NULL;
-        if (charmaptranslate_output(input, i, mapping,
-                                    &output, &osize, &opos, &x)) {
-            Py_XDECREF(x);
+        int translate;
+        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+        Py_ssize_t newpos;
+        /* startpos for collecting untranslatable chars */
+        Py_ssize_t collstart;
+        Py_ssize_t collend;
+        Py_UCS4 ch;
+
+        ch = PyUnicode_READ(kind, data, i);
+        translate = charmaptranslate_output(ch, mapping, &writer);
+        if (translate < 0)
             goto onError;
-        }
-        Py_XDECREF(x);
-        if (x!=Py_None) /* it worked => adjust input pointer */
+
+        if (translate != 0) {
+            /* it worked => adjust input pointer */
             ++i;
-        else { /* untranslatable character */
-            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
-            Py_ssize_t repsize;
-            Py_ssize_t newpos;
-            Py_ssize_t uni2;
-            /* startpos for collecting untranslatable chars */
-            Py_ssize_t collstart = i;
-            Py_ssize_t collend = i+1;
-            Py_ssize_t coll;
-
-            /* find all untranslatable characters */
-            while (collend < size) {
-                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
-                    goto onError;
-                Py_XDECREF(x);
-                if (x!=Py_None)
-                    break;
-                ++collend;
-            }
-            /* cache callback name lookup
-             * (if not done yet, i.e. it's the first error) */
-            if (known_errorHandler==-1) {
-                if ((errors==NULL) || (!strcmp(errors, "strict")))
-                    known_errorHandler = 1;
-                else if (!strcmp(errors, "replace"))
-                    known_errorHandler = 2;
-                else if (!strcmp(errors, "ignore"))
-                    known_errorHandler = 3;
-                else if (!strcmp(errors, "xmlcharrefreplace"))
-                    known_errorHandler = 4;
-                else
-                    known_errorHandler = 0;
-            }
-            switch (known_errorHandler) {
-            case 1: /* strict */
-                make_translate_exception(&exc,
-                                         input, collstart, collend, reason);
-                if (exc != NULL)
-                    PyCodec_StrictErrors(exc);
+            continue;
+        }
+
+        /* untranslatable character */
+        collstart = i;
+        collend = i+1;
+
+        /* find all untranslatable characters */
+        while (collend < size) {
+            PyObject *x;
+            ch = PyUnicode_READ(kind, data, collend);
+            if (charmaptranslate_lookup(ch, mapping, &x))
                 goto onError;
-            case 2: /* replace */
-                /* No need to check for space, this is a 1:1 replacement */
-                for (coll = collstart; coll<collend; coll++)
-                    output[opos++] = '?';
-                /* fall through */
-            case 3: /* ignore */
-                i = collend;
-                break;
-            case 4: /* xmlcharrefreplace */
-                /* generate replacement (temporarily (mis)uses i) */
-                for (i = collstart; i < collend; ++i) {
-                    char buffer[2+29+1+1];
-                    char *cp;
-                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
-                    if (charmaptranslate_makespace(&output, &osize,
-                                                   opos+strlen(buffer)+(size-collend)))
-                        goto onError;
-                    for (cp = buffer; *cp; ++cp)
-                        output[opos++] = *cp;
-                }
-                i = collend;
+            Py_XDECREF(x);
+            if (x != Py_None)
                 break;
-            default:
-                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
-                                                                 reason, input, &exc,
-                                                                 collstart, collend, &newpos);
-                if (repunicode == NULL)
-                    goto onError;
-                if (PyUnicode_READY(repunicode) == -1) {
-                    Py_DECREF(repunicode);
-                    goto onError;
-                }
-                /* generate replacement  */
-                repsize = PyUnicode_GET_LENGTH(repunicode);
-                if (charmaptranslate_makespace(&output, &osize,
-                                               opos+repsize+(size-collend))) {
-                    Py_DECREF(repunicode);
-                    goto onError;
-                }
-                for (uni2 = 0; repsize-->0; ++uni2)
-                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
-                i = newpos;
+            ++collend;
+        }
+
+        if (ignore) {
+            i = collend;
+        }
+        else {
+            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+                                                             reason, input, &exc,
+                                                             collstart, collend, &newpos);
+            if (repunicode == NULL)
+                goto onError;
+            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
                 Py_DECREF(repunicode);
+                goto onError;
             }
+            Py_DECREF(repunicode);
+            i = newpos;
         }
     }
-    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
-    if (!res)
-        goto onError;
-    PyMem_Free(output);
     Py_XDECREF(exc);
     Py_XDECREF(errorHandler);
-    return res;
+    return _PyUnicodeWriter_Finish(&writer);
 
   onError:
-    PyMem_Free(output);
+    _PyUnicodeWriter_Dealloc(&writer);
     Py_XDECREF(exc);
     Py_XDECREF(errorHandler);
     return NULL;
@@ -14009,24 +14032,14 @@ mainformatlong(PyObject *v,
     if (!PyNumber_Check(v))
         goto wrongtype;
 
-    /* make sure number is a type of integer */
-    /* if not, issue deprecation warning for now */
+    /* make sure number is a type of integer for o, x, and X */
     if (!PyLong_Check(v)) {
         if (type == 'o' || type == 'x' || type == 'X') {
             iobj = PyNumber_Index(v);
             if (iobj == NULL) {
-                PyErr_Clear();
-                if (PyErr_WarnEx(PyExc_DeprecationWarning,
-                                 "automatic int conversions have been deprecated",
-                                 1)) {
-                    return -1;
-                }
-                iobj = PyNumber_Long(v);
-                if (iobj == NULL ) {
-                    if (PyErr_ExceptionMatches(PyExc_TypeError))
-                        goto wrongtype;
-                    return -1;
-                }
+                if (PyErr_ExceptionMatches(PyExc_TypeError))
+                    goto wrongtype;
+                return -1;
             }
         }
         else {
@@ -14087,10 +14100,23 @@ mainformatlong(PyObject *v,
     return 0;
 
 wrongtype:
-    PyErr_Format(PyExc_TypeError,
-            "%%%c format: a number is required, "
-            "not %.200s",
-            type, Py_TYPE(v)->tp_name);
+    switch(type)
+    {
+        case 'o':
+        case 'x':
+        case 'X':
+            PyErr_Format(PyExc_TypeError,
+                    "%%%c format: an integer is required, "
+                    "not %.200s",
+                    type, Py_TYPE(v)->tp_name);
+            break;
+        default:
+            PyErr_Format(PyExc_TypeError,
+                    "%%%c format: a number is required, "
+                    "not %.200s",
+                    type, Py_TYPE(v)->tp_name);
+            break;
+    }
     return -1;
 }
 
@@ -14108,22 +14134,10 @@ formatchar(PyObject *v)
         PyObject *iobj;
         long x;
         /* make sure number is a type of integer */
-        /* if not, issue deprecation warning for now */
         if (!PyLong_Check(v)) {
             iobj = PyNumber_Index(v);
             if (iobj == NULL) {
-                PyErr_Clear();
-                if (PyErr_WarnEx(PyExc_DeprecationWarning,
-                                 "automatic int conversions have been deprecated",
-                                 1)) {
-                    return -1;
-                }
-                iobj = PyNumber_Long(v);
-                if (iobj == NULL ) {
-                    if (PyErr_ExceptionMatches(PyExc_TypeError))
-                        goto onError;
-                    return -1;
-                }
+                goto onError;
             }
             v = iobj;
             Py_DECREF(iobj);
author	Ned Deily <nad@acm.org>	2014-04-09 16:22:19 -0700
committer	Ned Deily <nad@acm.org>	2014-04-09 16:22:19 -0700
commit	a4a8b4599f3ea553e45af913536d0f6985081a8f (patch)
tree	ca2a1984a51a388d86db2a006b5cd5a24fc15d3c /Objects
parent	7c5c123a453e906da293896ed413614236064cce (diff)
parent	8bc92821f69ba96eb0e2acabea54eb3b5e1c86a5 (diff)
download	cpython-a4a8b4599f3ea553e45af913536d0f6985081a8f.tar.gz