diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 2206 |
1 files changed, 1133 insertions, 1073 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6c8fe2d865..0a3712e2f3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -47,14 +47,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include <windows.h> #endif -/* Endianness switches; defaults to little endian */ - -#ifdef WORDS_BIGENDIAN -# define BYTEORDER_IS_BIG_ENDIAN -#else -# define BYTEORDER_IS_LITTLE_ENDIAN -#endif - /* --- Globals ------------------------------------------------------------ The globals are initialized by the _PyUnicode_Init() API and should @@ -412,8 +404,6 @@ unicode_result_wchar(PyObject *unicode) #ifndef Py_DEBUG Py_ssize_t len; - assert(Py_REFCNT(unicode) == 1); - len = _PyUnicode_WSTR_LENGTH(unicode); if (len == 0) { Py_INCREF(unicode_empty); @@ -431,10 +421,12 @@ unicode_result_wchar(PyObject *unicode) } if (_PyUnicode_Ready(unicode) < 0) { - Py_XDECREF(unicode); + Py_DECREF(unicode); return NULL; } #else + assert(Py_REFCNT(unicode) == 1); + /* don't make the result ready in debug mode to ensure that the caller makes the string ready before using it */ assert(_PyUnicode_CheckConsistency(unicode, 1)); @@ -640,6 +632,25 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, } } +#ifdef Py_DEBUG +/* Fill the data of an Unicode string with invalid characters to detect bugs + earlier. + + _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for + ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an + invalid character in Unicode 6.0. */ +static void +unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) +{ + int kind = PyUnicode_KIND(unicode); + Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); + Py_ssize_t length = _PyUnicode_LENGTH(unicode); + if (length <= old_length) + return; + memset(data + old_length * kind, 0xff, (length - old_length) * kind); +} +#endif + static PyObject* resize_compact(PyObject *unicode, Py_ssize_t length) { @@ -648,6 +659,10 @@ resize_compact(PyObject *unicode, Py_ssize_t length) Py_ssize_t new_size; int share_wstr; PyObject *new_unicode; +#ifdef Py_DEBUG + Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); +#endif + assert(unicode_modifiable(unicode)); assert(PyUnicode_IS_READY(unicode)); assert(PyUnicode_IS_COMPACT(unicode)); @@ -683,6 +698,9 @@ resize_compact(PyObject *unicode, Py_ssize_t length) if (!PyUnicode_IS_ASCII(unicode)) _PyUnicode_WSTR_LENGTH(unicode) = length; } +#ifdef Py_DEBUG + unicode_fill_invalid(unicode, old_length); +#endif PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), length, 0); assert(_PyUnicode_CheckConsistency(unicode, 0)); @@ -701,6 +719,9 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) Py_ssize_t char_size; int share_wstr, share_utf8; void *data; +#ifdef Py_DEBUG + Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); +#endif data = _PyUnicode_DATA_ANY(unicode); char_size = PyUnicode_KIND(unicode); @@ -736,6 +757,9 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) } _PyUnicode_LENGTH(unicode) = length; PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); +#ifdef Py_DEBUG + unicode_fill_invalid(unicode, old_length); +#endif if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { assert(_PyUnicode_CheckConsistency(unicode, 0)); return 0; @@ -788,8 +812,8 @@ resize_copy(PyObject *unicode, Py_ssize_t length) return NULL; copy_length = _PyUnicode_WSTR_LENGTH(unicode); copy_length = Py_MIN(copy_length, length); - Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), - copy_length); + Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), + copy_length * sizeof(wchar_t)); return w; } } @@ -1060,11 +1084,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) } } #ifdef Py_DEBUG - /* Fill the data with invalid characters to detect bugs earlier. - _PyUnicode_CheckConsistency(str, 1) detects invalid characters, - at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII - and U+FFFFFFFF is an invalid character in Unicode 6.0. */ - memset(data, 0xff, size * kind); + unicode_fill_invalid((PyObject*)unicode, 0); #endif assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); return obj; @@ -1672,6 +1692,14 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index, switch (kind) { case PyUnicode_1BYTE_KIND: { assert(index + len <= PyUnicode_GET_LENGTH(unicode)); +#ifdef Py_DEBUG + if (PyUnicode_IS_ASCII(unicode)) { + Py_UCS4 maxchar = ucs1lib_find_max_char( + (const Py_UCS1*)str, + (const Py_UCS1*)str + len); + assert(maxchar < 128); + } +#endif memcpy((char *) data + index, str, len); break; } @@ -2265,16 +2293,9 @@ PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) static void makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, - int zeropad, int width, int precision, char c) + char c) { *fmt++ = '%'; - if (width) { - if (zeropad) - *fmt++ = '0'; - fmt += sprintf(fmt, "%d", width); - } - if (precision) - fmt += sprintf(fmt, ".%d", precision); if (longflag) *fmt++ = 'l'; else if (longlongflag) { @@ -2299,44 +2320,70 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, *fmt = '\0'; } -/* helper for PyUnicode_FromFormatV() */ +/* maximum number of characters required for output of %lld or %p. + We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, + plus 1 for the sign. 53/22 is an upper bound for log10(256). */ +#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) static const char* -parse_format_flags(const char *f, - int *p_width, int *p_precision, - int *p_longflag, int *p_longlongflag, int *p_size_tflag) +unicode_fromformat_arg(_PyUnicodeWriter *writer, + const char *f, va_list *vargs) { - int width, precision, longflag, longlongflag, size_tflag; + const char *p; + Py_ssize_t len; + int zeropad; + int width; + int precision; + int longflag; + int longlongflag; + int size_tflag; + int fill; + + p = f; + f++; + zeropad = 0; + if (*f == '0') { + zeropad = 1; + f++; + } /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ - f++; width = 0; - while (Py_ISDIGIT((unsigned)*f)) - width = (width*10) + *f++ - '0'; + while (Py_ISDIGIT((unsigned)*f)) { + if (width > (INT_MAX - ((int)*f - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "width too big"); + return NULL; + } + width = (width*10) + (*f - '0'); + f++; + } precision = 0; if (*f == '.') { f++; - while (Py_ISDIGIT((unsigned)*f)) - precision = (precision*10) + *f++ - '0'; + while (Py_ISDIGIT((unsigned)*f)) { + if (precision > (INT_MAX - ((int)*f - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "precision too big"); + return NULL; + } + precision = (precision*10) + (*f - '0'); + f++; + } if (*f == '%') { /* "%.3%s" => f points to "3" */ f--; } } if (*f == '\0') { - /* bogus format "%.1" => go backward, f points to "1" */ + /* bogus format "%.123" => go backward, f points to "3" */ f--; } - if (p_width != NULL) - *p_width = width; - if (p_precision != NULL) - *p_precision = precision; /* Handle %ld, %lu, %lld and %llu. */ longflag = 0; longlongflag = 0; size_tflag = 0; - if (*f == 'l') { if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { longflag = 1; @@ -2355,494 +2402,297 @@ parse_format_flags(const char *f, size_tflag = 1; ++f; } - if (p_longflag != NULL) - *p_longflag = longflag; - if (p_longlongflag != NULL) - *p_longlongflag = longlongflag; - if (p_size_tflag != NULL) - *p_size_tflag = size_tflag; - return f; -} -/* maximum number of characters required for output of %ld. 21 characters - allows for 64-bit integers (in decimal) and an optional sign. */ -#define MAX_LONG_CHARS 21 -/* maximum number of characters required for output of %lld. - We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, - plus 1 for the sign. 53/22 is an upper bound for log10(256). */ -#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) + if (f[1] == '\0') + writer->overallocate = 0; -PyObject * -PyUnicode_FromFormatV(const char *format, va_list vargs) -{ - va_list count; - Py_ssize_t callcount = 0; - PyObject **callresults = NULL; - PyObject **callresult = NULL; - Py_ssize_t n = 0; - int width = 0; - int precision = 0; - int zeropad; - const char* f; - PyObject *string; - /* used by sprintf */ - char fmt[61]; /* should be enough for %0width.precisionlld */ - Py_UCS4 maxchar = 127; /* result is ASCII by default */ - Py_UCS4 argmaxchar; - Py_ssize_t numbersize = 0; - char *numberresults = NULL; - char *numberresult = NULL; - Py_ssize_t i; - int kind; - void *data; + switch (*f) { + case 'c': + { + int ordinal = va_arg(*vargs, int); + if (ordinal < 0 || ordinal > MAX_UNICODE) { + PyErr_SetString(PyExc_ValueError, + "character argument not in range(0x110000)"); + return NULL; + } + if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1) + return NULL; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal); + writer->pos++; + break; + } - Py_VA_COPY(count, vargs); - /* step 1: count the number of %S/%R/%A/%s format specifications - * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ - * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the - * result in an array) - * also estimate a upper bound for all the number formats in the string, - * numbers will be formatted in step 3 and be kept in a '\0'-separated - * buffer before putting everything together. */ - for (f = format; *f; f++) { - if (*f == '%') { - int longlongflag; - /* skip width or width.precision (eg. "1.2" of "%1.2f") */ - f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); - if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') - ++callcount; + case 'i': + case 'd': + case 'u': + case 'x': + { + /* used by sprintf */ + char fmt[10]; /* should be enough for "%0lld\0" */ + char buffer[MAX_LONG_LONG_CHARS]; + + if (*f == 'u') { + makefmt(fmt, longflag, longlongflag, size_tflag, *f); - else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { + if (longflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, unsigned long)); #ifdef HAVE_LONG_LONG - if (longlongflag) { - if (width < MAX_LONG_LONG_CHARS) - width = MAX_LONG_LONG_CHARS; - } - else + else if (longlongflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, unsigned PY_LONG_LONG)); #endif - /* MAX_LONG_CHARS is enough to hold a 64-bit integer, - including sign. Decimal takes the most space. This - isn't enough for octal. If a width is specified we - need more (which we allocate later). */ - if (width < MAX_LONG_CHARS) - width = MAX_LONG_CHARS; - - /* account for the size + '\0' to separate numbers - inside of the numberresults buffer */ - numbersize += (width + 1); + else if (size_tflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, size_t)); + else + len = sprintf(buffer, fmt, + va_arg(*vargs, unsigned int)); + } + else if (*f == 'x') { + makefmt(fmt, 0, 0, 0, 'x'); + len = sprintf(buffer, fmt, va_arg(*vargs, int)); + } + else { + makefmt(fmt, longflag, longlongflag, size_tflag, *f); + + if (longflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, PY_LONG_LONG)); +#endif + else if (size_tflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, Py_ssize_t)); + else + len = sprintf(buffer, fmt, + va_arg(*vargs, int)); + } + assert(len >= 0); + + if (precision < len) + precision = len; + if (width > precision) { + Py_UCS4 fillchar; + fill = width - precision; + fillchar = zeropad?'0':' '; + if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1) + return NULL; + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) + return NULL; + writer->pos += fill; + } + if (precision > len) { + fill = precision - len; + if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1) + return NULL; + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) + return NULL; + writer->pos += fill; + } + if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) + return NULL; + break; + } + + case 'p': + { + char number[MAX_LONG_LONG_CHARS]; + + len = sprintf(number, "%p", va_arg(*vargs, void*)); + assert(len >= 0); + + /* %p is ill-defined: ensure leading 0x. */ + if (number[1] == 'X') + number[1] = 'x'; + else if (number[1] != 'x') { + memmove(number + 2, number, + strlen(number) + 1); + number[0] = '0'; + number[1] = 'x'; + len += 2; + } + + if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) + return NULL; + break; + } + + case 's': + { + /* UTF-8 */ + const char *s = va_arg(*vargs, const char*); + PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); + if (!str) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + Py_DECREF(str); + return NULL; + } + Py_DECREF(str); + break; + } + + case 'U': + { + PyObject *obj = va_arg(*vargs, PyObject *); + assert(obj && _PyUnicode_CHECK(obj)); + + if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + return NULL; + break; + } + + case 'V': + { + PyObject *obj = va_arg(*vargs, PyObject *); + const char *str = va_arg(*vargs, const char *); + PyObject *str_obj; + assert(obj || str); + if (obj) { + assert(_PyUnicode_CHECK(obj)); + if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + return NULL; + } + else { + str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); + if (!str_obj) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { + Py_DECREF(str_obj); + return NULL; } + Py_DECREF(str_obj); } - else if ((unsigned char)*f > 127) { - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)*f); + break; + } + + case 'S': + { + PyObject *obj = va_arg(*vargs, PyObject *); + PyObject *str; + assert(obj); + str = PyObject_Str(obj); + if (!str) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + Py_DECREF(str); return NULL; } + Py_DECREF(str); + break; } - /* step 2: allocate memory for the results of - * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ - if (callcount) { - callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); - if (!callresults) { - PyErr_NoMemory(); + + case 'R': + { + PyObject *obj = va_arg(*vargs, PyObject *); + PyObject *repr; + assert(obj); + repr = PyObject_Repr(obj); + if (!repr) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { + Py_DECREF(repr); return NULL; } - callresult = callresults; + Py_DECREF(repr); + break; } - /* step 2.5: allocate memory for the results of formating numbers */ - if (numbersize) { - numberresults = PyObject_Malloc(numbersize); - if (!numberresults) { - PyErr_NoMemory(); - goto fail; + + case 'A': + { + PyObject *obj = va_arg(*vargs, PyObject *); + PyObject *ascii; + assert(obj); + ascii = PyObject_ASCII(obj); + if (!ascii) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { + Py_DECREF(ascii); + return NULL; } - numberresult = numberresults; + Py_DECREF(ascii); + break; } - /* step 3: format numbers and figure out how large a buffer we need */ - for (f = format; *f; f++) { - if (*f == '%') { - const char* p; - int longflag; - int longlongflag; - int size_tflag; - int numprinted; + case '%': + if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1) + return NULL; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); + writer->pos++; + break; - p = f; - zeropad = (f[1] == '0'); - f = parse_format_flags(f, &width, &precision, - &longflag, &longlongflag, &size_tflag); - switch (*f) { - case 'c': - { - Py_UCS4 ordinal = va_arg(count, int); - maxchar = MAX_MAXCHAR(maxchar, ordinal); - n++; - break; - } - case '%': - n++; - break; - case 'i': - case 'd': - makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, - width, precision, *f); - if (longflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, long)); -#ifdef HAVE_LONG_LONG - else if (longlongflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, PY_LONG_LONG)); -#endif - else if (size_tflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, Py_ssize_t)); - else - numprinted = sprintf(numberresult, fmt, - va_arg(count, int)); - n += numprinted; - /* advance by +1 to skip over the '\0' */ - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 'u': - makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, - width, precision, 'u'); - if (longflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, unsigned long)); -#ifdef HAVE_LONG_LONG - else if (longlongflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, unsigned PY_LONG_LONG)); -#endif - else if (size_tflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, size_t)); - else - numprinted = sprintf(numberresult, fmt, - va_arg(count, unsigned int)); - n += numprinted; - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 'x': - makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); - numprinted = sprintf(numberresult, fmt, va_arg(count, int)); - n += numprinted; - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 'p': - numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); - /* %p is ill-defined: ensure leading 0x. */ - if (numberresult[1] == 'X') - numberresult[1] = 'x'; - else if (numberresult[1] != 'x') { - memmove(numberresult + 2, numberresult, - strlen(numberresult) + 1); - numberresult[0] = '0'; - numberresult[1] = 'x'; - numprinted += 2; - } - n += numprinted; - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 's': - { - /* UTF-8 */ - const char *s = va_arg(count, const char*); - PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); - if (!str) - goto fail; - /* since PyUnicode_DecodeUTF8 returns already flexible - unicode objects, there is no need to call ready on them */ - argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(str); - /* Remember the str and switch to the next slot */ - *callresult++ = str; - break; - } - case 'U': - { - PyObject *obj = va_arg(count, PyObject *); - assert(obj && _PyUnicode_CHECK(obj)); - if (PyUnicode_READY(obj) == -1) - goto fail; - argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(obj); - break; - } - case 'V': - { - PyObject *obj = va_arg(count, PyObject *); - const char *str = va_arg(count, const char *); - PyObject *str_obj; - assert(obj || str); - assert(!obj || _PyUnicode_CHECK(obj)); - if (obj) { - if (PyUnicode_READY(obj) == -1) - goto fail; - argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(obj); - *callresult++ = NULL; - } - else { - str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); - if (!str_obj) - goto fail; - if (PyUnicode_READY(str_obj) == -1) { - Py_DECREF(str_obj); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(str_obj); - *callresult++ = str_obj; - } - break; - } - case 'S': - { - PyObject *obj = va_arg(count, PyObject *); - PyObject *str; - assert(obj); - str = PyObject_Str(obj); - if (!str) - goto fail; - if (PyUnicode_READY(str) == -1) { - Py_DECREF(str); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(str); - /* Remember the str and switch to the next slot */ - *callresult++ = str; - break; - } - case 'R': - { - PyObject *obj = va_arg(count, PyObject *); - PyObject *repr; - assert(obj); - repr = PyObject_Repr(obj); - if (!repr) - goto fail; - if (PyUnicode_READY(repr) == -1) { - Py_DECREF(repr); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(repr); - /* Remember the repr and switch to the next slot */ - *callresult++ = repr; - break; - } - case 'A': - { - PyObject *obj = va_arg(count, PyObject *); - PyObject *ascii; - assert(obj); - ascii = PyObject_ASCII(obj); - if (!ascii) - goto fail; - if (PyUnicode_READY(ascii) == -1) { - Py_DECREF(ascii); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(ascii); - /* Remember the repr and switch to the next slot */ - *callresult++ = ascii; - break; - } - default: - /* if we stumble upon an unknown - formatting code, copy the rest of - the format string to the output - string. (we cannot just skip the - code, since there's no way to know - what's in the argument list) */ - n += strlen(p); - goto expand; - } - } else - n++; - } - expand: - /* step 4: fill the buffer */ - /* Since we've analyzed how much space we need, - we don't have to resize the string. - There can be no errors beyond this point. */ - string = PyUnicode_New(n, maxchar); - if (!string) - goto fail; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); - callresult = callresults; - numberresult = numberresults; + default: + /* if we stumble upon an unknown formatting code, copy the rest + of the format string to the output string. (we cannot just + skip the code, since there's no way to know what's in the + argument list) */ + len = strlen(p); + if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1) + return NULL; + f = p+len; + return f; + } - for (i = 0, f = format; *f; f++) { + f++; + return f; +} + +PyObject * +PyUnicode_FromFormatV(const char *format, va_list vargs) +{ + va_list vargs2; + const char *f; + _PyUnicodeWriter writer; + + _PyUnicodeWriter_Init(&writer, strlen(format) + 100); + + /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). + Copy it to be able to pass a reference to a subfunction. */ + Py_VA_COPY(vargs2, vargs); + + for (f = format; *f; ) { if (*f == '%') { - const char* p; + f = unicode_fromformat_arg(&writer, f, &vargs2); + if (f == NULL) + goto fail; + } + else { + const char *p; + Py_ssize_t len; p = f; - f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); - /* checking for == because the last argument could be a empty - string, which causes i to point to end, the assert at the end of - the loop */ - assert(i <= PyUnicode_GET_LENGTH(string)); - - switch (*f) { - case 'c': + do { - const int ordinal = va_arg(vargs, int); - PyUnicode_WRITE(kind, data, i++, ordinal); - break; - } - case 'i': - case 'd': - case 'u': - case 'x': - case 'p': - { - Py_ssize_t len; - /* unused, since we already have the result */ - if (*f == 'p') - (void) va_arg(vargs, void *); - else - (void) va_arg(vargs, int); - /* extract the result from numberresults and append. */ - len = strlen(numberresult); - unicode_write_cstr(string, i, numberresult, len); - /* skip over the separating '\0' */ - i += len; - numberresult += len; - assert(*numberresult == '\0'); - numberresult++; - assert(numberresult <= numberresults + numbersize); - break; - } - case 's': - { - /* unused, since we already have the result */ - Py_ssize_t size; - (void) va_arg(vargs, char *); - size = PyUnicode_GET_LENGTH(*callresult); - assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); - i += size; - /* We're done with the unicode()/repr() => forget it */ - Py_DECREF(*callresult); - /* switch to next unicode()/repr() result */ - ++callresult; - break; - } - case 'U': - { - PyObject *obj = va_arg(vargs, PyObject *); - Py_ssize_t size; - assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); - size = PyUnicode_GET_LENGTH(obj); - _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); - i += size; - break; - } - case 'V': - { - Py_ssize_t size; - PyObject *obj = va_arg(vargs, PyObject *); - va_arg(vargs, const char *); - if (obj) { - size = PyUnicode_GET_LENGTH(obj); - assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); - i += size; - } else { - size = PyUnicode_GET_LENGTH(*callresult); - assert(PyUnicode_KIND(*callresult) <= - PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); - i += size; - Py_DECREF(*callresult); + if ((unsigned char)*p > 127) { + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects an ASCII-encoded format " + "string, got a non-ASCII byte: 0x%02x", + (unsigned char)*p); + return NULL; } - ++callresult; - break; - } - case 'S': - case 'R': - case 'A': - { - Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); - /* unused, since we already have the result */ - (void) va_arg(vargs, PyObject *); - assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); - i += size; - /* We're done with the unicode()/repr() => forget it */ - Py_DECREF(*callresult); - /* switch to next unicode()/repr() result */ - ++callresult; - break; - } - case '%': - PyUnicode_WRITE(kind, data, i++, '%'); - break; - default: - { - Py_ssize_t len = strlen(p); - unicode_write_cstr(string, i, p, len); - i += len; - assert(i == PyUnicode_GET_LENGTH(string)); - goto end; - } + p++; } - } - else { - assert(i < PyUnicode_GET_LENGTH(string)); - PyUnicode_WRITE(kind, data, i++, *f); + while (*p != '\0' && *p != '%'); + len = p - f; + + if (*p == '\0') + writer.overallocate = 0; + if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) + goto fail; + unicode_write_cstr(writer.buffer, writer.pos, f, len); + writer.pos += len; + + f = p; } } - assert(i == PyUnicode_GET_LENGTH(string)); + return _PyUnicodeWriter_Finish(&writer); - end: - if (callresults) - PyObject_Free(callresults); - if (numberresults) - PyObject_Free(numberresults); - return unicode_result(string); fail: - if (callresults) { - PyObject **callresult2 = callresults; - while (callresult2 < callresult) { - Py_XDECREF(*callresult2); - ++callresult2; - } - PyObject_Free(callresults); - } - if (numberresults) - PyObject_Free(numberresults); + _PyUnicodeWriter_Dealloc(&writer); return NULL; } @@ -3723,18 +3573,20 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) int -_PyUnicode_HasNULChars(PyObject* s) +_PyUnicode_HasNULChars(PyObject* str) { - static PyObject *nul = NULL; + Py_ssize_t pos; - if (nul == NULL) - nul = PyUnicode_FromStringAndSize("\0", 1); - if (nul == NULL) + if (PyUnicode_READY(str) == -1) return -1; - return PyUnicode_Contains(s, nul); + pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), + PyUnicode_GET_LENGTH(str), '\0', 1); + if (pos == -1) + return 0; + else + return 1; } - int PyUnicode_FSConverter(PyObject* arg, void* addr) { @@ -4955,7 +4807,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, int bo = 0; /* assume native ordering by default */ const char *errmsg = ""; /* Offsets from q for retrieving bytes in the right order. */ -#ifdef BYTEORDER_IS_LITTLE_ENDIAN +#if PY_LITTLE_ENDIAN int iorder[] = {0, 1, 2, 3}; #else int iorder[] = {3, 2, 1, 0}; @@ -4977,7 +4829,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, if (size >= 4) { const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | (q[iorder[1]] << 8) | q[iorder[0]]; -#ifdef BYTEORDER_IS_LITTLE_ENDIAN +#if PY_LITTLE_ENDIAN if (bom == 0x0000FEFF) { q += 4; bo = -1; @@ -5091,7 +4943,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, unsigned char *p; Py_ssize_t nsize, i; /* Offsets from p for storing byte pairs in the right order. */ -#ifdef BYTEORDER_IS_LITTLE_ENDIAN +#if PY_LITTLE_ENDIAN int iorder[] = {0, 1, 2, 3}; #else int iorder[] = {3, 2, 1, 0}; @@ -5234,7 +5086,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, return unicode_empty; } -#ifdef BYTEORDER_IS_LITTLE_ENDIAN +#if PY_LITTLE_ENDIAN native_ordering = bo <= 0; #else native_ordering = bo >= 0; @@ -5351,7 +5203,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, unsigned short *out; Py_ssize_t bytesize; Py_ssize_t pairs; -#ifdef WORDS_BIGENDIAN +#if PY_BIG_ENDIAN int native_ordering = byteorder >= 0; #else int native_ordering = byteorder <= 0; @@ -8214,19 +8066,6 @@ make_translate_exception(PyObject **exceptionObject, } } -/* raises a UnicodeTranslateError */ -static void -raise_translate_exception(PyObject **exceptionObject, - PyObject *unicode, - Py_ssize_t startpos, Py_ssize_t endpos, - const char *reason) -{ - make_translate_exception(exceptionObject, - unicode, startpos, endpos, reason); - if (*exceptionObject != NULL) - PyCodec_StrictErrors(*exceptionObject); -} - /* error handling callback helper: build arguments, call the callback and check the arguments, put the result into newpos and return the replacement string, which @@ -8502,8 +8341,10 @@ _PyUnicode_TranslateCharmap(PyObject *input, } switch (known_errorHandler) { case 1: /* strict */ - raise_translate_exception(&exc, input, collstart, - collend, reason); + make_translate_exception(&exc, + input, collstart, collend, reason); + if (exc != NULL) + PyCodec_StrictErrors(exc); goto onError; case 2: /* replace */ /* No need to check for space, this is a 1:1 replacement */ @@ -10404,7 +10245,12 @@ unicode_compare(PyObject *str1, PyObject *str2) { int kind1, kind2; void *data1, *data2; - Py_ssize_t len1, len2, i; + Py_ssize_t len1, len2; + Py_ssize_t i, len; + + /* a string is equal to itself */ + if (str1 == str2) + return 0; kind1 = PyUnicode_KIND(str1); kind2 = PyUnicode_KIND(str2); @@ -10412,19 +10258,61 @@ unicode_compare(PyObject *str1, PyObject *str2) data2 = PyUnicode_DATA(str2); len1 = PyUnicode_GET_LENGTH(str1); len2 = PyUnicode_GET_LENGTH(str2); + len = Py_MIN(len1, len2); - for (i = 0; i < len1 && i < len2; ++i) { - Py_UCS4 c1, c2; - c1 = PyUnicode_READ(kind1, data1, i); - c2 = PyUnicode_READ(kind2, data2, i); + if (kind1 == 1 && kind2 == 1) { + int cmp = memcmp(data1, data2, len); + /* normalize result of memcmp() into the range [-1; 1] */ + if (cmp < 0) + return -1; + if (cmp > 0) + return 1; + } + else { + for (i = 0; i < len; ++i) { + Py_UCS4 c1, c2; + c1 = PyUnicode_READ(kind1, data1, i); + c2 = PyUnicode_READ(kind2, data2, i); - if (c1 != c2) - return (c1 < c2) ? -1 : 1; + if (c1 != c2) + return (c1 < c2) ? -1 : 1; + } } - return (len1 < len2) ? -1 : (len1 != len2); + if (len1 == len2) + return 0; + if (len1 < len2) + return -1; + else + return 1; +} + +static int +unicode_compare_eq(PyObject *str1, PyObject *str2) +{ + int kind; + void *data1, *data2; + Py_ssize_t len; + int cmp; + + /* a string is equal to itself */ + if (str1 == str2) + return 1; + + len = PyUnicode_GET_LENGTH(str1); + if (PyUnicode_GET_LENGTH(str2) != len) + return 0; + kind = PyUnicode_KIND(str1); + if (PyUnicode_KIND(str2) != kind) + return 0; + data1 = PyUnicode_DATA(str1); + data2 = PyUnicode_DATA(str2); + + cmp = memcmp(data1, data2, len * kind); + return (cmp == 0); } + int PyUnicode_Compare(PyObject *left, PyObject *right) { @@ -10475,36 +10363,27 @@ PyObject * PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) { int result; + PyObject *v; - if (PyUnicode_Check(left) && PyUnicode_Check(right)) { - PyObject *v; - if (PyUnicode_READY(left) == -1 || - PyUnicode_READY(right) == -1) - return NULL; - if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || - PyUnicode_KIND(left) != PyUnicode_KIND(right)) { - if (op == Py_EQ) { - Py_INCREF(Py_False); - return Py_False; - } - if (op == Py_NE) { - Py_INCREF(Py_True); - return Py_True; - } - } - if (left == right) - result = 0; + if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) + Py_RETURN_NOTIMPLEMENTED; + + if (PyUnicode_READY(left) == -1 || + PyUnicode_READY(right) == -1) + return NULL; + + if (op == Py_EQ || op == Py_NE) { + result = unicode_compare_eq(left, right); + if (op == Py_EQ) + v = TEST_COND(result); else - result = unicode_compare(left, right); + v = TEST_COND(!result); + } + else { + result = unicode_compare(left, right); /* Convert the return value to a Boolean */ switch (op) { - case Py_EQ: - v = TEST_COND(result == 0); - break; - case Py_NE: - v = TEST_COND(result != 0); - break; case Py_LE: v = TEST_COND(result <= 0); break; @@ -10521,11 +10400,9 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) PyErr_BadArgument(); return NULL; } - Py_INCREF(v); - return v; } - - Py_RETURN_NOTIMPLEMENTED; + Py_INCREF(v); + return v; } int @@ -12908,6 +12785,19 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) return 0; } +int +_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) +{ + Py_UCS4 maxchar; + + maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); + if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) + return -1; + unicode_write_cstr(writer->buffer, writer->pos, str, len); + writer->pos += len; + return 0; +} + PyObject * _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) { @@ -13185,16 +13075,39 @@ static PyMappingMethods unicode_as_mapping = { /* Helpers for PyUnicode_Format() */ +struct unicode_formatter_t { + PyObject *args; + int args_owned; + Py_ssize_t arglen, argidx; + PyObject *dict; + + enum PyUnicode_Kind fmtkind; + Py_ssize_t fmtcnt, fmtpos; + void *fmtdata; + PyObject *fmtstr; + + _PyUnicodeWriter writer; +}; + +struct unicode_format_arg_t { + Py_UCS4 ch; + int flags; + Py_ssize_t width; + int prec; + int sign; +}; + static PyObject * -getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) +unicode_format_getnextarg(struct unicode_formatter_t *ctx) { - Py_ssize_t argidx = *p_argidx; - if (argidx < arglen) { - (*p_argidx)++; - if (arglen < 0) - return args; + Py_ssize_t argidx = ctx->argidx; + + if (argidx < ctx->arglen) { + ctx->argidx++; + if (ctx->arglen < 0) + return ctx->args; else - return PyTuple_GetItem(args, argidx); + return PyTuple_GetItem(ctx->args, argidx); } PyErr_SetString(PyExc_TypeError, "not enough arguments for format string"); @@ -13203,23 +13116,34 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) /* Returns a new reference to a PyUnicode object, or NULL on failure. */ +/* Format a float into the writer if the writer is not NULL, or into *p_output + otherwise. + + Return 0 on success, raise an exception and return -1 on error. */ static int -formatfloat(PyObject *v, int flags, int prec, int type, - PyObject **p_output, _PyUnicodeWriter *writer) +formatfloat(PyObject *v, struct unicode_format_arg_t *arg, + PyObject **p_output, + _PyUnicodeWriter *writer) { char *p; double x; Py_ssize_t len; + int prec; + int dtoa_flags; x = PyFloat_AsDouble(v); if (x == -1.0 && PyErr_Occurred()) return -1; + prec = arg->prec; if (prec < 0) prec = 6; - p = PyOS_double_to_string(x, type, prec, - (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); + if (arg->flags & F_ALT) + dtoa_flags = Py_DTSF_ALT; + else + dtoa_flags = 0; + p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); if (p == NULL) return -1; len = strlen(p); @@ -13256,7 +13180,7 @@ formatfloat(PyObject *v, int flags, int prec, int type, * produce a '-' sign, but can for Python's unbounded ints. */ static PyObject* -formatlong(PyObject *val, int flags, int prec, int type) +formatlong(PyObject *val, struct unicode_format_arg_t *arg) { PyObject *result = NULL; char *buf; @@ -13266,6 +13190,8 @@ formatlong(PyObject *val, int flags, int prec, int type) Py_ssize_t llen; int numdigits; /* len == numnondigits + numdigits */ int numnondigits = 0; + int prec = arg->prec; + int type = arg->ch; /* Avoid exceeding SSIZE_T_MAX */ if (prec > INT_MAX-3) { @@ -13277,7 +13203,10 @@ formatlong(PyObject *val, int flags, int prec, int type) assert(PyLong_Check(val)); switch (type) { + default: + assert(!"'type' not in [diuoxX]"); case 'd': + case 'i': case 'u': /* Special-case boolean: we want 0/1 */ if (PyBool_Check(val)) @@ -13294,8 +13223,6 @@ formatlong(PyObject *val, int flags, int prec, int type) numnondigits = 2; result = PyNumber_ToBase(val, 16); break; - default: - assert(!"'type' not in [duoxX]"); } if (!result) return NULL; @@ -13323,7 +13250,7 @@ formatlong(PyObject *val, int flags, int prec, int type) assert(numdigits > 0); /* Get rid of base marker unless F_ALT */ - if (((flags & F_ALT) == 0 && + if (((arg->flags & F_ALT) == 0 && (type == 'o' || type == 'x' || type == 'X'))) { assert(buf[sign] == '0'); assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || @@ -13368,15 +13295,100 @@ formatlong(PyObject *val, int flags, int prec, int type) if (buf[i] >= 'a' && buf[i] <= 'x') buf[i] -= 'a'-'A'; } - if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) { + if (!PyUnicode_Check(result) + || buf != PyUnicode_DATA(result)) { PyObject *unicode; unicode = _PyUnicode_FromASCII(buf, len); Py_DECREF(result); result = unicode; } + else if (len != PyUnicode_GET_LENGTH(result)) { + if (PyUnicode_Resize(&result, len) < 0) + Py_CLEAR(result); + } return result; } +/* Format an integer. + * Return 1 if the number has been formatted into the writer, + * 0 if the number has been formatted into *p_output + * -1 and raise an exception on error */ +static int +mainformatlong(PyObject *v, + struct unicode_format_arg_t *arg, + PyObject **p_output, + _PyUnicodeWriter *writer) +{ + PyObject *iobj, *res; + char type = (char)arg->ch; + + if (!PyNumber_Check(v)) + goto wrongtype; + + if (!PyLong_Check(v)) { + iobj = PyNumber_Long(v); + if (iobj == NULL) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + goto wrongtype; + return -1; + } + assert(PyLong_Check(iobj)); + } + else { + iobj = v; + Py_INCREF(iobj); + } + + if (PyLong_CheckExact(v) + && arg->width == -1 && arg->prec == -1 + && !(arg->flags & (F_SIGN | F_BLANK)) + && type != 'X') + { + /* Fast path */ + int alternate = arg->flags & F_ALT; + int base; + + switch(type) + { + default: + assert(0 && "'type' not in [diuoxX]"); + case 'd': + case 'i': + case 'u': + base = 10; + break; + case 'o': + base = 8; + break; + case 'x': + case 'X': + base = 16; + break; + } + + if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { + Py_DECREF(iobj); + return -1; + } + Py_DECREF(iobj); + return 1; + } + + res = formatlong(iobj, arg); + Py_DECREF(iobj); + if (res == NULL) + return -1; + *p_output = res; + return 0; + +wrongtype: + PyErr_Format(PyExc_TypeError, + "%%%c format: a number is required, " + "not %.200s", + type, Py_TYPE(v)->tp_name); + return -1; +} + static Py_UCS4 formatchar(PyObject *v) { @@ -13409,540 +13421,588 @@ formatchar(PyObject *v) return (Py_UCS4) -1; } -PyObject * -PyUnicode_Format(PyObject *format, PyObject *args) -{ - Py_ssize_t fmtcnt, fmtpos, arglen, argidx; - int args_owned = 0; - PyObject *dict = NULL; - PyObject *temp = NULL; - PyObject *second = NULL; - PyObject *uformat; - void *fmt; - enum PyUnicode_Kind kind, fmtkind; - _PyUnicodeWriter writer; - Py_ssize_t sublen; - Py_UCS4 maxchar; +/* Parse options of an argument: flags, width, precision. + Handle also "%(name)" syntax. - if (format == NULL || args == NULL) { - PyErr_BadInternalCall(); - return NULL; - } - uformat = PyUnicode_FromObject(format); - if (uformat == NULL) - return NULL; - if (PyUnicode_READY(uformat) == -1) { - Py_DECREF(uformat); - return NULL; - } + Return 0 if the argument has been formatted into arg->str. + Return 1 if the argument has been written into ctx->writer, + Raise an exception and return -1 on error. */ +static int +unicode_format_arg_parse(struct unicode_formatter_t *ctx, + struct unicode_format_arg_t *arg) +{ +#define FORMAT_READ(ctx) \ + PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) - fmt = PyUnicode_DATA(uformat); - fmtkind = PyUnicode_KIND(uformat); - fmtcnt = PyUnicode_GET_LENGTH(uformat); - fmtpos = 0; + PyObject *v; - _PyUnicodeWriter_Init(&writer, fmtcnt + 100); + arg->ch = FORMAT_READ(ctx); + if (arg->ch == '(') { + /* Get argument value from a dictionary. Example: "%(name)s". */ + Py_ssize_t keystart; + Py_ssize_t keylen; + PyObject *key; + int pcount = 1; - if (PyTuple_Check(args)) { - arglen = PyTuple_Size(args); - argidx = 0; - } - else { - arglen = -1; - argidx = -2; + if (ctx->dict == NULL) { + PyErr_SetString(PyExc_TypeError, + "format requires a mapping"); + return -1; + } + ++ctx->fmtpos; + --ctx->fmtcnt; + keystart = ctx->fmtpos; + /* Skip over balanced parentheses */ + while (pcount > 0 && --ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + if (arg->ch == ')') + --pcount; + else if (arg->ch == '(') + ++pcount; + ctx->fmtpos++; + } + keylen = ctx->fmtpos - keystart - 1; + if (ctx->fmtcnt < 0 || pcount > 0) { + PyErr_SetString(PyExc_ValueError, + "incomplete format key"); + return -1; + } + key = PyUnicode_Substring(ctx->fmtstr, + keystart, keystart + keylen); + if (key == NULL) + return -1; + if (ctx->args_owned) { + Py_DECREF(ctx->args); + ctx->args_owned = 0; + } + ctx->args = PyObject_GetItem(ctx->dict, key); + Py_DECREF(key); + if (ctx->args == NULL) + return -1; + ctx->args_owned = 1; + ctx->arglen = -1; + ctx->argidx = -2; + } + + /* Parse flags. Example: "%+i" => flags=F_SIGN. */ + arg->flags = 0; + while (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + switch (arg->ch) { + case '-': arg->flags |= F_LJUST; continue; + case '+': arg->flags |= F_SIGN; continue; + case ' ': arg->flags |= F_BLANK; continue; + case '#': arg->flags |= F_ALT; continue; + case '0': arg->flags |= F_ZERO; continue; + } + break; } - if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) - dict = args; - - while (--fmtcnt >= 0) { - if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { - Py_ssize_t nonfmtpos; - nonfmtpos = fmtpos++; - while (fmtcnt >= 0 && - PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { - fmtpos++; - fmtcnt--; - } - if (fmtcnt < 0) - fmtpos--; - sublen = fmtpos - nonfmtpos; - maxchar = _PyUnicode_FindMaxChar(uformat, - nonfmtpos, nonfmtpos + sublen); - if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1) - goto onError; - _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, - uformat, nonfmtpos, sublen); - writer.pos += sublen; + /* Parse width. Example: "%10s" => width=10 */ + arg->width = -1; + if (arg->ch == '*') { + v = unicode_format_getnextarg(ctx); + if (v == NULL) + return -1; + if (!PyLong_Check(v)) { + PyErr_SetString(PyExc_TypeError, + "* wants int"); + return -1; } - else { - /* Got a format specifier */ - int flags = 0; - Py_ssize_t width = -1; - int prec = -1; - Py_UCS4 c = '\0'; - Py_UCS4 fill; - int sign; - Py_UCS4 signchar; - int isnumok; - PyObject *v = NULL; - void *pbuf = NULL; - Py_ssize_t pindex, len; - Py_UCS4 bufmaxchar; - Py_ssize_t buflen; - - fmtpos++; - c = PyUnicode_READ(fmtkind, fmt, fmtpos); - if (c == '(') { - Py_ssize_t keystart; - Py_ssize_t keylen; - PyObject *key; - int pcount = 1; - - if (dict == NULL) { - PyErr_SetString(PyExc_TypeError, - "format requires a mapping"); - goto onError; - } - ++fmtpos; - --fmtcnt; - keystart = fmtpos; - /* Skip over balanced parentheses */ - while (pcount > 0 && --fmtcnt >= 0) { - c = PyUnicode_READ(fmtkind, fmt, fmtpos); - if (c == ')') - --pcount; - else if (c == '(') - ++pcount; - fmtpos++; - } - keylen = fmtpos - keystart - 1; - if (fmtcnt < 0 || pcount > 0) { - PyErr_SetString(PyExc_ValueError, - "incomplete format key"); - goto onError; - } - key = PyUnicode_Substring(uformat, - keystart, keystart + keylen); - if (key == NULL) - goto onError; - if (args_owned) { - Py_DECREF(args); - args_owned = 0; - } - args = PyObject_GetItem(dict, key); - Py_DECREF(key); - if (args == NULL) { - goto onError; - } - args_owned = 1; - arglen = -1; - argidx = -2; - } - while (--fmtcnt >= 0) { - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); - switch (c) { - case '-': flags |= F_LJUST; continue; - case '+': flags |= F_SIGN; continue; - case ' ': flags |= F_BLANK; continue; - case '#': flags |= F_ALT; continue; - case '0': flags |= F_ZERO; continue; - } + arg->width = PyLong_AsLong(v); + if (arg->width == -1 && PyErr_Occurred()) + return -1; + if (arg->width < 0) { + arg->flags |= F_LJUST; + arg->width = -arg->width; + } + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + } + else if (arg->ch >= '0' && arg->ch <= '9') { + arg->width = arg->ch - '0'; + while (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + if (arg->ch < '0' || arg->ch > '9') break; + /* Since arg->ch is unsigned, the RHS would end up as unsigned, + mixing signed and unsigned comparison. Since arg->ch is between + '0' and '9', casting to int is safe. */ + if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "width too big"); + return -1; } - if (c == '*') { - v = getnextarg(args, arglen, &argidx); - if (v == NULL) - goto onError; - if (!PyLong_Check(v)) { - PyErr_SetString(PyExc_TypeError, - "* wants int"); - goto onError; - } - width = PyLong_AsLong(v); - if (width == -1 && PyErr_Occurred()) - goto onError; - if (width < 0) { - flags |= F_LJUST; - width = -width; - } - if (--fmtcnt >= 0) - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); - } - else if (c >= '0' && c <= '9') { - width = c - '0'; - while (--fmtcnt >= 0) { - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); - if (c < '0' || c > '9') - break; - /* Since c is unsigned, the RHS would end up as unsigned, - mixing signed and unsigned comparison. Since c is between - '0' and '9', casting to int is safe. */ - if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "width too big"); - goto onError; - } - width = width*10 + (c - '0'); - } + arg->width = arg->width*10 + (arg->ch - '0'); + } + } + + /* Parse precision. Example: "%.3f" => prec=3 */ + arg->prec = -1; + if (arg->ch == '.') { + arg->prec = 0; + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + if (arg->ch == '*') { + v = unicode_format_getnextarg(ctx); + if (v == NULL) + return -1; + if (!PyLong_Check(v)) { + PyErr_SetString(PyExc_TypeError, + "* wants int"); + return -1; } - if (c == '.') { - prec = 0; - if (--fmtcnt >= 0) - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); - if (c == '*') { - v = getnextarg(args, arglen, &argidx); - if (v == NULL) - goto onError; - if (!PyLong_Check(v)) { - PyErr_SetString(PyExc_TypeError, - "* wants int"); - goto onError; - } - prec = PyLong_AsLong(v); - if (prec == -1 && PyErr_Occurred()) - goto onError; - if (prec < 0) - prec = 0; - if (--fmtcnt >= 0) - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); - } - else if (c >= '0' && c <= '9') { - prec = c - '0'; - while (--fmtcnt >= 0) { - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); - if (c < '0' || c > '9') - break; - if (prec > (INT_MAX - ((int)c - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "prec too big"); - goto onError; - } - prec = prec*10 + (c - '0'); - } - } - } /* prec */ - if (fmtcnt >= 0) { - if (c == 'h' || c == 'l' || c == 'L') { - if (--fmtcnt >= 0) - c = PyUnicode_READ(fmtkind, fmt, fmtpos++); + arg->prec = PyLong_AsLong(v); + if (arg->prec == -1 && PyErr_Occurred()) + return -1; + if (arg->prec < 0) + arg->prec = 0; + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + } + } + else if (arg->ch >= '0' && arg->ch <= '9') { + arg->prec = arg->ch - '0'; + while (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; + if (arg->ch < '0' || arg->ch > '9') + break; + if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "precision too big"); + return -1; } + arg->prec = arg->prec*10 + (arg->ch - '0'); } - if (fmtcnt < 0) { - PyErr_SetString(PyExc_ValueError, - "incomplete format"); - goto onError; - } - if (fmtcnt == 0) - writer.overallocate = 0; + } + } - if (c == '%') { - if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1) - goto onError; - PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%'); - writer.pos += 1; - continue; + /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ + if (ctx->fmtcnt >= 0) { + if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { + if (--ctx->fmtcnt >= 0) { + arg->ch = FORMAT_READ(ctx); + ctx->fmtpos++; } + } + } + if (ctx->fmtcnt < 0) { + PyErr_SetString(PyExc_ValueError, + "incomplete format"); + return -1; + } + return 0; - v = getnextarg(args, arglen, &argidx); - if (v == NULL) - goto onError; +#undef FORMAT_READ +} - sign = 0; - signchar = '\0'; - fill = ' '; - switch (c) { - - case 's': - case 'r': - case 'a': - if (PyLong_CheckExact(v) && width == -1 && prec == -1) { - /* Fast path */ - if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) - goto onError; - goto nextarg; - } +/* Format one argument. Supported conversion specifiers: - if (PyUnicode_CheckExact(v) && c == 's') { - temp = v; - Py_INCREF(temp); - } - else { - if (c == 's') - temp = PyObject_Str(v); - else if (c == 'r') - temp = PyObject_Repr(v); - else - temp = PyObject_ASCII(v); - } - break; + - "s", "r", "a": any type + - "i", "d", "u", "o", "x", "X": int + - "e", "E", "f", "F", "g", "G": float + - "c": int or str (1 character) - case 'i': - case 'd': - case 'u': - case 'o': - case 'x': - case 'X': - if (PyLong_CheckExact(v) - && width == -1 && prec == -1 - && !(flags & (F_SIGN | F_BLANK))) - { - /* Fast path */ - switch(c) - { - case 'd': - case 'i': - case 'u': - if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) - goto onError; - goto nextarg; - case 'x': - if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1) - goto onError; - goto nextarg; - case 'o': - if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1) - goto onError; - goto nextarg; - default: - break; - } - } + Return 0 if the argument has been formatted into *p_str, + 1 if the argument has been written into ctx->writer, + -1 on error. */ +static int +unicode_format_arg_format(struct unicode_formatter_t *ctx, + struct unicode_format_arg_t *arg, + PyObject **p_str) +{ + PyObject *v; + _PyUnicodeWriter *writer = &ctx->writer; - isnumok = 0; - if (PyNumber_Check(v)) { - PyObject *iobj=NULL; + if (ctx->fmtcnt == 0) + ctx->writer.overallocate = 0; - if (PyLong_Check(v)) { - iobj = v; - Py_INCREF(iobj); - } - else { - iobj = PyNumber_Long(v); - } - if (iobj!=NULL) { - if (PyLong_Check(iobj)) { - isnumok = 1; - sign = 1; - temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); - Py_DECREF(iobj); - } - else { - Py_DECREF(iobj); - } - } - } - if (!isnumok) { - PyErr_Format(PyExc_TypeError, - "%%%c format: a number is required, " - "not %.200s", (char)c, Py_TYPE(v)->tp_name); - goto onError; - } - if (flags & F_ZERO) - fill = '0'; - break; + if (arg->ch == '%') { + if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1) + return -1; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); + writer->pos += 1; + return 1; + } - case 'e': - case 'E': - case 'f': - case 'F': - case 'g': - case 'G': - if (width == -1 && prec == -1 - && !(flags & (F_SIGN | F_BLANK))) - { - /* Fast path */ - if (formatfloat(v, flags, prec, c, NULL, &writer) == -1) - goto onError; - goto nextarg; - } + v = unicode_format_getnextarg(ctx); + if (v == NULL) + return -1; - sign = 1; - if (flags & F_ZERO) - fill = '0'; - if (formatfloat(v, flags, prec, c, &temp, NULL) == -1) - temp = NULL; - break; + arg->sign = 0; - case 'c': - { - Py_UCS4 ch = formatchar(v); - if (ch == (Py_UCS4) -1) - goto onError; - if (width == -1 && prec == -1) { - /* Fast path */ - if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) - goto onError; - PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); - writer.pos += 1; - goto nextarg; - } - temp = PyUnicode_FromOrdinal(ch); - break; - } + switch (arg->ch) { - default: - PyErr_Format(PyExc_ValueError, - "unsupported format character '%c' (0x%x) " - "at index %zd", - (31<=c && c<=126) ? (char)c : '?', - (int)c, - fmtpos - 1); - goto onError; - } - if (temp == NULL) - goto onError; - assert (PyUnicode_Check(temp)); + case 's': + case 'r': + case 'a': + if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { + /* Fast path */ + if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) + return -1; + return 1; + } - if (width == -1 && prec == -1 - && !(flags & (F_SIGN | F_BLANK))) - { - /* Fast path */ - if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1) - goto onError; - goto nextarg; - } + if (PyUnicode_CheckExact(v) && arg->ch == 's') { + *p_str = v; + Py_INCREF(*p_str); + } + else { + if (arg->ch == 's') + *p_str = PyObject_Str(v); + else if (arg->ch == 'r') + *p_str = PyObject_Repr(v); + else + *p_str = PyObject_ASCII(v); + } + break; - if (PyUnicode_READY(temp) == -1) { - Py_CLEAR(temp); - goto onError; - } - kind = PyUnicode_KIND(temp); - pbuf = PyUnicode_DATA(temp); - len = PyUnicode_GET_LENGTH(temp); + case 'i': + case 'd': + case 'u': + case 'o': + case 'x': + case 'X': + { + int ret = mainformatlong(v, arg, p_str, writer); + if (ret != 0) + return ret; + arg->sign = 1; + break; + } - if (c == 's' || c == 'r' || c == 'a') { - if (prec >= 0 && len > prec) - len = prec; - } + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + if (arg->width == -1 && arg->prec == -1 + && !(arg->flags & (F_SIGN | F_BLANK))) + { + /* Fast path */ + if (formatfloat(v, arg, NULL, writer) == -1) + return -1; + return 1; + } - /* pbuf is initialized here. */ - pindex = 0; - if (sign) { - Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); - if (ch == '-' || ch == '+') { - signchar = ch; - len--; - pindex++; - } - else if (flags & F_SIGN) - signchar = '+'; - else if (flags & F_BLANK) - signchar = ' '; - else - sign = 0; - } - if (width < len) - width = len; - - /* Compute the length and maximum character of the - written characters */ - bufmaxchar = 127; - if (!(flags & F_LJUST)) { - if (sign) { - if ((width-1) > len) - bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); - } - else { - if (width > len) - bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); - } - } - maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len); - bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); + arg->sign = 1; + if (formatfloat(v, arg, p_str, NULL) == -1) + return -1; + break; - buflen = width; - if (sign && len == width) - buflen++; + case 'c': + { + Py_UCS4 ch = formatchar(v); + if (ch == (Py_UCS4) -1) + return -1; + if (arg->width == -1 && arg->prec == -1) { + /* Fast path */ + if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1) + return -1; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); + writer->pos += 1; + return 1; + } + *p_str = PyUnicode_FromOrdinal(ch); + break; + } - if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1) - goto onError; + default: + PyErr_Format(PyExc_ValueError, + "unsupported format character '%c' (0x%x) " + "at index %zd", + (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', + (int)arg->ch, + ctx->fmtpos - 1); + return -1; + } + if (*p_str == NULL) + return -1; + assert (PyUnicode_Check(*p_str)); + return 0; +} - /* Write characters */ - if (sign) { - if (fill != ' ') { - PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); - writer.pos += 1; - } - if (width > len) - width--; - } - if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { - assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); - assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); - if (fill != ' ') { - PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); - PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); - writer.pos += 2; - pindex += 2; - } - width -= 2; - if (width < 0) - width = 0; - len -= 2; - } - if (width > len && !(flags & F_LJUST)) { - sublen = width - len; - FILL(writer.kind, writer.data, fill, writer.pos, sublen); - writer.pos += sublen; - width = len; - } - if (fill == ' ') { - if (sign) { - PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); - writer.pos += 1; - } - if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { - assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); - assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); - PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); - PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); - writer.pos += 2; - pindex += 2; - } - } +static int +unicode_format_arg_output(struct unicode_formatter_t *ctx, + struct unicode_format_arg_t *arg, + PyObject *str) +{ + Py_ssize_t len; + enum PyUnicode_Kind kind; + void *pbuf; + Py_ssize_t pindex; + Py_UCS4 signchar; + Py_ssize_t buflen; + Py_UCS4 maxchar, bufmaxchar; + Py_ssize_t sublen; + _PyUnicodeWriter *writer = &ctx->writer; + Py_UCS4 fill; - if (len) { - _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, - temp, pindex, len); - writer.pos += len; - } - if (width > len) { - sublen = width - len; - FILL(writer.kind, writer.data, ' ', writer.pos, sublen); - writer.pos += sublen; - } + fill = ' '; + if (arg->sign && arg->flags & F_ZERO) + fill = '0'; -nextarg: - if (dict && (argidx < arglen) && c != '%') { - PyErr_SetString(PyExc_TypeError, - "not all arguments converted during string formatting"); + if (PyUnicode_READY(str) == -1) + return -1; + + len = PyUnicode_GET_LENGTH(str); + if ((arg->width == -1 || arg->width <= len) + && (arg->prec == -1 || arg->prec >= len) + && !(arg->flags & (F_SIGN | F_BLANK))) + { + /* Fast path */ + if (_PyUnicodeWriter_WriteStr(writer, str) == -1) + return -1; + return 0; + } + + /* Truncate the string for "s", "r" and "a" formats + if the precision is set */ + if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { + if (arg->prec >= 0 && len > arg->prec) + len = arg->prec; + } + + /* Adjust sign and width */ + kind = PyUnicode_KIND(str); + pbuf = PyUnicode_DATA(str); + pindex = 0; + signchar = '\0'; + if (arg->sign) { + Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); + if (ch == '-' || ch == '+') { + signchar = ch; + len--; + pindex++; + } + else if (arg->flags & F_SIGN) + signchar = '+'; + else if (arg->flags & F_BLANK) + signchar = ' '; + else + arg->sign = 0; + } + if (arg->width < len) + arg->width = len; + + /* Prepare the writer */ + bufmaxchar = 127; + if (!(arg->flags & F_LJUST)) { + if (arg->sign) { + if ((arg->width-1) > len) + bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); + } + else { + if (arg->width > len) + bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); + } + } + maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); + bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); + buflen = arg->width; + if (arg->sign && len == arg->width) + buflen++; + if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1) + return -1; + + /* Write the sign if needed */ + if (arg->sign) { + if (fill != ' ') { + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); + writer->pos += 1; + } + if (arg->width > len) + arg->width--; + } + + /* Write the numeric prefix for "x", "X" and "o" formats + if the alternate form is used. + For example, write "0x" for the "%#x" format. */ + if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { + assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); + assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); + if (fill != ' ') { + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); + writer->pos += 2; + pindex += 2; + } + arg->width -= 2; + if (arg->width < 0) + arg->width = 0; + len -= 2; + } + + /* Pad left with the fill character if needed */ + if (arg->width > len && !(arg->flags & F_LJUST)) { + sublen = arg->width - len; + FILL(writer->kind, writer->data, fill, writer->pos, sublen); + writer->pos += sublen; + arg->width = len; + } + + /* If padding with spaces: write sign if needed and/or numeric prefix if + the alternate form is used */ + if (fill == ' ') { + if (arg->sign) { + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); + writer->pos += 1; + } + if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { + assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); + assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); + PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); + writer->pos += 2; + pindex += 2; + } + } + + /* Write characters */ + if (len) { + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, pindex, len); + writer->pos += len; + } + + /* Pad right with the fill character if needed */ + if (arg->width > len) { + sublen = arg->width - len; + FILL(writer->kind, writer->data, ' ', writer->pos, sublen); + writer->pos += sublen; + } + return 0; +} + +/* Helper of PyUnicode_Format(): format one arg. + Return 0 on success, raise an exception and return -1 on error. */ +static int +unicode_format_arg(struct unicode_formatter_t *ctx) +{ + struct unicode_format_arg_t arg; + PyObject *str; + int ret; + + ret = unicode_format_arg_parse(ctx, &arg); + if (ret == -1) + return -1; + + ret = unicode_format_arg_format(ctx, &arg, &str); + if (ret == -1) + return -1; + + if (ret != 1) { + ret = unicode_format_arg_output(ctx, &arg, str); + Py_DECREF(str); + if (ret == -1) + return -1; + } + + if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { + PyErr_SetString(PyExc_TypeError, + "not all arguments converted during string formatting"); + return -1; + } + return 0; +} + +PyObject * +PyUnicode_Format(PyObject *format, PyObject *args) +{ + struct unicode_formatter_t ctx; + + if (format == NULL || args == NULL) { + PyErr_BadInternalCall(); + return NULL; + } + + ctx.fmtstr = PyUnicode_FromObject(format); + if (ctx.fmtstr == NULL) + return NULL; + if (PyUnicode_READY(ctx.fmtstr) == -1) { + Py_DECREF(ctx.fmtstr); + return NULL; + } + ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); + ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); + ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); + ctx.fmtpos = 0; + + _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); + + if (PyTuple_Check(args)) { + ctx.arglen = PyTuple_Size(args); + ctx.argidx = 0; + } + else { + ctx.arglen = -1; + ctx.argidx = -2; + } + ctx.args_owned = 0; + if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) + ctx.dict = args; + else + ctx.dict = NULL; + ctx.args = args; + + while (--ctx.fmtcnt >= 0) { + if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { + Py_ssize_t nonfmtpos, sublen; + Py_UCS4 maxchar; + + nonfmtpos = ctx.fmtpos++; + while (ctx.fmtcnt >= 0 && + PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { + ctx.fmtpos++; + ctx.fmtcnt--; + } + if (ctx.fmtcnt < 0) { + ctx.fmtpos--; + ctx.writer.overallocate = 0; + } + sublen = ctx.fmtpos - nonfmtpos; + maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr, + nonfmtpos, nonfmtpos + sublen); + if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1) goto onError; - } - Py_CLEAR(temp); - } /* '%' */ - } /* until end */ - if (argidx < arglen && !dict) { + + _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos, + ctx.fmtstr, nonfmtpos, sublen); + ctx.writer.pos += sublen; + } + else { + ctx.fmtpos++; + if (unicode_format_arg(&ctx) == -1) + goto onError; + } + } + + if (ctx.argidx < ctx.arglen && !ctx.dict) { PyErr_SetString(PyExc_TypeError, "not all arguments converted during string formatting"); goto onError; } - if (args_owned) { - Py_DECREF(args); + if (ctx.args_owned) { + Py_DECREF(ctx.args); } - Py_DECREF(uformat); - Py_XDECREF(temp); - Py_XDECREF(second); - return _PyUnicodeWriter_Finish(&writer); + Py_DECREF(ctx.fmtstr); + return _PyUnicodeWriter_Finish(&ctx.writer); onError: - Py_DECREF(uformat); - Py_XDECREF(temp); - Py_XDECREF(second); - _PyUnicodeWriter_Dealloc(&writer); - if (args_owned) { - Py_DECREF(args); + Py_DECREF(ctx.fmtstr); + _PyUnicodeWriter_Dealloc(&ctx.writer); + if (ctx.args_owned) { + Py_DECREF(ctx.args); } return NULL; } |