summaryrefslogtreecommitdiff
path: root/Objects
diff options
context:
space:
mode:
authorNed Deily <nad@acm.org>2014-04-09 16:22:19 -0700
committerNed Deily <nad@acm.org>2014-04-09 16:22:19 -0700
commita4a8b4599f3ea553e45af913536d0f6985081a8f (patch)
treeca2a1984a51a388d86db2a006b5cd5a24fc15d3c /Objects
parent7c5c123a453e906da293896ed413614236064cce (diff)
parent8bc92821f69ba96eb0e2acabea54eb3b5e1c86a5 (diff)
downloadcpython-a4a8b4599f3ea553e45af913536d0f6985081a8f.tar.gz
Issue #21187: merge with 3.4
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c478
1 files changed, 246 insertions, 232 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ec22239d3b..af70ede749 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj)
return PyBytes_AS_STRING(*obj);
}
-static int
-is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
-{
- const char *curr = s + offset;
- const char *prev;
-
- if (!IsDBCSLeadByteEx(code_page, *curr))
- return 0;
-
- prev = CharPrevExA(code_page, s, curr, 0);
- if (prev == curr)
- return 1;
- /* FIXME: This code is limited to "true" double-byte encodings,
- as it assumes an incomplete character consists of a single
- byte. */
- if (curr - prev == 2)
- return 1;
- if (!IsDBCSLeadByteEx(code_page, *prev))
- return 1;
- return 0;
-}
-
static DWORD
decode_code_page_flags(UINT code_page)
{
@@ -6913,7 +6891,7 @@ static int
decode_code_page_errors(UINT code_page,
PyObject **v,
const char *in, const int size,
- const char *errors)
+ const char *errors, int final)
{
const char *startin = in;
const char *endin = in + size;
@@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page,
if (encoding == NULL)
return -1;
- if (errors == NULL || strcmp(errors, "strict") == 0) {
+ if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
UnicodeDecodeError. */
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
@@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page,
if (outsize <= 0) {
Py_ssize_t startinpos, endinpos, outpos;
+ /* last character in partial decode? */
+ if (in + insize >= endin && !final)
+ break;
+
startinpos = in - startin;
endinpos = startinpos + 1;
outpos = out - PyUnicode_AS_UNICODE(*v);
@@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page,
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (unicode_resize(v, outsize) < 0)
goto error;
- ret = size;
+ ret = in - startin;
error:
Py_XDECREF(encoding_obj);
@@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page,
done = 1;
}
- /* Skip trailing lead-byte unless 'final' is set */
- if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
- --chunk_size;
-
if (chunk_size == 0 && done) {
if (v != NULL)
break;
_Py_RETURN_UNICODE_EMPTY();
}
-
converted = decode_code_page_strict(code_page, &v,
s, chunk_size);
if (converted == -2)
converted = decode_code_page_errors(code_page, &v,
s, chunk_size,
- errors);
- assert(converted != 0);
+ errors, final);
+ assert(converted != 0 || done);
if (converted < 0) {
Py_XDECREF(v);
@@ -8496,10 +8473,10 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
}
else if (PyLong_Check(x)) {
long value = PyLong_AS_LONG(x);
- long max = PyUnicode_GetMax();
- if (value < 0 || value > max) {
- PyErr_Format(PyExc_TypeError,
- "character mapping must be in range(0x%x)", max+1);
+ if (value < 0 || value > MAX_UNICODE) {
+ PyErr_Format(PyExc_ValueError,
+ "character mapping must be in range(0x%x)",
+ MAX_UNICODE+1);
Py_DECREF(x);
return -1;
}
@@ -8518,76 +8495,168 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
return -1;
}
}
-/* ensure that *outobj is at least requiredsize characters long,
- if not reallocate and adjust various state variables.
- Return 0 on success, -1 on error */
+
+/* lookup the character, write the result into the writer.
+ Return 1 if the result was written into the writer, return 0 if the mapping
+ was undefined, raise an exception return -1 on error. */
static int
-charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
- Py_ssize_t requiredsize)
-{
- Py_ssize_t oldsize = *psize;
- Py_UCS4 *new_outobj;
- if (requiredsize > oldsize) {
- /* exponentially overallocate to minimize reallocations */
- if (requiredsize < 2 * oldsize)
- requiredsize = 2 * oldsize;
- new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
- if (new_outobj == 0)
+charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
+ _PyUnicodeWriter *writer)
+{
+ PyObject *item;
+
+ if (charmaptranslate_lookup(ch, mapping, &item))
+ return -1;
+
+ if (item == NULL) {
+ /* not found => default to 1:1 mapping */
+ if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
return -1;
- *outobj = new_outobj;
- *psize = requiredsize;
+ }
+ return 1;
}
- return 0;
+
+ if (item == Py_None) {
+ Py_DECREF(item);
+ return 0;
+ }
+
+ if (PyLong_Check(item)) {
+ long ch = (Py_UCS4)PyLong_AS_LONG(item);
+ /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
+ used it */
+ if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+ Py_DECREF(item);
+ return 1;
+ }
+
+ if (!PyUnicode_Check(item)) {
+ Py_DECREF(item);
+ return -1;
+ }
+
+ if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+
+ Py_DECREF(item);
+ return 1;
}
-/* lookup the character, put the result in the output string and adjust
- various state variables. Return a new reference to the object that
- was put in the output buffer in *result, or Py_None, if the mapping was
- undefined (in which case no character was written).
- The called must decref result.
- Return 0 on success, -1 on error. */
+
static int
-charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
- PyObject *mapping, Py_UCS4 **output,
- Py_ssize_t *osize, Py_ssize_t *opos,
- PyObject **res)
+unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
+ Py_UCS1 *translate)
{
- Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
- if (charmaptranslate_lookup(curinp, mapping, res))
+ PyObject *item = NULL;
+ int ret = 0;
+
+ if (charmaptranslate_lookup(ch, mapping, &item)) {
return -1;
- if (*res==NULL) {
+ }
+
+ if (item == Py_None) {
+ /* deletion */
+ translate[ch] = 0xfe;
+ }
+ else if (item == NULL) {
/* not found => default to 1:1 mapping */
- (*output)[(*opos)++] = curinp;
+ translate[ch] = ch;
+ return 1;
}
- else if (*res==Py_None)
- ;
- else if (PyLong_Check(*res)) {
- /* no overflow check, because we know that the space is enough */
- (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
+ else if (PyLong_Check(item)) {
+ long replace = PyLong_AS_LONG(item);
+ /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
+ used it */
+ if (127 < replace) {
+ /* invalid character or character outside ASCII:
+ skip the fast translate */
+ goto exit;
+ }
+ translate[ch] = (Py_UCS1)replace;
}
- else if (PyUnicode_Check(*res)) {
- Py_ssize_t repsize;
- if (PyUnicode_READY(*res) == -1)
+ else if (PyUnicode_Check(item)) {
+ Py_UCS4 replace;
+
+ if (PyUnicode_READY(item) == -1) {
+ Py_DECREF(item);
return -1;
- repsize = PyUnicode_GET_LENGTH(*res);
- if (repsize==1) {
- /* no overflow check, because we know that the space is enough */
- (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
}
- else if (repsize!=0) {
- /* more than one character */
- Py_ssize_t requiredsize = *opos +
- (PyUnicode_GET_LENGTH(input) - ipos) +
- repsize - 1;
- Py_ssize_t i;
- if (charmaptranslate_makespace(output, osize, requiredsize))
+ if (PyUnicode_GET_LENGTH(item) != 1)
+ goto exit;
+
+ replace = PyUnicode_READ_CHAR(item, 0);
+ if (replace > 127)
+ goto exit;
+ translate[ch] = (Py_UCS1)replace;
+ }
+ else {
+ /* not None, NULL, long or unicode */
+ goto exit;
+ }
+ ret = 1;
+
+ exit:
+ Py_DECREF(item);
+ return ret;
+}
+
+/* Fast path for ascii => ascii translation. Return 1 if the whole string
+ was translated into writer, return 0 if the input string was partially
+ translated into writer, raise an exception and return -1 on error. */
+static int
+unicode_fast_translate(PyObject *input, PyObject *mapping,
+ _PyUnicodeWriter *writer, int ignore)
+{
+ Py_UCS1 ascii_table[128], ch, ch2;
+ Py_ssize_t len;
+ Py_UCS1 *in, *end, *out;
+ int res = 0;
+
+ if (PyUnicode_READY(input) == -1)
+ return -1;
+ if (!PyUnicode_IS_ASCII(input))
+ return 0;
+ len = PyUnicode_GET_LENGTH(input);
+
+ memset(ascii_table, 0xff, 128);
+
+ in = PyUnicode_1BYTE_DATA(input);
+ end = in + len;
+
+ assert(PyUnicode_IS_ASCII(writer->buffer));
+ assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
+ out = PyUnicode_1BYTE_DATA(writer->buffer);
+
+ for (; in < end; in++) {
+ ch = *in;
+ ch2 = ascii_table[ch];
+ if (ch2 == 0xff) {
+ int translate = unicode_fast_translate_lookup(mapping, ch,
+ ascii_table);
+ if (translate < 0)
return -1;
- for(i = 0; i < repsize; i++)
- (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
+ if (translate == 0)
+ goto exit;
+ ch2 = ascii_table[ch];
}
+ if (ch2 == 0xfe) {
+ if (ignore)
+ continue;
+ goto exit;
+ }
+ assert(ch2 < 128);
+ *out = ch2;
+ out++;
}
- else
- return -1;
- return 0;
+ res = 1;
+
+exit:
+ writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
+ return res;
}
PyObject *
@@ -8596,22 +8665,17 @@ _PyUnicode_TranslateCharmap(PyObject *input,
const char *errors)
{
/* input object */
- char *idata;
+ char *data;
Py_ssize_t size, i;
int kind;
/* output buffer */
- Py_UCS4 *output = NULL;
- Py_ssize_t osize;
- PyObject *res;
- /* current output position */
- Py_ssize_t opos;
+ _PyUnicodeWriter writer;
+ /* error handler */
char *reason = "character maps to <undefined>";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace,
- * 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ int ignore;
+ int res;
if (mapping == NULL) {
PyErr_BadArgument();
@@ -8620,10 +8684,9 @@ _PyUnicode_TranslateCharmap(PyObject *input,
if (PyUnicode_READY(input) == -1)
return NULL;
- idata = (char*)PyUnicode_DATA(input);
+ data = (char*)PyUnicode_DATA(input);
kind = PyUnicode_KIND(input);
size = PyUnicode_GET_LENGTH(input);
- i = 0;
if (size == 0) {
Py_INCREF(input);
@@ -8632,121 +8695,81 @@ _PyUnicode_TranslateCharmap(PyObject *input,
/* allocate enough for a simple 1:1 translation without
replacements, if we need more, we'll resize */
- osize = size;
- output = PyMem_Malloc(osize * sizeof(Py_UCS4));
- opos = 0;
- if (output == NULL) {
- PyErr_NoMemory();
+ _PyUnicodeWriter_Init(&writer);
+ if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError;
+
+ ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
+
+ res = unicode_fast_translate(input, mapping, &writer, ignore);
+ if (res < 0) {
+ _PyUnicodeWriter_Dealloc(&writer);
+ return NULL;
}
+ if (res == 1)
+ return _PyUnicodeWriter_Finish(&writer);
+ i = writer.pos;
while (i<size) {
/* try to encode it */
- PyObject *x = NULL;
- if (charmaptranslate_output(input, i, mapping,
- &output, &osize, &opos, &x)) {
- Py_XDECREF(x);
+ int translate;
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ Py_ssize_t newpos;
+ /* startpos for collecting untranslatable chars */
+ Py_ssize_t collstart;
+ Py_ssize_t collend;
+ Py_UCS4 ch;
+
+ ch = PyUnicode_READ(kind, data, i);
+ translate = charmaptranslate_output(ch, mapping, &writer);
+ if (translate < 0)
goto onError;
- }
- Py_XDECREF(x);
- if (x!=Py_None) /* it worked => adjust input pointer */
+
+ if (translate != 0) {
+ /* it worked => adjust input pointer */
++i;
- else { /* untranslatable character */
- PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
- Py_ssize_t repsize;
- Py_ssize_t newpos;
- Py_ssize_t uni2;
- /* startpos for collecting untranslatable chars */
- Py_ssize_t collstart = i;
- Py_ssize_t collend = i+1;
- Py_ssize_t coll;
-
- /* find all untranslatable characters */
- while (collend < size) {
- if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
- goto onError;
- Py_XDECREF(x);
- if (x!=Py_None)
- break;
- ++collend;
- }
- /* cache callback name lookup
- * (if not done yet, i.e. it's the first error) */
- if (known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- known_errorHandler = 4;
- else
- known_errorHandler = 0;
- }
- switch (known_errorHandler) {
- case 1: /* strict */
- make_translate_exception(&exc,
- input, collstart, collend, reason);
- if (exc != NULL)
- PyCodec_StrictErrors(exc);
+ continue;
+ }
+
+ /* untranslatable character */
+ collstart = i;
+ collend = i+1;
+
+ /* find all untranslatable characters */
+ while (collend < size) {
+ PyObject *x;
+ ch = PyUnicode_READ(kind, data, collend);
+ if (charmaptranslate_lookup(ch, mapping, &x))
goto onError;
- case 2: /* replace */
- /* No need to check for space, this is a 1:1 replacement */
- for (coll = collstart; coll<collend; coll++)
- output[opos++] = '?';
- /* fall through */
- case 3: /* ignore */
- i = collend;
- break;
- case 4: /* xmlcharrefreplace */
- /* generate replacement (temporarily (mis)uses i) */
- for (i = collstart; i < collend; ++i) {
- char buffer[2+29+1+1];
- char *cp;
- sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
- if (charmaptranslate_makespace(&output, &osize,
- opos+strlen(buffer)+(size-collend)))
- goto onError;
- for (cp = buffer; *cp; ++cp)
- output[opos++] = *cp;
- }
- i = collend;
+ Py_XDECREF(x);
+ if (x != Py_None)
break;
- default:
- repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
- reason, input, &exc,
- collstart, collend, &newpos);
- if (repunicode == NULL)
- goto onError;
- if (PyUnicode_READY(repunicode) == -1) {
- Py_DECREF(repunicode);
- goto onError;
- }
- /* generate replacement */
- repsize = PyUnicode_GET_LENGTH(repunicode);
- if (charmaptranslate_makespace(&output, &osize,
- opos+repsize+(size-collend))) {
- Py_DECREF(repunicode);
- goto onError;
- }
- for (uni2 = 0; repsize-->0; ++uni2)
- output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
- i = newpos;
+ ++collend;
+ }
+
+ if (ignore) {
+ i = collend;
+ }
+ else {
+ repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+ reason, input, &exc,
+ collstart, collend, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Py_DECREF(repunicode);
+ goto onError;
}
+ Py_DECREF(repunicode);
+ i = newpos;
}
}
- res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
- if (!res)
- goto onError;
- PyMem_Free(output);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
- return res;
+ return _PyUnicodeWriter_Finish(&writer);
onError:
- PyMem_Free(output);
+ _PyUnicodeWriter_Dealloc(&writer);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
return NULL;
@@ -14009,24 +14032,14 @@ mainformatlong(PyObject *v,
if (!PyNumber_Check(v))
goto wrongtype;
- /* make sure number is a type of integer */
- /* if not, issue deprecation warning for now */
+ /* make sure number is a type of integer for o, x, and X */
if (!PyLong_Check(v)) {
if (type == 'o' || type == 'x' || type == 'X') {
iobj = PyNumber_Index(v);
if (iobj == NULL) {
- PyErr_Clear();
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "automatic int conversions have been deprecated",
- 1)) {
- return -1;
- }
- iobj = PyNumber_Long(v);
- if (iobj == NULL ) {
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- goto wrongtype;
- return -1;
- }
+ if (PyErr_ExceptionMatches(PyExc_TypeError))
+ goto wrongtype;
+ return -1;
}
}
else {
@@ -14087,10 +14100,23 @@ mainformatlong(PyObject *v,
return 0;
wrongtype:
- PyErr_Format(PyExc_TypeError,
- "%%%c format: a number is required, "
- "not %.200s",
- type, Py_TYPE(v)->tp_name);
+ switch(type)
+ {
+ case 'o':
+ case 'x':
+ case 'X':
+ PyErr_Format(PyExc_TypeError,
+ "%%%c format: an integer is required, "
+ "not %.200s",
+ type, Py_TYPE(v)->tp_name);
+ break;
+ default:
+ PyErr_Format(PyExc_TypeError,
+ "%%%c format: a number is required, "
+ "not %.200s",
+ type, Py_TYPE(v)->tp_name);
+ break;
+ }
return -1;
}
@@ -14108,22 +14134,10 @@ formatchar(PyObject *v)
PyObject *iobj;
long x;
/* make sure number is a type of integer */
- /* if not, issue deprecation warning for now */
if (!PyLong_Check(v)) {
iobj = PyNumber_Index(v);
if (iobj == NULL) {
- PyErr_Clear();
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "automatic int conversions have been deprecated",
- 1)) {
- return -1;
- }
- iobj = PyNumber_Long(v);
- if (iobj == NULL ) {
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- goto onError;
- return -1;
- }
+ goto onError;
}
v = iobj;
Py_DECREF(iobj);