summaryrefslogtreecommitdiff
path: root/Objects
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-10-02 13:14:03 +0300
committerSerhiy Storchaka <storchaka@gmail.com>2015-10-02 13:14:03 +0300
commitab1c52c60acbc3be631951d2d54067f059f027ec (patch)
tree9a070397c127130211bc12bc2b7ec1a35d62db47 /Objects
parente3a9a33dde86390899f22dc6b2e12b7e33b9750f (diff)
parent2a3cbec0216dfd9bba81d12940900a68452d3d56 (diff)
downloadcpython-ab1c52c60acbc3be631951d2d54067f059f027ec.tar.gz
Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:
1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate. 3. In some circumstances the '\xfd' character was produced instead of the replacement character '\ufffd' (due to a bug in _PyUnicodeWriter).
Diffstat (limited to 'Objects')
-rw-r--r--Objects/abstract.c2
-rw-r--r--Objects/setobject.c537
-rw-r--r--Objects/stringlib/codecs.h147
-rw-r--r--Objects/stringlib/unicode_format.h2
-rw-r--r--Objects/structseq.c24
-rw-r--r--Objects/typeobject.c46
-rw-r--r--Objects/unicodeobject.c259
7 files changed, 614 insertions, 403 deletions
diff --git a/Objects/abstract.c b/Objects/abstract.c
index 31cc0a8ba5..039a4ca052 100644
--- a/Objects/abstract.c
+++ b/Objects/abstract.c
@@ -2131,7 +2131,7 @@ PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw)
/* PyObject_Call() must not be called with an exception set,
because it may clear it (directly or indirectly) and so the
- caller looses its exception */
+ caller loses its exception */
assert(!PyErr_Occurred());
call = func->ob_type->tp_call;
diff --git a/Objects/setobject.c b/Objects/setobject.c
index 704d7e2b2a..03bb230961 100644
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -29,7 +29,6 @@
#include "Python.h"
#include "structmember.h"
-#include "stringlib/eq.h"
/* Object used as dummy key to fill deleted entries */
static PyObject _dummy_struct;
@@ -51,19 +50,20 @@ static PyObject _dummy_struct;
static setentry *
set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
{
- setentry *table = so->table;
- setentry *freeslot = NULL;
+ setentry *table;
setentry *entry;
- size_t perturb = hash;
+ size_t perturb;
size_t mask = so->mask;
size_t i = (size_t)hash & mask; /* Unsigned for defined overflow behavior */
size_t j;
int cmp;
- entry = &table[i];
+ entry = &so->table[i];
if (entry->key == NULL)
return entry;
+ perturb = hash;
+
while (1) {
if (entry->hash == hash) {
PyObject *startkey = entry->key;
@@ -73,8 +73,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
return entry;
if (PyUnicode_CheckExact(startkey)
&& PyUnicode_CheckExact(key)
- && unicode_eq(startkey, key))
+ && _PyUnicode_EQ(startkey, key))
return entry;
+ table = so->table;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
@@ -86,14 +87,12 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
return entry;
mask = so->mask; /* help avoid a register spill */
}
- if (entry->hash == -1 && freeslot == NULL)
- freeslot = entry;
if (i + LINEAR_PROBES <= mask) {
for (j = 0 ; j < LINEAR_PROBES ; j++) {
entry++;
- if (entry->key == NULL)
- goto found_null;
+ if (entry->hash == 0 && entry->key == NULL)
+ return entry;
if (entry->hash == hash) {
PyObject *startkey = entry->key;
assert(startkey != dummy);
@@ -101,8 +100,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
return entry;
if (PyUnicode_CheckExact(startkey)
&& PyUnicode_CheckExact(key)
- && unicode_eq(startkey, key))
+ && _PyUnicode_EQ(startkey, key))
return entry;
+ table = so->table;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
@@ -114,7 +114,104 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
return entry;
mask = so->mask;
}
- if (entry->hash == -1 && freeslot == NULL)
+ }
+ }
+
+ perturb >>= PERTURB_SHIFT;
+ i = (i * 5 + 1 + perturb) & mask;
+
+ entry = &so->table[i];
+ if (entry->key == NULL)
+ return entry;
+ }
+}
+
+static int set_table_resize(PySetObject *, Py_ssize_t);
+
+static int
+set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+ setentry *table;
+ setentry *freeslot;
+ setentry *entry;
+ size_t perturb;
+ size_t mask;
+ size_t i; /* Unsigned for defined overflow behavior */
+ size_t j;
+ int cmp;
+
+ /* Pre-increment is necessary to prevent arbitrary code in the rich
+ comparison from deallocating the key just before the insertion. */
+ Py_INCREF(key);
+
+ restart:
+
+ mask = so->mask;
+ i = (size_t)hash & mask;
+
+ entry = &so->table[i];
+ if (entry->key == NULL)
+ goto found_unused;
+
+ freeslot = NULL;
+ perturb = hash;
+
+ while (1) {
+ if (entry->hash == hash) {
+ PyObject *startkey = entry->key;
+ /* startkey cannot be a dummy because the dummy hash field is -1 */
+ assert(startkey != dummy);
+ if (startkey == key)
+ goto found_active;
+ if (PyUnicode_CheckExact(startkey)
+ && PyUnicode_CheckExact(key)
+ && _PyUnicode_EQ(startkey, key))
+ goto found_active;
+ table = so->table;
+ Py_INCREF(startkey);
+ cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+ Py_DECREF(startkey);
+ if (cmp > 0) /* likely */
+ goto found_active;
+ if (cmp < 0)
+ goto comparison_error;
+ /* Continuing the search from the current entry only makes
+ sense if the table and entry are unchanged; otherwise,
+ we have to restart from the beginning */
+ if (table != so->table || entry->key != startkey)
+ goto restart;
+ mask = so->mask; /* help avoid a register spill */
+ }
+ else if (entry->hash == -1 && freeslot == NULL)
+ freeslot = entry;
+
+ if (i + LINEAR_PROBES <= mask) {
+ for (j = 0 ; j < LINEAR_PROBES ; j++) {
+ entry++;
+ if (entry->hash == 0 && entry->key == NULL)
+ goto found_unused_or_dummy;
+ if (entry->hash == hash) {
+ PyObject *startkey = entry->key;
+ assert(startkey != dummy);
+ if (startkey == key)
+ goto found_active;
+ if (PyUnicode_CheckExact(startkey)
+ && PyUnicode_CheckExact(key)
+ && _PyUnicode_EQ(startkey, key))
+ goto found_active;
+ table = so->table;
+ Py_INCREF(startkey);
+ cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+ Py_DECREF(startkey);
+ if (cmp > 0)
+ goto found_active;
+ if (cmp < 0)
+ goto comparison_error;
+ if (table != so->table || entry->key != startkey)
+ goto restart;
+ mask = so->mask;
+ }
+ else if (entry->hash == -1 && freeslot == NULL)
freeslot = entry;
}
}
@@ -122,12 +219,35 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
perturb >>= PERTURB_SHIFT;
i = (i * 5 + 1 + perturb) & mask;
- entry = &table[i];
+ entry = &so->table[i];
if (entry->key == NULL)
- goto found_null;
+ goto found_unused_or_dummy;
}
- found_null:
- return freeslot == NULL ? entry : freeslot;
+
+ found_unused_or_dummy:
+ if (freeslot == NULL)
+ goto found_unused;
+ so->used++;
+ freeslot->key = key;
+ freeslot->hash = hash;
+ return 0;
+
+ found_unused:
+ so->fill++;
+ so->used++;
+ entry->key = key;
+ entry->hash = hash;
+ if ((size_t)so->fill*3 < mask*2)
+ return 0;
+ return set_table_resize(so, so->used);
+
+ found_active:
+ Py_DECREF(key);
+ return 0;
+
+ comparison_error:
+ Py_DECREF(key);
+ return -1;
}
/*
@@ -172,38 +292,6 @@ set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
/* ======== End logic for probing the hash table ========================== */
/* ======================================================================== */
-
-/*
-Internal routine to insert a new key into the table.
-Used by the public insert routine.
-Eats a reference to key.
-*/
-static int
-set_insert_key(PySetObject *so, PyObject *key, Py_hash_t hash)
-{
- setentry *entry;
-
- entry = set_lookkey(so, key, hash);
- if (entry == NULL)
- return -1;
- if (entry->key == NULL) {
- /* UNUSED */
- entry->key = key;
- entry->hash = hash;
- so->fill++;
- so->used++;
- } else if (entry->key == dummy) {
- /* DUMMY */
- entry->key = key;
- entry->hash = hash;
- so->used++;
- } else {
- /* ACTIVE */
- Py_DECREF(key);
- }
- return 0;
-}
-
/*
Restructure the table by allocating a new table and reinserting all
keys again. When entries have been deleted, the new table may
@@ -220,6 +308,7 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
setentry small_copy[PySet_MINSIZE];
assert(minused >= 0);
+ minused = (minused > 50000) ? minused * 2 : minused * 4;
/* Find the smallest table size > minused. */
/* XXX speed-up with intrinsics */
@@ -295,31 +384,42 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
return 0;
}
-/* CAUTION: set_add_key/entry() must guarantee it won't resize the table */
+static int
+set_contains_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+ setentry *entry;
+
+ entry = set_lookkey(so, key, hash);
+ if (entry != NULL)
+ return entry->key != NULL;
+ return -1;
+}
+
+#define DISCARD_NOTFOUND 0
+#define DISCARD_FOUND 1
static int
-set_add_entry(PySetObject *so, setentry *entry)
+set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
- Py_ssize_t n_used;
- PyObject *key = entry->key;
- Py_hash_t hash = entry->hash;
+ setentry *entry;
+ PyObject *old_key;
- assert(so->fill <= so->mask); /* at least one empty slot */
- n_used = so->used;
- Py_INCREF(key);
- if (set_insert_key(so, key, hash)) {
- Py_DECREF(key);
+ entry = set_lookkey(so, key, hash);
+ if (entry == NULL)
return -1;
- }
- if (!(so->used > n_used && so->fill*3 >= (so->mask+1)*2))
- return 0;
- return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+ if (entry->key == NULL)
+ return DISCARD_NOTFOUND;
+ old_key = entry->key;
+ entry->key = dummy;
+ entry->hash = -1;
+ so->used--;
+ Py_DECREF(old_key);
+ return DISCARD_FOUND;
}
static int
set_add_key(PySetObject *so, PyObject *key)
{
- setentry entry;
Py_hash_t hash;
if (!PyUnicode_CheckExact(key) ||
@@ -328,50 +428,35 @@ set_add_key(PySetObject *so, PyObject *key)
if (hash == -1)
return -1;
}
- entry.key = key;
- entry.hash = hash;
- return set_add_entry(so, &entry);
+ return set_add_entry(so, key, hash);
}
-#define DISCARD_NOTFOUND 0
-#define DISCARD_FOUND 1
-
static int
-set_discard_entry(PySetObject *so, setentry *oldentry)
+set_contains_key(PySetObject *so, PyObject *key)
{
- setentry *entry;
- PyObject *old_key;
+ Py_hash_t hash;
- entry = set_lookkey(so, oldentry->key, oldentry->hash);
- if (entry == NULL)
- return -1;
- if (entry->key == NULL || entry->key == dummy)
- return DISCARD_NOTFOUND;
- old_key = entry->key;
- entry->key = dummy;
- entry->hash = -1;
- so->used--;
- Py_DECREF(old_key);
- return DISCARD_FOUND;
+ if (!PyUnicode_CheckExact(key) ||
+ (hash = ((PyASCIIObject *) key)->hash) == -1) {
+ hash = PyObject_Hash(key);
+ if (hash == -1)
+ return -1;
+ }
+ return set_contains_entry(so, key, hash);
}
static int
set_discard_key(PySetObject *so, PyObject *key)
{
- setentry entry;
Py_hash_t hash;
- assert (PyAnySet_Check(so));
-
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
- entry.key = key;
- entry.hash = hash;
- return set_discard_entry(so, &entry);
+ return set_discard_entry(so, key, hash);
}
static void
@@ -452,20 +537,22 @@ set_next(PySetObject *so, Py_ssize_t *pos_ptr, setentry **entry_ptr)
{
Py_ssize_t i;
Py_ssize_t mask;
- setentry *table;
+ setentry *entry;
assert (PyAnySet_Check(so));
i = *pos_ptr;
assert(i >= 0);
- table = so->table;
mask = so->mask;
- while (i <= mask && (table[i].key == NULL || table[i].key == dummy))
+ entry = &so->table[i];
+ while (i <= mask && (entry->key == NULL || entry->key == dummy)) {
i++;
+ entry++;
+ }
*pos_ptr = i+1;
if (i > mask)
return 0;
- assert(table[i].key != NULL);
- *entry_ptr = &table[i];
+ assert(entry != NULL);
+ *entry_ptr = entry;
return 1;
}
@@ -563,8 +650,8 @@ set_merge(PySetObject *so, PyObject *otherset)
* incrementally resizing as we insert new keys. Expect
* that there will be no (or few) overlapping keys.
*/
- if ((so->fill + other->used)*3 >= (so->mask+1)*2) {
- if (set_table_resize(so, (so->used + other->used)*2) != 0)
+ if ((so->fill + other->used)*3 >= so->mask*2) {
+ if (set_table_resize(so, so->used + other->used) != 0)
return -1;
}
so_entry = so->table;
@@ -604,46 +691,13 @@ set_merge(PySetObject *so, PyObject *otherset)
other_entry = &other->table[i];
key = other_entry->key;
if (key != NULL && key != dummy) {
- Py_INCREF(key);
- if (set_insert_key(so, key, other_entry->hash)) {
- Py_DECREF(key);
+ if (set_add_entry(so, key, other_entry->hash))
return -1;
- }
}
}
return 0;
}
-static int
-set_contains_entry(PySetObject *so, setentry *entry)
-{
- PyObject *key;
- setentry *lu_entry;
-
- lu_entry = set_lookkey(so, entry->key, entry->hash);
- if (lu_entry == NULL)
- return -1;
- key = lu_entry->key;
- return key != NULL && key != dummy;
-}
-
-static int
-set_contains_key(PySetObject *so, PyObject *key)
-{
- setentry entry;
- Py_hash_t hash;
-
- if (!PyUnicode_CheckExact(key) ||
- (hash = ((PyASCIIObject *) key)->hash) == -1) {
- hash = PyObject_Hash(key);
- if (hash == -1)
- return -1;
- }
- entry.key = key;
- entry.hash = hash;
- return set_contains_entry(so, &entry);
-}
-
static PyObject *
set_pop(PySetObject *so)
{
@@ -685,43 +739,64 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)
return 0;
}
-static Py_hash_t
-frozenset_hash(PyObject *self)
+/* Work to increase the bit dispersion for closely spaced hash values.
+ This is important because some use cases have many combinations of a
+ small number of elements with nearby hashes so that many distinct
+ combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
{
- /* Most of the constants in this hash algorithm are randomly choosen
- large primes with "interesting bit patterns" and that passed
- tests for good collision statistics on a variety of problematic
- datasets such as:
+ return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
- ps = []
- for r in range(21):
- ps += itertools.combinations(range(20), r)
- num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+/* Most of the constants in this hash algorithm are randomly chosen
+ large primes with "interesting bit patterns" and that passed tests
+ for good collision statistics on a variety of problematic datasets
+ including powersets and graph structures (such as David Eppstein's
+ graph recipes in Lib/test/test_set.py) */
- */
+static Py_hash_t
+frozenset_hash(PyObject *self)
+{
PySetObject *so = (PySetObject *)self;
- Py_uhash_t h, hash = 1927868237UL;
+ Py_uhash_t hash = 0;
setentry *entry;
- Py_ssize_t pos = 0;
if (so->hash != -1)
return so->hash;
- hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
- while (set_next(so, &pos, &entry)) {
- /* Work to increase the bit dispersion for closely spaced hash
- values. This is important because some use cases have many
- combinations of a small number of elements with nearby
- hashes so that many distinct combinations collapse to only
- a handful of distinct hash values. */
- h = entry->hash;
- hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
- }
- /* Make the final result spread-out in a different pattern
- than the algorithm for tuples or other python objects. */
+ /* Xor-in shuffled bits from every entry's hash field because xor is
+ commutative and a frozenset hash should be independent of order.
+
+ For speed, include null entries and dummy entries and then
+ subtract out their effect afterwards so that the final hash
+ depends only on active entries. This allows the code to be
+ vectorized by the compiler and it saves the unpredictable
+ branches that would arise when trying to exclude null and dummy
+ entries on every iteration. */
+
+ for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+ hash ^= _shuffle_bits(entry->hash);
+
+ /* Remove the effect of an odd number of NULL entries */
+ if ((so->mask + 1 - so->fill) & 1)
+ hash ^= _shuffle_bits(0);
+
+ /* Remove the effect of an odd number of dummy entries */
+ if ((so->fill - so->used) & 1)
+ hash ^= _shuffle_bits(-1);
+
+ /* Factor in the number of active entries */
+ hash ^= ((Py_uhash_t)PySet_GET_SIZE(self) + 1) * 1927868237UL;
+
+ /* Disperse patterns arising in nested frozensets */
hash = hash * 69069U + 907133923UL;
+
+ /* -1 is reserved as an error code */
if (hash == (Py_uhash_t)-1)
hash = 590923713UL;
+
so->hash = hash;
return hash;
}
@@ -868,7 +943,7 @@ PyTypeObject PySetIter_Type = {
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_flags */
0, /* tp_doc */
(traverseproc)setiter_traverse, /* tp_traverse */
0, /* tp_clear */
@@ -913,18 +988,14 @@ set_update_internal(PySetObject *so, PyObject *other)
* incrementally resizing as we insert new keys. Expect
* that there will be no (or few) overlapping keys.
*/
- if (dictsize == -1)
+ if (dictsize < 0)
return -1;
- if ((so->fill + dictsize)*3 >= (so->mask+1)*2) {
- if (set_table_resize(so, (so->used + dictsize)*2) != 0)
+ if ((so->fill + dictsize)*3 >= so->mask*2) {
+ if (set_table_resize(so, so->used + dictsize) != 0)
return -1;
}
while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
- setentry an_entry;
-
- an_entry.hash = hash;
- an_entry.key = key;
- if (set_add_entry(so, &an_entry))
+ if (set_add_entry(so, key, hash))
return -1;
}
return 0;
@@ -1204,6 +1275,8 @@ set_intersection(PySetObject *so, PyObject *other)
{
PySetObject *result;
PyObject *key, *it, *tmp;
+ Py_hash_t hash;
+ int rv;
if ((PyObject *)so == other)
return set_copy(so);
@@ -1223,13 +1296,15 @@ set_intersection(PySetObject *so, PyObject *other)
}
while (set_next((PySetObject *)other, &pos, &entry)) {
- int rv = set_contains_entry(so, entry);
- if (rv == -1) {
+ key = entry->key;
+ hash = entry->hash;
+ rv = set_contains_entry(so, key, hash);
+ if (rv < 0) {
Py_DECREF(result);
return NULL;
}
if (rv) {
- if (set_add_entry(result, entry)) {
+ if (set_add_entry(result, key, hash)) {
Py_DECREF(result);
return NULL;
}
@@ -1245,32 +1320,15 @@ set_intersection(PySetObject *so, PyObject *other)
}
while ((key = PyIter_Next(it)) != NULL) {
- int rv;
- setentry entry;
- Py_hash_t hash = PyObject_Hash(key);
-
- if (hash == -1) {
- Py_DECREF(it);
- Py_DECREF(result);
- Py_DECREF(key);
- return NULL;
- }
- entry.hash = hash;
- entry.key = key;
- rv = set_contains_entry(so, &entry);
- if (rv == -1) {
- Py_DECREF(it);
- Py_DECREF(result);
- Py_DECREF(key);
- return NULL;
- }
+ hash = PyObject_Hash(key);
+ if (hash == -1)
+ goto error;
+ rv = set_contains_entry(so, key, hash);
+ if (rv < 0)
+ goto error;
if (rv) {
- if (set_add_entry(result, &entry)) {
- Py_DECREF(it);
- Py_DECREF(result);
- Py_DECREF(key);
- return NULL;
- }
+ if (set_add_entry(result, key, hash))
+ goto error;
}
Py_DECREF(key);
}
@@ -1280,6 +1338,11 @@ set_intersection(PySetObject *so, PyObject *other)
return NULL;
}
return (PyObject *)result;
+ error:
+ Py_DECREF(it);
+ Py_DECREF(result);
+ Py_DECREF(key);
+ return NULL;
}
static PyObject *
@@ -1366,6 +1429,7 @@ static PyObject *
set_isdisjoint(PySetObject *so, PyObject *other)
{
PyObject *key, *it, *tmp;
+ int rv;
if ((PyObject *)so == other) {
if (PySet_GET_SIZE(so) == 0)
@@ -1384,8 +1448,8 @@ set_isdisjoint(PySetObject *so, PyObject *other)
other = tmp;
}
while (set_next((PySetObject *)other, &pos, &entry)) {
- int rv = set_contains_entry(so, entry);
- if (rv == -1)
+ rv = set_contains_entry(so, entry->key, entry->hash);
+ if (rv < 0)
return NULL;
if (rv)
Py_RETURN_FALSE;
@@ -1398,8 +1462,6 @@ set_isdisjoint(PySetObject *so, PyObject *other)
return NULL;
while ((key = PyIter_Next(it)) != NULL) {
- int rv;
- setentry entry;
Py_hash_t hash = PyObject_Hash(key);
if (hash == -1) {
@@ -1407,11 +1469,9 @@ set_isdisjoint(PySetObject *so, PyObject *other)
Py_DECREF(it);
return NULL;
}
- entry.hash = hash;
- entry.key = key;
- rv = set_contains_entry(so, &entry);
+ rv = set_contains_entry(so, key, hash);
Py_DECREF(key);
- if (rv == -1) {
+ if (rv < 0) {
Py_DECREF(it);
return NULL;
}
@@ -1440,7 +1500,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
Py_ssize_t pos = 0;
while (set_next((PySetObject *)other, &pos, &entry))
- if (set_discard_entry(so, entry) == -1)
+ if (set_discard_entry(so, entry->key, entry->hash) < 0)
return -1;
} else {
PyObject *key, *it;
@@ -1449,7 +1509,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
return -1;
while ((key = PyIter_Next(it)) != NULL) {
- if (set_discard_key(so, key) == -1) {
+ if (set_discard_key(so, key) < 0) {
Py_DECREF(it);
Py_DECREF(key);
return -1;
@@ -1460,10 +1520,10 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
if (PyErr_Occurred())
return -1;
}
- /* If more than 1/5 are dummies, then resize them away. */
- if ((so->fill - so->used) * 5 < so->mask)
+ /* If more than 1/4th are dummies, then resize them away. */
+ if ((size_t)(so->fill - so->used) <= (size_t)so->mask / 4)
return 0;
- return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+ return set_table_resize(so, so->used);
}
static PyObject *
@@ -1490,7 +1550,7 @@ set_copy_and_difference(PySetObject *so, PyObject *other)
result = set_copy(so);
if (result == NULL)
return NULL;
- if (set_difference_update_internal((PySetObject *) result, other) != -1)
+ if (set_difference_update_internal((PySetObject *) result, other) == 0)
return result;
Py_DECREF(result);
return NULL;
@@ -1500,8 +1560,11 @@ static PyObject *
set_difference(PySetObject *so, PyObject *other)
{
PyObject *result;
+ PyObject *key;
+ Py_hash_t hash;
setentry *entry;
Py_ssize_t pos = 0;
+ int rv;
if (!PyAnySet_Check(other) && !PyDict_CheckExact(other)) {
return set_copy_and_difference(so, other);
@@ -1519,17 +1582,15 @@ set_difference(PySetObject *so, PyObject *other)
if (PyDict_CheckExact(other)) {
while (set_next(so, &pos, &entry)) {
- setentry entrycopy;
- int rv;
- entrycopy.hash = entry->hash;
- entrycopy.key = entry->key;
- rv = _PyDict_Contains(other, entry->key, entry->hash);
+ key = entry->key;
+ hash = entry->hash;
+ rv = _PyDict_Contains(other, key, hash);
if (rv < 0) {
Py_DECREF(result);
return NULL;
}
if (!rv) {
- if (set_add_entry((PySetObject *)result, &entrycopy)) {
+ if (set_add_entry((PySetObject *)result, key, hash)) {
Py_DECREF(result);
return NULL;
}
@@ -1540,13 +1601,15 @@ set_difference(PySetObject *so, PyObject *other)
/* Iterate over so, checking for common elements in other. */
while (set_next(so, &pos, &entry)) {
- int rv = set_contains_entry((PySetObject *)other, entry);
- if (rv == -1) {
+ key = entry->key;
+ hash = entry->hash;
+ rv = set_contains_entry((PySetObject *)other, key, hash);
+ if (rv < 0) {
Py_DECREF(result);
return NULL;
}
if (!rv) {
- if (set_add_entry((PySetObject *)result, entry)) {
+ if (set_add_entry((PySetObject *)result, key, hash)) {
Py_DECREF(result);
return NULL;
}
@@ -1608,29 +1671,24 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
PySetObject *otherset;
PyObject *key;
Py_ssize_t pos = 0;
+ Py_hash_t hash;
setentry *entry;
+ int rv;
if ((PyObject *)so == other)
return set_clear(so);
if (PyDict_CheckExact(other)) {
PyObject *value;
- int rv;
- Py_hash_t hash;
while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
- setentry an_entry;
-
Py_INCREF(key);
- an_entry.hash = hash;
- an_entry.key = key;
-
- rv = set_discard_entry(so, &an_entry);
- if (rv == -1) {
+ rv = set_discard_entry(so, key, hash);
+ if (rv < 0) {
Py_DECREF(key);
return NULL;
}
if (rv == DISCARD_NOTFOUND) {
- if (set_add_entry(so, &an_entry)) {
+ if (set_add_entry(so, key, hash)) {
Py_DECREF(key);
return NULL;
}
@@ -1650,13 +1708,15 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
}
while (set_next(otherset, &pos, &entry)) {
- int rv = set_discard_entry(so, entry);
- if (rv == -1) {
+ key = entry->key;
+ hash = entry->hash;
+ rv = set_discard_entry(so, key, hash);
+ if (rv < 0) {
Py_DECREF(otherset);
return NULL;
}
if (rv == DISCARD_NOTFOUND) {
- if (set_add_entry(so, entry)) {
+ if (set_add_entry(so, key, hash)) {
Py_DECREF(otherset);
return NULL;
}
@@ -1718,6 +1778,7 @@ set_issubset(PySetObject *so, PyObject *other)
{
setentry *entry;
Py_ssize_t pos = 0;
+ int rv;
if (!PyAnySet_Check(other)) {
PyObject *tmp, *result;
@@ -1732,8 +1793,8 @@ set_issubset(PySetObject *so, PyObject *other)
Py_RETURN_FALSE;
while (set_next(so, &pos, &entry)) {
- int rv = set_contains_entry((PySetObject *)other, entry);
- if (rv == -1)
+ rv = set_contains_entry((PySetObject *)other, entry->key, entry->hash);
+ if (rv < 0)
return NULL;
if (!rv)
Py_RETURN_FALSE;
@@ -1824,7 +1885,7 @@ set_contains(PySetObject *so, PyObject *key)
int rv;
rv = set_contains_key(so, key);
- if (rv == -1) {
+ if (rv < 0) {
if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
return -1;
PyErr_Clear();
@@ -1843,7 +1904,7 @@ set_direct_contains(PySetObject *so, PyObject *key)
long result;
result = set_contains(so, key);
- if (result == -1)
+ if (result < 0)
return NULL;
return PyBool_FromLong(result);
}
@@ -1857,7 +1918,7 @@ set_remove(PySetObject *so, PyObject *key)
int rv;
rv = set_discard_key(so, key);
- if (rv == -1) {
+ if (rv < 0) {
if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
return NULL;
PyErr_Clear();
@@ -1866,7 +1927,7 @@ set_remove(PySetObject *so, PyObject *key)
return NULL;
rv = set_discard_key(so, tmpkey);
Py_DECREF(tmpkey);
- if (rv == -1)
+ if (rv < 0)
return NULL;
}
@@ -1889,7 +1950,7 @@ set_discard(PySetObject *so, PyObject *key)
int rv;
rv = set_discard_key(so, key);
- if (rv == -1) {
+ if (rv < 0) {
if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
return NULL;
PyErr_Clear();
@@ -1898,7 +1959,7 @@ set_discard(PySetObject *so, PyObject *key)
return NULL;
rv = set_discard_key(so, tmpkey);
Py_DECREF(tmpkey);
- if (rv == -1)
+ if (rv < 0)
return NULL;
}
Py_RETURN_NONE;
@@ -2125,7 +2186,7 @@ static PyMethodDef frozenset_methods[] = {
copy_doc},
{"difference", (PyCFunction)set_difference_multi, METH_VARARGS,
difference_doc},
- {"intersection",(PyCFunction)set_intersection_multi, METH_VARARGS,
+ {"intersection", (PyCFunction)set_intersection_multi, METH_VARARGS,
intersection_doc},
{"isdisjoint", (PyCFunction)set_isdisjoint, METH_O,
isdisjoint_doc},
@@ -2196,7 +2257,7 @@ PyTypeObject PyFrozenSet_Type = {
(traverseproc)set_traverse, /* tp_traverse */
(inquiry)set_clear_internal, /* tp_clear */
(richcmpfunc)set_richcompare, /* tp_richcompare */
- offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
+ offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
(getiterfunc)set_iter, /* tp_iter */
0, /* tp_iternext */
frozenset_methods, /* tp_methods */
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 0fc6b582d2..562191c18e 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -268,9 +268,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_ssize_t nallocated; /* number of result bytes allocated */
Py_ssize_t nneeded; /* number of result bytes needed */
#if STRINGLIB_SIZEOF_CHAR > 1
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
PyObject *rep = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
#endif
#if STRINGLIB_SIZEOF_CHAR == 1
const Py_ssize_t max_char_size = 2;
@@ -326,72 +327,116 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
}
#if STRINGLIB_SIZEOF_CHAR > 1
else if (Py_UNICODE_IS_SURROGATE(ch)) {
- Py_ssize_t newpos;
- Py_ssize_t repsize, k, startpos;
+ Py_ssize_t startpos, endpos, newpos;
+ Py_ssize_t repsize, k;
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
startpos = i-1;
- rep = unicode_encode_call_errorhandler(
- errors, &errorHandler, "utf-8", "surrogates not allowed",
- unicode, &exc, startpos, startpos+1, &newpos);
- if (!rep)
- goto error;
-
- if (PyBytes_Check(rep))
- repsize = PyBytes_GET_SIZE(rep);
- else
- repsize = PyUnicode_GET_LENGTH(rep);
+ endpos = startpos+1;
+
+ while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
+ endpos++;
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ memset(p, '?', endpos - startpos);
+ p += (endpos - startpos);
+ /* fall through the ignore handler */
+ case _Py_ERROR_IGNORE:
+ i += (endpos - startpos - 1);
+ break;
- if (repsize > max_char_size) {
- Py_ssize_t offset;
- if (result == NULL)
- offset = p - stackbuf;
- else
- offset = p - PyBytes_AS_STRING(result);
+ case _Py_ERROR_SURROGATEPASS:
+ for (k=startpos; k<endpos; k++) {
+ ch = data[k];
+ *p++ = (char)(0xe0 | (ch >> 12));
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ i += (endpos - startpos - 1);
+ break;
- if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
- /* integer overflow */
- PyErr_NoMemory();
- goto error;
+ case _Py_ERROR_SURROGATEESCAPE:
+ for (k=startpos; k<endpos; k++) {
+ ch = data[k];
+ if (!(0xDC80 <= ch && ch <= 0xDCFF))
+ break;
+ *p++ = (char)(ch & 0xff);
}
- nallocated += repsize - max_char_size;
- if (result != NULL) {
- if (_PyBytes_Resize(&result, nallocated) < 0)
- goto error;
- } else {
- result = PyBytes_FromStringAndSize(NULL, nallocated);
+ if (k >= endpos) {
+ i += (endpos - startpos - 1);
+ break;
+ }
+ startpos = k;
+ assert(startpos < endpos);
+ /* fall through the default handler */
+
+ default:
+ rep = unicode_encode_call_errorhandler(
+ errors, &error_handler_obj, "utf-8", "surrogates not allowed",
+ unicode, &exc, startpos, endpos, &newpos);
+ if (!rep)
+ goto error;
+
+ if (PyBytes_Check(rep))
+ repsize = PyBytes_GET_SIZE(rep);
+ else
+ repsize = PyUnicode_GET_LENGTH(rep);
+
+ if (repsize > max_char_size) {
+ Py_ssize_t offset;
+
if (result == NULL)
+ offset = p - stackbuf;
+ else
+ offset = p - PyBytes_AS_STRING(result);
+
+ if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
+ /* integer overflow */
+ PyErr_NoMemory();
goto error;
- Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
+ }
+ nallocated += repsize - max_char_size;
+ if (result != NULL) {
+ if (_PyBytes_Resize(&result, nallocated) < 0)
+ goto error;
+ } else {
+ result = PyBytes_FromStringAndSize(NULL, nallocated);
+ if (result == NULL)
+ goto error;
+ Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
+ }
+ p = PyBytes_AS_STRING(result) + offset;
}
- p = PyBytes_AS_STRING(result) + offset;
- }
-
- if (PyBytes_Check(rep)) {
- char *prep = PyBytes_AS_STRING(rep);
- for(k = repsize; k > 0; k--)
- *p++ = *prep++;
- } else /* rep is unicode */ {
- enum PyUnicode_Kind repkind;
- void *repdata;
- if (PyUnicode_READY(rep) < 0)
- goto error;
- repkind = PyUnicode_KIND(rep);
- repdata = PyUnicode_DATA(rep);
+ if (PyBytes_Check(rep)) {
+ memcpy(p, PyBytes_AS_STRING(rep), repsize);
+ p += repsize;
+ }
+ else {
+ /* rep is unicode */
+ if (PyUnicode_READY(rep) < 0)
+ goto error;
- for(k=0; k<repsize; k++) {
- Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
- if (0x80 <= c) {
+ if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, "utf-8",
unicode,
i-1, i,
"surrogates not allowed");
goto error;
}
- *p++ = (char)c;
+
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ memcpy(p, PyUnicode_DATA(rep), repsize);
+ p += repsize;
}
+ Py_CLEAR(rep);
+
+ i = newpos;
}
- Py_CLEAR(rep);
}
else
#if STRINGLIB_SIZEOF_CHAR > 2
@@ -430,7 +475,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
}
#if STRINGLIB_SIZEOF_CHAR > 1
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
#endif
return result;
@@ -438,7 +483,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#if STRINGLIB_SIZEOF_CHAR > 1
error:
Py_XDECREF(rep);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
Py_XDECREF(result);
return NULL;
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index aec221acff..d72e47d348 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -67,7 +67,7 @@ SubString_new_object(SubString *str)
return PyUnicode_Substring(str->str, str->start, str->end);
}
-/* return a new string. if str->str is NULL, return None */
+/* return a new string. if str->str is NULL, return a new empty string */
Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString *str)
{
diff --git a/Objects/structseq.c b/Objects/structseq.c
index 664344be6c..720973816e 100644
--- a/Objects/structseq.c
+++ b/Objects/structseq.c
@@ -16,14 +16,14 @@ _Py_IDENTIFIER(n_fields);
_Py_IDENTIFIER(n_unnamed_fields);
#define VISIBLE_SIZE(op) Py_SIZE(op)
-#define VISIBLE_SIZE_TP(tp) PyLong_AsLong( \
+#define VISIBLE_SIZE_TP(tp) PyLong_AsSsize_t( \
_PyDict_GetItemId((tp)->tp_dict, &PyId_n_sequence_fields))
-#define REAL_SIZE_TP(tp) PyLong_AsLong( \
+#define REAL_SIZE_TP(tp) PyLong_AsSsize_t( \
_PyDict_GetItemId((tp)->tp_dict, &PyId_n_fields))
#define REAL_SIZE(op) REAL_SIZE_TP(Py_TYPE(op))
-#define UNNAMED_FIELDS_TP(tp) PyLong_AsLong( \
+#define UNNAMED_FIELDS_TP(tp) PyLong_AsSsize_t( \
_PyDict_GetItemId((tp)->tp_dict, &PyId_n_unnamed_fields))
#define UNNAMED_FIELDS(op) UNNAMED_FIELDS_TP(Py_TYPE(op))
@@ -164,7 +164,8 @@ structseq_repr(PyStructSequence *obj)
#define TYPE_MAXSIZE 100
PyTypeObject *typ = Py_TYPE(obj);
- int i, removelast = 0;
+ Py_ssize_t i;
+ int removelast = 0;
Py_ssize_t len;
char buf[REPR_BUFFER_SIZE];
char *endofbuf, *pbuf = buf;
@@ -236,8 +237,7 @@ structseq_reduce(PyStructSequence* self)
PyObject* tup = NULL;
PyObject* dict = NULL;
PyObject* result;
- Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields;
- int i;
+ Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields, i;
n_fields = REAL_SIZE(self);
n_visible_fields = VISIBLE_SIZE(self);
@@ -325,7 +325,7 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
{
PyObject *dict;
PyMemberDef* members;
- int n_members, n_unnamed_members, i, k;
+ Py_ssize_t n_members, n_unnamed_members, i, k;
PyObject *v;
#ifdef Py_TRACE_REFS
@@ -373,9 +373,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
Py_INCREF(type);
dict = type->tp_dict;
-#define SET_DICT_FROM_INT(key, value) \
+#define SET_DICT_FROM_SIZE(key, value) \
do { \
- v = PyLong_FromLong((long) value); \
+ v = PyLong_FromSsize_t(value); \
if (v == NULL) \
return -1; \
if (PyDict_SetItemString(dict, key, v) < 0) { \
@@ -385,9 +385,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
Py_DECREF(v); \
} while (0)
- SET_DICT_FROM_INT(visible_length_key, desc->n_in_sequence);
- SET_DICT_FROM_INT(real_length_key, n_members);
- SET_DICT_FROM_INT(unnamed_fields_key, n_unnamed_members);
+ SET_DICT_FROM_SIZE(visible_length_key, desc->n_in_sequence);
+ SET_DICT_FROM_SIZE(real_length_key, n_members);
+ SET_DICT_FROM_SIZE(unnamed_fields_key, n_unnamed_members);
return 0;
}
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index bb83530d58..a311e99653 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -906,25 +906,33 @@ type_call(PyTypeObject *type, PyObject *args, PyObject *kwds)
#endif
obj = type->tp_new(type, args, kwds);
- if (obj != NULL) {
- /* Ugly exception: when the call was type(something),
- don't call tp_init on the result. */
- if (type == &PyType_Type &&
- PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
- (kwds == NULL ||
- (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
- return obj;
- /* If the returned object is not an instance of type,
- it won't be initialized. */
- if (!PyType_IsSubtype(Py_TYPE(obj), type))
- return obj;
- type = Py_TYPE(obj);
- if (type->tp_init != NULL) {
- int res = type->tp_init(obj, args, kwds);
- if (res < 0) {
- Py_DECREF(obj);
- obj = NULL;
- }
+ obj = _Py_CheckFunctionResult((PyObject*)type, obj, NULL);
+ if (obj == NULL)
+ return NULL;
+
+ /* Ugly exception: when the call was type(something),
+ don't call tp_init on the result. */
+ if (type == &PyType_Type &&
+ PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
+ (kwds == NULL ||
+ (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
+ return obj;
+
+ /* If the returned object is not an instance of type,
+ it won't be initialized. */
+ if (!PyType_IsSubtype(Py_TYPE(obj), type))
+ return obj;
+
+ type = Py_TYPE(obj);
+ if (type->tp_init != NULL) {
+ int res = type->tp_init(obj, args, kwds);
+ if (res < 0) {
+ assert(PyErr_Occurred());
+ Py_DECREF(obj);
+ obj = NULL;
+ }
+ else {
+ assert(!PyErr_Occurred());
}
}
return obj;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 168f9f9923..4fd0430631 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include "Python.h"
#include "ucnhash.h"
#include "bytes_methods.h"
+#include "stringlib/eq.h"
#ifdef MS_WINDOWS
#include <windows.h>
@@ -292,6 +293,37 @@ static unsigned char ascii_linebreak[] = {
#include "clinic/unicodeobject.c.h"
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_STRICT,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_SURROGATEPASS,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_XMLCHARREFREPLACE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+ if (errors == NULL)
+ return _Py_ERROR_STRICT;
+ if (strcmp(errors, "strict") == 0)
+ return _Py_ERROR_STRICT;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_ERROR_SURROGATEESCAPE;
+ if (strcmp(errors, "surrogatepass") == 0)
+ return _Py_ERROR_SURROGATEPASS;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "replace") == 0)
+ return _Py_ERROR_REPLACE;
+ if (strcmp(errors, "xmlcharrefreplace") == 0)
+ return _Py_ERROR_XMLCHARREFREPLACE;
+ return _Py_ERROR_OTHER;
+}
+
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
This function is kept for backward compatibility with the old API. */
Py_UNICODE
@@ -3162,24 +3194,22 @@ wcstombs_errorpos(const wchar_t *wstr)
static int
locale_error_handler(const char *errors, int *surrogateescape)
{
- if (errors == NULL) {
- *surrogateescape = 0;
- return 0;
- }
-
- if (strcmp(errors, "strict") == 0) {
+ _Py_error_handler error_handler = get_error_handler(errors);
+ switch (error_handler)
+ {
+ case _Py_ERROR_STRICT:
*surrogateescape = 0;
return 0;
- }
- if (strcmp(errors, "surrogateescape") == 0) {
+ case _Py_ERROR_SURROGATEESCAPE:
*surrogateescape = 1;
return 0;
+ default:
+ PyErr_Format(PyExc_ValueError,
+ "only 'strict' and 'surrogateescape' error handlers "
+ "are supported, not '%s'",
+ errors);
+ return -1;
}
- PyErr_Format(PyExc_ValueError,
- "only 'strict' and 'surrogateescape' error handlers "
- "are supported, not '%s'",
- errors);
- return -1;
}
PyObject *
@@ -6390,7 +6420,7 @@ unicode_encode_call_errorhandler(const char *errors,
static PyObject *
unicode_encode_ucs1(PyObject *unicode,
const char *errors,
- unsigned int limit)
+ const Py_UCS4 limit)
{
/* input state */
Py_ssize_t pos=0, size;
@@ -6404,11 +6434,9 @@ unicode_encode_ucs1(PyObject *unicode,
Py_ssize_t ressize;
const char *encoding = (limit == 256) ? "latin-1" : "ascii";
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (PyUnicode_READY(unicode) == -1)
return NULL;
@@ -6426,12 +6454,12 @@ unicode_encode_ucs1(PyObject *unicode,
ressize = size;
while (pos < size) {
- Py_UCS4 c = PyUnicode_READ(kind, data, pos);
+ Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* can we encode this? */
- if (c<limit) {
+ if (ch < limit) {
/* no overflow check, because we know that the space is enough */
- *str++ = (char)c;
+ *str++ = (char)ch;
++pos;
}
else {
@@ -6442,38 +6470,35 @@ unicode_encode_ucs1(PyObject *unicode,
Py_ssize_t collstart = pos;
Py_ssize_t collend = pos;
/* find all unecodable characters */
+
while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
++collend;
+
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
- if (known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- known_errorHandler = 4;
- else
- known_errorHandler = 0;
- }
- switch (known_errorHandler) {
- case 1: /* strict */
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler) {
+ case _Py_ERROR_STRICT:
raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
goto onError;
- case 2: /* replace */
- while (collstart++ < collend)
- *str++ = '?'; /* fall through */
- case 3: /* ignore */
+
+ case _Py_ERROR_REPLACE:
+ memset(str, '?', collend - collstart);
+ str += (collend - collstart);
+ /* fall through ignore error handler */
+ case _Py_ERROR_IGNORE:
pos = collend;
break;
- case 4: /* xmlcharrefreplace */
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
respos = str - PyBytes_AS_STRING(res);
requiredsize = respos;
/* determine replacement size */
for (i = collstart; i < collend; ++i) {
- Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
if (ch < 10)
incr = 2+1+1;
else if (ch < 100)
@@ -6511,13 +6536,31 @@ unicode_encode_ucs1(PyObject *unicode,
}
pos = collend;
break;
+
+ case _Py_ERROR_SURROGATEESCAPE:
+ for (i = collstart; i < collend; ++i) {
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0xdc80 || 0xdcff < ch) {
+ /* Not a UTF-8b surrogate */
+ break;
+ }
+ *str++ = (char)(ch - 0xdc00);
+ ++pos;
+ }
+ if (i >= collend)
+ break;
+ collstart = pos;
+ assert(collstart != collend);
+ /* fallback to general error handling */
+
default:
- repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
encoding, reason, unicode, &exc,
collstart, collend, &newpos);
if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
PyUnicode_READY(repunicode) == -1))
goto onError;
+
if (PyBytes_Check(repunicode)) {
/* Directly copy bytes result to output. */
repsize = PyBytes_Size(repunicode);
@@ -6541,6 +6584,7 @@ unicode_encode_ucs1(PyObject *unicode,
Py_DECREF(repunicode);
break;
}
+
/* need more space? (at least enough for what we
have+the replacement+the rest of the string, so
we won't have to check space for encodable characters) */
@@ -6563,17 +6607,18 @@ unicode_encode_ucs1(PyObject *unicode,
str = PyBytes_AS_STRING(res) + respos;
ressize = requiredsize;
}
+
/* check if there is anything unencodable in the replacement
and copy it to the output */
for (i = 0; repsize-->0; ++i, ++str) {
- c = PyUnicode_READ_CHAR(repunicode, i);
- if (c >= limit) {
+ ch = PyUnicode_READ_CHAR(repunicode, i);
+ if (ch >= limit) {
raise_encode_exception(&exc, encoding, unicode,
pos, pos+1, reason);
Py_DECREF(repunicode);
goto onError;
}
- *str = (char)c;
+ *str = (char)ch;
}
pos = newpos;
Py_DECREF(repunicode);
@@ -6588,7 +6633,7 @@ unicode_encode_ucs1(PyObject *unicode,
goto onError;
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return res;
@@ -6598,7 +6643,7 @@ unicode_encode_ucs1(PyObject *unicode,
onError:
Py_XDECREF(res);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}
@@ -6658,8 +6703,9 @@ PyUnicode_DecodeASCII(const char *s,
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
@@ -6688,12 +6734,42 @@ PyUnicode_DecodeASCII(const char *s,
PyUnicode_WRITE(kind, data, writer.pos, c);
writer.pos++;
++s;
+ continue;
}
- else {
+
+ /* byte outsize range 0x00..0x7f: call the error handler */
+
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ case _Py_ERROR_SURROGATEESCAPE:
+ /* Fast-path: the error handler only writes one character,
+ but we may switch to UCS2 at the first write */
+ if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+ goto onError;
+ kind = writer.kind;
+ data = writer.data;
+
+ if (error_handler == _Py_ERROR_REPLACE)
+ PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+ else
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ break;
+
+ case _Py_ERROR_IGNORE:
+ ++s;
+ break;
+
+ default:
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
+ errors, &error_handler_obj,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&writer))
@@ -6702,13 +6778,13 @@ PyUnicode_DecodeASCII(const char *s,
data = writer.data;
}
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
_PyUnicodeWriter_Dealloc(&writer);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}
@@ -8074,7 +8150,7 @@ static int
charmap_encoding_error(
PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
PyObject **exceptionObject,
- int *known_errorHandler, PyObject **errorHandler, const char *errors,
+ _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
PyObject **res, Py_ssize_t *respos)
{
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
@@ -8121,23 +8197,15 @@ charmap_encoding_error(
}
/* cache callback name lookup
* (if not done yet, i.e. it's the first error) */
- if (*known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- *known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- *known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- *known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- *known_errorHandler = 4;
- else
- *known_errorHandler = 0;
- }
- switch (*known_errorHandler) {
- case 1: /* strict */
+ if (*error_handler == _Py_ERROR_UNKNOWN)
+ *error_handler = get_error_handler(errors);
+
+ switch (*error_handler) {
+ case _Py_ERROR_STRICT:
raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
- case 2: /* replace */
+
+ case _Py_ERROR_REPLACE:
for (collpos = collstartpos; collpos<collendpos; ++collpos) {
x = charmapencode_output('?', mapping, res, respos);
if (x==enc_EXCEPTION) {
@@ -8149,10 +8217,11 @@ charmap_encoding_error(
}
}
/* fall through */
- case 3: /* ignore */
+ case _Py_ERROR_IGNORE:
*inpos = collendpos;
break;
- case 4: /* xmlcharrefreplace */
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
/* generate replacement (temporarily (mis)uses p) */
for (collpos = collstartpos; collpos < collendpos; ++collpos) {
char buffer[2+29+1+1];
@@ -8170,8 +8239,9 @@ charmap_encoding_error(
}
*inpos = collendpos;
break;
+
default:
- repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
+ repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
encoding, reason, unicode, exceptionObject,
collstartpos, collendpos, &newpos);
if (repunicode == NULL)
@@ -8234,12 +8304,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
Py_ssize_t size;
/* current output position */
Py_ssize_t respos = 0;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace,
- * 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
void *data;
int kind;
@@ -8270,7 +8337,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
if (x==enc_FAILED) { /* unencodable character */
if (charmap_encoding_error(unicode, &inpos, mapping,
&exc,
- &known_errorHandler, &errorHandler, errors,
+ &error_handler, &error_handler_obj, errors,
&res, &respos)) {
goto onError;
}
@@ -8286,13 +8353,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
goto onError;
Py_XDECREF(exc);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
return res;
onError:
Py_XDECREF(res);
Py_XDECREF(exc);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
return NULL;
}
@@ -8618,7 +8685,7 @@ exit:
return res;
}
-PyObject *
+static PyObject *
_PyUnicode_TranslateCharmap(PyObject *input,
PyObject *mapping,
const char *errors)
@@ -10889,6 +10956,12 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
}
int
+_PyUnicode_EQ(PyObject *aa, PyObject *bb)
+{
+ return unicode_eq(aa, bb);
+}
+
+int
PyUnicode_Contains(PyObject *container, PyObject *element)
{
PyObject *str, *sub;
@@ -13258,7 +13331,9 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t newlen;
PyObject *newbuffer;
- assert(length > 0);
+ /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+ assert((maxchar > writer->maxchar && length >= 0)
+ || length > 0);
if (length > PY_SSIZE_T_MAX - writer->pos) {
PyErr_NoMemory();
@@ -13325,6 +13400,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
#undef OVERALLOCATE_FACTOR
}
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+ enum PyUnicode_Kind kind)
+{
+ Py_UCS4 maxchar;
+
+ /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+ assert(writer->kind < kind);
+
+ switch (kind)
+ {
+ case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+ case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+ case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+ default:
+ assert(0 && "invalid kind");
+ return -1;
+ }
+
+ return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
Py_LOCAL_INLINE(int)
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
{