diff options
author | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-08-22 15:47:16 +0100 |
---|---|---|
committer | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-08-22 15:47:16 +0100 |
commit | 25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch) | |
tree | d889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/_simple_set_pyx.pyx | |
download | bzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz |
Tarball conversion
Diffstat (limited to 'bzrlib/_simple_set_pyx.pyx')
-rw-r--r-- | bzrlib/_simple_set_pyx.pyx | 606 |
1 files changed, 606 insertions, 0 deletions
diff --git a/bzrlib/_simple_set_pyx.pyx b/bzrlib/_simple_set_pyx.pyx new file mode 100644 index 0000000..a9cee98 --- /dev/null +++ b/bzrlib/_simple_set_pyx.pyx @@ -0,0 +1,606 @@ +# Copyright (C) 2009, 2010 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +"""Definition of a class that is similar to Set with some small changes.""" + +cdef extern from "python-compat.h": + pass + +cdef extern from "Python.h": + ctypedef unsigned long size_t + ctypedef long (*hashfunc)(PyObject*) except -1 + ctypedef object (*richcmpfunc)(PyObject *, PyObject *, int) + ctypedef int (*visitproc)(PyObject *, void *) + ctypedef int (*traverseproc)(PyObject *, visitproc, void *) + int Py_EQ + void Py_INCREF(PyObject *) + void Py_DECREF(PyObject *) + ctypedef struct PyTypeObject: + hashfunc tp_hash + richcmpfunc tp_richcompare + traverseproc tp_traverse + + PyTypeObject *Py_TYPE(PyObject *) + # Note: *Don't* use hash(), Pyrex 0.9.8.5 thinks it returns an 'int', and + # thus silently truncates to 32-bits on 64-bit machines. + long PyObject_Hash(PyObject *) except -1 + + void *PyMem_Malloc(size_t nbytes) + void PyMem_Free(void *) + void memset(void *, int, size_t) + + +# Dummy is an object used to mark nodes that have been deleted. Since +# collisions require us to move a node to an alternative location, if we just +# set an entry to NULL on delete, we won't find any relocated nodes. +# We have to use _dummy_obj because we need to keep a refcount to it, but we +# also use _dummy as a pointer, because it avoids having to put <PyObject*> all +# over the code base. +cdef object _dummy_obj +cdef PyObject *_dummy +_dummy_obj = object() +_dummy = <PyObject *>_dummy_obj + + +cdef object _NotImplemented +_NotImplemented = NotImplemented + + +cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1: + cdef long other_hash + + if this == other: + return 1 + other_hash = PyObject_Hash(other) + if other_hash != this_hash: + return 0 + + # This implements a subset of the PyObject_RichCompareBool functionality. + # Namely it: + # 1) Doesn't try to do anything with old-style classes + # 2) Assumes that both objects have a tp_richcompare implementation, and + # that if that is not enough to compare equal, then they are not + # equal. (It doesn't try to cast them both to some intermediate form + # that would compare equal.) + res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ) + if res is _NotImplemented: + res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ) + if res is _NotImplemented: + return 0 + if res: + return 1 + return 0 + + +cdef public api class SimpleSet [object SimpleSetObject, type SimpleSet_Type]: + """This class can be used to track canonical forms for objects. + + It is similar in function to the interned dictionary that is used by + strings. However: + + 1) It assumes that hash(obj) is cheap, so does not need to inline a copy + of it + 2) It only stores one reference to the object, rather than 2 (key vs + key:value) + + As such, it uses 1/3rd the amount of memory to store a pointer to the + interned object. + """ + # Attributes are defined in the .pxd file + DEF DEFAULT_SIZE=1024 + + def __init__(self): + cdef Py_ssize_t size, n_bytes + + size = DEFAULT_SIZE + self._mask = size - 1 + self._used = 0 + self._fill = 0 + n_bytes = sizeof(PyObject*) * size; + self._table = <PyObject **>PyMem_Malloc(n_bytes) + if self._table == NULL: + raise MemoryError() + memset(self._table, 0, n_bytes) + + def __sizeof__(self): + # Note: Pyrex doesn't allow sizeof(class) so we re-implement it here. + # Bits are: + # 1: PyObject + # 2: vtable * + # 3: 3 Py_ssize_t + # 4: PyObject** + # Note that we might get alignment, etc, wrong, but at least this is + # better than no estimate at all + # return sizeof(SimpleSet) + (self._mask + 1) * (sizeof(PyObject*)) + return (sizeof(PyObject) + sizeof(void*) + + 3*sizeof(Py_ssize_t) + sizeof(PyObject**) + + (self._mask + 1) * sizeof(PyObject*)) + + def __dealloc__(self): + if self._table != NULL: + PyMem_Free(self._table) + self._table = NULL + + property used: + def __get__(self): + return self._used + + property fill: + def __get__(self): + return self._fill + + property mask: + def __get__(self): + return self._mask + + def _memory_size(self): + """Return the number of bytes of memory consumed by this class.""" + return sizeof(self) + (sizeof(PyObject*)*(self._mask + 1)) + + def __len__(self): + return self._used + + def _test_lookup(self, key): + cdef PyObject **slot + + slot = _lookup(self, key) + if slot[0] == NULL: + res = '<null>' + elif slot[0] == _dummy: + res = '<dummy>' + else: + res = <object>slot[0] + return <int>(slot - self._table), res + + def __contains__(self, key): + """Is key present in this SimpleSet.""" + cdef PyObject **slot + + slot = _lookup(self, key) + if slot[0] == NULL or slot[0] == _dummy: + return False + return True + + cdef PyObject *_get(self, object key) except? NULL: + """Return the object (or nothing) define at the given location.""" + cdef PyObject **slot + + slot = _lookup(self, key) + if slot[0] == NULL or slot[0] == _dummy: + return NULL + return slot[0] + + def __getitem__(self, key): + """Return a stored item that is equivalent to key.""" + cdef PyObject *py_val + + py_val = self._get(key) + if py_val == NULL: + raise KeyError("Key %s is not present" % key) + val = <object>(py_val) + return val + + cdef int _insert_clean(self, PyObject *key) except -1: + """Insert a key into self.table. + + This is only meant to be used during times like '_resize', + as it makes a lot of assuptions about keys not already being present, + and there being no dummy entries. + """ + cdef size_t i, n_lookup + cdef long the_hash + cdef PyObject **table, **slot + cdef Py_ssize_t mask + + mask = self._mask + table = self._table + + the_hash = PyObject_Hash(key) + i = the_hash + for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever + slot = &table[i & mask] + if slot[0] == NULL: + slot[0] = key + self._fill = self._fill + 1 + self._used = self._used + 1 + return 1 + i = i + 1 + n_lookup + raise RuntimeError('ran out of slots.') + + def _py_resize(self, min_used): + """Do not use this directly, it is only exposed for testing.""" + return self._resize(min_used) + + cdef Py_ssize_t _resize(self, Py_ssize_t min_used) except -1: + """Resize the internal table. + + The final table will be big enough to hold at least min_used entries. + We will copy the data from the existing table over, leaving out dummy + entries. + + :return: The new size of the internal table + """ + cdef Py_ssize_t new_size, n_bytes, remaining + cdef PyObject **new_table, **old_table, **slot + + new_size = DEFAULT_SIZE + while new_size <= min_used and new_size > 0: + new_size = new_size << 1 + # We rolled over our signed size field + if new_size <= 0: + raise MemoryError() + # Even if min_used == self._mask + 1, and we aren't changing the actual + # size, we will still run the algorithm so that dummy entries are + # removed + # TODO: Test this + # if new_size < self._used: + # raise RuntimeError('cannot shrink SimpleSet to something' + # ' smaller than the number of used slots.') + n_bytes = sizeof(PyObject*) * new_size; + new_table = <PyObject **>PyMem_Malloc(n_bytes) + if new_table == NULL: + raise MemoryError() + + old_table = self._table + self._table = new_table + memset(self._table, 0, n_bytes) + self._mask = new_size - 1 + self._used = 0 + remaining = self._fill + self._fill = 0 + + # Moving everything to the other table is refcount neutral, so we don't + # worry about it. + slot = old_table + while remaining > 0: + if slot[0] == NULL: # unused slot + pass + elif slot[0] == _dummy: # dummy slot + remaining = remaining - 1 + else: # active slot + remaining = remaining - 1 + self._insert_clean(slot[0]) + slot = slot + 1 + PyMem_Free(old_table) + return new_size + + def add(self, key): + """Similar to set.add(), start tracking this key. + + There is one small difference, which is that we return the object that + is stored at the given location. (which is closer to the + dict.setdefault() functionality.) + """ + return self._add(key) + + cdef object _add(self, key): + cdef PyObject **slot, *py_key + cdef int added + + py_key = <PyObject *>key + if (Py_TYPE(py_key).tp_richcompare == NULL + or Py_TYPE(py_key).tp_hash == NULL): + raise TypeError('Types added to SimpleSet must implement' + ' both tp_richcompare and tp_hash') + added = 0 + # We need at least one empty slot + assert self._used < self._mask + slot = _lookup(self, key) + if (slot[0] == NULL): + Py_INCREF(py_key) + self._fill = self._fill + 1 + self._used = self._used + 1 + slot[0] = py_key + added = 1 + elif (slot[0] == _dummy): + Py_INCREF(py_key) + self._used = self._used + 1 + slot[0] = py_key + added = 1 + # No else: clause. If _lookup returns a pointer to + # a live object, then we already have a value at this location. + retval = <object>(slot[0]) + # PySet and PyDict use a 2-3rds full algorithm, we'll follow suit + if added and (self._fill * 3) >= ((self._mask + 1) * 2): + # However, we always work for a load factor of 2:1 + self._resize(self._used * 2) + # Even if we resized and ended up moving retval into a different slot, + # it is still the value that is held at the slot equivalent to 'key', + # so we can still return it + return retval + + def discard(self, key): + """Remove key from the set, whether it exists or not. + + :return: False if the item did not exist, True if it did + """ + if self._discard(key): + return True + return False + + cdef int _discard(self, key) except -1: + cdef PyObject **slot, *py_key + + slot = _lookup(self, key) + if slot[0] == NULL or slot[0] == _dummy: + return 0 + self._used = self._used - 1 + Py_DECREF(slot[0]) + slot[0] = _dummy + # PySet uses the heuristic: If more than 1/5 are dummies, then resize + # them away + # if ((so->_fill - so->_used) * 5 < so->mask) + # However, we are planning on using this as an interning structure, in + # which we will be putting a lot of objects. And we expect that large + # groups of them are going to have the same lifetime. + # Dummy entries hurt a little bit because they cause the lookup to keep + # searching, but resizing is also rather expensive + # For now, we'll just use their algorithm, but we may want to revisit + # it + if ((self._fill - self._used) * 5 > self._mask): + self._resize(self._used * 2) + return 1 + + def __iter__(self): + return _SimpleSet_iterator(self) + + +cdef class _SimpleSet_iterator: + """Iterator over the SimpleSet structure.""" + + cdef Py_ssize_t pos + cdef SimpleSet set + cdef Py_ssize_t _used # track if things have been mutated while iterating + cdef Py_ssize_t len # number of entries left + + def __init__(self, obj): + self.set = obj + self.pos = 0 + self._used = self.set._used + self.len = self.set._used + + def __iter__(self): + return self + + def __next__(self): + cdef Py_ssize_t mask, i + cdef PyObject *key + + if self.set is None: + raise StopIteration + if self.set._used != self._used: + # Force this exception to continue to be raised + self._used = -1 + raise RuntimeError("Set size changed during iteration") + if not SimpleSet_Next(self.set, &self.pos, &key): + self.set = None + raise StopIteration + # we found something + the_key = <object>key # INCREF + self.len = self.len - 1 + return the_key + + def __length_hint__(self): + if self.set is not None and self._used == self.set._used: + return self.len + return 0 + + + +cdef api SimpleSet SimpleSet_New(): + """Create a new SimpleSet object.""" + return SimpleSet() + + +cdef SimpleSet _check_self(object self): + """Check that the parameter is not None. + + Pyrex/Cython will do type checking, but only to ensure that an object is + either the right type or None. You can say "object foo not None" for pure + python functions, but not for C functions. + So this is just a helper for all the apis that need to do the check. + """ + cdef SimpleSet true_self + if self is None: + raise TypeError('self must not be None') + true_self = self + return true_self + + +cdef PyObject **_lookup(SimpleSet self, object key) except NULL: + """Find the slot where 'key' would fit. + + This is the same as a dicts 'lookup' function. + + :param key: An object we are looking up + :param hash: The hash for key + :return: The location in self.table where key should be put. + location == NULL is an exception, but (*location) == NULL just + indicates the slot is empty and can be used. + """ + # This uses Quadratic Probing: + # http://en.wikipedia.org/wiki/Quadratic_probing + # with c1 = c2 = 1/2 + # This leads to probe locations at: + # h0 = hash(k1) + # h1 = h0 + 1 + # h2 = h0 + 3 = h1 + 1 + 1 + # h3 = h0 + 6 = h2 + 1 + 2 + # h4 = h0 + 10 = h2 + 1 + 3 + # Note that all of these are '& mask', but that is computed *after* the + # offset. + # This differs from the algorithm used by Set and Dict. Which, effectively, + # use double-hashing, and a step size that starts large, but dwindles to + # stepping one-by-one. + # This gives more 'locality' in that if you have a collision at offset X, + # the first fallback is X+1, which is fast to check. However, that means + # that an object w/ hash X+1 will also check there, and then X+2 next. + # However, for objects with differing hashes, their chains are different. + # The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ... + # So different hashes diverge quickly. + # A bigger problem is that we *only* ever use the lowest bits of the hash + # So all integers (x + SIZE*N) will resolve into the same bucket, and all + # use the same collision resolution. We may want to try to find a way to + # incorporate the upper bits of the hash with quadratic probing. (For + # example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.) + cdef size_t i, n_lookup + cdef Py_ssize_t mask + cdef long key_hash + cdef PyObject **table, **slot, *cur, **free_slot, *py_key + + py_key = <PyObject *>key + # Note: avoid using hash(obj) because of a bug w/ pyrex 0.9.8.5 and 64-bit + # (it treats hash() as returning an 'int' rather than a 'long') + key_hash = PyObject_Hash(py_key) + i = <size_t>key_hash + mask = self._mask + table = self._table + free_slot = NULL + for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever + slot = &table[i & mask] + cur = slot[0] + if cur == NULL: + # Found a blank spot + if free_slot != NULL: + # Did we find an earlier _dummy entry? + return free_slot + else: + return slot + if cur == py_key: + # Found an exact pointer to the key + return slot + if cur == _dummy: + if free_slot == NULL: + free_slot = slot + elif _is_equal(py_key, key_hash, cur): + # Both py_key and cur belong in this slot, return it + return slot + i = i + 1 + n_lookup + raise AssertionError('should never get here') + + +cdef api PyObject **_SimpleSet_Lookup(object self, object key) except NULL: + """Find the slot where 'key' would fit. + + This is the same as a dicts 'lookup' function. This is a private + api because mutating what you get without maintaing the other invariants + is a 'bad thing'. + + :param key: An object we are looking up + :param hash: The hash for key + :return: The location in self._table where key should be put + should never be NULL, but may reference a NULL (PyObject*) + """ + return _lookup(_check_self(self), key) + + +cdef api object SimpleSet_Add(object self, object key): + """Add a key to the SimpleSet (set). + + :param self: The SimpleSet to add the key to. + :param key: The key to be added. If the key is already present, + self will not be modified + :return: The current key stored at the location defined by 'key'. + This may be the same object, or it may be an equivalent object. + (consider dict.setdefault(key, key)) + """ + return _check_self(self)._add(key) + + +cdef api int SimpleSet_Contains(object self, object key) except -1: + """Is key present in self?""" + return (key in _check_self(self)) + + +cdef api int SimpleSet_Discard(object self, object key) except -1: + """Remove the object referenced at location 'key'. + + :param self: The SimpleSet being modified + :param key: The key we are checking on + :return: 1 if there was an object present, 0 if there was not, and -1 on + error. + """ + return _check_self(self)._discard(key) + + +cdef api PyObject *SimpleSet_Get(SimpleSet self, object key) except? NULL: + """Get a pointer to the object present at location 'key'. + + This returns an object which is equal to key which was previously added to + self. This returns a borrowed reference, as it may also return NULL if no + value is present at that location. + + :param key: The value we are looking for + :return: The object present at that location + """ + return _check_self(self)._get(key) + + +cdef api Py_ssize_t SimpleSet_Size(object self) except -1: + """Get the number of active entries in 'self'""" + return _check_self(self)._used + + +cdef api int SimpleSet_Next(object self, Py_ssize_t *pos, + PyObject **key) except -1: + """Walk over items in a SimpleSet. + + :param pos: should be initialized to 0 by the caller, and will be updated + by this function + :param key: Will return a borrowed reference to key + :return: 0 if nothing left, 1 if we are returning a new value + """ + cdef Py_ssize_t i, mask + cdef SimpleSet true_self + cdef PyObject **table + true_self = _check_self(self) + i = pos[0] + if (i < 0): + return 0 + mask = true_self._mask + table= true_self._table + while (i <= mask and (table[i] == NULL or table[i] == _dummy)): + i = i + 1 + pos[0] = i + 1 + if (i > mask): + return 0 # All done + if (key != NULL): + key[0] = table[i] + return 1 + + +cdef int SimpleSet_traverse(SimpleSet self, visitproc visit, + void *arg) except -1: + """This is an implementation of 'tp_traverse' that hits the whole table. + + Cython/Pyrex don't seem to let you define a tp_traverse, and they only + define one for you if you have an 'object' attribute. Since they don't + support C arrays of objects, we access the PyObject * directly. + """ + cdef Py_ssize_t pos + cdef PyObject *next_key + cdef int ret + + pos = 0 + while SimpleSet_Next(self, &pos, &next_key): + ret = visit(next_key, arg) + if ret: + return ret + return 0 + +# It is a little bit ugly to do this, but it works, and means that Meliae can +# dump the total memory consumed by all child objects. +(<PyTypeObject *>SimpleSet).tp_traverse = <traverseproc>SimpleSet_traverse |