summaryrefslogtreecommitdiff
path: root/bzrlib/_simple_set_pyx.pyx
diff options
context:
space:
mode:
authorLorry <lorry@roadtrain.codethink.co.uk>2012-08-22 15:47:16 +0100
committerLorry <lorry@roadtrain.codethink.co.uk>2012-08-22 15:47:16 +0100
commit25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch)
treed889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/_simple_set_pyx.pyx
downloadbzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz
Tarball conversion
Diffstat (limited to 'bzrlib/_simple_set_pyx.pyx')
-rw-r--r--bzrlib/_simple_set_pyx.pyx606
1 files changed, 606 insertions, 0 deletions
diff --git a/bzrlib/_simple_set_pyx.pyx b/bzrlib/_simple_set_pyx.pyx
new file mode 100644
index 0000000..a9cee98
--- /dev/null
+++ b/bzrlib/_simple_set_pyx.pyx
@@ -0,0 +1,606 @@
+# Copyright (C) 2009, 2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Definition of a class that is similar to Set with some small changes."""
+
+cdef extern from "python-compat.h":
+ pass
+
+cdef extern from "Python.h":
+ ctypedef unsigned long size_t
+ ctypedef long (*hashfunc)(PyObject*) except -1
+ ctypedef object (*richcmpfunc)(PyObject *, PyObject *, int)
+ ctypedef int (*visitproc)(PyObject *, void *)
+ ctypedef int (*traverseproc)(PyObject *, visitproc, void *)
+ int Py_EQ
+ void Py_INCREF(PyObject *)
+ void Py_DECREF(PyObject *)
+ ctypedef struct PyTypeObject:
+ hashfunc tp_hash
+ richcmpfunc tp_richcompare
+ traverseproc tp_traverse
+
+ PyTypeObject *Py_TYPE(PyObject *)
+ # Note: *Don't* use hash(), Pyrex 0.9.8.5 thinks it returns an 'int', and
+ # thus silently truncates to 32-bits on 64-bit machines.
+ long PyObject_Hash(PyObject *) except -1
+
+ void *PyMem_Malloc(size_t nbytes)
+ void PyMem_Free(void *)
+ void memset(void *, int, size_t)
+
+
+# Dummy is an object used to mark nodes that have been deleted. Since
+# collisions require us to move a node to an alternative location, if we just
+# set an entry to NULL on delete, we won't find any relocated nodes.
+# We have to use _dummy_obj because we need to keep a refcount to it, but we
+# also use _dummy as a pointer, because it avoids having to put <PyObject*> all
+# over the code base.
+cdef object _dummy_obj
+cdef PyObject *_dummy
+_dummy_obj = object()
+_dummy = <PyObject *>_dummy_obj
+
+
+cdef object _NotImplemented
+_NotImplemented = NotImplemented
+
+
+cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1:
+ cdef long other_hash
+
+ if this == other:
+ return 1
+ other_hash = PyObject_Hash(other)
+ if other_hash != this_hash:
+ return 0
+
+ # This implements a subset of the PyObject_RichCompareBool functionality.
+ # Namely it:
+ # 1) Doesn't try to do anything with old-style classes
+ # 2) Assumes that both objects have a tp_richcompare implementation, and
+ # that if that is not enough to compare equal, then they are not
+ # equal. (It doesn't try to cast them both to some intermediate form
+ # that would compare equal.)
+ res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)
+ if res is _NotImplemented:
+ res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)
+ if res is _NotImplemented:
+ return 0
+ if res:
+ return 1
+ return 0
+
+
+cdef public api class SimpleSet [object SimpleSetObject, type SimpleSet_Type]:
+ """This class can be used to track canonical forms for objects.
+
+ It is similar in function to the interned dictionary that is used by
+ strings. However:
+
+ 1) It assumes that hash(obj) is cheap, so does not need to inline a copy
+ of it
+ 2) It only stores one reference to the object, rather than 2 (key vs
+ key:value)
+
+ As such, it uses 1/3rd the amount of memory to store a pointer to the
+ interned object.
+ """
+ # Attributes are defined in the .pxd file
+ DEF DEFAULT_SIZE=1024
+
+ def __init__(self):
+ cdef Py_ssize_t size, n_bytes
+
+ size = DEFAULT_SIZE
+ self._mask = size - 1
+ self._used = 0
+ self._fill = 0
+ n_bytes = sizeof(PyObject*) * size;
+ self._table = <PyObject **>PyMem_Malloc(n_bytes)
+ if self._table == NULL:
+ raise MemoryError()
+ memset(self._table, 0, n_bytes)
+
+ def __sizeof__(self):
+ # Note: Pyrex doesn't allow sizeof(class) so we re-implement it here.
+ # Bits are:
+ # 1: PyObject
+ # 2: vtable *
+ # 3: 3 Py_ssize_t
+ # 4: PyObject**
+ # Note that we might get alignment, etc, wrong, but at least this is
+ # better than no estimate at all
+ # return sizeof(SimpleSet) + (self._mask + 1) * (sizeof(PyObject*))
+ return (sizeof(PyObject) + sizeof(void*)
+ + 3*sizeof(Py_ssize_t) + sizeof(PyObject**)
+ + (self._mask + 1) * sizeof(PyObject*))
+
+ def __dealloc__(self):
+ if self._table != NULL:
+ PyMem_Free(self._table)
+ self._table = NULL
+
+ property used:
+ def __get__(self):
+ return self._used
+
+ property fill:
+ def __get__(self):
+ return self._fill
+
+ property mask:
+ def __get__(self):
+ return self._mask
+
+ def _memory_size(self):
+ """Return the number of bytes of memory consumed by this class."""
+ return sizeof(self) + (sizeof(PyObject*)*(self._mask + 1))
+
+ def __len__(self):
+ return self._used
+
+ def _test_lookup(self, key):
+ cdef PyObject **slot
+
+ slot = _lookup(self, key)
+ if slot[0] == NULL:
+ res = '<null>'
+ elif slot[0] == _dummy:
+ res = '<dummy>'
+ else:
+ res = <object>slot[0]
+ return <int>(slot - self._table), res
+
+ def __contains__(self, key):
+ """Is key present in this SimpleSet."""
+ cdef PyObject **slot
+
+ slot = _lookup(self, key)
+ if slot[0] == NULL or slot[0] == _dummy:
+ return False
+ return True
+
+ cdef PyObject *_get(self, object key) except? NULL:
+ """Return the object (or nothing) define at the given location."""
+ cdef PyObject **slot
+
+ slot = _lookup(self, key)
+ if slot[0] == NULL or slot[0] == _dummy:
+ return NULL
+ return slot[0]
+
+ def __getitem__(self, key):
+ """Return a stored item that is equivalent to key."""
+ cdef PyObject *py_val
+
+ py_val = self._get(key)
+ if py_val == NULL:
+ raise KeyError("Key %s is not present" % key)
+ val = <object>(py_val)
+ return val
+
+ cdef int _insert_clean(self, PyObject *key) except -1:
+ """Insert a key into self.table.
+
+ This is only meant to be used during times like '_resize',
+ as it makes a lot of assuptions about keys not already being present,
+ and there being no dummy entries.
+ """
+ cdef size_t i, n_lookup
+ cdef long the_hash
+ cdef PyObject **table, **slot
+ cdef Py_ssize_t mask
+
+ mask = self._mask
+ table = self._table
+
+ the_hash = PyObject_Hash(key)
+ i = the_hash
+ for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
+ slot = &table[i & mask]
+ if slot[0] == NULL:
+ slot[0] = key
+ self._fill = self._fill + 1
+ self._used = self._used + 1
+ return 1
+ i = i + 1 + n_lookup
+ raise RuntimeError('ran out of slots.')
+
+ def _py_resize(self, min_used):
+ """Do not use this directly, it is only exposed for testing."""
+ return self._resize(min_used)
+
+ cdef Py_ssize_t _resize(self, Py_ssize_t min_used) except -1:
+ """Resize the internal table.
+
+ The final table will be big enough to hold at least min_used entries.
+ We will copy the data from the existing table over, leaving out dummy
+ entries.
+
+ :return: The new size of the internal table
+ """
+ cdef Py_ssize_t new_size, n_bytes, remaining
+ cdef PyObject **new_table, **old_table, **slot
+
+ new_size = DEFAULT_SIZE
+ while new_size <= min_used and new_size > 0:
+ new_size = new_size << 1
+ # We rolled over our signed size field
+ if new_size <= 0:
+ raise MemoryError()
+ # Even if min_used == self._mask + 1, and we aren't changing the actual
+ # size, we will still run the algorithm so that dummy entries are
+ # removed
+ # TODO: Test this
+ # if new_size < self._used:
+ # raise RuntimeError('cannot shrink SimpleSet to something'
+ # ' smaller than the number of used slots.')
+ n_bytes = sizeof(PyObject*) * new_size;
+ new_table = <PyObject **>PyMem_Malloc(n_bytes)
+ if new_table == NULL:
+ raise MemoryError()
+
+ old_table = self._table
+ self._table = new_table
+ memset(self._table, 0, n_bytes)
+ self._mask = new_size - 1
+ self._used = 0
+ remaining = self._fill
+ self._fill = 0
+
+ # Moving everything to the other table is refcount neutral, so we don't
+ # worry about it.
+ slot = old_table
+ while remaining > 0:
+ if slot[0] == NULL: # unused slot
+ pass
+ elif slot[0] == _dummy: # dummy slot
+ remaining = remaining - 1
+ else: # active slot
+ remaining = remaining - 1
+ self._insert_clean(slot[0])
+ slot = slot + 1
+ PyMem_Free(old_table)
+ return new_size
+
+ def add(self, key):
+ """Similar to set.add(), start tracking this key.
+
+ There is one small difference, which is that we return the object that
+ is stored at the given location. (which is closer to the
+ dict.setdefault() functionality.)
+ """
+ return self._add(key)
+
+ cdef object _add(self, key):
+ cdef PyObject **slot, *py_key
+ cdef int added
+
+ py_key = <PyObject *>key
+ if (Py_TYPE(py_key).tp_richcompare == NULL
+ or Py_TYPE(py_key).tp_hash == NULL):
+ raise TypeError('Types added to SimpleSet must implement'
+ ' both tp_richcompare and tp_hash')
+ added = 0
+ # We need at least one empty slot
+ assert self._used < self._mask
+ slot = _lookup(self, key)
+ if (slot[0] == NULL):
+ Py_INCREF(py_key)
+ self._fill = self._fill + 1
+ self._used = self._used + 1
+ slot[0] = py_key
+ added = 1
+ elif (slot[0] == _dummy):
+ Py_INCREF(py_key)
+ self._used = self._used + 1
+ slot[0] = py_key
+ added = 1
+ # No else: clause. If _lookup returns a pointer to
+ # a live object, then we already have a value at this location.
+ retval = <object>(slot[0])
+ # PySet and PyDict use a 2-3rds full algorithm, we'll follow suit
+ if added and (self._fill * 3) >= ((self._mask + 1) * 2):
+ # However, we always work for a load factor of 2:1
+ self._resize(self._used * 2)
+ # Even if we resized and ended up moving retval into a different slot,
+ # it is still the value that is held at the slot equivalent to 'key',
+ # so we can still return it
+ return retval
+
+ def discard(self, key):
+ """Remove key from the set, whether it exists or not.
+
+ :return: False if the item did not exist, True if it did
+ """
+ if self._discard(key):
+ return True
+ return False
+
+ cdef int _discard(self, key) except -1:
+ cdef PyObject **slot, *py_key
+
+ slot = _lookup(self, key)
+ if slot[0] == NULL or slot[0] == _dummy:
+ return 0
+ self._used = self._used - 1
+ Py_DECREF(slot[0])
+ slot[0] = _dummy
+ # PySet uses the heuristic: If more than 1/5 are dummies, then resize
+ # them away
+ # if ((so->_fill - so->_used) * 5 < so->mask)
+ # However, we are planning on using this as an interning structure, in
+ # which we will be putting a lot of objects. And we expect that large
+ # groups of them are going to have the same lifetime.
+ # Dummy entries hurt a little bit because they cause the lookup to keep
+ # searching, but resizing is also rather expensive
+ # For now, we'll just use their algorithm, but we may want to revisit
+ # it
+ if ((self._fill - self._used) * 5 > self._mask):
+ self._resize(self._used * 2)
+ return 1
+
+ def __iter__(self):
+ return _SimpleSet_iterator(self)
+
+
+cdef class _SimpleSet_iterator:
+ """Iterator over the SimpleSet structure."""
+
+ cdef Py_ssize_t pos
+ cdef SimpleSet set
+ cdef Py_ssize_t _used # track if things have been mutated while iterating
+ cdef Py_ssize_t len # number of entries left
+
+ def __init__(self, obj):
+ self.set = obj
+ self.pos = 0
+ self._used = self.set._used
+ self.len = self.set._used
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef Py_ssize_t mask, i
+ cdef PyObject *key
+
+ if self.set is None:
+ raise StopIteration
+ if self.set._used != self._used:
+ # Force this exception to continue to be raised
+ self._used = -1
+ raise RuntimeError("Set size changed during iteration")
+ if not SimpleSet_Next(self.set, &self.pos, &key):
+ self.set = None
+ raise StopIteration
+ # we found something
+ the_key = <object>key # INCREF
+ self.len = self.len - 1
+ return the_key
+
+ def __length_hint__(self):
+ if self.set is not None and self._used == self.set._used:
+ return self.len
+ return 0
+
+
+
+cdef api SimpleSet SimpleSet_New():
+ """Create a new SimpleSet object."""
+ return SimpleSet()
+
+
+cdef SimpleSet _check_self(object self):
+ """Check that the parameter is not None.
+
+ Pyrex/Cython will do type checking, but only to ensure that an object is
+ either the right type or None. You can say "object foo not None" for pure
+ python functions, but not for C functions.
+ So this is just a helper for all the apis that need to do the check.
+ """
+ cdef SimpleSet true_self
+ if self is None:
+ raise TypeError('self must not be None')
+ true_self = self
+ return true_self
+
+
+cdef PyObject **_lookup(SimpleSet self, object key) except NULL:
+ """Find the slot where 'key' would fit.
+
+ This is the same as a dicts 'lookup' function.
+
+ :param key: An object we are looking up
+ :param hash: The hash for key
+ :return: The location in self.table where key should be put.
+ location == NULL is an exception, but (*location) == NULL just
+ indicates the slot is empty and can be used.
+ """
+ # This uses Quadratic Probing:
+ # http://en.wikipedia.org/wiki/Quadratic_probing
+ # with c1 = c2 = 1/2
+ # This leads to probe locations at:
+ # h0 = hash(k1)
+ # h1 = h0 + 1
+ # h2 = h0 + 3 = h1 + 1 + 1
+ # h3 = h0 + 6 = h2 + 1 + 2
+ # h4 = h0 + 10 = h2 + 1 + 3
+ # Note that all of these are '& mask', but that is computed *after* the
+ # offset.
+ # This differs from the algorithm used by Set and Dict. Which, effectively,
+ # use double-hashing, and a step size that starts large, but dwindles to
+ # stepping one-by-one.
+ # This gives more 'locality' in that if you have a collision at offset X,
+ # the first fallback is X+1, which is fast to check. However, that means
+ # that an object w/ hash X+1 will also check there, and then X+2 next.
+ # However, for objects with differing hashes, their chains are different.
+ # The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...
+ # So different hashes diverge quickly.
+ # A bigger problem is that we *only* ever use the lowest bits of the hash
+ # So all integers (x + SIZE*N) will resolve into the same bucket, and all
+ # use the same collision resolution. We may want to try to find a way to
+ # incorporate the upper bits of the hash with quadratic probing. (For
+ # example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)
+ cdef size_t i, n_lookup
+ cdef Py_ssize_t mask
+ cdef long key_hash
+ cdef PyObject **table, **slot, *cur, **free_slot, *py_key
+
+ py_key = <PyObject *>key
+ # Note: avoid using hash(obj) because of a bug w/ pyrex 0.9.8.5 and 64-bit
+ # (it treats hash() as returning an 'int' rather than a 'long')
+ key_hash = PyObject_Hash(py_key)
+ i = <size_t>key_hash
+ mask = self._mask
+ table = self._table
+ free_slot = NULL
+ for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
+ slot = &table[i & mask]
+ cur = slot[0]
+ if cur == NULL:
+ # Found a blank spot
+ if free_slot != NULL:
+ # Did we find an earlier _dummy entry?
+ return free_slot
+ else:
+ return slot
+ if cur == py_key:
+ # Found an exact pointer to the key
+ return slot
+ if cur == _dummy:
+ if free_slot == NULL:
+ free_slot = slot
+ elif _is_equal(py_key, key_hash, cur):
+ # Both py_key and cur belong in this slot, return it
+ return slot
+ i = i + 1 + n_lookup
+ raise AssertionError('should never get here')
+
+
+cdef api PyObject **_SimpleSet_Lookup(object self, object key) except NULL:
+ """Find the slot where 'key' would fit.
+
+ This is the same as a dicts 'lookup' function. This is a private
+ api because mutating what you get without maintaing the other invariants
+ is a 'bad thing'.
+
+ :param key: An object we are looking up
+ :param hash: The hash for key
+ :return: The location in self._table where key should be put
+ should never be NULL, but may reference a NULL (PyObject*)
+ """
+ return _lookup(_check_self(self), key)
+
+
+cdef api object SimpleSet_Add(object self, object key):
+ """Add a key to the SimpleSet (set).
+
+ :param self: The SimpleSet to add the key to.
+ :param key: The key to be added. If the key is already present,
+ self will not be modified
+ :return: The current key stored at the location defined by 'key'.
+ This may be the same object, or it may be an equivalent object.
+ (consider dict.setdefault(key, key))
+ """
+ return _check_self(self)._add(key)
+
+
+cdef api int SimpleSet_Contains(object self, object key) except -1:
+ """Is key present in self?"""
+ return (key in _check_self(self))
+
+
+cdef api int SimpleSet_Discard(object self, object key) except -1:
+ """Remove the object referenced at location 'key'.
+
+ :param self: The SimpleSet being modified
+ :param key: The key we are checking on
+ :return: 1 if there was an object present, 0 if there was not, and -1 on
+ error.
+ """
+ return _check_self(self)._discard(key)
+
+
+cdef api PyObject *SimpleSet_Get(SimpleSet self, object key) except? NULL:
+ """Get a pointer to the object present at location 'key'.
+
+ This returns an object which is equal to key which was previously added to
+ self. This returns a borrowed reference, as it may also return NULL if no
+ value is present at that location.
+
+ :param key: The value we are looking for
+ :return: The object present at that location
+ """
+ return _check_self(self)._get(key)
+
+
+cdef api Py_ssize_t SimpleSet_Size(object self) except -1:
+ """Get the number of active entries in 'self'"""
+ return _check_self(self)._used
+
+
+cdef api int SimpleSet_Next(object self, Py_ssize_t *pos,
+ PyObject **key) except -1:
+ """Walk over items in a SimpleSet.
+
+ :param pos: should be initialized to 0 by the caller, and will be updated
+ by this function
+ :param key: Will return a borrowed reference to key
+ :return: 0 if nothing left, 1 if we are returning a new value
+ """
+ cdef Py_ssize_t i, mask
+ cdef SimpleSet true_self
+ cdef PyObject **table
+ true_self = _check_self(self)
+ i = pos[0]
+ if (i < 0):
+ return 0
+ mask = true_self._mask
+ table= true_self._table
+ while (i <= mask and (table[i] == NULL or table[i] == _dummy)):
+ i = i + 1
+ pos[0] = i + 1
+ if (i > mask):
+ return 0 # All done
+ if (key != NULL):
+ key[0] = table[i]
+ return 1
+
+
+cdef int SimpleSet_traverse(SimpleSet self, visitproc visit,
+ void *arg) except -1:
+ """This is an implementation of 'tp_traverse' that hits the whole table.
+
+ Cython/Pyrex don't seem to let you define a tp_traverse, and they only
+ define one for you if you have an 'object' attribute. Since they don't
+ support C arrays of objects, we access the PyObject * directly.
+ """
+ cdef Py_ssize_t pos
+ cdef PyObject *next_key
+ cdef int ret
+
+ pos = 0
+ while SimpleSet_Next(self, &pos, &next_key):
+ ret = visit(next_key, arg)
+ if ret:
+ return ret
+ return 0
+
+# It is a little bit ugly to do this, but it works, and means that Meliae can
+# dump the total memory consumed by all child objects.
+(<PyTypeObject *>SimpleSet).tp_traverse = <traverseproc>SimpleSet_traverse