summaryrefslogtreecommitdiff
path: root/bzrlib/_simple_set_pyx.pyx
blob: a9cee98d97797ae83573308c4f68bef6ba068c2c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
# Copyright (C) 2009, 2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

"""Definition of a class that is similar to Set with some small changes."""

cdef extern from "python-compat.h":
    pass

cdef extern from "Python.h":
    ctypedef unsigned long size_t
    ctypedef long (*hashfunc)(PyObject*) except -1
    ctypedef object (*richcmpfunc)(PyObject *, PyObject *, int)
    ctypedef int (*visitproc)(PyObject *, void *)
    ctypedef int (*traverseproc)(PyObject *, visitproc, void *)
    int Py_EQ
    void Py_INCREF(PyObject *)
    void Py_DECREF(PyObject *)
    ctypedef struct PyTypeObject:
        hashfunc tp_hash
        richcmpfunc tp_richcompare
        traverseproc tp_traverse

    PyTypeObject *Py_TYPE(PyObject *)
    # Note: *Don't* use hash(), Pyrex 0.9.8.5 thinks it returns an 'int', and
    #       thus silently truncates to 32-bits on 64-bit machines.
    long PyObject_Hash(PyObject *) except -1
        
    void *PyMem_Malloc(size_t nbytes)
    void PyMem_Free(void *)
    void memset(void *, int, size_t)


# Dummy is an object used to mark nodes that have been deleted. Since
# collisions require us to move a node to an alternative location, if we just
# set an entry to NULL on delete, we won't find any relocated nodes.
# We have to use _dummy_obj because we need to keep a refcount to it, but we
# also use _dummy as a pointer, because it avoids having to put <PyObject*> all
# over the code base.
cdef object _dummy_obj
cdef PyObject *_dummy
_dummy_obj = object()
_dummy = <PyObject *>_dummy_obj


cdef object _NotImplemented
_NotImplemented = NotImplemented


cdef int _is_equal(PyObject *this, long this_hash, PyObject *other) except -1:
    cdef long other_hash

    if this == other:
        return 1
    other_hash = PyObject_Hash(other)
    if other_hash != this_hash:
        return 0

    # This implements a subset of the PyObject_RichCompareBool functionality.
    # Namely it:
    #   1) Doesn't try to do anything with old-style classes
    #   2) Assumes that both objects have a tp_richcompare implementation, and
    #      that if that is not enough to compare equal, then they are not
    #      equal. (It doesn't try to cast them both to some intermediate form
    #      that would compare equal.)
    res = Py_TYPE(this).tp_richcompare(this, other, Py_EQ)
    if res is _NotImplemented:
        res = Py_TYPE(other).tp_richcompare(other, this, Py_EQ)
        if res is _NotImplemented:
            return 0
    if res:
        return 1
    return 0


cdef public api class SimpleSet [object SimpleSetObject, type SimpleSet_Type]:
    """This class can be used to track canonical forms for objects.

    It is similar in function to the interned dictionary that is used by
    strings. However:

      1) It assumes that hash(obj) is cheap, so does not need to inline a copy
         of it
      2) It only stores one reference to the object, rather than 2 (key vs
         key:value)

    As such, it uses 1/3rd the amount of memory to store a pointer to the
    interned object.
    """
    # Attributes are defined in the .pxd file
    DEF DEFAULT_SIZE=1024

    def __init__(self):
        cdef Py_ssize_t size, n_bytes

        size = DEFAULT_SIZE
        self._mask = size - 1
        self._used = 0
        self._fill = 0
        n_bytes = sizeof(PyObject*) * size;
        self._table = <PyObject **>PyMem_Malloc(n_bytes)
        if self._table == NULL:
            raise MemoryError()
        memset(self._table, 0, n_bytes)

    def __sizeof__(self):
        # Note: Pyrex doesn't allow sizeof(class) so we re-implement it here.
        # Bits are:
        #   1: PyObject
        #   2: vtable *
        #   3: 3 Py_ssize_t
        #   4: PyObject**
        # Note that we might get alignment, etc, wrong, but at least this is
        # better than no estimate at all
        # return sizeof(SimpleSet) + (self._mask + 1) * (sizeof(PyObject*))
        return (sizeof(PyObject) + sizeof(void*)
                + 3*sizeof(Py_ssize_t) + sizeof(PyObject**)
                + (self._mask + 1) * sizeof(PyObject*))

    def __dealloc__(self):
        if self._table != NULL:
            PyMem_Free(self._table)
            self._table = NULL

    property used:
        def __get__(self):
            return self._used

    property fill:
        def __get__(self):
            return self._fill

    property mask:
        def __get__(self):
            return self._mask

    def _memory_size(self):
        """Return the number of bytes of memory consumed by this class."""
        return sizeof(self) + (sizeof(PyObject*)*(self._mask + 1))

    def __len__(self):
        return self._used

    def _test_lookup(self, key):
        cdef PyObject **slot

        slot = _lookup(self, key)
        if slot[0] == NULL:
            res = '<null>'
        elif slot[0] == _dummy:
            res = '<dummy>'
        else:
            res = <object>slot[0]
        return <int>(slot - self._table), res

    def __contains__(self, key):
        """Is key present in this SimpleSet."""
        cdef PyObject **slot

        slot = _lookup(self, key)
        if slot[0] == NULL or slot[0] == _dummy:
            return False
        return True

    cdef PyObject *_get(self, object key) except? NULL:
        """Return the object (or nothing) define at the given location."""
        cdef PyObject **slot

        slot = _lookup(self, key)
        if slot[0] == NULL or slot[0] == _dummy:
            return NULL
        return slot[0]

    def __getitem__(self, key):
        """Return a stored item that is equivalent to key."""
        cdef PyObject *py_val

        py_val = self._get(key)
        if py_val == NULL:
            raise KeyError("Key %s is not present" % key)
        val = <object>(py_val)
        return val

    cdef int _insert_clean(self, PyObject *key) except -1:
        """Insert a key into self.table.

        This is only meant to be used during times like '_resize',
        as it makes a lot of assuptions about keys not already being present,
        and there being no dummy entries.
        """
        cdef size_t i, n_lookup
        cdef long the_hash
        cdef PyObject **table, **slot
        cdef Py_ssize_t mask

        mask = self._mask
        table = self._table

        the_hash = PyObject_Hash(key)
        i = the_hash
        for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
            slot = &table[i & mask]
            if slot[0] == NULL:
                slot[0] = key
                self._fill = self._fill + 1
                self._used = self._used + 1
                return 1
            i = i + 1 + n_lookup
        raise RuntimeError('ran out of slots.')

    def _py_resize(self, min_used):
        """Do not use this directly, it is only exposed for testing."""
        return self._resize(min_used)

    cdef Py_ssize_t _resize(self, Py_ssize_t min_used) except -1:
        """Resize the internal table.

        The final table will be big enough to hold at least min_used entries.
        We will copy the data from the existing table over, leaving out dummy
        entries.

        :return: The new size of the internal table
        """
        cdef Py_ssize_t new_size, n_bytes, remaining
        cdef PyObject **new_table, **old_table, **slot

        new_size = DEFAULT_SIZE
        while new_size <= min_used and new_size > 0:
            new_size = new_size << 1
        # We rolled over our signed size field
        if new_size <= 0:
            raise MemoryError()
        # Even if min_used == self._mask + 1, and we aren't changing the actual
        # size, we will still run the algorithm so that dummy entries are
        # removed
        # TODO: Test this
        # if new_size < self._used:
        #     raise RuntimeError('cannot shrink SimpleSet to something'
        #                        ' smaller than the number of used slots.')
        n_bytes = sizeof(PyObject*) * new_size;
        new_table = <PyObject **>PyMem_Malloc(n_bytes)
        if new_table == NULL:
            raise MemoryError()

        old_table = self._table
        self._table = new_table
        memset(self._table, 0, n_bytes)
        self._mask = new_size - 1
        self._used = 0
        remaining = self._fill
        self._fill = 0

        # Moving everything to the other table is refcount neutral, so we don't
        # worry about it.
        slot = old_table
        while remaining > 0:
            if slot[0] == NULL: # unused slot
                pass 
            elif slot[0] == _dummy: # dummy slot
                remaining = remaining - 1
            else: # active slot
                remaining = remaining - 1
                self._insert_clean(slot[0])
            slot = slot + 1
        PyMem_Free(old_table)
        return new_size

    def add(self, key):
        """Similar to set.add(), start tracking this key.
        
        There is one small difference, which is that we return the object that
        is stored at the given location. (which is closer to the
        dict.setdefault() functionality.)
        """
        return self._add(key)

    cdef object _add(self, key):
        cdef PyObject **slot, *py_key
        cdef int added

        py_key = <PyObject *>key
        if (Py_TYPE(py_key).tp_richcompare == NULL
            or Py_TYPE(py_key).tp_hash == NULL):
            raise TypeError('Types added to SimpleSet must implement'
                            ' both tp_richcompare and tp_hash')
        added = 0
        # We need at least one empty slot
        assert self._used < self._mask
        slot = _lookup(self, key)
        if (slot[0] == NULL):
            Py_INCREF(py_key)
            self._fill = self._fill + 1
            self._used = self._used + 1
            slot[0] = py_key
            added = 1
        elif (slot[0] == _dummy):
            Py_INCREF(py_key)
            self._used = self._used + 1
            slot[0] = py_key
            added = 1
        # No else: clause. If _lookup returns a pointer to
        # a live object, then we already have a value at this location.
        retval = <object>(slot[0])
        # PySet and PyDict use a 2-3rds full algorithm, we'll follow suit
        if added and (self._fill * 3) >= ((self._mask + 1) * 2):
            # However, we always work for a load factor of 2:1
            self._resize(self._used * 2)
        # Even if we resized and ended up moving retval into a different slot,
        # it is still the value that is held at the slot equivalent to 'key',
        # so we can still return it
        return retval

    def discard(self, key):
        """Remove key from the set, whether it exists or not.

        :return: False if the item did not exist, True if it did
        """
        if self._discard(key):
            return True
        return False

    cdef int _discard(self, key) except -1:
        cdef PyObject **slot, *py_key

        slot = _lookup(self, key)
        if slot[0] == NULL or slot[0] == _dummy:
            return 0
        self._used = self._used - 1
        Py_DECREF(slot[0])
        slot[0] = _dummy
        # PySet uses the heuristic: If more than 1/5 are dummies, then resize
        #                           them away
        #   if ((so->_fill - so->_used) * 5 < so->mask)
        # However, we are planning on using this as an interning structure, in
        # which we will be putting a lot of objects. And we expect that large
        # groups of them are going to have the same lifetime.
        # Dummy entries hurt a little bit because they cause the lookup to keep
        # searching, but resizing is also rather expensive
        # For now, we'll just use their algorithm, but we may want to revisit
        # it
        if ((self._fill - self._used) * 5 > self._mask):
            self._resize(self._used * 2)
        return 1

    def __iter__(self):
        return _SimpleSet_iterator(self)


cdef class _SimpleSet_iterator:
    """Iterator over the SimpleSet structure."""

    cdef Py_ssize_t pos
    cdef SimpleSet set
    cdef Py_ssize_t _used # track if things have been mutated while iterating
    cdef Py_ssize_t len # number of entries left

    def __init__(self, obj):
        self.set = obj
        self.pos = 0
        self._used = self.set._used
        self.len = self.set._used

    def __iter__(self):
        return self

    def __next__(self):
        cdef Py_ssize_t mask, i
        cdef PyObject *key

        if self.set is None:
            raise StopIteration
        if self.set._used != self._used:
            # Force this exception to continue to be raised
            self._used = -1
            raise RuntimeError("Set size changed during iteration")
        if not SimpleSet_Next(self.set, &self.pos, &key):
            self.set = None
            raise StopIteration
        # we found something
        the_key = <object>key # INCREF
        self.len = self.len - 1
        return the_key

    def __length_hint__(self):
        if self.set is not None and self._used == self.set._used:
            return self.len
        return 0
    


cdef api SimpleSet SimpleSet_New():
    """Create a new SimpleSet object."""
    return SimpleSet()


cdef SimpleSet _check_self(object self):
    """Check that the parameter is not None.

    Pyrex/Cython will do type checking, but only to ensure that an object is
    either the right type or None. You can say "object foo not None" for pure
    python functions, but not for C functions.
    So this is just a helper for all the apis that need to do the check.
    """
    cdef SimpleSet true_self
    if self is None:
        raise TypeError('self must not be None')
    true_self = self
    return true_self


cdef PyObject **_lookup(SimpleSet self, object key) except NULL:
    """Find the slot where 'key' would fit.

    This is the same as a dicts 'lookup' function.

    :param key: An object we are looking up
    :param hash: The hash for key
    :return: The location in self.table where key should be put.
        location == NULL is an exception, but (*location) == NULL just
        indicates the slot is empty and can be used.
    """
    # This uses Quadratic Probing:
    #  http://en.wikipedia.org/wiki/Quadratic_probing
    # with c1 = c2 = 1/2
    # This leads to probe locations at:
    #  h0 = hash(k1)
    #  h1 = h0 + 1
    #  h2 = h0 + 3 = h1 + 1 + 1
    #  h3 = h0 + 6 = h2 + 1 + 2
    #  h4 = h0 + 10 = h2 + 1 + 3
    # Note that all of these are '& mask', but that is computed *after* the
    # offset.
    # This differs from the algorithm used by Set and Dict. Which, effectively,
    # use double-hashing, and a step size that starts large, but dwindles to
    # stepping one-by-one.
    # This gives more 'locality' in that if you have a collision at offset X,
    # the first fallback is X+1, which is fast to check. However, that means
    # that an object w/ hash X+1 will also check there, and then X+2 next.
    # However, for objects with differing hashes, their chains are different.
    # The former checks X, X+1, X+3, ... the latter checks X+1, X+2, X+4, ...
    # So different hashes diverge quickly.
    # A bigger problem is that we *only* ever use the lowest bits of the hash
    # So all integers (x + SIZE*N) will resolve into the same bucket, and all
    # use the same collision resolution. We may want to try to find a way to
    # incorporate the upper bits of the hash with quadratic probing. (For
    # example, X, X+1, X+3+some_upper_bits, X+6+more_upper_bits, etc.)
    cdef size_t i, n_lookup
    cdef Py_ssize_t mask
    cdef long key_hash
    cdef PyObject **table, **slot, *cur, **free_slot, *py_key

    py_key = <PyObject *>key
    # Note: avoid using hash(obj) because of a bug w/ pyrex 0.9.8.5 and 64-bit
    #       (it treats hash() as returning an 'int' rather than a 'long')
    key_hash = PyObject_Hash(py_key)
    i = <size_t>key_hash
    mask = self._mask
    table = self._table
    free_slot = NULL
    for n_lookup from 0 <= n_lookup <= <size_t>mask: # Don't loop forever
        slot = &table[i & mask]
        cur = slot[0]
        if cur == NULL:
            # Found a blank spot
            if free_slot != NULL:
                # Did we find an earlier _dummy entry?
                return free_slot
            else:
                return slot
        if cur == py_key:
            # Found an exact pointer to the key
            return slot
        if cur == _dummy:
            if free_slot == NULL:
                free_slot = slot
        elif _is_equal(py_key, key_hash, cur):
            # Both py_key and cur belong in this slot, return it
            return slot
        i = i + 1 + n_lookup
    raise AssertionError('should never get here')


cdef api PyObject **_SimpleSet_Lookup(object self, object key) except NULL:
    """Find the slot where 'key' would fit.

    This is the same as a dicts 'lookup' function. This is a private
    api because mutating what you get without maintaing the other invariants
    is a 'bad thing'.

    :param key: An object we are looking up
    :param hash: The hash for key
    :return: The location in self._table where key should be put
        should never be NULL, but may reference a NULL (PyObject*)
    """
    return _lookup(_check_self(self), key)


cdef api object SimpleSet_Add(object self, object key):
    """Add a key to the SimpleSet (set).

    :param self: The SimpleSet to add the key to.
    :param key: The key to be added. If the key is already present,
        self will not be modified
    :return: The current key stored at the location defined by 'key'.
        This may be the same object, or it may be an equivalent object.
        (consider dict.setdefault(key, key))
    """
    return _check_self(self)._add(key)
    

cdef api int SimpleSet_Contains(object self, object key) except -1:
    """Is key present in self?"""
    return (key in _check_self(self))


cdef api int SimpleSet_Discard(object self, object key) except -1:
    """Remove the object referenced at location 'key'.

    :param self: The SimpleSet being modified
    :param key: The key we are checking on
    :return: 1 if there was an object present, 0 if there was not, and -1 on
        error.
    """
    return _check_self(self)._discard(key)


cdef api PyObject *SimpleSet_Get(SimpleSet self, object key) except? NULL:
    """Get a pointer to the object present at location 'key'.

    This returns an object which is equal to key which was previously added to
    self. This returns a borrowed reference, as it may also return NULL if no
    value is present at that location.

    :param key: The value we are looking for
    :return: The object present at that location
    """
    return _check_self(self)._get(key)


cdef api Py_ssize_t SimpleSet_Size(object self) except -1:
    """Get the number of active entries in 'self'"""
    return _check_self(self)._used


cdef api int SimpleSet_Next(object self, Py_ssize_t *pos,
                            PyObject **key) except -1:
    """Walk over items in a SimpleSet.

    :param pos: should be initialized to 0 by the caller, and will be updated
        by this function
    :param key: Will return a borrowed reference to key
    :return: 0 if nothing left, 1 if we are returning a new value
    """
    cdef Py_ssize_t i, mask
    cdef SimpleSet true_self
    cdef PyObject **table
    true_self = _check_self(self)
    i = pos[0]
    if (i < 0):
        return 0
    mask = true_self._mask
    table= true_self._table
    while (i <= mask and (table[i] == NULL or table[i] == _dummy)):
        i = i + 1
    pos[0] = i + 1
    if (i > mask):
        return 0 # All done
    if (key != NULL):
        key[0] = table[i]
    return 1


cdef int SimpleSet_traverse(SimpleSet self, visitproc visit,
                            void *arg) except -1:
    """This is an implementation of 'tp_traverse' that hits the whole table.

    Cython/Pyrex don't seem to let you define a tp_traverse, and they only
    define one for you if you have an 'object' attribute. Since they don't
    support C arrays of objects, we access the PyObject * directly.
    """
    cdef Py_ssize_t pos
    cdef PyObject *next_key
    cdef int ret

    pos = 0
    while SimpleSet_Next(self, &pos, &next_key):
        ret = visit(next_key, arg)
        if ret:
            return ret
    return 0

# It is a little bit ugly to do this, but it works, and means that Meliae can
# dump the total memory consumed by all child objects.
(<PyTypeObject *>SimpleSet).tp_traverse = <traverseproc>SimpleSet_traverse