Tarball conversion

author: Lorry <lorry@roadtrain.codethink.co.uk> 2012-08-22 15:47:16 +0100
committer: Lorry <lorry@roadtrain.codethink.co.uk> 2012-08-22 15:47:16 +0100
commit: 25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch)
tree: d889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/_known_graph_pyx.pyx
download: bzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz
1 files changed, 955 insertions, 0 deletions
diff --git a/bzrlib/_known_graph_pyx.pyx b/bzrlib/_known_graph_pyx.pyx
new file mode 100644
index 0000000..a9e6d13
--- /dev/null
+++ b/bzrlib/_known_graph_pyx.pyx
@@ -0,0 +1,955 @@
+# Copyright (C) 2009, 2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Implementation of Graph algorithms when we have already loaded everything.
+"""
+
+cdef extern from "python-compat.h":
+    pass
+
+cdef extern from "Python.h":
+    ctypedef int Py_ssize_t
+    ctypedef struct PyObject:
+        pass
+
+    int PyString_CheckExact(object)
+
+    int PyObject_RichCompareBool(object, object, int)
+    int Py_LT
+
+    int PyTuple_CheckExact(object)
+    object PyTuple_New(Py_ssize_t n)
+    Py_ssize_t PyTuple_GET_SIZE(object t)
+    PyObject * PyTuple_GET_ITEM(object t, Py_ssize_t o)
+    void PyTuple_SET_ITEM(object t, Py_ssize_t o, object v)
+
+    int PyList_CheckExact(object)
+    Py_ssize_t PyList_GET_SIZE(object l)
+    PyObject * PyList_GET_ITEM(object l, Py_ssize_t o)
+    int PyList_SetItem(object l, Py_ssize_t o, object l) except -1
+    int PyList_Append(object l, object v) except -1
+
+    int PyDict_CheckExact(object d)
+    Py_ssize_t PyDict_Size(object d) except -1
+    PyObject * PyDict_GetItem(object d, object k)
+    int PyDict_SetItem(object d, object k, object v) except -1
+    int PyDict_DelItem(object d, object k) except -1
+    int PyDict_Next(object d, Py_ssize_t *pos, PyObject **k, PyObject **v)
+
+    void Py_INCREF(object)
+
+from collections import deque
+import gc
+
+from bzrlib import errors, revision
+
+cdef object NULL_REVISION
+NULL_REVISION = revision.NULL_REVISION
+
+
+cdef class _KnownGraphNode:
+    """Represents a single object in the known graph."""
+
+    cdef object key
+    cdef object parents
+    cdef object children
+    cdef public long gdfo
+    cdef int seen
+    cdef object extra
+
+    def __init__(self, key):
+        self.key = key
+        self.parents = None
+
+        self.children = []
+        # Greatest distance from origin
+        self.gdfo = -1
+        self.seen = 0
+        self.extra = None
+
+    property child_keys:
+        def __get__(self):
+            cdef _KnownGraphNode child
+
+            keys = []
+            for child in self.children:
+                PyList_Append(keys, child.key)
+            return keys
+
+    property parent_keys:
+        def __get__(self):
+            if self.parents is None:
+                return None
+            
+            cdef _KnownGraphNode parent
+
+            keys = []
+            for parent in self.parents:
+                PyList_Append(keys, parent.key)
+            return keys
+    
+    cdef clear_references(self):
+        self.parents = None
+        self.children = None
+
+    def __repr__(self):
+        cdef _KnownGraphNode node
+
+        parent_keys = []
+        if self.parents is not None:
+            for node in self.parents:
+                parent_keys.append(node.key)
+        child_keys = []
+        if self.children is not None:
+            for node in self.children:
+                child_keys.append(node.key)
+        return '%s(%s  gdfo:%s par:%s child:%s)' % (
+            self.__class__.__name__, self.key, self.gdfo,
+            parent_keys, child_keys)
+
+
+cdef _KnownGraphNode _get_list_node(lst, Py_ssize_t pos):
+    cdef PyObject *temp_node
+
+    temp_node = PyList_GET_ITEM(lst, pos)
+    return <_KnownGraphNode>temp_node
+
+
+cdef _KnownGraphNode _get_tuple_node(tpl, Py_ssize_t pos):
+    cdef PyObject *temp_node
+
+    temp_node = PyTuple_GET_ITEM(tpl, pos)
+    return <_KnownGraphNode>temp_node
+
+
+def get_key(node):
+    cdef _KnownGraphNode real_node
+    real_node = node
+    return real_node.key
+
+
+cdef object _sort_list_nodes(object lst_or_tpl, int reverse):
+    """Sort a list of _KnownGraphNode objects.
+
+    If lst_or_tpl is a list, it is allowed to mutate in place. It may also
+    just return the input list if everything is already sorted.
+    """
+    cdef _KnownGraphNode node1, node2
+    cdef int do_swap, is_tuple
+    cdef Py_ssize_t length
+
+    is_tuple = PyTuple_CheckExact(lst_or_tpl)
+    if not (is_tuple or PyList_CheckExact(lst_or_tpl)):
+        raise TypeError('lst_or_tpl must be a list or tuple.')
+    length = len(lst_or_tpl)
+    if length == 0 or length == 1:
+        return lst_or_tpl
+    if length == 2:
+        if is_tuple:
+            node1 = _get_tuple_node(lst_or_tpl, 0)
+            node2 = _get_tuple_node(lst_or_tpl, 1)
+        else:
+            node1 = _get_list_node(lst_or_tpl, 0)
+            node2 = _get_list_node(lst_or_tpl, 1)
+        if reverse:
+            do_swap = PyObject_RichCompareBool(node1.key, node2.key, Py_LT)
+        else:
+            do_swap = PyObject_RichCompareBool(node2.key, node1.key, Py_LT)
+        if not do_swap:
+            return lst_or_tpl
+        if is_tuple:
+            return (node2, node1)
+        else:
+            # Swap 'in-place', since lists are mutable
+            Py_INCREF(node1)
+            PyList_SetItem(lst_or_tpl, 1, node1)
+            Py_INCREF(node2)
+            PyList_SetItem(lst_or_tpl, 0, node2)
+            return lst_or_tpl
+    # For all other sizes, we just use 'sorted()'
+    if is_tuple:
+        # Note that sorted() is just list(iterable).sort()
+        lst_or_tpl = list(lst_or_tpl)
+    lst_or_tpl.sort(key=get_key, reverse=reverse)
+    return lst_or_tpl
+
+
+cdef class _MergeSorter
+
+cdef class KnownGraph:
+    """This is a class which assumes we already know the full graph."""
+
+    cdef public object _nodes
+    cdef public object _known_heads
+    cdef public int do_cache
+
+    def __init__(self, parent_map, do_cache=True):
+        """Create a new KnownGraph instance.
+
+        :param parent_map: A dictionary mapping key => parent_keys
+        """
+        # tests at pre-allocating the node dict actually slowed things down
+        self._nodes = {}
+        # Maps {sorted(revision_id, revision_id): heads}
+        self._known_heads = {}
+        self.do_cache = int(do_cache)
+        # TODO: consider disabling gc since we are allocating a lot of nodes
+        #       that won't be collectable anyway. real world testing has not
+        #       shown a specific impact, yet.
+        self._initialize_nodes(parent_map)
+        self._find_gdfo()
+
+    def __dealloc__(self):
+        cdef _KnownGraphNode child
+        cdef Py_ssize_t pos
+        cdef PyObject *temp_node
+
+        while PyDict_Next(self._nodes, &pos, NULL, &temp_node):
+            child = <_KnownGraphNode>temp_node
+            child.clear_references()
+
+    cdef _KnownGraphNode _get_or_create_node(self, key):
+        cdef PyObject *temp_node
+        cdef _KnownGraphNode node
+
+        temp_node = PyDict_GetItem(self._nodes, key)
+        if temp_node == NULL:
+            node = _KnownGraphNode(key)
+            PyDict_SetItem(self._nodes, key, node)
+        else:
+            node = <_KnownGraphNode>temp_node
+        return node
+
+    cdef _populate_parents(self, _KnownGraphNode node, parent_keys):
+        cdef Py_ssize_t num_parent_keys, pos
+        cdef _KnownGraphNode parent_node
+
+        num_parent_keys = len(parent_keys)
+        # We know how many parents, so we pre allocate the tuple
+        parent_nodes = PyTuple_New(num_parent_keys)
+        for pos from 0 <= pos < num_parent_keys:
+            # Note: it costs us 10ms out of 40ms to lookup all of these
+            #       parents, it doesn't seem to be an allocation overhead,
+            #       but rather a lookup overhead. There doesn't seem to be
+            #       a way around it, and that is one reason why
+            #       KnownGraphNode maintains a direct pointer to the parent
+            #       node.
+            # We use [] because parent_keys may be a tuple or list
+            parent_node = self._get_or_create_node(parent_keys[pos])
+            # PyTuple_SET_ITEM will steal a reference, so INCREF first
+            Py_INCREF(parent_node)
+            PyTuple_SET_ITEM(parent_nodes, pos, parent_node)
+            PyList_Append(parent_node.children, node)
+        node.parents = parent_nodes
+
+    def _initialize_nodes(self, parent_map):
+        """Populate self._nodes.
+
+        After this has finished:
+        - self._nodes will have an entry for every entry in parent_map.
+        - ghosts will have a parent_keys = None,
+        - all nodes found will also have child_keys populated with all known
+          child keys,
+        """
+        cdef PyObject *temp_key, *temp_parent_keys, *temp_node
+        cdef Py_ssize_t pos
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode parent_node
+
+        if not PyDict_CheckExact(parent_map):
+            raise TypeError('parent_map should be a dict of {key:parent_keys}')
+        # for key, parent_keys in parent_map.iteritems():
+        pos = 0
+        while PyDict_Next(parent_map, &pos, &temp_key, &temp_parent_keys):
+            key = <object>temp_key
+            parent_keys = <object>temp_parent_keys
+            node = self._get_or_create_node(key)
+            self._populate_parents(node, parent_keys)
+
+    def _find_tails(self):
+        cdef PyObject *temp_node
+        cdef _KnownGraphNode node
+        cdef Py_ssize_t pos
+
+        tails = []
+        pos = 0
+        while PyDict_Next(self._nodes, &pos, NULL, &temp_node):
+            node = <_KnownGraphNode>temp_node
+            if node.parents is None or PyTuple_GET_SIZE(node.parents) == 0:
+                node.gdfo = 1
+                PyList_Append(tails, node)
+        return tails
+
+    def _find_tips(self):
+        cdef PyObject *temp_node
+        cdef _KnownGraphNode node
+        cdef Py_ssize_t pos
+
+        tips = []
+        pos = 0
+        while PyDict_Next(self._nodes, &pos, NULL, &temp_node):
+            node = <_KnownGraphNode>temp_node
+            if PyList_GET_SIZE(node.children) == 0:
+                PyList_Append(tips, node)
+        return tips
+
+    def _find_gdfo(self):
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode child
+        cdef PyObject *temp
+        cdef Py_ssize_t pos
+        cdef int replace
+        cdef Py_ssize_t last_item
+        cdef long next_gdfo
+
+        pending = self._find_tails()
+
+        last_item = PyList_GET_SIZE(pending) - 1
+        while last_item >= 0:
+            # Avoid pop followed by push, instead, peek, and replace
+            # timing shows this is 930ms => 770ms for OOo
+            node = _get_list_node(pending, last_item)
+            last_item = last_item - 1
+            next_gdfo = node.gdfo + 1
+            for pos from 0 <= pos < PyList_GET_SIZE(node.children):
+                child = _get_list_node(node.children, pos)
+                if next_gdfo > child.gdfo:
+                    child.gdfo = next_gdfo
+                child.seen = child.seen + 1
+                if child.seen == PyTuple_GET_SIZE(child.parents):
+                    # This child is populated, queue it to be walked
+                    last_item = last_item + 1
+                    if last_item < PyList_GET_SIZE(pending):
+                        Py_INCREF(child) # SetItem steals a ref
+                        PyList_SetItem(pending, last_item, child)
+                    else:
+                        PyList_Append(pending, child)
+                    # We have queued this node, we don't need to track it
+                    # anymore
+                    child.seen = 0
+
+    def add_node(self, key, parent_keys):
+        """Add a new node to the graph.
+
+        If this fills in a ghost, then the gdfos of all children will be
+        updated accordingly.
+        
+        :param key: The node being added. If this is a duplicate, this is a
+            no-op.
+        :param parent_keys: The parents of the given node.
+        :return: None (should we return if this was a ghost, etc?)
+        """
+        cdef PyObject *maybe_node
+        cdef _KnownGraphNode node, parent_node, child_node
+        cdef long parent_gdfo, next_gdfo
+
+        maybe_node = PyDict_GetItem(self._nodes, key)
+        if maybe_node != NULL:
+            node = <_KnownGraphNode>maybe_node
+            if node.parents is None:
+                # We are filling in a ghost
+                self._populate_parents(node, parent_keys)
+                # We can't trust cached heads anymore
+                self._known_heads.clear()
+            else: # Ensure that the parent_key list matches
+                existing_parent_keys = []
+                for parent_node in node.parents:
+                    existing_parent_keys.append(parent_node.key)
+                # Make sure we use a list for the comparison, in case it was a
+                # tuple, etc
+                parent_keys = list(parent_keys)
+                if existing_parent_keys == parent_keys:
+                    # Exact match, nothing more to do
+                    return
+                else:
+                    raise ValueError('Parent key mismatch, existing node %s'
+                        ' has parents of %s not %s'
+                        % (key, existing_parent_keys, parent_keys))
+        else:
+            node = _KnownGraphNode(key)
+            PyDict_SetItem(self._nodes, key, node)
+            self._populate_parents(node, parent_keys)
+        parent_gdfo = 0
+        for parent_node in node.parents:
+            if parent_node.gdfo == -1:
+                # This is a newly introduced ghost, so it gets gdfo of 1
+                parent_node.gdfo = 1
+            if parent_gdfo < parent_node.gdfo:
+                parent_gdfo = parent_node.gdfo
+        node.gdfo = parent_gdfo + 1
+        # Now fill the gdfo to all children
+        # Note that this loop is slightly inefficient, in that we may visit the
+        # same child (and its decendents) more than once, however, it is
+        # 'efficient' in that we only walk to nodes that would be updated,
+        # rather than all nodes
+        # We use a deque rather than a simple list stack, to go for BFD rather
+        # than DFD. So that if a longer path is possible, we walk it before we
+        # get to the final child
+        pending = deque([node])
+        pending_popleft = pending.popleft
+        pending_append = pending.append
+        while pending:
+            node = pending_popleft()
+            next_gdfo = node.gdfo + 1
+            for child_node in node.children:
+                if child_node.gdfo < next_gdfo:
+                    # This child is being updated, we need to check its
+                    # children
+                    child_node.gdfo = next_gdfo
+                    pending_append(child_node)
+
+    def heads(self, keys):
+        """Return the heads from amongst keys.
+
+        This is done by searching the ancestries of each key.  Any key that is
+        reachable from another key is not returned; all the others are.
+
+        This operation scales with the relative depth between any two keys. It
+        uses gdfo to avoid walking all ancestry.
+
+        :param keys: An iterable of keys.
+        :return: A set of the heads. Note that as a set there is no ordering
+            information. Callers will need to filter their input to create
+            order if they need it.
+        """
+        cdef PyObject *maybe_node
+        cdef PyObject *maybe_heads
+        cdef PyObject *temp_node
+        cdef _KnownGraphNode node
+        cdef Py_ssize_t pos, last_item
+        cdef long min_gdfo
+
+        heads_key = frozenset(keys)
+        maybe_heads = PyDict_GetItem(self._known_heads, heads_key)
+        if maybe_heads != NULL:
+            return <object>maybe_heads
+        # Not cached, compute it ourselves
+        candidate_nodes = {}
+        for key in keys:
+            maybe_node = PyDict_GetItem(self._nodes, key)
+            if maybe_node == NULL:
+                raise KeyError('key %s not in nodes' % (key,))
+            PyDict_SetItem(candidate_nodes, key, <object>maybe_node)
+        maybe_node = PyDict_GetItem(candidate_nodes, NULL_REVISION)
+        if maybe_node != NULL:
+            # NULL_REVISION is only a head if it is the only entry
+            candidate_nodes.pop(NULL_REVISION)
+            if not candidate_nodes:
+                return frozenset([NULL_REVISION])
+            # The keys changed, so recalculate heads_key
+            heads_key = frozenset(candidate_nodes)
+        if PyDict_Size(candidate_nodes) < 2:
+            return heads_key
+
+        cleanup = []
+        pending = []
+        # we know a gdfo cannot be longer than a linear chain of all nodes
+        min_gdfo = PyDict_Size(self._nodes) + 1
+        # Build up nodes that need to be walked, note that starting nodes are
+        # not added to seen()
+        pos = 0
+        while PyDict_Next(candidate_nodes, &pos, NULL, &temp_node):
+            node = <_KnownGraphNode>temp_node
+            if node.parents is not None:
+                pending.extend(node.parents)
+            if node.gdfo < min_gdfo:
+                min_gdfo = node.gdfo
+
+        # Now do all the real work
+        last_item = PyList_GET_SIZE(pending) - 1
+        while last_item >= 0:
+            node = _get_list_node(pending, last_item)
+            last_item = last_item - 1
+            if node.seen:
+                # node already appears in some ancestry
+                continue
+            PyList_Append(cleanup, node)
+            node.seen = 1
+            if node.gdfo <= min_gdfo:
+                continue
+            if node.parents is not None and PyTuple_GET_SIZE(node.parents) > 0:
+                for pos from 0 <= pos < PyTuple_GET_SIZE(node.parents):
+                    parent_node = _get_tuple_node(node.parents, pos)
+                    last_item = last_item + 1
+                    if last_item < PyList_GET_SIZE(pending):
+                        Py_INCREF(parent_node) # SetItem steals a ref
+                        PyList_SetItem(pending, last_item, parent_node)
+                    else:
+                        PyList_Append(pending, parent_node)
+        heads = []
+        pos = 0
+        while PyDict_Next(candidate_nodes, &pos, NULL, &temp_node):
+            node = <_KnownGraphNode>temp_node
+            if not node.seen:
+                PyList_Append(heads, node.key)
+        heads = frozenset(heads)
+        for pos from 0 <= pos < PyList_GET_SIZE(cleanup):
+            node = _get_list_node(cleanup, pos)
+            node.seen = 0
+        if self.do_cache:
+            PyDict_SetItem(self._known_heads, heads_key, heads)
+        return heads
+
+    def topo_sort(self):
+        """Return the nodes in topological order.
+
+        All parents must occur before all children.
+        """
+        # This is, for the most part, the same iteration order that we used for
+        # _find_gdfo, consider finding a way to remove the duplication
+        # In general, we find the 'tails' (nodes with no parents), and then
+        # walk to the children. For children that have all of their parents
+        # yielded, we queue up the child to be yielded as well.
+        cdef _KnownGraphNode node
+        cdef _KnownGraphNode child
+        cdef PyObject *temp
+        cdef Py_ssize_t pos
+        cdef int replace
+        cdef Py_ssize_t last_item
+
+        pending = self._find_tails()
+        if PyList_GET_SIZE(pending) == 0 and len(self._nodes) > 0:
+            raise errors.GraphCycleError(self._nodes)
+
+        topo_order = []
+
+        last_item = PyList_GET_SIZE(pending) - 1
+        while last_item >= 0:
+            # Avoid pop followed by push, instead, peek, and replace
+            # timing shows this is 930ms => 770ms for OOo
+            node = _get_list_node(pending, last_item)
+            last_item = last_item - 1
+            if node.parents is not None:
+                # We don't include ghost parents
+                PyList_Append(topo_order, node.key)
+            for pos from 0 <= pos < PyList_GET_SIZE(node.children):
+                child = _get_list_node(node.children, pos)
+                if child.gdfo == -1:
+                    # We know we have a graph cycle because a node has a parent
+                    # which we couldn't find
+                    raise errors.GraphCycleError(self._nodes)
+                child.seen = child.seen + 1
+                if child.seen == PyTuple_GET_SIZE(child.parents):
+                    # All parents of this child have been yielded, queue this
+                    # one to be yielded as well
+                    last_item = last_item + 1
+                    if last_item < PyList_GET_SIZE(pending):
+                        Py_INCREF(child) # SetItem steals a ref
+                        PyList_SetItem(pending, last_item, child)
+                    else:
+                        PyList_Append(pending, child)
+                    # We have queued this node, we don't need to track it
+                    # anymore
+                    child.seen = 0
+        # We started from the parents, so we don't need to do anymore work
+        return topo_order
+
+    def gc_sort(self):
+        """Return a reverse topological ordering which is 'stable'.
+
+        There are a few constraints:
+          1) Reverse topological (all children before all parents)
+          2) Grouped by prefix
+          3) 'stable' sorting, so that we get the same result, independent of
+             machine, or extra data.
+        To do this, we use the same basic algorithm as topo_sort, but when we
+        aren't sure what node to access next, we sort them lexicographically.
+        """
+        cdef PyObject *temp
+        cdef Py_ssize_t pos, last_item
+        cdef _KnownGraphNode node, node2, parent_node
+
+        tips = self._find_tips()
+        # Split the tips based on prefix
+        prefix_tips = {}
+        for pos from 0 <= pos < PyList_GET_SIZE(tips):
+            node = _get_list_node(tips, pos)
+            if PyString_CheckExact(node.key) or len(node.key) == 1:
+                prefix = ''
+            else:
+                prefix = node.key[0]
+            temp = PyDict_GetItem(prefix_tips, prefix)
+            if temp == NULL:
+                prefix_tips[prefix] = [node]
+            else:
+                tip_nodes = <object>temp
+                PyList_Append(tip_nodes, node)
+
+        result = []
+        for prefix in sorted(prefix_tips):
+            temp = PyDict_GetItem(prefix_tips, prefix)
+            assert temp != NULL
+            tip_nodes = <object>temp
+            pending = _sort_list_nodes(tip_nodes, 1)
+            last_item = PyList_GET_SIZE(pending) - 1
+            while last_item >= 0:
+                node = _get_list_node(pending, last_item)
+                last_item = last_item - 1
+                if node.parents is None:
+                    # Ghost
+                    continue
+                PyList_Append(result, node.key)
+                # Sorting the parent keys isn't strictly necessary for stable
+                # sorting of a given graph. But it does help minimize the
+                # differences between graphs
+                # For bzr.dev ancestry:
+                #   4.73ms  no sort
+                #   7.73ms  RichCompareBool sort
+                parents = _sort_list_nodes(node.parents, 1)
+                for pos from 0 <= pos < len(parents):
+                    if PyTuple_CheckExact(parents):
+                        parent_node = _get_tuple_node(parents, pos)
+                    else:
+                        parent_node = _get_list_node(parents, pos)
+                    # TODO: GraphCycle detection
+                    parent_node.seen = parent_node.seen + 1
+                    if (parent_node.seen
+                        == PyList_GET_SIZE(parent_node.children)):
+                        # All children have been processed, queue up this
+                        # parent
+                        last_item = last_item + 1
+                        if last_item < PyList_GET_SIZE(pending):
+                            Py_INCREF(parent_node) # SetItem steals a ref
+                            PyList_SetItem(pending, last_item, parent_node)
+                        else:
+                            PyList_Append(pending, parent_node)
+                        parent_node.seen = 0
+        return result
+
+    def merge_sort(self, tip_key):
+        """Compute the merge sorted graph output."""
+        cdef _MergeSorter sorter
+
+        # TODO: consider disabling gc since we are allocating a lot of nodes
+        #       that won't be collectable anyway. real world testing has not
+        #       shown a specific impact, yet.
+        sorter = _MergeSorter(self, tip_key)
+        return sorter.topo_order()
+    
+    def get_parent_keys(self, key):
+        """Get the parents for a key
+        
+        Returns a list containg the parents keys. If the key is a ghost,
+        None is returned. A KeyError will be raised if the key is not in
+        the graph.
+        
+        :param keys: Key to check (eg revision_id)
+        :return: A list of parents
+        """
+        return self._nodes[key].parent_keys 
+
+    def get_child_keys(self, key):
+        """Get the children for a key
+        
+        Returns a list containg the children keys. A KeyError will be raised
+        if the key is not in the graph.
+        
+        :param keys: Key to check (eg revision_id)
+        :return: A list of children
+        """
+        return self._nodes[key].child_keys    
+
+
+cdef class _MergeSortNode:
+    """Tracks information about a node during the merge_sort operation."""
+
+    # Public api
+    cdef public object key
+    cdef public long merge_depth
+    cdef public object end_of_merge # True/False Is this the end of the current merge
+
+    # Private api, used while computing the information
+    cdef _KnownGraphNode left_parent
+    cdef _KnownGraphNode left_pending_parent
+    cdef object pending_parents # list of _KnownGraphNode for non-left parents
+    cdef long _revno_first
+    cdef long _revno_second
+    cdef long _revno_last
+    # TODO: turn these into flag/bit fields rather than individual members
+    cdef int is_first_child # Is this the first child?
+    cdef int seen_by_child # A child node has seen this parent
+    cdef int completed # Fully Processed
+
+    def __init__(self, key):
+        self.key = key
+        self.merge_depth = -1
+        self.left_parent = None
+        self.left_pending_parent = None
+        self.pending_parents = None
+        self._revno_first = -1
+        self._revno_second = -1
+        self._revno_last = -1
+        self.is_first_child = 0
+        self.seen_by_child = 0
+        self.completed = 0
+
+    def __repr__(self):
+        return '%s(%s depth:%s rev:%s,%s,%s first:%s seen:%s)' % (
+            self.__class__.__name__, self.key,
+            self.merge_depth,
+            self._revno_first, self._revno_second, self._revno_last,
+            self.is_first_child, self.seen_by_child)
+
+    cdef int has_pending_parents(self): # cannot_raise
+        if self.left_pending_parent is not None or self.pending_parents:
+            return 1
+        return 0
+
+    cdef object _revno(self):
+        if self._revno_first == -1:
+            if self._revno_second != -1:
+                raise RuntimeError('Something wrong with: %s' % (self,))
+            return (self._revno_last,)
+        else:
+            return (self._revno_first, self._revno_second, self._revno_last)
+
+    property revno:
+        def __get__(self):
+            return self._revno()
+
+
+cdef class _MergeSorter:
+    """This class does the work of computing the merge_sort ordering.
+
+    We have some small advantages, in that we get all the extra information
+    that KnownGraph knows, like knowing the child lists, etc.
+    """
+
+    # Current performance numbers for merge_sort(bzr_dev_parent_map):
+    #  302ms tsort.merge_sort()
+    #   91ms graph.KnownGraph().merge_sort()
+    #   40ms kg.merge_sort()
+
+    cdef KnownGraph graph
+    cdef object _depth_first_stack  # list
+    cdef Py_ssize_t _last_stack_item # offset to last item on stack
+    # cdef object _ms_nodes # dict of key => _MergeSortNode
+    cdef object _revno_to_branch_count # {revno => num child branches}
+    cdef object _scheduled_nodes # List of nodes ready to be yielded
+
+    def __init__(self, known_graph, tip_key):
+        cdef _KnownGraphNode node
+
+        self.graph = known_graph
+        # self._ms_nodes = {}
+        self._revno_to_branch_count = {}
+        self._depth_first_stack = []
+        self._last_stack_item = -1
+        self._scheduled_nodes = []
+        if (tip_key is not None and tip_key != NULL_REVISION
+            and tip_key != (NULL_REVISION,)):
+            node = self.graph._nodes[tip_key]
+            self._push_node(node, 0)
+
+    cdef _MergeSortNode _get_ms_node(self, _KnownGraphNode node):
+        cdef PyObject *temp_node
+        cdef _MergeSortNode ms_node
+
+        if node.extra is None:
+            ms_node = _MergeSortNode(node.key)
+            node.extra = ms_node
+        else:
+            ms_node = <_MergeSortNode>node.extra
+        return ms_node
+
+    cdef _push_node(self, _KnownGraphNode node, long merge_depth):
+        cdef _KnownGraphNode parent_node
+        cdef _MergeSortNode ms_node, ms_parent_node
+        cdef Py_ssize_t pos
+
+        ms_node = self._get_ms_node(node)
+        ms_node.merge_depth = merge_depth
+        if node.parents is None:
+            raise RuntimeError('ghost nodes should not be pushed'
+                               ' onto the stack: %s' % (node,))
+        if PyTuple_GET_SIZE(node.parents) > 0:
+            parent_node = _get_tuple_node(node.parents, 0)
+            ms_node.left_parent = parent_node
+            if parent_node.parents is None: # left-hand ghost
+                ms_node.left_pending_parent = None
+                ms_node.left_parent = None
+            else:
+                ms_node.left_pending_parent = parent_node
+        if PyTuple_GET_SIZE(node.parents) > 1:
+            ms_node.pending_parents = []
+            for pos from 1 <= pos < PyTuple_GET_SIZE(node.parents):
+                parent_node = _get_tuple_node(node.parents, pos)
+                if parent_node.parents is None: # ghost
+                    continue
+                PyList_Append(ms_node.pending_parents, parent_node)
+
+        ms_node.is_first_child = 1
+        if ms_node.left_parent is not None:
+            ms_parent_node = self._get_ms_node(ms_node.left_parent)
+            if ms_parent_node.seen_by_child:
+                ms_node.is_first_child = 0
+            ms_parent_node.seen_by_child = 1
+        self._last_stack_item = self._last_stack_item + 1
+        if self._last_stack_item < PyList_GET_SIZE(self._depth_first_stack):
+            Py_INCREF(node) # SetItem steals a ref
+            PyList_SetItem(self._depth_first_stack, self._last_stack_item,
+                           node)
+        else:
+            PyList_Append(self._depth_first_stack, node)
+
+    cdef _pop_node(self):
+        cdef PyObject *temp
+        cdef _MergeSortNode ms_node, ms_parent_node, ms_prev_node
+        cdef _KnownGraphNode node, parent_node, prev_node
+
+        node = _get_list_node(self._depth_first_stack, self._last_stack_item)
+        ms_node = <_MergeSortNode>node.extra
+        self._last_stack_item = self._last_stack_item - 1
+        if ms_node.left_parent is not None:
+            # Assign the revision number from the left-hand parent
+            ms_parent_node = <_MergeSortNode>ms_node.left_parent.extra
+            if ms_node.is_first_child:
+                # First child just increments the final digit
+                ms_node._revno_first = ms_parent_node._revno_first
+                ms_node._revno_second = ms_parent_node._revno_second
+                ms_node._revno_last = ms_parent_node._revno_last + 1
+            else:
+                # Not the first child, make a new branch
+                #  (mainline_revno, branch_count, 1)
+                if ms_parent_node._revno_first == -1:
+                    # Mainline ancestor, the increment is on the last digit
+                    base_revno = ms_parent_node._revno_last
+                else:
+                    base_revno = ms_parent_node._revno_first
+                temp = PyDict_GetItem(self._revno_to_branch_count,
+                                      base_revno)
+                if temp == NULL:
+                    branch_count = 1
+                else:
+                    branch_count = (<object>temp) + 1
+                PyDict_SetItem(self._revno_to_branch_count, base_revno,
+                               branch_count)
+                ms_node._revno_first = base_revno
+                ms_node._revno_second = branch_count
+                ms_node._revno_last = 1
+        else:
+            temp = PyDict_GetItem(self._revno_to_branch_count, 0)
+            if temp == NULL:
+                # The first root node doesn't have a 3-digit revno
+                root_count = 0
+                ms_node._revno_first = -1
+                ms_node._revno_second = -1
+                ms_node._revno_last = 1
+            else:
+                root_count = (<object>temp) + 1
+                ms_node._revno_first = 0
+                ms_node._revno_second = root_count
+                ms_node._revno_last = 1
+            PyDict_SetItem(self._revno_to_branch_count, 0, root_count)
+        ms_node.completed = 1
+        if PyList_GET_SIZE(self._scheduled_nodes) == 0:
+            # The first scheduled node is always the end of merge
+            ms_node.end_of_merge = True
+        else:
+            prev_node = _get_list_node(self._scheduled_nodes,
+                                    PyList_GET_SIZE(self._scheduled_nodes) - 1)
+            ms_prev_node = <_MergeSortNode>prev_node.extra
+            if ms_prev_node.merge_depth < ms_node.merge_depth:
+                # The previously pushed node is to our left, so this is the end
+                # of this right-hand chain
+                ms_node.end_of_merge = True
+            elif (ms_prev_node.merge_depth == ms_node.merge_depth
+                  and prev_node not in node.parents):
+                # The next node is not a direct parent of this node
+                ms_node.end_of_merge = True
+            else:
+                ms_node.end_of_merge = False
+        PyList_Append(self._scheduled_nodes, node)
+
+    cdef _schedule_stack(self):
+        cdef _KnownGraphNode last_node, next_node
+        cdef _MergeSortNode ms_node, ms_last_node, ms_next_node
+        cdef long next_merge_depth
+        ordered = []
+        while self._last_stack_item >= 0:
+            # Peek at the last item on the stack
+            last_node = _get_list_node(self._depth_first_stack,
+                                       self._last_stack_item)
+            if last_node.gdfo == -1:
+                # if _find_gdfo skipped a node, that means there is a graph
+                # cycle, error out now
+                raise errors.GraphCycleError(self.graph._nodes)
+            ms_last_node = <_MergeSortNode>last_node.extra
+            if not ms_last_node.has_pending_parents():
+                # Processed all parents, pop this node
+                self._pop_node()
+                continue
+            while ms_last_node.has_pending_parents():
+                if ms_last_node.left_pending_parent is not None:
+                    # recurse depth first into the primary parent
+                    next_node = ms_last_node.left_pending_parent
+                    ms_last_node.left_pending_parent = None
+                else:
+                    # place any merges in right-to-left order for scheduling
+                    # which gives us left-to-right order after we reverse
+                    # the scheduled queue.
+                    # Note: This has the effect of allocating common-new
+                    #       revisions to the right-most subtree rather than the
+                    #       left most, which will display nicely (you get
+                    #       smaller trees at the top of the combined merge).
+                    next_node = ms_last_node.pending_parents.pop()
+                ms_next_node = self._get_ms_node(next_node)
+                if ms_next_node.completed:
+                    # this parent was completed by a child on the
+                    # call stack. skip it.
+                    continue
+                # otherwise transfer it from the source graph into the
+                # top of the current depth first search stack.
+
+                if next_node is ms_last_node.left_parent:
+                    next_merge_depth = ms_last_node.merge_depth
+                else:
+                    next_merge_depth = ms_last_node.merge_depth + 1
+                self._push_node(next_node, next_merge_depth)
+                # and do not continue processing parents until this 'call'
+                # has recursed.
+                break
+
+    cdef topo_order(self):
+        cdef _MergeSortNode ms_node
+        cdef _KnownGraphNode node
+        cdef Py_ssize_t pos
+        cdef PyObject *temp_key, *temp_node
+
+        # Note: allocating a _MergeSortNode and deallocating it for all nodes
+        #       costs approx 8.52ms (21%) of the total runtime
+        #       We might consider moving the attributes into the base
+        #       KnownGraph object.
+        self._schedule_stack()
+
+        # We've set up the basic schedule, now we can continue processing the
+        # output.
+        # Note: This final loop costs us 40.0ms => 28.8ms (11ms, 25%) on
+        #       bzr.dev, to convert the internal Object representation into a
+        #       Tuple representation...
+        #       2ms is walking the data and computing revno tuples
+        #       7ms is computing the return tuple
+        #       4ms is PyList_Append()
+        ordered = []
+        # output the result in reverse order, and separate the generated info
+        for pos from PyList_GET_SIZE(self._scheduled_nodes) > pos >= 0:
+            node = _get_list_node(self._scheduled_nodes, pos)
+            ms_node = <_MergeSortNode>node.extra
+            PyList_Append(ordered, ms_node)
+            node.extra = None
+        # Clear out the scheduled nodes now that we're done
+        self._scheduled_nodes = []
+        return ordered
author	Lorry <lorry@roadtrain.codethink.co.uk>	2012-08-22 15:47:16 +0100
committer	Lorry <lorry@roadtrain.codethink.co.uk>	2012-08-22 15:47:16 +0100
commit	25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch)
tree	d889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/_known_graph_pyx.pyx
download	bzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz