generalise RevisionLoader to RevisionStore as a repo abstraction

author: Ian Clatworthy <ian.clatworthy@canonical.com> 2009-02-19 17:13:43 +1000
committer: Ian Clatworthy <ian.clatworthy@canonical.com> 2009-02-19 17:13:43 +1000
commit: 995d645a0dbc1d920fdc4b15c3c905ea69f3c84e (patch)
tree: 2e152be3d4d2aaa9f2e89b691a5356569ff75e79 /revision_store.py
parent: 69bd63cebbbf62db05b2b9874596d118c6db7b13 (diff)
download: bzr-fastimport-995d645a0dbc1d920fdc4b15c3c905ea69f3c84e.tar.gz
1 files changed, 360 insertions, 0 deletions
diff --git a/revision_store.py b/revision_store.py
new file mode 100644
index 0000000..fd2b427
--- /dev/null
+++ b/revision_store.py
@@ -0,0 +1,360 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""An abstraction of a repository providing just the bits importing needs."""
+
+
+from bzrlib import errors, knit, lru_cache, osutils
+from bzrlib import revision as _mod_revision
+
+
+class AbstractRevisionStore(object):
+
+    def __init__(self, repo):
+        """An object responsible for loading revisions into a repository.
+
+        NOTE: Repository locking is not managed by this class. Clients
+        should take a write lock, call load() multiple times, then release
+        the lock.
+
+        :param repository: the target repository
+        """
+        self.repo = repo
+        self.try_inv_deltas = getattr(self.repo._format, '_commit_inv_deltas',
+            False)
+
+    def expects_rich_root(self):
+        """Does this store expect inventories with rich roots?"""
+        return self.repo.supports_rich_root()
+
+    def get_inventory(self, revision_id):
+        """Get a stored inventory."""
+        return self.repo.get_inventory(revision_id)
+
+    def get_file_text(self, revision_id, file_id):
+        """Get the text stored for a file in a given revision."""
+        revtree = self.repo.revision_tree(revision_id)
+        return revtree.get_file_text(file_id)
+
+    def load(self, rev, inv, signature, text_provider,
+        inventories_provider=None):
+        """Load a revision.
+
+        :param rev: the Revision
+        :param inv: the inventory
+        :param signature: signing information
+        :param text_provider: a callable expecting a file_id parameter
+            that returns the text for that file-id
+        :param inventories_provider: a callable expecting a repository and
+            a list of revision-ids, that returns:
+              * the list of revision-ids present in the repository
+              * the list of inventories for the revision-id's,
+                including an empty inventory for the missing revisions
+            If None, a default implementation is provided.
+        """
+        # NOTE: This is bzrlib.repository._install_revision refactored to
+        # to provide more flexibility in how previous revisions are cached,
+        # data is feed in, etc.
+        if inventories_provider is None:
+            inventories_provider = self._default_inventories_provider
+        present_parents, parent_invs = inventories_provider(rev.parent_ids)
+        self._load_texts(rev.revision_id, inv.iter_entries(), parent_invs,
+            text_provider)
+        try:
+            rev.inventory_sha1 = self._add_inventory(rev.revision_id,
+                inv, present_parents, parent_invs)
+        except errors.RevisionAlreadyPresent:
+            pass
+        if signature is not None:
+            self.repo.add_signature_text(rev.revision_id, signature)
+        self._add_revision(rev, inv)
+
+    def _load_texts(self, revision_id, entries, parent_invs, text_provider):
+        """Load texts to a repository for inventory entries.
+        
+        This method is provided for subclasses to use or override.
+
+        :param revision_id: the revision identifier
+        :param entries: iterator over the inventory entries
+        :param parent_invs: the parent inventories
+        :param text_provider: a callable expecting a file_id parameter
+            that returns the text for that file-id
+        """
+        raise NotImplementedError(self._load_texts)
+
+    def _add_inventory(self, revision_id, inv, parents, parent_invs):
+        """Add the inventory inv to the repository as revision_id.
+        
+        :param parents: The revision ids of the parents that revision_id
+                        is known to have and are in the repository already.
+        :param parent_invs: the parent inventories
+
+        :returns: The validator(which is a sha1 digest, though what is sha'd is
+            repository format specific) of the serialized inventory.
+        """
+        if self.try_inv_deltas and len(parents):
+            # Do we need to search for the first non-empty inventory?
+            # parent_invs can be a longer list than parents if there
+            # are ghosts????
+            basis_inv = parent_invs[0]
+            delta = inv._make_delta(basis_inv)
+            return self.repo.add_inventory_by_delta(parents[0], delta,
+                revision_id, parents)
+        else:
+            return self.repo.add_inventory(revision_id, inv, parents)
+
+    def _add_revision(self, rev, inv):
+        """Add a revision and its inventory to a repository.
+
+        :param rev: the Revision
+        :param inv: the inventory
+        """
+        self.repo.add_revision(rev.revision_id, rev, inv)
+
+    def _default_inventories_provider(self, revision_ids):
+        """An inventories provider that queries the repository."""
+        present = []
+        inventories = []
+        for revision_id in revision_ids:
+            if self.repo.has_revision(revision_id):
+                present.append(revision_id)
+                rev_tree = self.repo.revision_tree(revision_id)
+            else:
+                rev_tree = self.repo.revision_tree(None)
+            inventories.append(rev_tree.inventory)
+        return present, inventories
+
+
+class RevisionStore1(AbstractRevisionStore):
+    """A RevisionStore that uses the old bzrlib Repository API.
+    
+    The old API was present until bzr.dev rev 3510.
+    """
+
+    def _load_texts(self, revision_id, entries, parent_invs, text_provider):
+        """See RevisionStore._load_texts()."""
+        # Backwards compatibility hack: skip the root id.
+        if not self.repo.supports_rich_root():
+            path, root = entries.next()
+            if root.revision != revision_id:
+                raise errors.IncompatibleRevision(repr(self.repo))
+        # Add the texts that are not already present
+        tx = self.repo.get_transaction()
+        for path, ie in entries:
+            # This test is *really* slow: over 50% of import time
+            #w = self.repo.weave_store.get_weave_or_empty(ie.file_id, tx)
+            #if ie.revision in w:
+            #    continue
+            # Try another way, realising that this assumes that the
+            # version is not already there. In the general case,
+            # a shared repository might already have the revision but
+            # we arguably don't need that check when importing from
+            # a foreign system.
+            if ie.revision != revision_id:
+                continue
+            text_parents = []
+            for parent_inv in parent_invs:
+                if ie.file_id not in parent_inv:
+                    continue
+                parent_id = parent_inv[ie.file_id].revision
+                if parent_id in text_parents:
+                    continue
+                text_parents.append(parent_id)
+            lines = text_provider(ie.file_id)
+            vfile = self.repo.weave_store.get_weave_or_empty(ie.file_id,  tx)
+            vfile.add_lines(revision_id, text_parents, lines)
+
+    def _get_lines(self, file_id, revision_id):
+        tx = self.repo.get_transaction()
+        w = self.repo.weave_store.get_weave(ie.file_id, tx)
+        return w.get_lines(revision_id)
+
+    def _add_revision(self, rev, inv):
+        # There's no need to do everything repo.add_revision does and
+        # doing so (since bzr.dev 3392) can be pretty slow for long
+        # delta chains on inventories. Just do the essentials here ...
+        _mod_revision.check_not_reserved_id(rev.revision_id)
+        self.repo._revision_store.add_revision(rev, self.repo.get_transaction())
+
+
+class RevisionStore2(AbstractRevisionStore):
+    """A RevisionStore that uses the new bzrlib Repository API."""
+
+    def _load_texts(self, revision_id, entries, parent_invs, text_provider):
+        """See RevisionStore._load_texts()."""
+        # Backwards compatibility hack: skip the root id.
+        if not self.repo.supports_rich_root():
+            path, root = entries.next()
+            if root.revision != revision_id:
+                raise errors.IncompatibleRevision(repr(self.repo))
+        text_keys = {}
+        for path, ie in entries:
+            text_keys[(ie.file_id, ie.revision)] = ie
+        text_parent_map = self.repo.texts.get_parent_map(text_keys)
+        missing_texts = set(text_keys) - set(text_parent_map)
+        # Add the texts that are not already present
+        for text_key in missing_texts:
+            ie = text_keys[text_key]
+            text_parents = []
+            for parent_inv in parent_invs:
+                if ie.file_id not in parent_inv:
+                    continue
+                parent_id = parent_inv[ie.file_id].revision
+                if parent_id in text_parents:
+                    continue
+                text_parents.append((ie.file_id, parent_id))
+            lines = text_provider(ie.file_id)
+            self.repo.texts.add_lines(text_key, text_parents, lines)
+
+    def _get_lines(self, file_id, revision_id):
+        record = self.repo.texts.get_record_stream([(file_id, revision_id)],
+            'unordered', True).next()
+        if record.storage_kind == 'absent':
+            raise errors.RevisionNotPresent(record.key, self.repo)
+        return osutils.split_lines(record.get_bytes_as('fulltext'))
+
+    # This is breaking imports into brisbane-core currently
+    #def _add_revision(self, rev, inv):
+    #    # There's no need to do everything repo.add_revision does and
+    #    # doing so (since bzr.dev 3392) can be pretty slow for long
+    #    # delta chains on inventories. Just do the essentials here ...
+    #    _mod_revision.check_not_reserved_id(rev.revision_id)
+    #    self.repo._add_revision(rev)
+ 
+
+class ImportRevisionStore1(RevisionStore1):
+    """A RevisionStore (old Repository API) optimised for importing.
+
+    This implementation caches serialised inventory texts and provides
+    fine-grained control over when inventories are stored as fulltexts.
+    """
+
+    def __init__(self, repo, parent_texts_to_cache=1, fulltext_when=None,
+        random_ids=True):
+        """See AbstractRevisionStore.__init__.
+
+        :param repository: the target repository
+        :param parent_text_to_cache: the number of parent texts to cache
+        :para fulltext_when: if non None, a function to call to decide
+          whether to fulltext the inventory or not. The revision count
+          is passed as a parameter and the result is treated as a boolean.
+        """
+        RevisionStore1.__init__(self, repo)
+        self.inv_parent_texts = lru_cache.LRUCache(parent_texts_to_cache)
+        self.fulltext_when = fulltext_when
+        self.random_ids = random_ids
+        self.revision_count = 0
+
+    def _add_inventory(self, revision_id, inv, parents, parent_invs):
+        """See RevisionStore._add_inventory."""
+        # Code taken from bzrlib.repository.add_inventory
+        assert self.repo.is_in_write_group()
+        _mod_revision.check_not_reserved_id(revision_id)
+        assert inv.revision_id is None or inv.revision_id == revision_id, \
+            "Mismatch between inventory revision" \
+            " id and insertion revid (%r, %r)" % (inv.revision_id, revision_id)
+        assert inv.root is not None
+        inv_lines = self.repo._serialise_inventory_to_lines(inv)
+        inv_vf = self.repo.get_inventory_weave()
+        sha1, num_bytes, parent_text = self._inventory_add_lines(inv_vf,
+            revision_id, parents, inv_lines, self.inv_parent_texts)
+        self.inv_parent_texts[revision_id] = parent_text
+        return sha1
+
+    def _inventory_add_lines(self, inv_vf, version_id, parents, lines,
+            parent_texts):
+        """See Repository._inventory_add_lines()."""
+        # setup parameters used in original code but not this API
+        self.revision_count += 1
+        if self.fulltext_when is not None:
+            delta = not self.fulltext_when(self.revision_count)
+        else:
+            delta = inv_vf.delta
+        left_matching_blocks = None
+        random_id = self.random_ids
+        check_content = False
+
+        # bzrlib.knit.add_lines() but error checking optimised
+        inv_vf._check_add(version_id, lines, random_id, check_content)
+
+        ####################################################################
+        # bzrlib.knit._add() but skip checking if fulltext better than delta
+        ####################################################################
+
+        line_bytes = ''.join(lines)
+        digest = osutils.sha_string(line_bytes)
+        present_parents = []
+        for parent in parents:
+            if inv_vf.has_version(parent):
+                present_parents.append(parent)
+        if parent_texts is None:
+            parent_texts = {}
+
+        # can only compress against the left most present parent.
+        if (delta and
+            (len(present_parents) == 0 or
+             present_parents[0] != parents[0])):
+            delta = False
+
+        text_length = len(line_bytes)
+        options = []
+        if lines:
+            if lines[-1][-1] != '\n':
+                # copy the contents of lines.
+                lines = lines[:]
+                options.append('no-eol')
+                lines[-1] = lines[-1] + '\n'
+                line_bytes += '\n'
+
+        #if delta:
+        #    # To speed the extract of texts the delta chain is limited
+        #    # to a fixed number of deltas.  This should minimize both
+        #    # I/O and the time spend applying deltas.
+        #    delta = inv_vf._check_should_delta(present_parents)
+
+        assert isinstance(version_id, str)
+        content = inv_vf.factory.make(lines, version_id)
+        if delta or (inv_vf.factory.annotated and len(present_parents) > 0):
+            # Merge annotations from parent texts if needed.
+            delta_hunks = inv_vf._merge_annotations(content, present_parents,
+                parent_texts, delta, inv_vf.factory.annotated,
+                left_matching_blocks)
+
+        if delta:
+            options.append('line-delta')
+            store_lines = inv_vf.factory.lower_line_delta(delta_hunks)
+            size, bytes = inv_vf._data._record_to_data(version_id, digest,
+                store_lines)
+        else:
+            options.append('fulltext')
+            # isinstance is slower and we have no hierarchy.
+            if inv_vf.factory.__class__ == knit.KnitPlainFactory:
+                # Use the already joined bytes saving iteration time in
+                # _record_to_data.
+                size, bytes = inv_vf._data._record_to_data(version_id, digest,
+                    lines, [line_bytes])
+            else:
+                # get mixed annotation + content and feed it into the
+                # serialiser.
+                store_lines = inv_vf.factory.lower_fulltext(content)
+                size, bytes = inv_vf._data._record_to_data(version_id, digest,
+                    store_lines)
+
+        access_memo = inv_vf._data.add_raw_records([size], bytes)[0]
+        inv_vf._index.add_versions(
+            ((version_id, options, access_memo, parents),),
+            random_id=random_id)
+        return digest, text_length, content
author	Ian Clatworthy <ian.clatworthy@canonical.com>	2009-02-19 17:13:43 +1000
committer	Ian Clatworthy <ian.clatworthy@canonical.com>	2009-02-19 17:13:43 +1000
commit	995d645a0dbc1d920fdc4b15c3c905ea69f3c84e (patch)
tree	2e152be3d4d2aaa9f2e89b691a5356569ff75e79 /revision_store.py
parent	69bd63cebbbf62db05b2b9874596d118c6db7b13 (diff)
download	bzr-fastimport-995d645a0dbc1d920fdc4b15c3c905ea69f3c84e.tar.gz