summaryrefslogtreecommitdiff
path: root/bzrlib/check.py
diff options
context:
space:
mode:
Diffstat (limited to 'bzrlib/check.py')
-rw-r--r--bzrlib/check.py446
1 files changed, 446 insertions, 0 deletions
diff --git a/bzrlib/check.py b/bzrlib/check.py
new file mode 100644
index 0000000..48d9ace
--- /dev/null
+++ b/bzrlib/check.py
@@ -0,0 +1,446 @@
+# Copyright (C) 2005, 2006 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+# TODO: Check ancestries are correct for every revision: includes
+# every committed so far, and in a reasonable order.
+
+# TODO: Also check non-mainline revisions mentioned as parents.
+
+# TODO: Check for extra files in the control directory.
+
+# TODO: Check revision, inventory and entry objects have all
+# required fields.
+
+# TODO: Get every revision in the revision-store even if they're not
+# referenced by history and make sure they're all valid.
+
+# TODO: Perhaps have a way to record errors other than by raising exceptions;
+# would perhaps be enough to accumulate exception objects in a list without
+# raising them. If there's more than one exception it'd be good to see them
+# all.
+
+"""Checking of bzr objects.
+
+check_refs is a concept used for optimising check. Objects that depend on other
+objects (e.g. tree on repository) can list the objects they would be requesting
+so that when the dependent object is checked, matches can be pulled out and
+evaluated in-line rather than re-reading the same data many times.
+check_refs are tuples (kind, value). Currently defined kinds are:
+
+* 'trees', where value is a revid and the looked up objects are revision trees.
+* 'lefthand-distance', where value is a revid and the looked up objects are the
+ distance along the lefthand path to NULL for that revid.
+* 'revision-existence', where value is a revid, and the result is True or False
+ indicating that the revision was found/not found.
+"""
+
+from __future__ import absolute_import
+
+from bzrlib import (
+ errors,
+ ui,
+ )
+from bzrlib.branch import Branch
+from bzrlib.controldir import ControlDir
+from bzrlib.revision import NULL_REVISION
+from bzrlib.trace import note
+from bzrlib.workingtree import WorkingTree
+from bzrlib.i18n import gettext
+
+class Check(object):
+ """Check a repository"""
+
+ def __init__(self, repository, check_repo=True):
+ self.repository = repository
+
+ def report_results(self, verbose):
+ raise NotImplementedError(self.report_results)
+
+
+class VersionedFileCheck(Check):
+ """Check a versioned file repository"""
+
+ # The Check object interacts with InventoryEntry.check, etc.
+
+ def __init__(self, repository, check_repo=True):
+ self.repository = repository
+ self.checked_rev_cnt = 0
+ self.ghosts = set()
+ self.missing_parent_links = {}
+ self.missing_inventory_sha_cnt = 0
+ self.missing_revision_cnt = 0
+ self.checked_weaves = set()
+ self.unreferenced_versions = set()
+ self.inconsistent_parents = []
+ self.rich_roots = repository.supports_rich_root()
+ self.text_key_references = {}
+ self.check_repo = check_repo
+ self.other_results = []
+ # Plain text lines to include in the report
+ self._report_items = []
+ # Keys we are looking for; may be large and need spilling to disk.
+ # key->(type(revision/inventory/text/signature/map), sha1, first-referer)
+ self.pending_keys = {}
+ # Ancestors map for all of revisions being checked; while large helper
+ # functions we call would create it anyway, so better to have once and
+ # keep.
+ self.ancestors = {}
+
+ def check(self, callback_refs=None, check_repo=True):
+ if callback_refs is None:
+ callback_refs = {}
+ self.repository.lock_read()
+ self.progress = ui.ui_factory.nested_progress_bar()
+ try:
+ self.progress.update(gettext('check'), 0, 4)
+ if self.check_repo:
+ self.progress.update(gettext('checking revisions'), 0)
+ self.check_revisions()
+ self.progress.update(gettext('checking commit contents'), 1)
+ self.repository._check_inventories(self)
+ self.progress.update(gettext('checking file graphs'), 2)
+ # check_weaves is done after the revision scan so that
+ # revision index is known to be valid.
+ self.check_weaves()
+ self.progress.update(gettext('checking branches and trees'), 3)
+ if callback_refs:
+ repo = self.repository
+ # calculate all refs, and callback the objects requesting them.
+ refs = {}
+ wanting_items = set()
+ # Current crude version calculates everything and calls
+ # everything at once. Doing a queue and popping as things are
+ # satisfied would be cheaper on memory [but few people have
+ # huge numbers of working trees today. TODO: fix before
+ # landing].
+ distances = set()
+ existences = set()
+ for ref, wantlist in callback_refs.iteritems():
+ wanting_items.update(wantlist)
+ kind, value = ref
+ if kind == 'trees':
+ refs[ref] = repo.revision_tree(value)
+ elif kind == 'lefthand-distance':
+ distances.add(value)
+ elif kind == 'revision-existence':
+ existences.add(value)
+ else:
+ raise AssertionError(
+ 'unknown ref kind for ref %s' % ref)
+ node_distances = repo.get_graph().find_lefthand_distances(distances)
+ for key, distance in node_distances.iteritems():
+ refs[('lefthand-distance', key)] = distance
+ if key in existences and distance > 0:
+ refs[('revision-existence', key)] = True
+ existences.remove(key)
+ parent_map = repo.get_graph().get_parent_map(existences)
+ for key in parent_map:
+ refs[('revision-existence', key)] = True
+ existences.remove(key)
+ for key in existences:
+ refs[('revision-existence', key)] = False
+ for item in wanting_items:
+ if isinstance(item, WorkingTree):
+ item._check(refs)
+ if isinstance(item, Branch):
+ self.other_results.append(item.check(refs))
+ finally:
+ self.progress.finished()
+ self.repository.unlock()
+
+ def _check_revisions(self, revisions_iterator):
+ """Check revision objects by decorating a generator.
+
+ :param revisions_iterator: An iterator of(revid, Revision-or-None).
+ :return: A generator of the contents of revisions_iterator.
+ """
+ self.planned_revisions = set()
+ for revid, revision in revisions_iterator:
+ yield revid, revision
+ self._check_one_rev(revid, revision)
+ # Flatten the revisions we found to guarantee consistent later
+ # iteration.
+ self.planned_revisions = list(self.planned_revisions)
+ # TODO: extract digital signatures as items to callback on too.
+
+ def check_revisions(self):
+ """Scan revisions, checking data directly available as we go."""
+ revision_iterator = self.repository._iter_revisions(None)
+ revision_iterator = self._check_revisions(revision_iterator)
+ # We read the all revisions here:
+ # - doing this allows later code to depend on the revision index.
+ # - we can fill out existence flags at this point
+ # - we can read the revision inventory sha at this point
+ # - we can check properties and serialisers etc.
+ if not self.repository._format.revision_graph_can_have_wrong_parents:
+ # The check against the index isn't needed.
+ self.revs_with_bad_parents_in_index = None
+ for thing in revision_iterator:
+ pass
+ else:
+ bad_revisions = self.repository._find_inconsistent_revision_parents(
+ revision_iterator)
+ self.revs_with_bad_parents_in_index = list(bad_revisions)
+
+ def report_results(self, verbose):
+ if self.check_repo:
+ self._report_repo_results(verbose)
+ for result in self.other_results:
+ result.report_results(verbose)
+
+ def _report_repo_results(self, verbose):
+ note(gettext('checked repository {0} format {1}').format(
+ self.repository.user_url,
+ self.repository._format))
+ note(gettext('%6d revisions'), self.checked_rev_cnt)
+ note(gettext('%6d file-ids'), len(self.checked_weaves))
+ if verbose:
+ note(gettext('%6d unreferenced text versions'),
+ len(self.unreferenced_versions))
+ if verbose and len(self.unreferenced_versions):
+ for file_id, revision_id in self.unreferenced_versions:
+ note(gettext('unreferenced version: {{{0}}} in {1}').format(revision_id,
+ file_id))
+ if self.missing_inventory_sha_cnt:
+ note(gettext('%6d revisions are missing inventory_sha1'),
+ self.missing_inventory_sha_cnt)
+ if self.missing_revision_cnt:
+ note(gettext('%6d revisions are mentioned but not present'),
+ self.missing_revision_cnt)
+ if len(self.ghosts):
+ note(gettext('%6d ghost revisions'), len(self.ghosts))
+ if verbose:
+ for ghost in self.ghosts:
+ note(' %s', ghost)
+ if len(self.missing_parent_links):
+ note(gettext('%6d revisions missing parents in ancestry'),
+ len(self.missing_parent_links))
+ if verbose:
+ for link, linkers in self.missing_parent_links.items():
+ note(gettext(' %s should be in the ancestry for:'), link)
+ for linker in linkers:
+ note(' * %s', linker)
+ if len(self.inconsistent_parents):
+ note(gettext('%6d inconsistent parents'), len(self.inconsistent_parents))
+ if verbose:
+ for info in self.inconsistent_parents:
+ revision_id, file_id, found_parents, correct_parents = info
+ note(gettext(' * {0} version {1} has parents {2!r} '
+ 'but should have {3!r}').format(
+ file_id, revision_id, found_parents,
+ correct_parents))
+ if self.revs_with_bad_parents_in_index:
+ note(gettext(
+ '%6d revisions have incorrect parents in the revision index'),
+ len(self.revs_with_bad_parents_in_index))
+ if verbose:
+ for item in self.revs_with_bad_parents_in_index:
+ revision_id, index_parents, actual_parents = item
+ note(gettext(
+ ' {0} has wrong parents in index: '
+ '{1!r} should be {2!r}').format(
+ revision_id, index_parents, actual_parents))
+ for item in self._report_items:
+ note(item)
+
+ def _check_one_rev(self, rev_id, rev):
+ """Cross-check one revision.
+
+ :param rev_id: A revision id to check.
+ :param rev: A revision or None to indicate a missing revision.
+ """
+ if rev.revision_id != rev_id:
+ self._report_items.append(gettext(
+ 'Mismatched internal revid {{{0}}} and index revid {{{1}}}').format(
+ rev.revision_id, rev_id))
+ rev_id = rev.revision_id
+ # Check this revision tree etc, and count as seen when we encounter a
+ # reference to it.
+ self.planned_revisions.add(rev_id)
+ # It is not a ghost
+ self.ghosts.discard(rev_id)
+ # Count all parents as ghosts if we haven't seen them yet.
+ for parent in rev.parent_ids:
+ if not parent in self.planned_revisions:
+ self.ghosts.add(parent)
+
+ self.ancestors[rev_id] = tuple(rev.parent_ids) or (NULL_REVISION,)
+ self.add_pending_item(rev_id, ('inventories', rev_id), 'inventory',
+ rev.inventory_sha1)
+ self.checked_rev_cnt += 1
+
+ def add_pending_item(self, referer, key, kind, sha1):
+ """Add a reference to a sha1 to be cross checked against a key.
+
+ :param referer: The referer that expects key to have sha1.
+ :param key: A storage key e.g. ('texts', 'foo@bar-20040504-1234')
+ :param kind: revision/inventory/text/map/signature
+ :param sha1: A hex sha1 or None if no sha1 is known.
+ """
+ existing = self.pending_keys.get(key)
+ if existing:
+ if sha1 != existing[1]:
+ self._report_items.append(gettext('Multiple expected sha1s for {0}. {{{1}}}'
+ ' expects {{{2}}}, {{{3}}} expects {{{4}}}').format(
+ key, referer, sha1, existing[1], existing[0]))
+ else:
+ self.pending_keys[key] = (kind, sha1, referer)
+
+ def check_weaves(self):
+ """Check all the weaves we can get our hands on.
+ """
+ weave_ids = []
+ storebar = ui.ui_factory.nested_progress_bar()
+ try:
+ self._check_weaves(storebar)
+ finally:
+ storebar.finished()
+
+ def _check_weaves(self, storebar):
+ storebar.update('text-index', 0, 2)
+ if self.repository._format.fast_deltas:
+ # We haven't considered every fileid instance so far.
+ weave_checker = self.repository._get_versioned_file_checker(
+ ancestors=self.ancestors)
+ else:
+ weave_checker = self.repository._get_versioned_file_checker(
+ text_key_references=self.text_key_references,
+ ancestors=self.ancestors)
+ storebar.update('file-graph', 1)
+ result = weave_checker.check_file_version_parents(
+ self.repository.texts)
+ self.checked_weaves = weave_checker.file_ids
+ bad_parents, unused_versions = result
+ bad_parents = bad_parents.items()
+ for text_key, (stored_parents, correct_parents) in bad_parents:
+ # XXX not ready for id join/split operations.
+ weave_id = text_key[0]
+ revision_id = text_key[-1]
+ weave_parents = tuple([parent[-1] for parent in stored_parents])
+ correct_parents = tuple([parent[-1] for parent in correct_parents])
+ self.inconsistent_parents.append(
+ (revision_id, weave_id, weave_parents, correct_parents))
+ self.unreferenced_versions.update(unused_versions)
+
+ def _add_entry_to_text_key_references(self, inv, entry):
+ if not self.rich_roots and entry.name == '':
+ return
+ key = (entry.file_id, entry.revision)
+ self.text_key_references.setdefault(key, False)
+ if entry.revision == inv.revision_id:
+ self.text_key_references[key] = True
+
+
+def scan_branch(branch, needed_refs, to_unlock):
+ """Scan a branch for refs.
+
+ :param branch: The branch to schedule for checking.
+ :param needed_refs: Refs we are accumulating.
+ :param to_unlock: The unlock list accumulating.
+ """
+ note(gettext("Checking branch at '%s'.") % (branch.base,))
+ branch.lock_read()
+ to_unlock.append(branch)
+ branch_refs = branch._get_check_refs()
+ for ref in branch_refs:
+ reflist = needed_refs.setdefault(ref, [])
+ reflist.append(branch)
+
+
+def scan_tree(base_tree, tree, needed_refs, to_unlock):
+ """Scan a tree for refs.
+
+ :param base_tree: The original tree check opened, used to detect duplicate
+ tree checks.
+ :param tree: The tree to schedule for checking.
+ :param needed_refs: Refs we are accumulating.
+ :param to_unlock: The unlock list accumulating.
+ """
+ if base_tree is not None and tree.basedir == base_tree.basedir:
+ return
+ note(gettext("Checking working tree at '%s'.") % (tree.basedir,))
+ tree.lock_read()
+ to_unlock.append(tree)
+ tree_refs = tree._get_check_refs()
+ for ref in tree_refs:
+ reflist = needed_refs.setdefault(ref, [])
+ reflist.append(tree)
+
+
+def check_dwim(path, verbose, do_branch=False, do_repo=False, do_tree=False):
+ """Check multiple objects.
+
+ If errors occur they are accumulated and reported as far as possible, and
+ an exception raised at the end of the process.
+ """
+ try:
+ base_tree, branch, repo, relpath = \
+ ControlDir.open_containing_tree_branch_or_repository(path)
+ except errors.NotBranchError:
+ base_tree = branch = repo = None
+
+ to_unlock = []
+ needed_refs= {}
+ try:
+ if base_tree is not None:
+ # If the tree is a lightweight checkout we won't see it in
+ # repo.find_branches - add now.
+ if do_tree:
+ scan_tree(None, base_tree, needed_refs, to_unlock)
+ branch = base_tree.branch
+ if branch is not None:
+ # We have a branch
+ if repo is None:
+ # The branch is in a shared repository
+ repo = branch.repository
+ if repo is not None:
+ repo.lock_read()
+ to_unlock.append(repo)
+ branches = repo.find_branches(using=True)
+ saw_tree = False
+ if do_branch or do_tree:
+ for branch in branches:
+ if do_tree:
+ try:
+ tree = branch.bzrdir.open_workingtree()
+ saw_tree = True
+ except (errors.NotLocalUrl, errors.NoWorkingTree):
+ pass
+ else:
+ scan_tree(base_tree, tree, needed_refs, to_unlock)
+ if do_branch:
+ scan_branch(branch, needed_refs, to_unlock)
+ if do_branch and not branches:
+ note(gettext("No branch found at specified location."))
+ if do_tree and base_tree is None and not saw_tree:
+ note(gettext("No working tree found at specified location."))
+ if do_repo or do_branch or do_tree:
+ if do_repo:
+ note(gettext("Checking repository at '%s'.")
+ % (repo.user_url,))
+ result = repo.check(None, callback_refs=needed_refs,
+ check_repo=do_repo)
+ result.report_results(verbose)
+ else:
+ if do_tree:
+ note(gettext("No working tree found at specified location."))
+ if do_branch:
+ note(gettext("No branch found at specified location."))
+ if do_repo:
+ note(gettext("No repository found at specified location."))
+ finally:
+ for thing in to_unlock:
+ thing.unlock()