diff options
Diffstat (limited to 'bzrlib/check.py')
-rw-r--r-- | bzrlib/check.py | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/bzrlib/check.py b/bzrlib/check.py new file mode 100644 index 0000000..48d9ace --- /dev/null +++ b/bzrlib/check.py @@ -0,0 +1,446 @@ +# Copyright (C) 2005, 2006 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# TODO: Check ancestries are correct for every revision: includes +# every committed so far, and in a reasonable order. + +# TODO: Also check non-mainline revisions mentioned as parents. + +# TODO: Check for extra files in the control directory. + +# TODO: Check revision, inventory and entry objects have all +# required fields. + +# TODO: Get every revision in the revision-store even if they're not +# referenced by history and make sure they're all valid. + +# TODO: Perhaps have a way to record errors other than by raising exceptions; +# would perhaps be enough to accumulate exception objects in a list without +# raising them. If there's more than one exception it'd be good to see them +# all. + +"""Checking of bzr objects. + +check_refs is a concept used for optimising check. Objects that depend on other +objects (e.g. tree on repository) can list the objects they would be requesting +so that when the dependent object is checked, matches can be pulled out and +evaluated in-line rather than re-reading the same data many times. +check_refs are tuples (kind, value). Currently defined kinds are: + +* 'trees', where value is a revid and the looked up objects are revision trees. +* 'lefthand-distance', where value is a revid and the looked up objects are the + distance along the lefthand path to NULL for that revid. +* 'revision-existence', where value is a revid, and the result is True or False + indicating that the revision was found/not found. +""" + +from __future__ import absolute_import + +from bzrlib import ( + errors, + ui, + ) +from bzrlib.branch import Branch +from bzrlib.controldir import ControlDir +from bzrlib.revision import NULL_REVISION +from bzrlib.trace import note +from bzrlib.workingtree import WorkingTree +from bzrlib.i18n import gettext + +class Check(object): + """Check a repository""" + + def __init__(self, repository, check_repo=True): + self.repository = repository + + def report_results(self, verbose): + raise NotImplementedError(self.report_results) + + +class VersionedFileCheck(Check): + """Check a versioned file repository""" + + # The Check object interacts with InventoryEntry.check, etc. + + def __init__(self, repository, check_repo=True): + self.repository = repository + self.checked_rev_cnt = 0 + self.ghosts = set() + self.missing_parent_links = {} + self.missing_inventory_sha_cnt = 0 + self.missing_revision_cnt = 0 + self.checked_weaves = set() + self.unreferenced_versions = set() + self.inconsistent_parents = [] + self.rich_roots = repository.supports_rich_root() + self.text_key_references = {} + self.check_repo = check_repo + self.other_results = [] + # Plain text lines to include in the report + self._report_items = [] + # Keys we are looking for; may be large and need spilling to disk. + # key->(type(revision/inventory/text/signature/map), sha1, first-referer) + self.pending_keys = {} + # Ancestors map for all of revisions being checked; while large helper + # functions we call would create it anyway, so better to have once and + # keep. + self.ancestors = {} + + def check(self, callback_refs=None, check_repo=True): + if callback_refs is None: + callback_refs = {} + self.repository.lock_read() + self.progress = ui.ui_factory.nested_progress_bar() + try: + self.progress.update(gettext('check'), 0, 4) + if self.check_repo: + self.progress.update(gettext('checking revisions'), 0) + self.check_revisions() + self.progress.update(gettext('checking commit contents'), 1) + self.repository._check_inventories(self) + self.progress.update(gettext('checking file graphs'), 2) + # check_weaves is done after the revision scan so that + # revision index is known to be valid. + self.check_weaves() + self.progress.update(gettext('checking branches and trees'), 3) + if callback_refs: + repo = self.repository + # calculate all refs, and callback the objects requesting them. + refs = {} + wanting_items = set() + # Current crude version calculates everything and calls + # everything at once. Doing a queue and popping as things are + # satisfied would be cheaper on memory [but few people have + # huge numbers of working trees today. TODO: fix before + # landing]. + distances = set() + existences = set() + for ref, wantlist in callback_refs.iteritems(): + wanting_items.update(wantlist) + kind, value = ref + if kind == 'trees': + refs[ref] = repo.revision_tree(value) + elif kind == 'lefthand-distance': + distances.add(value) + elif kind == 'revision-existence': + existences.add(value) + else: + raise AssertionError( + 'unknown ref kind for ref %s' % ref) + node_distances = repo.get_graph().find_lefthand_distances(distances) + for key, distance in node_distances.iteritems(): + refs[('lefthand-distance', key)] = distance + if key in existences and distance > 0: + refs[('revision-existence', key)] = True + existences.remove(key) + parent_map = repo.get_graph().get_parent_map(existences) + for key in parent_map: + refs[('revision-existence', key)] = True + existences.remove(key) + for key in existences: + refs[('revision-existence', key)] = False + for item in wanting_items: + if isinstance(item, WorkingTree): + item._check(refs) + if isinstance(item, Branch): + self.other_results.append(item.check(refs)) + finally: + self.progress.finished() + self.repository.unlock() + + def _check_revisions(self, revisions_iterator): + """Check revision objects by decorating a generator. + + :param revisions_iterator: An iterator of(revid, Revision-or-None). + :return: A generator of the contents of revisions_iterator. + """ + self.planned_revisions = set() + for revid, revision in revisions_iterator: + yield revid, revision + self._check_one_rev(revid, revision) + # Flatten the revisions we found to guarantee consistent later + # iteration. + self.planned_revisions = list(self.planned_revisions) + # TODO: extract digital signatures as items to callback on too. + + def check_revisions(self): + """Scan revisions, checking data directly available as we go.""" + revision_iterator = self.repository._iter_revisions(None) + revision_iterator = self._check_revisions(revision_iterator) + # We read the all revisions here: + # - doing this allows later code to depend on the revision index. + # - we can fill out existence flags at this point + # - we can read the revision inventory sha at this point + # - we can check properties and serialisers etc. + if not self.repository._format.revision_graph_can_have_wrong_parents: + # The check against the index isn't needed. + self.revs_with_bad_parents_in_index = None + for thing in revision_iterator: + pass + else: + bad_revisions = self.repository._find_inconsistent_revision_parents( + revision_iterator) + self.revs_with_bad_parents_in_index = list(bad_revisions) + + def report_results(self, verbose): + if self.check_repo: + self._report_repo_results(verbose) + for result in self.other_results: + result.report_results(verbose) + + def _report_repo_results(self, verbose): + note(gettext('checked repository {0} format {1}').format( + self.repository.user_url, + self.repository._format)) + note(gettext('%6d revisions'), self.checked_rev_cnt) + note(gettext('%6d file-ids'), len(self.checked_weaves)) + if verbose: + note(gettext('%6d unreferenced text versions'), + len(self.unreferenced_versions)) + if verbose and len(self.unreferenced_versions): + for file_id, revision_id in self.unreferenced_versions: + note(gettext('unreferenced version: {{{0}}} in {1}').format(revision_id, + file_id)) + if self.missing_inventory_sha_cnt: + note(gettext('%6d revisions are missing inventory_sha1'), + self.missing_inventory_sha_cnt) + if self.missing_revision_cnt: + note(gettext('%6d revisions are mentioned but not present'), + self.missing_revision_cnt) + if len(self.ghosts): + note(gettext('%6d ghost revisions'), len(self.ghosts)) + if verbose: + for ghost in self.ghosts: + note(' %s', ghost) + if len(self.missing_parent_links): + note(gettext('%6d revisions missing parents in ancestry'), + len(self.missing_parent_links)) + if verbose: + for link, linkers in self.missing_parent_links.items(): + note(gettext(' %s should be in the ancestry for:'), link) + for linker in linkers: + note(' * %s', linker) + if len(self.inconsistent_parents): + note(gettext('%6d inconsistent parents'), len(self.inconsistent_parents)) + if verbose: + for info in self.inconsistent_parents: + revision_id, file_id, found_parents, correct_parents = info + note(gettext(' * {0} version {1} has parents {2!r} ' + 'but should have {3!r}').format( + file_id, revision_id, found_parents, + correct_parents)) + if self.revs_with_bad_parents_in_index: + note(gettext( + '%6d revisions have incorrect parents in the revision index'), + len(self.revs_with_bad_parents_in_index)) + if verbose: + for item in self.revs_with_bad_parents_in_index: + revision_id, index_parents, actual_parents = item + note(gettext( + ' {0} has wrong parents in index: ' + '{1!r} should be {2!r}').format( + revision_id, index_parents, actual_parents)) + for item in self._report_items: + note(item) + + def _check_one_rev(self, rev_id, rev): + """Cross-check one revision. + + :param rev_id: A revision id to check. + :param rev: A revision or None to indicate a missing revision. + """ + if rev.revision_id != rev_id: + self._report_items.append(gettext( + 'Mismatched internal revid {{{0}}} and index revid {{{1}}}').format( + rev.revision_id, rev_id)) + rev_id = rev.revision_id + # Check this revision tree etc, and count as seen when we encounter a + # reference to it. + self.planned_revisions.add(rev_id) + # It is not a ghost + self.ghosts.discard(rev_id) + # Count all parents as ghosts if we haven't seen them yet. + for parent in rev.parent_ids: + if not parent in self.planned_revisions: + self.ghosts.add(parent) + + self.ancestors[rev_id] = tuple(rev.parent_ids) or (NULL_REVISION,) + self.add_pending_item(rev_id, ('inventories', rev_id), 'inventory', + rev.inventory_sha1) + self.checked_rev_cnt += 1 + + def add_pending_item(self, referer, key, kind, sha1): + """Add a reference to a sha1 to be cross checked against a key. + + :param referer: The referer that expects key to have sha1. + :param key: A storage key e.g. ('texts', 'foo@bar-20040504-1234') + :param kind: revision/inventory/text/map/signature + :param sha1: A hex sha1 or None if no sha1 is known. + """ + existing = self.pending_keys.get(key) + if existing: + if sha1 != existing[1]: + self._report_items.append(gettext('Multiple expected sha1s for {0}. {{{1}}}' + ' expects {{{2}}}, {{{3}}} expects {{{4}}}').format( + key, referer, sha1, existing[1], existing[0])) + else: + self.pending_keys[key] = (kind, sha1, referer) + + def check_weaves(self): + """Check all the weaves we can get our hands on. + """ + weave_ids = [] + storebar = ui.ui_factory.nested_progress_bar() + try: + self._check_weaves(storebar) + finally: + storebar.finished() + + def _check_weaves(self, storebar): + storebar.update('text-index', 0, 2) + if self.repository._format.fast_deltas: + # We haven't considered every fileid instance so far. + weave_checker = self.repository._get_versioned_file_checker( + ancestors=self.ancestors) + else: + weave_checker = self.repository._get_versioned_file_checker( + text_key_references=self.text_key_references, + ancestors=self.ancestors) + storebar.update('file-graph', 1) + result = weave_checker.check_file_version_parents( + self.repository.texts) + self.checked_weaves = weave_checker.file_ids + bad_parents, unused_versions = result + bad_parents = bad_parents.items() + for text_key, (stored_parents, correct_parents) in bad_parents: + # XXX not ready for id join/split operations. + weave_id = text_key[0] + revision_id = text_key[-1] + weave_parents = tuple([parent[-1] for parent in stored_parents]) + correct_parents = tuple([parent[-1] for parent in correct_parents]) + self.inconsistent_parents.append( + (revision_id, weave_id, weave_parents, correct_parents)) + self.unreferenced_versions.update(unused_versions) + + def _add_entry_to_text_key_references(self, inv, entry): + if not self.rich_roots and entry.name == '': + return + key = (entry.file_id, entry.revision) + self.text_key_references.setdefault(key, False) + if entry.revision == inv.revision_id: + self.text_key_references[key] = True + + +def scan_branch(branch, needed_refs, to_unlock): + """Scan a branch for refs. + + :param branch: The branch to schedule for checking. + :param needed_refs: Refs we are accumulating. + :param to_unlock: The unlock list accumulating. + """ + note(gettext("Checking branch at '%s'.") % (branch.base,)) + branch.lock_read() + to_unlock.append(branch) + branch_refs = branch._get_check_refs() + for ref in branch_refs: + reflist = needed_refs.setdefault(ref, []) + reflist.append(branch) + + +def scan_tree(base_tree, tree, needed_refs, to_unlock): + """Scan a tree for refs. + + :param base_tree: The original tree check opened, used to detect duplicate + tree checks. + :param tree: The tree to schedule for checking. + :param needed_refs: Refs we are accumulating. + :param to_unlock: The unlock list accumulating. + """ + if base_tree is not None and tree.basedir == base_tree.basedir: + return + note(gettext("Checking working tree at '%s'.") % (tree.basedir,)) + tree.lock_read() + to_unlock.append(tree) + tree_refs = tree._get_check_refs() + for ref in tree_refs: + reflist = needed_refs.setdefault(ref, []) + reflist.append(tree) + + +def check_dwim(path, verbose, do_branch=False, do_repo=False, do_tree=False): + """Check multiple objects. + + If errors occur they are accumulated and reported as far as possible, and + an exception raised at the end of the process. + """ + try: + base_tree, branch, repo, relpath = \ + ControlDir.open_containing_tree_branch_or_repository(path) + except errors.NotBranchError: + base_tree = branch = repo = None + + to_unlock = [] + needed_refs= {} + try: + if base_tree is not None: + # If the tree is a lightweight checkout we won't see it in + # repo.find_branches - add now. + if do_tree: + scan_tree(None, base_tree, needed_refs, to_unlock) + branch = base_tree.branch + if branch is not None: + # We have a branch + if repo is None: + # The branch is in a shared repository + repo = branch.repository + if repo is not None: + repo.lock_read() + to_unlock.append(repo) + branches = repo.find_branches(using=True) + saw_tree = False + if do_branch or do_tree: + for branch in branches: + if do_tree: + try: + tree = branch.bzrdir.open_workingtree() + saw_tree = True + except (errors.NotLocalUrl, errors.NoWorkingTree): + pass + else: + scan_tree(base_tree, tree, needed_refs, to_unlock) + if do_branch: + scan_branch(branch, needed_refs, to_unlock) + if do_branch and not branches: + note(gettext("No branch found at specified location.")) + if do_tree and base_tree is None and not saw_tree: + note(gettext("No working tree found at specified location.")) + if do_repo or do_branch or do_tree: + if do_repo: + note(gettext("Checking repository at '%s'.") + % (repo.user_url,)) + result = repo.check(None, callback_refs=needed_refs, + check_repo=do_repo) + result.report_results(verbose) + else: + if do_tree: + note(gettext("No working tree found at specified location.")) + if do_branch: + note(gettext("No branch found at specified location.")) + if do_repo: + note(gettext("No repository found at specified location.")) + finally: + for thing in to_unlock: + thing.unlock() |