From f231d5373c39b3c85fed6c83a2f72acdce58e43d Mon Sep 17 00:00:00 2001 From: Jelmer Vernooij Date: Tue, 28 Feb 2012 15:00:56 +0100 Subject: Reimport some modules removed from python-fastimport 0.9.2. --- NEWS | 3 + cache_manager.py | 6 +- cmds.py | 4 +- idmapfile.py | 64 +++++++++ processors/generic_processor.py | 2 +- processors/info_processor.py | 281 ++++++++++++++++++++++++++++++++++++++++ reftracker.py | 66 ++++++++++ tests/test_head_tracking.py | 259 ++++++++++++++++++++++++++++++++++++ 8 files changed, 679 insertions(+), 6 deletions(-) create mode 100644 idmapfile.py create mode 100644 processors/info_processor.py create mode 100644 reftracker.py create mode 100644 tests/test_head_tracking.py diff --git a/NEWS b/NEWS index 3387c53..582762b 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,9 @@ Compatibility * Avoid using Tree.inventory directly, which is deprecated in bzr 2.6. (Jelmer Vernooij) +* Reimport some modules removed from python-fastimport 0.9.2. + (Jelmer Vernooij, #693507) + 0.12 2012-02-09 Bug fixes diff --git a/cache_manager.py b/cache_manager.py index 9a0b651..e0f7f90 100644 --- a/cache_manager.py +++ b/cache_manager.py @@ -25,12 +25,12 @@ from bzrlib import lru_cache, trace from bzrlib.plugins.fastimport import ( branch_mapper, ) +from bzrlib.plugins.fastimport.reftracker import ( + RefTracker, + ) from fastimport.helpers import ( single_plural, ) -from fastimport.reftracker import ( - RefTracker, - ) class _Cleanup(object): diff --git a/cmds.py b/cmds.py index d364b00..d90393a 100644 --- a/cmds.py +++ b/cmds.py @@ -307,7 +307,7 @@ class cmd_fast_import(Command): def _generate_info(self, source): from cStringIO import StringIO from fastimport import parser - from fastimport.processors import info_processor + from bzrlib.plugins.fastimport.processors import info_processor stream = _get_source_stream(source) output = StringIO() try: @@ -477,7 +477,7 @@ class cmd_fast_import_info(Command): takes_options = ['verbose'] def run(self, source, verbose=False): load_fastimport() - from fastimport.processors import info_processor + from bzrlib.plugins.fastimport.processors import info_processor return _run(source, info_processor.InfoProcessor, verbose=verbose) diff --git a/idmapfile.py b/idmapfile.py new file mode 100644 index 0000000..669dbce --- /dev/null +++ b/idmapfile.py @@ -0,0 +1,64 @@ +# Copyright (C) 2008 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""Routines for saving and loading the id-map file.""" + +import os + + +def save_id_map(filename, revision_ids): + """Save the mapping of commit ids to revision ids to a file. + + Throws the usual exceptions if the file cannot be opened, + written to or closed. + + :param filename: name of the file to save the data to + :param revision_ids: a dictionary of commit ids to revision ids. + """ + f = open(filename, 'wb') + try: + for commit_id, rev_id in revision_ids.iteritems(): + f.write("%s %s\n" % (commit_id, rev_id)) + f.flush() + finally: + f.close() + + +def load_id_map(filename): + """Load the mapping of commit ids to revision ids from a file. + + If the file does not exist, an empty result is returned. + If the file does exists but cannot be opened, read or closed, + the normal exceptions are thrown. + + NOTE: It is assumed that commit-ids do not have embedded spaces. + + :param filename: name of the file to save the data to + :result: map, count where: + map = a dictionary of commit ids to revision ids; + count = the number of keys in map + """ + result = {} + count = 0 + if os.path.exists(filename): + f = open(filename) + try: + for line in f: + parts = line[:-1].split(' ', 1) + result[parts[0]] = parts[1] + count += 1 + finally: + f.close() + return result, count diff --git a/processors/generic_processor.py b/processors/generic_processor.py index 4ab0ac3..80b0c5d 100644 --- a/processors/generic_processor.py +++ b/processors/generic_processor.py @@ -40,6 +40,7 @@ except ImportError: from bzrlib.plugins.fastimport import ( branch_updater, cache_manager, + idmapfile, marks_file, revision_store, ) @@ -47,7 +48,6 @@ from fastimport import ( commands, errors as plugin_errors, helpers, - idmapfile, processor, ) diff --git a/processors/info_processor.py b/processors/info_processor.py new file mode 100644 index 0000000..bb162e7 --- /dev/null +++ b/processors/info_processor.py @@ -0,0 +1,281 @@ +# Copyright (C) 2008 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""Import processor that dump stats about the input (and doesn't import).""" + +from bzrlib.plugins.fastimport import ( + reftracker, + ) +from fastimport import ( + commands, + processor, + ) +from fastimport.helpers import ( + invert_dict, + invert_dictset, + ) +import stat + + +class InfoProcessor(processor.ImportProcessor): + """An import processor that dumps statistics about the input. + + No changes to the current repository are made. + + As well as providing useful information about an import + stream before importing it, this processor is useful for + benchmarking the speed at which data can be extracted from + the source. + """ + + def __init__(self, params=None, verbose=0, outf=None): + processor.ImportProcessor.__init__(self, params, verbose, + outf=outf) + + def pre_process(self): + # Init statistics + self.cmd_counts = {} + for cmd in commands.COMMAND_NAMES: + self.cmd_counts[cmd] = 0 + self.file_cmd_counts = {} + for fc in commands.FILE_COMMAND_NAMES: + self.file_cmd_counts[fc] = 0 + self.parent_counts = {} + self.max_parent_count = 0 + self.committers = set() + self.separate_authors_found = False + self.symlinks_found = False + self.executables_found = False + self.sha_blob_references = False + self.lightweight_tags = 0 + # Blob usage tracking + self.blobs = {} + for usage in ['new', 'used', 'unknown', 'unmarked']: + self.blobs[usage] = set() + self.blob_ref_counts = {} + # Head tracking + self.reftracker = reftracker.RefTracker() + # Stuff to cache: a map from mark to # of times that mark is merged + self.merges = {} + # Stuff to cache: these are maps from mark to sets + self.rename_old_paths = {} + self.copy_source_paths = {} + + def post_process(self): + # Dump statistics + cmd_names = commands.COMMAND_NAMES + fc_names = commands.FILE_COMMAND_NAMES + self._dump_stats_group("Command counts", + [(c, self.cmd_counts[c]) for c in cmd_names], str) + self._dump_stats_group("File command counts", + [(c, self.file_cmd_counts[c]) for c in fc_names], str) + + # Commit stats + if self.cmd_counts['commit']: + p_items = [] + for i in xrange(0, self.max_parent_count + 1): + if i in self.parent_counts: + count = self.parent_counts[i] + p_items.append(("parents-%d" % i, count)) + merges_count = len(self.merges.keys()) + p_items.append(('total revisions merged', merges_count)) + flags = { + 'separate authors found': self.separate_authors_found, + 'executables': self.executables_found, + 'symlinks': self.symlinks_found, + 'blobs referenced by SHA': self.sha_blob_references, + } + self._dump_stats_group("Parent counts", p_items, str) + self._dump_stats_group("Commit analysis", flags.iteritems(), _found) + heads = invert_dictset(self.reftracker.heads) + self._dump_stats_group("Head analysis", heads.iteritems(), None, + _iterable_as_config_list) + # note("\t%d\t%s" % (len(self.committers), 'unique committers')) + self._dump_stats_group("Merges", self.merges.iteritems(), None) + # We only show the rename old path and copy source paths when -vv + # (verbose=2) is specified. The output here for mysql's data can't + # be parsed currently so this bit of code needs more work anyhow .. + if self.verbose >= 2: + self._dump_stats_group("Rename old paths", + self.rename_old_paths.iteritems(), len, + _iterable_as_config_list) + self._dump_stats_group("Copy source paths", + self.copy_source_paths.iteritems(), len, + _iterable_as_config_list) + + # Blob stats + if self.cmd_counts['blob']: + # In verbose mode, don't list every blob used + if self.verbose: + del self.blobs['used'] + self._dump_stats_group("Blob usage tracking", + self.blobs.iteritems(), len, _iterable_as_config_list) + if self.blob_ref_counts: + blobs_by_count = invert_dict(self.blob_ref_counts) + blob_items = blobs_by_count.items() + blob_items.sort() + self._dump_stats_group("Blob reference counts", + blob_items, len, _iterable_as_config_list) + + # Other stats + if self.cmd_counts['reset']: + reset_stats = { + 'lightweight tags': self.lightweight_tags, + } + self._dump_stats_group("Reset analysis", reset_stats.iteritems()) + + def _dump_stats_group(self, title, items, normal_formatter=None, + verbose_formatter=None): + """Dump a statistics group. + + In verbose mode, do so as a config file so + that other processors can load the information if they want to. + :param normal_formatter: the callable to apply to the value + before displaying it in normal mode + :param verbose_formatter: the callable to apply to the value + before displaying it in verbose mode + """ + if self.verbose: + self.outf.write("[%s]\n" % (title,)) + for name, value in items: + if verbose_formatter is not None: + value = verbose_formatter(value) + if type(name) == str: + name = name.replace(' ', '-') + self.outf.write("%s = %s\n" % (name, value)) + self.outf.write("\n") + else: + self.outf.write("%s:\n" % (title,)) + for name, value in items: + if normal_formatter is not None: + value = normal_formatter(value) + self.outf.write("\t%s\t%s\n" % (value, name)) + + def progress_handler(self, cmd): + """Process a ProgressCommand.""" + self.cmd_counts[cmd.name] += 1 + + def blob_handler(self, cmd): + """Process a BlobCommand.""" + self.cmd_counts[cmd.name] += 1 + if cmd.mark is None: + self.blobs['unmarked'].add(cmd.id) + else: + self.blobs['new'].add(cmd.id) + # Marks can be re-used so remove it from used if already there. + # Note: we definitely do NOT want to remove it from multi if + # it's already in that set. + try: + self.blobs['used'].remove(cmd.id) + except KeyError: + pass + + def checkpoint_handler(self, cmd): + """Process a CheckpointCommand.""" + self.cmd_counts[cmd.name] += 1 + + def commit_handler(self, cmd): + """Process a CommitCommand.""" + self.cmd_counts[cmd.name] += 1 + self.committers.add(cmd.committer) + if cmd.author is not None: + self.separate_authors_found = True + for fc in cmd.iter_files(): + self.file_cmd_counts[fc.name] += 1 + if isinstance(fc, commands.FileModifyCommand): + if fc.mode & 0111: + self.executables_found = True + if stat.S_ISLNK(fc.mode): + self.symlinks_found = True + if fc.dataref is not None: + if fc.dataref[0] == ':': + self._track_blob(fc.dataref) + else: + self.sha_blob_references = True + elif isinstance(fc, commands.FileRenameCommand): + self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path) + elif isinstance(fc, commands.FileCopyCommand): + self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path) + + # Track the heads + parents = self.reftracker.track_heads(cmd) + + # Track the parent counts + parent_count = len(parents) + if self.parent_counts.has_key(parent_count): + self.parent_counts[parent_count] += 1 + else: + self.parent_counts[parent_count] = 1 + if parent_count > self.max_parent_count: + self.max_parent_count = parent_count + + # Remember the merges + if cmd.merges: + #self.merges.setdefault(cmd.ref, set()).update(cmd.merges) + for merge in cmd.merges: + if merge in self.merges: + self.merges[merge] += 1 + else: + self.merges[merge] = 1 + + def reset_handler(self, cmd): + """Process a ResetCommand.""" + self.cmd_counts[cmd.name] += 1 + if cmd.ref.startswith('refs/tags/'): + self.lightweight_tags += 1 + else: + if cmd.from_ is not None: + self.reftracker.track_heads_for_ref( + cmd.ref, cmd.from_) + + def tag_handler(self, cmd): + """Process a TagCommand.""" + self.cmd_counts[cmd.name] += 1 + + def feature_handler(self, cmd): + """Process a FeatureCommand.""" + self.cmd_counts[cmd.name] += 1 + feature = cmd.feature_name + if feature not in commands.FEATURE_NAMES: + self.warning("feature %s is not supported - parsing may fail" + % (feature,)) + + def _track_blob(self, mark): + if mark in self.blob_ref_counts: + self.blob_ref_counts[mark] += 1 + pass + elif mark in self.blobs['used']: + self.blob_ref_counts[mark] = 2 + self.blobs['used'].remove(mark) + elif mark in self.blobs['new']: + self.blobs['used'].add(mark) + self.blobs['new'].remove(mark) + else: + self.blobs['unknown'].add(mark) + +def _found(b): + """Format a found boolean as a string.""" + return ['no', 'found'][b] + +def _iterable_as_config_list(s): + """Format an iterable as a sequence of comma-separated strings. + + To match what ConfigObj expects, a single item list has a trailing comma. + """ + items = sorted(s) + if len(items) == 1: + return "%s," % (items[0],) + else: + return ", ".join(items) diff --git a/reftracker.py b/reftracker.py new file mode 100644 index 0000000..44136c7 --- /dev/null +++ b/reftracker.py @@ -0,0 +1,66 @@ +# Copyright (C) 2009 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +"""Tracker of refs.""" + + +class RefTracker(object): + + def __init__(self): + # Head tracking: last ref, last id per ref & map of commit ids to ref*s* + self.last_ref = None + self.last_ids = {} + self.heads = {} + + def dump_stats(self, note): + self._show_stats_for(self.last_ids, "last-ids", note=note) + self._show_stats_for(self.heads, "heads", note=note) + + def clear(self): + self.last_ids.clear() + self.heads.clear() + + def track_heads(self, cmd): + """Track the repository heads given a CommitCommand. + + :param cmd: the CommitCommand + :return: the list of parents in terms of commit-ids + """ + # Get the true set of parents + if cmd.from_ is not None: + parents = [cmd.from_] + else: + last_id = self.last_ids.get(cmd.ref) + if last_id is not None: + parents = [last_id] + else: + parents = [] + parents.extend(cmd.merges) + + # Track the heads + self.track_heads_for_ref(cmd.ref, cmd.id, parents) + return parents + + def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None): + if parents is not None: + for parent in parents: + if parent in self.heads: + del self.heads[parent] + self.heads.setdefault(cmd_id, set()).add(cmd_ref) + self.last_ids[cmd_ref] = cmd_id + self.last_ref = cmd_ref + + diff --git a/tests/test_head_tracking.py b/tests/test_head_tracking.py new file mode 100644 index 0000000..19f6c68 --- /dev/null +++ b/tests/test_head_tracking.py @@ -0,0 +1,259 @@ +# Copyright (C) 2009 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""Test tracking of heads""" + +from cStringIO import StringIO + +from fastimport import ( + commands, + parser, + ) + +import testtools + +from bzrlib.plugins.fastimport.reftracker import ( + RefTracker, + ) + + +# A sample input stream that only adds files to a branch +_SAMPLE_MAINLINE = \ +"""blob +mark :1 +data 9 +Welcome! +commit refs/heads/master +mark :100 +committer a 1234798653 +0000 +data 4 +test +M 644 :1 doc/README.txt +blob +mark :2 +data 17 +Life +is +good ... +commit refs/heads/master +mark :101 +committer a 1234798653 +0000 +data 8 +test +ing +from :100 +M 644 :2 NEWS +blob +mark :3 +data 19 +Welcome! +my friend +blob +mark :4 +data 11 +== Docs == +commit refs/heads/master +mark :102 +committer d 1234798653 +0000 +data 8 +test +ing +from :101 +M 644 :3 doc/README.txt +M 644 :4 doc/index.txt +""" + +# A sample input stream that adds files to two branches +_SAMPLE_TWO_HEADS = \ +"""blob +mark :1 +data 9 +Welcome! +commit refs/heads/master +mark :100 +committer a 1234798653 +0000 +data 4 +test +M 644 :1 doc/README.txt +blob +mark :2 +data 17 +Life +is +good ... +commit refs/heads/mybranch +mark :101 +committer a 1234798653 +0000 +data 8 +test +ing +from :100 +M 644 :2 NEWS +blob +mark :3 +data 19 +Welcome! +my friend +blob +mark :4 +data 11 +== Docs == +commit refs/heads/master +mark :102 +committer d 1234798653 +0000 +data 8 +test +ing +from :100 +M 644 :3 doc/README.txt +M 644 :4 doc/index.txt +""" + +# A sample input stream that adds files to two branches +_SAMPLE_TWO_BRANCHES_MERGED = \ +"""blob +mark :1 +data 9 +Welcome! +commit refs/heads/master +mark :100 +committer a 1234798653 +0000 +data 4 +test +M 644 :1 doc/README.txt +blob +mark :2 +data 17 +Life +is +good ... +commit refs/heads/mybranch +mark :101 +committer a 1234798653 +0000 +data 8 +test +ing +from :100 +M 644 :2 NEWS +blob +mark :3 +data 19 +Welcome! +my friend +blob +mark :4 +data 11 +== Docs == +commit refs/heads/master +mark :102 +committer d 1234798653 +0000 +data 8 +test +ing +from :100 +M 644 :3 doc/README.txt +M 644 :4 doc/index.txt +commit refs/heads/master +mark :103 +committer d 1234798653 +0000 +data 8 +test +ing +from :102 +merge :101 +D doc/index.txt +""" + +# A sample input stream that contains a reset +_SAMPLE_RESET = \ +"""blob +mark :1 +data 9 +Welcome! +commit refs/heads/master +mark :100 +committer a 1234798653 +0000 +data 4 +test +M 644 :1 doc/README.txt +reset refs/remotes/origin/master +from :100 +""" + +# A sample input stream that contains a reset and more commits +_SAMPLE_RESET_WITH_MORE_COMMITS = \ +"""blob +mark :1 +data 9 +Welcome! +commit refs/heads/master +mark :100 +committer a 1234798653 +0000 +data 4 +test +M 644 :1 doc/README.txt +reset refs/remotes/origin/master +from :100 +commit refs/remotes/origin/master +mark :101 +committer d 1234798653 +0000 +data 8 +test +ing +from :100 +D doc/README.txt +""" + +class TestHeadTracking(testtools.TestCase): + + def assertHeads(self, input, expected): + s = StringIO(input) + p = parser.ImportParser(s) + reftracker = RefTracker() + for cmd in p.iter_commands(): + if isinstance(cmd, commands.CommitCommand): + reftracker.track_heads(cmd) + # eat the file commands + list(cmd.iter_files()) + elif isinstance(cmd, commands.ResetCommand): + if cmd.from_ is not None: + reftracker.track_heads_for_ref(cmd.ref, cmd.from_) + self.assertEqual(reftracker.heads, expected) + + def test_mainline(self): + self.assertHeads(_SAMPLE_MAINLINE, { + ':102': set(['refs/heads/master']), + }) + + def test_two_heads(self): + self.assertHeads(_SAMPLE_TWO_HEADS, { + ':101': set(['refs/heads/mybranch']), + ':102': set(['refs/heads/master']), + }) + + def test_two_branches_merged(self): + self.assertHeads(_SAMPLE_TWO_BRANCHES_MERGED, { + ':103': set(['refs/heads/master']), + }) + + def test_reset(self): + self.assertHeads(_SAMPLE_RESET, { + ':100': set(['refs/heads/master', 'refs/remotes/origin/master']), + }) + + def test_reset_with_more_commits(self): + self.assertHeads(_SAMPLE_RESET_WITH_MORE_COMMITS, { + ':101': set(['refs/remotes/origin/master']), + }) -- cgit v1.2.1