Add fast-import-{filter,query,info} scripts.

author: Jelmer Vernooĳ <jelmer@jelmer.uk> 2018-03-30 18:24:19 +0100
committer: Jelmer Vernooĳ <jelmer@jelmer.uk> 2018-03-30 18:24:19 +0100
commit: fe5b3d2b2d1f4298342c1b1a1d5631dbbf9270ce (patch)
tree: dba1340fb42fcf39acf3d9e05685ae2d5dd15ee8 /fastimport
parent: b65623b0ddba5a4ec99d5576e4888fcc1200de97 (diff)
download: python-fastimport-git-fe5b3d2b2d1f4298342c1b1a1d5631dbbf9270ce.tar.gz
5 files changed, 501 insertions, 0 deletions
diff --git a/fastimport/helpers.py b/fastimport/helpers.py
index abb7014..67072be 100644
--- a/fastimport/helpers.py
+++ b/fastimport/helpers.py
@@ -194,3 +194,72 @@ class newobject(object):
         Hook for the future.utils.native() function
         """
         return object(self)
+
+
+def binary_stream(stream):
+    """Ensure a stream is binary on Windows.
+
+    :return: the stream
+    """
+    try:
+        import os
+        if os.name == 'nt':
+            fileno = getattr(stream, 'fileno', None)
+            if fileno:
+                no = fileno()
+                if no >= 0:     # -1 means we're working as subprocess
+                    import msvcrt
+                    msvcrt.setmode(no, os.O_BINARY)
+    except ImportError:
+        pass
+    return stream
+
+
+def invert_dictset(d):
+    """Invert a dictionary with keys matching a set of values, turned into lists."""
+    # Based on recipe from ASPN
+    result = {}
+    for k, c in d.items():
+        for v in c:
+            keys = result.setdefault(v, [])
+            keys.append(k)
+    return result
+
+
+def invert_dict(d):
+    """Invert a dictionary with keys matching each value turned into a list."""
+    # Based on recipe from ASPN
+    result = {}
+    for k, v in d.items():
+        keys = result.setdefault(v, [])
+        keys.append(k)
+    return result
+
+
+def defines_to_dict(defines):
+    """Convert a list of definition strings to a dictionary."""
+    if defines is None:
+        return None
+    result = {}
+    for define in defines:
+        kv = define.split('=', 1)
+        if len(kv) == 1:
+            result[define.strip()] = 1
+        else:
+            result[kv[0].strip()] = kv[1].strip()
+    return result
+
+
+def get_source_stream(source):
+    if source == '-' or source is None:
+        import sys
+        stream = binary_stream(sys.stdin)
+    elif source.endswith('.gz'):
+        import gzip
+        stream = gzip.open(source, "rb")
+    else:
+        stream = open(source, "rb")
+    return stream
+
+
+
diff --git a/fastimport/processors/info_processor.py b/fastimport/processors/info_processor.py
new file mode 100644
index 0000000..28c7300
--- /dev/null
+++ b/fastimport/processors/info_processor.py
@@ -0,0 +1,286 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Import processor that dump stats about the input (and doesn't import)."""
+
+from __future__ import absolute_import
+
+from .. import (
+    reftracker,
+    )
+from ..helpers import (
+    invert_dict,
+    invert_dictset,
+    )
+from fastimport import (
+    commands,
+    processor,
+    )
+import stat
+
+
+class InfoProcessor(processor.ImportProcessor):
+    """An import processor that dumps statistics about the input.
+
+    No changes to the current repository are made.
+
+    As well as providing useful information about an import
+    stream before importing it, this processor is useful for
+    benchmarking the speed at which data can be extracted from
+    the source.
+    """
+
+    def __init__(self, params=None, verbose=0, outf=None):
+        processor.ImportProcessor.__init__(self, params, verbose,
+            outf=outf)
+
+    def pre_process(self):
+        # Init statistics
+        self.cmd_counts = {}
+        for cmd in commands.COMMAND_NAMES:
+            self.cmd_counts[cmd] = 0
+        self.file_cmd_counts = {}
+        for fc in commands.FILE_COMMAND_NAMES:
+            self.file_cmd_counts[fc] = 0
+        self.parent_counts = {}
+        self.max_parent_count = 0
+        self.committers = set()
+        self.separate_authors_found = False
+        self.symlinks_found = False
+        self.executables_found = False
+        self.sha_blob_references = False
+        self.lightweight_tags = 0
+        # Blob usage tracking
+        self.blobs = {}
+        for usage in ['new', 'used', 'unknown', 'unmarked']:
+            self.blobs[usage] = set()
+        self.blob_ref_counts = {}
+        # Head tracking
+        self.reftracker = reftracker.RefTracker()
+        # Stuff to cache: a map from mark to # of times that mark is merged
+        self.merges = {}
+        # Stuff to cache: these are maps from mark to sets
+        self.rename_old_paths = {}
+        self.copy_source_paths = {}
+
+    def post_process(self):
+        # Dump statistics
+        cmd_names = commands.COMMAND_NAMES
+        fc_names = commands.FILE_COMMAND_NAMES
+        self._dump_stats_group("Command counts",
+            [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str)
+        self._dump_stats_group("File command counts", 
+            [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], str)
+
+        # Commit stats
+        if self.cmd_counts[b'commit']:
+            p_items = []
+            for i in range(self.max_parent_count + 1):
+                if i in self.parent_counts:
+                    count = self.parent_counts[i]
+                    p_items.append(("parents-%d" % i, count))
+            merges_count = len(self.merges)
+            p_items.append(('total revisions merged', merges_count))
+            flags = {
+                'separate authors found': self.separate_authors_found,
+                'executables': self.executables_found,
+                'symlinks': self.symlinks_found,
+                'blobs referenced by SHA': self.sha_blob_references,
+                }
+            self._dump_stats_group("Parent counts", p_items, str)
+            self._dump_stats_group("Commit analysis", sorted(flags.items()), _found)
+            heads = invert_dictset(self.reftracker.heads)
+            self._dump_stats_group(
+                    "Head analysis",
+                    [(k.decode('utf-8'),
+                        ', '.join([m.decode('utf-8') for m in v]))
+                        for (k, v) in heads.items()], None,
+                    _iterable_as_config_list)
+            # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
+            self._dump_stats_group("Merges", self.merges.items(), None)
+            # We only show the rename old path and copy source paths when -vv
+            # (verbose=2) is specified. The output here for mysql's data can't
+            # be parsed currently so this bit of code needs more work anyhow ..
+            if self.verbose >= 2:
+                self._dump_stats_group("Rename old paths",
+                    self.rename_old_paths.items(), len,
+                    _iterable_as_config_list)
+                self._dump_stats_group("Copy source paths",
+                    self.copy_source_paths.items(), len,
+                    _iterable_as_config_list)
+
+        # Blob stats
+        if self.cmd_counts[b'blob']:
+            # In verbose mode, don't list every blob used
+            if self.verbose:
+                del self.blobs['used']
+            self._dump_stats_group("Blob usage tracking",
+                self.blobs.items(), len, _iterable_as_config_list)
+        if self.blob_ref_counts:
+            blobs_by_count = invert_dict(self.blob_ref_counts)
+            blob_items = sorted(blobs_by_count.items())
+            self._dump_stats_group("Blob reference counts",
+                blob_items, len, _iterable_as_config_list)
+
+        # Other stats
+        if self.cmd_counts[b'reset']:
+            reset_stats = {
+                'lightweight tags': self.lightweight_tags,
+                }
+            self._dump_stats_group("Reset analysis", reset_stats.items())
+
+    def _dump_stats_group(self, title, items, normal_formatter=None,
+        verbose_formatter=None):
+        """Dump a statistics group.
+        
+        In verbose mode, do so as a config file so
+        that other processors can load the information if they want to.
+        :param normal_formatter: the callable to apply to the value
+          before displaying it in normal mode
+        :param verbose_formatter: the callable to apply to the value
+          before displaying it in verbose mode
+        """
+        if self.verbose:
+            self.outf.write("[%s]\n" % (title,))
+            for name, value in items:
+                if verbose_formatter is not None:
+                    value = verbose_formatter(value)
+                if type(name) == str:
+                    name = name.replace(' ', '-')
+                self.outf.write("%s = %s\n" % (name, value))
+            self.outf.write("\n")
+        else:
+            self.outf.write("%s:\n" % (title,))
+            for name, value in items:
+                if normal_formatter is not None:
+                    value = normal_formatter(value)
+                self.outf.write("\t%s\t%s\n" % (value, name))
+
+    def progress_handler(self, cmd):
+        """Process a ProgressCommand."""
+        self.cmd_counts[cmd.name] += 1
+
+    def blob_handler(self, cmd):
+        """Process a BlobCommand."""
+        self.cmd_counts[cmd.name] += 1
+        if cmd.mark is None:
+            self.blobs['unmarked'].add(cmd.id)
+        else:
+            self.blobs['new'].add(cmd.id)
+            # Marks can be re-used so remove it from used if already there.
+            # Note: we definitely do NOT want to remove it from multi if
+            # it's already in that set.
+            try:
+                self.blobs['used'].remove(cmd.id)
+            except KeyError:
+                pass
+
+    def checkpoint_handler(self, cmd):
+        """Process a CheckpointCommand."""
+        self.cmd_counts[cmd.name] += 1
+
+    def commit_handler(self, cmd):
+        """Process a CommitCommand."""
+        self.cmd_counts[cmd.name] += 1
+        self.committers.add(cmd.committer)
+        if cmd.author is not None:
+            self.separate_authors_found = True
+        for fc in cmd.iter_files():
+            self.file_cmd_counts[fc.name] += 1
+            if isinstance(fc, commands.FileModifyCommand):
+                if fc.mode & 0o111:
+                    self.executables_found = True
+                if stat.S_ISLNK(fc.mode):
+                    self.symlinks_found = True
+                if fc.dataref is not None:
+                    if fc.dataref[0] == ':':
+                        self._track_blob(fc.dataref)
+                    else:
+                        self.sha_blob_references = True
+            elif isinstance(fc, commands.FileRenameCommand):
+                self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
+            elif isinstance(fc, commands.FileCopyCommand):
+                self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
+
+        # Track the heads
+        parents = self.reftracker.track_heads(cmd)
+
+        # Track the parent counts
+        parent_count = len(parents)
+        try:
+            self.parent_counts[parent_count] += 1
+        except KeyError:
+            self.parent_counts[parent_count] = 1
+            if parent_count > self.max_parent_count:
+                self.max_parent_count = parent_count
+
+        # Remember the merges
+        if cmd.merges:
+            #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
+            for merge in cmd.merges:
+                if merge in self.merges:
+                    self.merges[merge] += 1
+                else:
+                    self.merges[merge] = 1
+
+    def reset_handler(self, cmd):
+        """Process a ResetCommand."""
+        self.cmd_counts[cmd.name] += 1
+        if cmd.ref.startswith('refs/tags/'):
+            self.lightweight_tags += 1
+        else:
+            if cmd.from_ is not None:
+                self.reftracker.track_heads_for_ref(
+                    cmd.ref, cmd.from_)
+
+    def tag_handler(self, cmd):
+        """Process a TagCommand."""
+        self.cmd_counts[cmd.name] += 1
+
+    def feature_handler(self, cmd):
+        """Process a FeatureCommand."""
+        self.cmd_counts[cmd.name] += 1
+        feature = cmd.feature_name
+        if feature not in commands.FEATURE_NAMES:
+            self.warning("feature %s is not supported - parsing may fail"
+                % (feature,))
+
+    def _track_blob(self, mark):
+        if mark in self.blob_ref_counts:
+            self.blob_ref_counts[mark] += 1
+            pass
+        elif mark in self.blobs['used']:
+            self.blob_ref_counts[mark] = 2
+            self.blobs['used'].remove(mark)
+        elif mark in self.blobs['new']:
+            self.blobs['used'].add(mark)
+            self.blobs['new'].remove(mark)
+        else:
+            self.blobs['unknown'].add(mark)
+
+def _found(b):
+    """Format a found boolean as a string."""
+    return ['no', 'found'][b]
+
+def _iterable_as_config_list(s):
+    """Format an iterable as a sequence of comma-separated strings.
+    
+    To match what ConfigObj expects, a single item list has a trailing comma.
+    """
+    items = sorted(s)
+    if len(items) == 1:
+        return "%s," % (items[0],)
+    else:
+        return ", ".join(items)
diff --git a/fastimport/reftracker.py b/fastimport/reftracker.py
new file mode 100644
index 0000000..16a5e45
--- /dev/null
+++ b/fastimport/reftracker.py
@@ -0,0 +1,68 @@
+# Copyright (C) 2009 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+"""Tracker of refs."""
+
+from __future__ import absolute_import
+
+
+class RefTracker(object):
+
+    def __init__(self):
+        # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
+        self.last_ref = None
+        self.last_ids = {}
+        self.heads = {}
+
+    def dump_stats(self, note):
+        self._show_stats_for(self.last_ids, "last-ids", note=note)
+        self._show_stats_for(self.heads, "heads", note=note)
+
+    def clear(self):
+        self.last_ids.clear()
+        self.heads.clear()
+
+    def track_heads(self, cmd):
+        """Track the repository heads given a CommitCommand.
+
+        :param cmd: the CommitCommand
+        :return: the list of parents in terms of commit-ids
+        """
+        # Get the true set of parents
+        if cmd.from_ is not None:
+            parents = [cmd.from_]
+        else:
+            last_id = self.last_ids.get(cmd.ref)
+            if last_id is not None:
+                parents = [last_id]
+            else:
+                parents = []
+        parents.extend(cmd.merges)
+
+        # Track the heads
+        self.track_heads_for_ref(cmd.ref, cmd.id, parents)
+        return parents
+
+    def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None):
+        if parents is not None:
+            for parent in parents:
+                if parent in self.heads:
+                    del self.heads[parent]
+        self.heads.setdefault(cmd_id, set()).add(cmd_ref)
+        self.last_ids[cmd_ref] = cmd_id
+        self.last_ref = cmd_ref
+
+
diff --git a/fastimport/tests/__init__.py b/fastimport/tests/__init__.py
index ae5acb7..01a681b 100644
--- a/fastimport/tests/__init__.py
+++ b/fastimport/tests/__init__.py
@@ -26,6 +26,7 @@ def test_suite():
         'test_dates',
         'test_errors',
         'test_filter_processor',
+        'test_info_processor',
         'test_helpers',
         'test_parser',
         ]
diff --git a/fastimport/tests/test_info_processor.py b/fastimport/tests/test_info_processor.py
new file mode 100644
index 0000000..43dd50b
--- /dev/null
+++ b/fastimport/tests/test_info_processor.py
@@ -0,0 +1,77 @@
+# Copyright (C) 2018 Jelmer Vernooij
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Test InfoProcessor"""
+from io import BytesIO
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+from unittest import TestCase
+
+from fastimport import (
+    parser,
+    )
+
+from fastimport.processors import (
+    info_processor,
+    )
+
+simple_fast_import_stream = b"""commit refs/heads/master
+mark :1
+committer Jelmer Vernooij <jelmer@samba.org> 1299718135 +0100
+data 7
+initial
+
+"""
+
+class TestFastImportInfo(TestCase):
+
+    def test_simple(self):
+        stream = BytesIO(simple_fast_import_stream)
+        outf = StringIO()
+        proc = info_processor.InfoProcessor(outf=outf)
+        p = parser.ImportParser(stream)
+        proc.process(p.iter_commands)
+
+        self.maxDiff = None
+        self.assertMultiLineEqual(outf.getvalue(), """Command counts:
+\t0\tblob
+\t0\tcheckpoint
+\t1\tcommit
+\t0\tfeature
+\t0\tprogress
+\t0\treset
+\t0\ttag
+File command counts:
+\t0\tfilemodify
+\t0\tfiledelete
+\t0\tfilecopy
+\t0\tfilerename
+\t0\tfiledeleteall
+Parent counts:
+\t1\tparents-0
+\t0\ttotal revisions merged
+Commit analysis:
+\tno\tblobs referenced by SHA
+\tno\texecutables
+\tno\tseparate authors found
+\tno\tsymlinks
+Head analysis:
+\t:1\trefs/heads/master
+Merges:
+""")
author	Jelmer Vernooĳ <jelmer@jelmer.uk>	2018-03-30 18:24:19 +0100
committer	Jelmer Vernooĳ <jelmer@jelmer.uk>	2018-03-30 18:24:19 +0100
commit	fe5b3d2b2d1f4298342c1b1a1d5631dbbf9270ce (patch)
tree	dba1340fb42fcf39acf3d9e05685ae2d5dd15ee8 /fastimport
parent	b65623b0ddba5a4ec99d5576e4888fcc1200de97 (diff)
download	python-fastimport-git-fe5b3d2b2d1f4298342c1b1a1d5631dbbf9270ce.tar.gz