# -*- coding: utf-8 -*- # Copyright (C) 2008 Canonical Ltd # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Based on bzr-fast-export # Copyright (c) 2008 Adeodato Simó # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be included # in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # vim: fileencoding=utf-8 """Core engine for the fast-export command.""" # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch # is not updated (because the parent of commit is already merged, so we don't # set new_git_branch to the previously used name) from email.Utils import parseaddr import sys, time, re import bzrlib.branch import bzrlib.revision from bzrlib import ( builtins, errors as bazErrors, osutils, progress, trace, ) from bzrlib.plugins.fastimport import ( helpers, marks_file, ) from fastimport import commands from bzrlib.plugins.fastimport.helpers import ( binary_stream, single_plural, ) def _get_output_stream(destination): if destination is None or destination == '-': return binary_stream(sys.stdout) elif destination.endswith('gz'): import gzip return gzip.open(destination, 'wb') else: return open(destination, 'wb') # from dulwich.repo: def check_ref_format(refname): """Check if a refname is correctly formatted. Implements all the same rules as git-check-ref-format[1]. [1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html :param refname: The refname to check :return: True if refname is valid, False otherwise """ # These could be combined into one big expression, but are listed separately # to parallel [1]. if '/.' in refname or refname.startswith('.'): return False if '/' not in refname: return False if '..' in refname: return False for c in refname: if ord(c) < 040 or c in '\177 ~^:?*[': return False if refname[-1] in '/.': return False if refname.endswith('.lock'): return False if '@{' in refname: return False if '\\' in refname: return False return True def sanitize_ref_name_for_git(refname): """Rewrite refname so that it will be accepted by git-fast-import. For the detailed rules see check_ref_format. By rewriting the refname we are breaking uniqueness guarantees provided by bzr so we have to manually verify that resulting ref names are unique. :param refname: refname to rewrite :return: new refname """ new_refname = re.sub( # '/.' in refname or startswith '.' r"/\.|^\." # '..' in refname r"|\.\." # ord(c) < 040 r"|[" + "".join([chr(x) for x in range(040)]) + r"]" # c in '\177 ~^:?*[' r"|[\177 ~^:?*[]" # last char in "/." r"|[/.]$" # endswith '.lock' r"|.lock$" # "@{" in refname r"|@{" # "\\" in refname r"|\\", "_", refname) return new_refname class BzrFastExporter(object): def __init__(self, source, outf, ref=None, checkpoint=-1, import_marks_file=None, export_marks_file=None, revision=None, verbose=False, plain_format=False, rewrite_tags=False, no_tags=False, baseline=False): """Export branch data in fast import format. :param plain_format: if True, 'classic' fast-import format is used without any extended features; if False, the generated data is richer and includes information like multiple authors, revision properties, etc. :param rewrite_tags: if True and if plain_format is set, tag names will be rewritten to be git-compatible. Otherwise tags which aren't valid for git will be skipped if plain_format is set. :param no_tags: if True tags won't be exported at all """ self.branch = source self.outf = outf self.ref = ref self.checkpoint = checkpoint self.import_marks_file = import_marks_file self.export_marks_file = export_marks_file self.revision = revision self.excluded_revisions = set() self.plain_format = plain_format self.rewrite_tags = rewrite_tags self.no_tags = no_tags self.baseline = baseline self._multi_author_api_available = hasattr(bzrlib.revision.Revision, 'get_apparent_authors') self.properties_to_exclude = ['authors', 'author'] # Progress reporting stuff self.verbose = verbose if verbose: self.progress_every = 100 else: self.progress_every = 1000 self._start_time = time.time() self._commit_total = 0 # Load the marks and initialise things accordingly self.revid_to_mark = {} self.branch_names = {} if self.import_marks_file: marks_info = marks_file.import_marks(self.import_marks_file) if marks_info is not None: self.revid_to_mark = dict((r, m) for m, r in marks_info.items()) # These are no longer included in the marks file #self.branch_names = marks_info[1] def interesting_history(self): if self.revision: rev1, rev2 = builtins._get_revision_range(self.revision, self.branch, "fast-export") start_rev_id = rev1.rev_id end_rev_id = rev2.rev_id else: start_rev_id = None end_rev_id = None self.note("Calculating the revisions to include ...") view_revisions = [rev_id for rev_id, _, _, _ in self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)] view_revisions.reverse() # If a starting point was given, we need to later check that we don't # start emitting revisions from before that point. Collect the # revisions to exclude now ... if start_rev_id is not None: self.note("Calculating the revisions to exclude ...") self.excluded_revisions = set([rev_id for rev_id, _, _, _ in self.branch.iter_merge_sorted_revisions(start_rev_id)]) if self.baseline: # needed so the first relative commit knows its parent self.excluded_revisions.remove(start_rev_id) view_revisions.insert(0, start_rev_id) return list(view_revisions) def run(self): # Export the data self.branch.repository.lock_read() try: interesting = self.interesting_history() self._commit_total = len(interesting) self.note("Starting export of %d revisions ..." % self._commit_total) if not self.plain_format: self.emit_features() if self.baseline: self.emit_baseline(interesting.pop(0), self.ref) for revid in interesting: self.emit_commit(revid, self.ref) if self.branch.supports_tags() and not self.no_tags: self.emit_tags() finally: self.branch.repository.unlock() # Save the marks if requested self._save_marks() self.dump_stats() def note(self, msg, *args): """Output a note but timestamp it.""" msg = "%s %s" % (self._time_of_day(), msg) trace.note(msg, *args) def warning(self, msg, *args): """Output a warning but timestamp it.""" msg = "%s WARNING: %s" % (self._time_of_day(), msg) trace.warning(msg, *args) def _time_of_day(self): """Time of day as a string.""" # Note: this is a separate method so tests can patch in a fixed value return time.strftime("%H:%M:%S") def report_progress(self, commit_count, details=''): if commit_count and commit_count % self.progress_every == 0: if self._commit_total: counts = "%d/%d" % (commit_count, self._commit_total) else: counts = "%d" % (commit_count,) minutes = (time.time() - self._start_time) / 60 rate = commit_count * 1.0 / minutes if rate > 10: rate_str = "at %.0f/minute " % rate else: rate_str = "at %.1f/minute " % rate self.note("%s commits exported %s%s" % (counts, rate_str, details)) def dump_stats(self): time_required = progress.str_tdelta(time.time() - self._start_time) rc = len(self.revid_to_mark) self.note("Exported %d %s in %s", rc, single_plural(rc, "revision", "revisions"), time_required) def print_cmd(self, cmd): self.outf.write("%r\n" % cmd) def _save_marks(self): if self.export_marks_file: revision_ids = dict((m, r) for r, m in self.revid_to_mark.items()) marks_file.export_marks(self.export_marks_file, revision_ids) def is_empty_dir(self, tree, path): path_id = tree.path2id(path) if path_id is None: self.warning("Skipping empty_dir detection - no file_id for %s" % (path,)) return False # Continue if path is not a directory if tree.kind(path_id) != 'directory': return False # Use treewalk to find the contents of our directory contents = list(tree.walkdirs(prefix=path))[0] if len(contents[1]) == 0: return True else: return False def emit_features(self): for feature in sorted(commands.FEATURE_NAMES): self.print_cmd(commands.FeatureCommand(feature)) def emit_baseline(self, revid, ref): # Emit a full source tree of the first commit's parent revobj = self.branch.repository.get_revision(revid) mark = 1 self.revid_to_mark[revid] = mark file_cmds = self._get_filecommands(bzrlib.revision.NULL_REVISION, revid) self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds)) def emit_commit(self, revid, ref): if revid in self.revid_to_mark or revid in self.excluded_revisions: return # Get the Revision object try: revobj = self.branch.repository.get_revision(revid) except bazErrors.NoSuchRevision: # This is a ghost revision. Mark it as not found and next! self.revid_to_mark[revid] = -1 return # Get the primary parent # TODO: Consider the excluded revisions when deciding the parents. # Currently, a commit with parents that are excluded ought to be # triggering the ref calculation below (and it is not). # IGC 20090824 ncommits = len(self.revid_to_mark) nparents = len(revobj.parent_ids) if nparents == 0: if ncommits: # This is a parentless commit but it's not the first one # output. We need to create a new temporary branch for it # otherwise git-fast-import will assume the previous commit # was this one's parent ref = self._next_tmp_ref() parent = bzrlib.revision.NULL_REVISION else: parent = revobj.parent_ids[0] # Print the commit mark = ncommits + 1 self.revid_to_mark[revid] = mark file_cmds = self._get_filecommands(parent, revid) self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds)) # Report progress and checkpoint if it's time for that self.report_progress(ncommits) if (self.checkpoint > 0 and ncommits and ncommits % self.checkpoint == 0): self.note("Exported %i commits - adding checkpoint to output" % ncommits) self._save_marks() self.print_cmd(commands.CheckpointCommand()) def _get_name_email(self, user): if user.find('<') == -1: # If the email isn't inside <>, we need to use it as the name # in order for things to round-trip correctly. # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com') name = user email = '' else: name, email = parseaddr(user) return name.encode("utf-8"), email.encode("utf-8") def _get_commit_command(self, git_ref, mark, revobj, file_cmds): # Get the committer and author info committer = revobj.committer name, email = self._get_name_email(committer) committer_info = (name, email, revobj.timestamp, revobj.timezone) if self._multi_author_api_available: more_authors = revobj.get_apparent_authors() author = more_authors.pop(0) else: more_authors = [] author = revobj.get_apparent_author() if not self.plain_format and more_authors: name, email = self._get_name_email(author) author_info = (name, email, revobj.timestamp, revobj.timezone) more_author_info = [] for a in more_authors: name, email = self._get_name_email(a) more_author_info.append( (name, email, revobj.timestamp, revobj.timezone)) elif author != committer: name, email = self._get_name_email(author) author_info = (name, email, revobj.timestamp, revobj.timezone) more_author_info = None else: author_info = None more_author_info = None # Get the parents in terms of marks non_ghost_parents = [] for p in revobj.parent_ids: if p in self.excluded_revisions: continue try: parent_mark = self.revid_to_mark[p] non_ghost_parents.append(":%s" % parent_mark) except KeyError: # ghost - ignore continue if non_ghost_parents: from_ = non_ghost_parents[0] merges = non_ghost_parents[1:] else: from_ = None merges = None # Filter the revision properties. Some metadata (like the # author information) is already exposed in other ways so # don't repeat it here. if self.plain_format: properties = None else: properties = revobj.properties for prop in self.properties_to_exclude: try: del properties[prop] except KeyError: pass # Build and return the result return commands.CommitCommand(git_ref, mark, author_info, committer_info, revobj.message.encode("utf-8"), from_, merges, iter(file_cmds), more_authors=more_author_info, properties=properties) def _get_revision_trees(self, parent, revision_id): try: tree_old = self.branch.repository.revision_tree(parent) except bazErrors.UnexpectedInventoryFormat: self.warning("Parent is malformed - diffing against previous parent") # We can't find the old parent. Let's diff against his parent pp = self.branch.repository.get_revision(parent) tree_old = self.branch.repository.revision_tree(pp.parent_ids[0]) tree_new = None try: tree_new = self.branch.repository.revision_tree(revision_id) except bazErrors.UnexpectedInventoryFormat: # We can't really do anything anymore self.warning("Revision %s is malformed - skipping" % revision_id) return tree_old, tree_new def _get_filecommands(self, parent, revision_id): """Get the list of FileCommands for the changes between two revisions.""" tree_old, tree_new = self._get_revision_trees(parent, revision_id) if not(tree_old and tree_new): # Something is wrong with this revision - ignore the filecommands return [] changes = tree_new.changes_from(tree_old) # Make "modified" have 3-tuples, as added does my_modified = [ x[0:3] for x in changes.modified ] # The potential interaction between renames and deletes is messy. # Handle it here ... file_cmds, rd_modifies, renamed = self._process_renames_and_deletes( changes.renamed, changes.removed, revision_id, tree_old) # Map kind changes to a delete followed by an add for path, id_, kind1, kind2 in changes.kind_changed: path = self._adjust_path_for_renames(path, renamed, revision_id) # IGC: I don't understand why a delete is needed here. # In fact, it seems harmful? If you uncomment this line, # please file a bug explaining why you needed to. #file_cmds.append(commands.FileDeleteCommand(path)) my_modified.append((path, id_, kind2)) # Record modifications for path, id_, kind in changes.added + my_modified + rd_modifies: if kind == 'file': text = tree_new.get_file_text(id_) file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"), helpers.kind_to_mode('file', tree_new.is_executable(id_)), None, text)) elif kind == 'symlink': file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"), helpers.kind_to_mode('symlink', False), None, tree_new.get_symlink_target(id_))) elif kind == 'directory': if not self.plain_format: file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"), helpers.kind_to_mode('directory', False), None, None)) else: self.warning("cannot export '%s' of kind %s yet - ignoring" % (path, kind)) return file_cmds def _process_renames_and_deletes(self, renames, deletes, revision_id, tree_old): file_cmds = [] modifies = [] renamed = [] # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933. # In a nutshell, there are several nasty cases: # # 1) bzr rm a; bzr mv b a; bzr commit # 2) bzr mv x/y z; bzr rm x; commmit # # The first must come out with the delete first like this: # # D a # R b a # # The second case must come out with the rename first like this: # # R x/y z # D x # # So outputting all deletes first or all renames first won't work. # Instead, we need to make multiple passes over the various lists to # get the ordering right. must_be_renamed = {} old_to_new = {} deleted_paths = set([p for p, _, _ in deletes]) for (oldpath, newpath, id_, kind, text_modified, meta_modified) in renames: emit = kind != 'directory' or not self.plain_format if newpath in deleted_paths: if emit: file_cmds.append(commands.FileDeleteCommand(newpath.encode("utf-8"))) deleted_paths.remove(newpath) if (self.is_empty_dir(tree_old, oldpath)): self.note("Skipping empty dir %s in rev %s" % (oldpath, revision_id)) continue #oldpath = self._adjust_path_for_renames(oldpath, renamed, # revision_id) renamed.append([oldpath, newpath]) old_to_new[oldpath] = newpath if emit: file_cmds.append( commands.FileRenameCommand(oldpath.encode("utf-8"), newpath.encode("utf-8"))) if text_modified or meta_modified: modifies.append((newpath, id_, kind)) # Renaming a directory implies all children must be renamed. # Note: changes_from() doesn't handle this if kind == 'directory' and tree_old.kind(id_) == 'directory': for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_): if e.kind == 'directory' and self.plain_format: continue old_child_path = osutils.pathjoin(oldpath, p) new_child_path = osutils.pathjoin(newpath, p) must_be_renamed[old_child_path] = new_child_path # Add children not already renamed if must_be_renamed: renamed_already = set(old_to_new.keys()) still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already for old_child_path in sorted(still_to_be_renamed): new_child_path = must_be_renamed[old_child_path] if self.verbose: self.note("implicitly renaming %s => %s" % (old_child_path, new_child_path)) file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"), new_child_path.encode("utf-8"))) # Record remaining deletes for path, id_, kind in deletes: if path not in deleted_paths: continue if kind == 'directory' and self.plain_format: continue #path = self._adjust_path_for_renames(path, renamed, revision_id) file_cmds.append(commands.FileDeleteCommand(path.encode("utf-8"))) return file_cmds, modifies, renamed def _adjust_path_for_renames(self, path, renamed, revision_id): # If a previous rename is found, we should adjust the path for old, new in renamed: if path == old: self.note("Changing path %s given rename to %s in revision %s" % (path, new, revision_id)) path = new elif path.startswith(old + '/'): self.note( "Adjusting path %s given rename of %s to %s in revision %s" % (path, old, new, revision_id)) path = path.replace(old + "/", new + "/") return path def emit_tags(self): for tag, revid in self.branch.tags.get_tag_dict().items(): try: mark = self.revid_to_mark[revid] except KeyError: self.warning('not creating tag %r pointing to non-existent ' 'revision %s' % (tag, revid)) else: git_ref = 'refs/tags/%s' % tag.encode("utf-8") if self.plain_format and not check_ref_format(git_ref): if self.rewrite_tags: new_ref = sanitize_ref_name_for_git(git_ref) self.warning('tag %r is exported as %r to be valid in git.', git_ref, new_ref) git_ref = new_ref else: self.warning('not creating tag %r as its name would not be ' 'valid in git.', git_ref) continue self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark))) def _next_tmp_ref(self): """Return a unique branch name. The name will start with "tmp".""" prefix = 'tmp' if prefix not in self.branch_names: self.branch_names[prefix] = 0 else: self.branch_names[prefix] += 1 prefix = '%s.%d' % (prefix, self.branch_names[prefix]) return 'refs/heads/%s' % prefix