#!/usr/bin/env python """ darcs-fast-export - darcs backend for fast data importers Copyright (c) 2008, 2009 Miklos Vajna Copyright (c) 2008 Matthias Andree This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ import xml.dom.minidom import xml.parsers.expat import os import sys import gzip import time import calendar import shutil import subprocess import optparse import re import urllib import urllib2 import StringIO sys = reload(sys) sys.setdefaultencoding("utf-8") class Handler: def __init__(self): self.hashes = [] self.authormap = {} self.export_marks = [] self.import_marks = [] def get_patchname(self, patch): ret = [] s = "" if patch.attributes['inverted'].value == 'True': s = "UNDO: " cs = patch.getElementsByTagName("name")[0].childNodes if cs.length > 0: ret.append(s + cs[0].data) lines = patch.getElementsByTagName("comment") if lines: for i in lines[0].childNodes[0].data.split('\n'): if not i.startswith("Ignore-this: "): ret.append(i) return "\n".join(ret).encode('utf-8') def get_author(self, patch): """darcs allows any freeform string, but fast-import has a more strict format, so fix up broken author names here.""" author = patch.attributes['author'].value if author in self.authormap: author = self.authormap[author] if not len(author): author = "darcs-fast-export " # add missing name elif not ">" in author: author = "%s <%s>" % (author.split('@')[0], author) # avoid double quoting elif author[0] == '"' and author[-1] == '"': author = author[1:-1] # name after email elif author[-1] != '>': author = author[author.index('>')+2:] + ' ' + author[:author.index('>')+1] return author.encode('utf-8') def get_date(self, patch): try: date = time.strptime(patch, "%Y%m%d%H%M%S") except ValueError: date = time.strptime(patch[:19] + patch[-5:], '%a %b %d %H:%M:%S %Y') return calendar.timegm(date) def progress(self, s): print "progress [%s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s) sys.stdout.flush() def log(self, s): self.logsock.write("[%s] %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s)) self.logsock.flush() def parse_inventory(self, sock=None): prev = None nextprev = False buf = [] if not sock: sock = self.open(os.path.join(self.origin, "_darcs", "hashed_inventory")) for i in sock.readlines(): if i.startswith("hash"): buf.insert(0, i[6:-1]) if i.startswith("Starting with inventory:"): nextprev = True elif nextprev: prev = i[:-1] nextprev = False sock.close() for i in buf: self.hashes.insert(0, i) if prev: sock = self.gzip_open(os.path.join(self.origin, "_darcs", "inventories", prev)) self.parse_inventory(sock) # this is like gzip.open but supports urls as well def gzip_open(self, path): if os.path.exists(path): return gzip.open(path) buf = urllib.urlopen(path).read() sock = StringIO.StringIO(buf) return gzip.GzipFile(fileobj=sock) # this is like os.path.exists but supports urls as well def path_exists(self, path): if os.path.exists(path): return True else: try: urllib2.urlopen(urllib2.Request(path)) return True except urllib2.HTTPError, e: return False # this is like open, but supports urls as well def open(self, path): if os.path.exists(path): return open(path) else: return urllib.urlopen(path) def handle_opts(self): # Option Parser usage="%prog [options] darcsrepo" opp = optparse.OptionParser(usage=usage) opp.add_option("--import-marks", metavar="IFILE", help="read state for incremental imports from IFILE") opp.add_option("--export-marks", metavar="OFILE", help="write state for incremental imports from OFILE") opp.add_option("--encoding", help="encoding of log [default: %default], if unspecified and input isn't utf-8, guess") opp.add_option("--authors-file", metavar="F", help="read author transformations in old=new format from F") opp.add_option("--working", metavar="W", help="working directory which is removed at the end of non-incremental conversions") opp.add_option("--logfile", metavar="L", help="log file which contains the output of external programs invoked during the conversion") opp.add_option("--git-branch", metavar="B", help="git branch [default: refs/heads/master]") opp.add_option("--progress", metavar="P", help="insert progress statements after every n commit [default: 100]") (self.options, self.args) = opp.parse_args() if len(self.args) < 1: opp.error("darcsrepo required") # read author mapping file in gitauthors format, # i. e. in=out (one per # line) if self.options.authors_file: sock = open(self.options.authors_file) self.authormap = dict([i.strip().split('=',1) for i in sock]) sock.close() if "://" not in self.args[0]: self.origin = os.path.abspath(self.args[0]) else: self.origin = self.args[0].strip('/') if self.options.working: self.working = os.path.abspath(self.options.working) else: if "://" not in self.origin: self.working = "%s.darcs" % self.origin else: self.working = "%s.darcs" % os.path.split(self.origin)[-1] if self.options.logfile: logfile = os.path.abspath(self.options.logfile) else: if "://" not in self.origin: logfile = "%s.log" % self.origin else: logfile = "%s.log" % os.path.split(self.origin)[-1] self.logsock = open(logfile, "a") if self.options.git_branch: self.git_branch = self.options.git_branch else: self.git_branch = "refs/heads/master" if self.options.progress: self.prognum = int(self.options.progress) else: self.prognum = 100 def handle_import_marks(self): if self.options.import_marks: sock = open(self.options.import_marks) for i in sock.readlines(): line = i.strip() if not len(line): continue self.import_marks.append(line.split(' ')[1]) self.export_marks.append(line) sock.close() def get_patches(self): self.progress("getting list of patches") if not len(self.import_marks): sock = os.popen("darcs changes --xml --reverse --repo %s" % self.origin) else: sock = os.popen("darcs changes --xml --reverse --repo %s --from-match 'hash %s'" % (self.origin, self.import_marks[-1])) buf = sock.read() sock.close() # this is hackish. we need to escape some bad chars, otherwise the xml # will not be valid buf = buf.replace('\x1b', '^[') if self.options.encoding: xmldoc = xml.dom.minidom.parseString(unicode(buf, self.options.encoding).encode('utf-8')) else: try: xmldoc = xml.dom.minidom.parseString(buf) except xml.parsers.expat.ExpatError: try: import chardet except ImportError: sys.exit("Error, encoding is not utf-8. Please " + "either specify it with the --encoding " + "option or install chardet.") self.progress("encoding is not utf8, guessing charset") encoding = chardet.detect(buf)['encoding'] self.progress("detected encoding is %s" % encoding) xmldoc = xml.dom.minidom.parseString(unicode(buf, encoding).encode('utf-8')) sys.stdout.flush() return xmldoc.getElementsByTagName('patch') def setup_workdir(self): darcs2 = False self.oldfashionedpatch = True self.cwd = os.getcwd() if self.path_exists(os.path.join(self.origin, "_darcs", "format")): sock = self.open(os.path.join(self.origin, "_darcs", "format")) format = [x.strip() for x in sock] sock.close() darcs2 = 'darcs-2' in format self.oldfashionedpatch = not 'hashed' in format if not self.oldfashionedpatch: self.progress("parsing the inventory") if "://" not in self.origin: os.chdir(self.origin) self.parse_inventory() if not self.options.import_marks or not os.path.exists(self.working): # init the tmp darcs repo os.mkdir(self.working) os.chdir(self.working) if darcs2: os.system("darcs init --darcs-2") else: os.system("darcs init --old-fashioned-inventory") else: os.chdir(self.working) if self.options.import_marks: sock = os.popen("darcs pull -a --match 'hash %s' %s" % (self.import_marks[-1], self.origin)) self.log("Building/updating working directory:\n%s" % sock.read()) sock.close() def export_patches(self): patches = self.get_patches() # this is the number of the NEXT patch count = 1 if len(self.import_marks): patches = patches[1:] count = len(self.import_marks) + 1 if len(self.export_marks): # this is the mark number of the NEXT patch markcount = int(self.export_marks[-1].split(' ')[0][1:]) + 1 else: markcount = count # this may be huge and we need it many times patchnum = len(patches) if not len(self.import_marks): self.progress("starting export, repo has %d patches" % patchnum) else: self.progress("continuing export, %d patches to convert" % patchnum) paths = [] for i in patches: # apply the patch hash = i.attributes['hash'].value buf = ["\nNew patches:\n"] if self.oldfashionedpatch: sock = self.gzip_open(os.path.join(self.origin, "_darcs", "patches", hash)) else: sock = self.gzip_open(os.path.join(self.origin, "_darcs", "patches", self.hashes[count-1])) buf.append(sock.read()) sock.close() sock = os.popen("darcs changes --context") buf.append(sock.read()) sock.close() sock = subprocess.Popen(["darcs", "apply", "--allow-conflicts"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) sock.stdin.write("".join(buf)) sock.stdin.close() self.log("Applying %s:\n%s" % (hash, sock.stdout.read())) sock.stdout.close() message = self.get_patchname(i) # export the commit print "commit %s" % self.git_branch print "mark :%s" % markcount if self.options.export_marks: self.export_marks.append(":%s %s" % (markcount, hash)) date = self.get_date(i.attributes['date'].value) print "committer %s %s +0000" % (self.get_author(i), date) print "data %d\n%s" % (len(message), message) if markcount > 1: print "from :%s" % (markcount-1) # export the files for j in paths: print "D %s" % j paths = [] for (root, dirs, files) in os.walk ("."): for f in files: j = os.path.normpath(os.path.join(root, f)) if j.startswith("_darcs") or "-darcs-backup" in j: continue paths.append(j) sock = open(j) buf = sock.read() sock.close() # darcs does not track the executable bit :/ print "M 644 inline %s" % j print "data %s\n%s" % (len(buf), buf) if message[:4] == "TAG ": tag = re.sub('[^\xe9-\xf8\w.\-]+', '_', message[4:].strip().split('\n')[0]).strip('_') print "tag %s" % tag print "from :%s" % markcount print "tagger %s %s +0000" % (self.get_author(i), date) print "data %d\n%s" % (len(message), message) if count % self.prognum == 0: self.progress("%d/%d patches" % (count, patchnum)) count += 1 markcount += 1 os.chdir(self.cwd) if not self.options.export_marks: shutil.rmtree(self.working) self.logsock.close() def handle_export_marks(self): if self.options.export_marks: self.progress("writing export marks") sock = open(self.options.export_marks, 'w') sock.write("\n".join(self.export_marks)) sock.write("\n") sock.close() self.progress("finished") def handle(self): self.handle_opts() self.handle_import_marks() self.setup_workdir() self.export_patches() self.handle_export_marks() if __name__ == "__main__": h = Handler() h.handle()