#!/usr/bin/python # Copyright (C) 2013-2014 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. import cliapp import json import logging import os import urllib2 import string import sys from datetime import datetime import shutil import traceback __version__ = '0.0' lorry_path = os.path.realpath(__file__) def quote_url(url): ''' Convert URIs to strings that only contain digits, letters, % and _. NOTE: When changing the code of this function, make sure to also apply the same to the quote_url() function of morph. Otherwise the git bundles generated by lorry may no longer be found by morph. ''' valid_chars = string.digits + string.letters + '%_' transl = lambda x: x if x in valid_chars else '_' return ''.join([transl(x) for x in url]) class Lorry(cliapp.Application): def add_settings(self): self.settings.string(['working-area', 'w'], 'use DIR as the working area (for holding ' 'intermediate git repositories, etc)', metavar='DIR', default='workd') self.settings.string(['mirror-base-url-push'], 'base URL to use for pushing to the mirror ' 'server (default: %default)', metavar='URL', default='ssh://gitano@roadtrain.codethink.co.uk/delta') self.settings.string(['mirror-base-url-fetch'], 'base URL to use for bundle names and for ' 'pulling from the mirror server (default: ' '%default)', metavar='URL', default='git://git.baserock.org/delta') self.settings.boolean(['pull-only'], 'only pull from upstreams, do not push to ' 'the mirror server') self.settings.boolean(['verbose', 'v'], 'report what is going on to stdout') self.settings.boolean(['repack'], 'repack git repositories when an import has ' 'been updated (enabled by default)', default=True) self.settings.string(['command-stdout'], 'write the stdout of commands to this file', metavar='FILE', default=None) self.settings.string(['command-stderr'], 'write the stderr of commands to this file', metavar='FILE', default=None) self.settings.choice(['bundle'], ['first', 'never', 'always'], 'create bundles of git repositories.' 'first will only bundle if there is not already ' 'a bundle in BUNDLES (default: first)') self.settings.string(['bundle-dest'], 'put created bundles in BUNDLES', metavar='BUNDLES') self.settings.choice(['tarball'], ['first', 'never', 'always'], 'create tarballs of git repositories.' 'first will only tar if there is not already ' 'a tarball in TARBALLS (default: first)') self.settings.string(['tarball-dest'], 'put created tarballs in TARBALLS', metavar='TARBALLS') self.settings.boolean(['keep-multiple-backups'], 'keep multiple (time-stamped) backups (disabled by default)', default=False) def process_args(self, args): status = 0 self.settings['working-area'] = os.path.abspath(self.settings['working-area']) if not os.path.exists(self.settings['working-area']): os.makedirs(self.settings['working-area']) for arg in args: self.progress('Processing spec file %s' % arg) with open(arg) as f: specs = json.load(f) for name in sorted(specs.keys()): self.progress('Getting: %s' % name) try: self.gitify(name, specs[name]) except Exception,e: status += 1 sys.stderr.write( 'Error mirroring:\n%s' % traceback.format_exc()) logging.error(traceback.format_exc()) if status > 0 : logging.debug('Total Mirrors failed: %d' %status) status = 1 self.progress('Done') #print 'total failed:',status sys.exit(status) def bundle(self, name, gitdir): if self.settings['bundle'] == 'never': return bundlename = "%s/%s" % (self.settings['mirror-base-url-fetch'], name) path = os.path.join(self.settings['bundle-dest'], quote_url(bundlename)) + '.bndl' if not os.path.exists(path) or self.settings['bundle'] == 'always': self.progress('.. building bundle %s' % bundlename) # create the bundle self.run_program(['git', 'bundle', 'create', path, '--branches', '--tags'], cwd=gitdir) # FIXME this is a hack to avoid unrecognized headers in bundles, # which happens with some repositories. See # # http://marc.info/?l=git&m=132992959317420&w=2 # # for more information. From the bundle's header section, the # expression below will remove all garbage lines that appear # between the first line (the bundle format meta comment) and # the list of refs. expr = '1,/^[0-9a-f]\{40\}/{ /^[0-9a-f]\{40\}/!{/^[^#]/d}}' self.run_program(['sed', '-i', '-e', expr, path], cwd=gitdir) def make_tarball(self, name, gitdir): if self.settings['tarball'] == 'never': return tarballname = "%s/%s" % (self.settings['mirror-base-url-fetch'], name) path = os.path.join(self.settings['tarball-dest'], quote_url(tarballname)) + '.tar' if os.path.exists(os.path.join(gitdir, '.git')): gitdir = os.path.join(gitdir, '.git') if not os.path.exists(path) or self.settings['tarball'] == 'always': self.progress('.. building tarball %s' % tarballname) args = ['tar', 'cf', path] if os.path.exists(os.path.join(gitdir, 'config')): os.rename(os.path.join(gitdir, 'config'), os.path.join(gitdir, 'config.lorrytmp')) with open(os.path.join(gitdir, 'config'), 'w') as fh: fh.write("""[core] repositoryformatversion = 0 filemode = true bare = true """) for entry in ['HEAD', 'objects', 'refs', 'info', 'packed-refs', 'config', 'branches', 'description']: if os.path.exists(os.path.join(gitdir, entry)): args += [entry] self.run_program(args, cwd=gitdir) if os.path.exists(os.path.join(gitdir, 'config.lorrytmp')): os.rename(os.path.join(gitdir, 'config.lorrytmp'), os.path.join(gitdir, 'config')) def gitify(self, name, spec): self.progress('Getting %s' % name) table = { 'bzr': self.gitify_bzr, 'cvs': self.gitify_cvs, 'git': self.mirror_git, 'hg': self.gitify_hg, 'svn': self.gitify_svn, 'tarball': self.gitify_tarball, } vcstype = spec['type'] if vcstype not in table: raise cliapp.AppException('Unknown VCS type %s' % vcstype) dirname = self.dirname(name) if not os.path.exists(dirname): os.mkdir(dirname) gitdir = os.path.join(dirname, 'git') time = datetime.now().strftime('%F-%T') pre_update_name = 'git-pre-update' pre_update_backup_suffix = (pre_update_name + '-' + time if self.settings['keep-multiple-backups'] else pre_update_name) pre_update_backup_dir = os.path.join(dirname, pre_update_backup_suffix) post_fail_name = 'git-post-fail' post_fail_backup_suffix = (post_fail_name + '-' + time if self.settings['keep-multiple-backups'] else post_fail_name) post_fail_backup_dir = os.path.join(dirname, post_fail_backup_suffix) if not self.settings['keep-multiple-backups']: # remove previous backups if they exist if os.path.exists(pre_update_backup_dir): shutil.rmtree(pre_update_backup_dir) if os.path.exists(post_fail_backup_dir): shutil.rmtree(post_fail_backup_dir) backupdir = self.backup_gitdir(name, gitdir, pre_update_backup_dir) try: self.needs_aggressive = False table[vcstype](name, dirname, gitdir, spec) if self.settings['repack']: self.progress('.. repacking %s git repository' % name) self.run_program(['git', 'config', 'pack.windowMemory', '128M'], cwd=gitdir) args = ['git', 'gc'] if self.needs_aggressive: args += ['--aggressive'] self.run_program(args, cwd=gitdir) self.bundle(name, gitdir) self.make_tarball(name, gitdir) except: if backupdir is not None: faildir = self.backup_gitdir(name, gitdir, post_fail_backup_dir) self.restore_backup(name, backupdir, gitdir) self.output.write('Mirror of %s failed, state before mirror ' 'is saved at %s and state after mirror is ' 'saved at %s\n' % (name, backupdir, faildir)) logging.debug('Mirror of %s failed, state before mirror ' 'is saved at %s and state after mirror is ' 'saved at %s\n', name, backupdir, faildir) raise if not self.settings['pull-only']: if 'refspecs' in spec: self.push_to_mirror_server(name, gitdir, spec['refspecs']) else: self.push_to_mirror_server(name, gitdir) if backupdir is not None: self.progress('.. removing %s git repository backup' % name) shutil.rmtree(backupdir) def restore_backup(self, name, backupdir, gitdir): self.progress('.. restoring %s good git repository' % name) dotgit = os.path.join(gitdir, '.git') if not os.path.exists(dotgit): dotgit = gitdir shutil.rmtree(dotgit) self.copy_gitdir(backupdir, dotgit) def backup_gitdir(self, name, gitdir, backupdir): dotgit = os.path.join(gitdir, '.git') if not os.path.exists(dotgit): dotgit = gitdir self.progress('.. backing up %s git repository to %s' % (name, backupdir)) return self.copy_gitdir(dotgit, backupdir) def copy_gitdir(self, source, dest): if not os.path.exists(source): return None # copy everything except the objects dir def ignoreobjects(dirname, filenames): if dirname.endswith(source): return ['objects'] return [] shutil.copytree(source, dest, ignore=ignoreobjects) # hardlink the objects sourceobjects = os.path.join(source, 'objects') assert os.path.exists(sourceobjects), "No source objects" objectspath = os.path.join(dest, 'objects') os.mkdir(objectspath) for dirpath, dirnames, filenames in os.walk(sourceobjects): assert dirpath.startswith(sourceobjects) # strip sourceobjects and / from relpath relpath = dirpath[len(sourceobjects)+1:] for dirname in dirnames: os.mkdir(os.path.join(objectspath, relpath, dirname)) for filename in filenames: assert os.path.exists(os.path.join(objectspath, relpath)) os.link(os.path.join(dirpath, filename), os.path.join(objectspath, relpath, filename)) return dest def mirror_git(self, project_name, dirname, gitdir, spec): # Turn off git's SSL/TLS certificate verification, until Baserock # has an CA management infrastructure. os.environ['GIT_SSL_NO_VERIFY'] = 'true' if not os.path.exists(gitdir): self.progress('.. initialising git dir') self.run_program(['git', 'init', '--bare', gitdir]) self.progress('.. updating existing clone') if 'remote.origin.fetch' in self.run_program(['git', 'config', '-l'], cwd=gitdir): self.run_program(['git', 'config', '--unset-all', 'remote.origin.fetch'], cwd=gitdir) self.run_program(['git', 'config', 'remote.origin.url', spec['url']], cwd=gitdir) self.run_program(['git', 'config', 'remote.origin.mirror', 'true'], cwd=gitdir) self.run_program(['git', 'config', '--add', 'remote.origin.fetch', '+refs/heads/*:refs/heads/*'], cwd=gitdir) self.run_program(['git', 'config', '--add', 'remote.origin.fetch', '+refs/tags/*:refs/tags/*'], cwd=gitdir) try: self.run_program(['git', 'remote', 'update', 'origin', '--prune'], cwd=gitdir) except: self.run_program(['git', 'remote', 'prune', 'origin'], cwd=gitdir) self.run_program(['git', 'remote', 'update', 'origin', '--prune'], cwd=gitdir) def gitify_bzr(self, project_name, dirname, gitdir, spec): bzrdir = os.path.join(dirname, 'bzr') # check if repo exists if not os.path.exists(bzrdir): self.progress('.. creating bzr repository') self.run_program(['bzr', 'init-repo', '--no-trees', bzrdir]) if not os.path.exists(gitdir): self.progress('.. creating git repo') os.mkdir(gitdir) self.run_program(['git', 'init', '--bare', gitdir]) self.needs_aggressive = True # branches are the listed branches, plus the branch specified in url if 'branches' in spec: branches = spec['branches'] else: branches = {} if 'url' in spec: branches['trunk'] = spec['url'] logging.debug('all branches: %s' % repr(branches)) for branch, address in branches.iteritems(): branchdir = os.path.join(bzrdir, branch) if not os.path.exists(branchdir): self.progress('.. doing initial bzr branch') self.run_program( ['bzr', 'branch', '--quiet', '-Ossl.cert_reqs=none', address, branchdir]) else: self.progress('.. updating bzr branch') self.run_program( ['bzr', 'pull', '--quiet', '-Ossl.cert_reqs=none', address], cwd=branchdir) exports = {} bzrmarks = os.path.join(gitdir, 'marks.bzr') for branch, address in branches.iteritems(): branchdir = os.path.join(bzrdir, branch) self.progress('.. fast-exporting branch %s from bzr' % branch) exports[branch] = os.path.join(dirname, 'fast-export' + branch) cmdline = ['bzr', 'fast-export', '--git-branch=' + branch, branchdir, exports[branch]] if os.path.exists(bzrmarks): cmdline.append('--marks=' + bzrmarks) else: cmdline.append('--export-marks=' + bzrmarks) self.run_program(cmdline) gitmarks = os.path.join(gitdir, 'marks.git') for branch, address in branches.iteritems(): self.progress('.. fast-importing branch %s into git' % branch) with open(exports[branch], 'rb') as exportfile: cmdline = ['git', 'fast-import', '--export-marks=' + gitmarks] if os.path.exists(gitmarks): cmdline.append('--import-marks=' + gitmarks) self.run_program(cmdline, stdin=exportfile, cwd=gitdir) for branch, address in branches.iteritems(): branchdir = os.path.join(bzrdir, branch) self.progress('.. removing temporary fast-export file ' + exports[branch]) os.remove(exports[branch]) def gitify_svn(self, project_name, dirname, gitdir, spec): if not os.path.exists(gitdir): self.progress('.. doing initial clone') self.needs_aggressive = True layout = spec["layout"] # if standard layour specified, fill in the defaults if layout == "standard": layout = { "trunk": "trunk", "tags": "tags/*", "branches": "branches/*" } # init the repo then manually set the refspecs to fetch into local # git-svn can apparently provide better history tracking by # fetching the root of the repository # git-svn will convert branch, trunk and tag paths to allow this, # but it is simpler to disable it and do it manually self.run_program(['git', 'svn', 'init', spec['url'], gitdir + "-tmp", '--svn-remote=svn', '--no-minimize-url']) os.rename(os.path.join(gitdir + "-tmp", '.git'), gitdir) os.rmdir(gitdir + "-tmp") self.run_program(['git', 'config', 'core.bare', 'true'], cwd=gitdir) self.run_program(['git', 'config', 'svn-remote.svn.fetch', layout["trunk"]+':refs/heads/master'], cwd=gitdir) self.run_program(['git', 'config', 'svn-remote.svn.branches', layout["branches"] + ':refs/heads/*'], cwd=gitdir) self.run_program(['git', 'config', 'svn-remote.svn.tags', layout["tags"] + ':refs/tags/*'], cwd=gitdir) else: self.progress('.. updating existing clone') # update the remote tracking branches self.run_program(['git', 'svn', 'fetch'], cwd=gitdir) def gitify_cvs(self, project_name, dirname, gitdir, spec): self.needs_aggressive = True env = dict(os.environ) env['CVS_RSH'] = 'lorry-ssh-wrapper' self.run_program( ['git', 'cvsimport', '-a', '-d', spec['url'], '-C', gitdir, spec['module']], env=env) def gitify_hg(self, project_name, dirname, gitdir, spec): hgdir = os.path.join(dirname, 'hg') if os.path.exists(hgdir): self.progress('.. updating hg branch') self.run_program(['hg', 'pull', '--quiet', '--insecure'], cwd=hgdir) else: self.progress('.. doing initial hg branch') self.run_program(['hg', 'clone', '--quiet', '--insecure', spec['url'], hgdir]) if not os.path.exists(gitdir): self.needs_aggressive = True self.run_program(['git', 'init', '--bare', gitdir]) self.progress('.. fast-exporting into git') self.run_program(['hg-fast-export', '--quiet', '--force', '-r', '../hg'], cwd=gitdir) def gitify_tarball(self, project_name, dirname, gitdir, spec): url = spec['url'] url_path = urllib2.urlparse.urlparse(url)[2] basename = os.path.basename(url_path) tardest = os.path.join(dirname, basename) self.progress('.. checking if we need to fetch %s' % basename) if not os.path.exists(tardest): self.progress('.. attempting to fetch.') with open(tardest, 'w') as tarfile: urlfile = urllib2.urlopen(spec['url']) tarfile.write(urlfile.read()) urlfile.close() else: self.progress('.. no need to run, nothing to do') return if not os.path.exists(gitdir): self.run_program(['git', 'init', '--bare', gitdir]) cmdline = ["%s.tar-importer" % lorry_path, tardest] self.run_program(cmdline, cwd=gitdir) self.needs_aggressive = True def push_to_mirror_server(self, name, gitdir, pushrefspecs=['refs/heads/*:refs/heads/*', 'refs/tags/*:refs/tags/*']): pushurl = "%s/%s.git" % (self.settings['mirror-base-url-push'], name) self.progress('.. pushing %s to mirror server %s' % (name, pushurl)) self.run_program(['git', 'push', pushurl]+pushrefspecs, cwd=gitdir) def run_program(self, argv, **kwargs): if self.settings['command-stdout']: kwargs['stdout'] = open(self.settings['command-stdout'], 'a') if self.settings['command-stderr']: kwargs['stderr'] = open(self.settings['command-stderr'], 'a') if 'stdin' not in kwargs: kwargs['stdin'] = open('/dev/null', 'r') logging.debug('Running: argv=%s kwargs=%s' % (repr(argv), repr(kwargs))) exit, out, err = self.runcmd_unchecked(argv, **kwargs) logging.debug('Command: %s\nExit: %s\nStdout:\n%sStderr:\n%s' % (argv, exit, self.indent(out or ''), self.indent(err or ''))) if exit != 0: raise Exception('%s failed (exit code %s):\n%s' % (' '.join(argv), exit, self.indent(err or ''))) return out def indent(self, string): return ''.join(' %s\n' % line for line in string.splitlines()) def dirname(self, project_name): assert '\0' not in project_name # We escape slashes as underscores. project_name = '_'.join(project_name.split('/')) return os.path.join(self.settings['working-area'], project_name) def progress(self, msg): logging.debug(msg) if self.settings['verbose']: self.output.write('%s\n' % msg) if __name__ == '__main__': Lorry(version=__version__).run()