From 40996c304ba56aad88cac0b9de02a384adb35d3f Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Sun, 25 Nov 2012 23:21:45 +0100 Subject: Extend worked example and make several refinements * Verbosity control * Automatically update working tree * Identify orphan files in filter-clean and pass through so they don't show up as suprious diffs. --- git-fat | 116 +++++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 82 insertions(+), 34 deletions(-) (limited to 'git-fat') diff --git a/git-fat b/git-fat index ab36470..84be082 100755 --- a/git-fat +++ b/git-fat @@ -1,6 +1,6 @@ #!/usr/bin/env python -from __future__ import print_function +from __future__ import print_function, with_statement import sys import hashlib @@ -8,6 +8,7 @@ import tempfile import os import subprocess import shlex +import shutil BLOCK_SIZE = 4096 @@ -15,10 +16,7 @@ def verbose_stderr(*args, **kwargs): return print(*args, file=sys.stderr, **kwargs) def verbose_ignore(*args, **kwargs): pass -verbose = verbose_ignore -def gitroot(): - return subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() def mkdir_p(path): import errno try: @@ -45,7 +43,8 @@ def cat(instream, outstream): class GitFat(object): DecodeError = RuntimeError def __init__(self): - self.gitroot = gitroot() + self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore + self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() self.objdir = os.path.join(self.gitroot, '.git', 'fat', 'objects') self.magiclen = len(self.encode(hashlib.sha1('dummy').hexdigest())) def setup(self): @@ -62,12 +61,16 @@ class GitFat(object): return remote except ConfigParser.NoSectionError: raise RuntimeError('No rsync.remote in %s' % cfgpath) + def revparse(self, revname): + return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode(self, digest): return '#$# git-fat %s\n' % digest - def decode(self, string): + def decode(self, string, noraise=False): cookie = '#$# git-fat ' if string.startswith(cookie): return string[len(cookie):].split()[0] + elif noraise: + return None else: raise GitFat.DecodeError('Could not decode %s' % (string)) def decode_stream(self, stream): @@ -94,38 +97,54 @@ class GitFat(object): h = hashlib.new('sha1') fd, tmpname = tempfile.mkstemp(dir=self.objdir) try: + ishanging = False + cached = False # changes to True when file is cached, means we with os.fdopen(fd, 'w') as cache: - for block in readblocks(sys.stdin): + outstream = cache + blockiter = readblocks(sys.stdin) + # Check whether this file is hanging + block = next(blockiter) + if self.decode(block[0:self.magiclen], noraise=True): + ishanging = True + outstream = sys.stdout + h.update(block) + outstream.write(block) + for block in blockiter: h.update(block) - cache.write(block) + outstream.write(block) + outstream.flush() digest = h.hexdigest() objfile = os.path.join(self.objdir, digest) - os.rename(tmpname, objfile) - verbose('git-fat filter-clean: caching to %s' % objfile) - sys.stdout.write(self.encode(digest)) - sys.stdout.flush() - except: - raise - #os.remove(tmpname) + if not ishanging: + os.rename(tmpname, objfile) + cached = True + self.verbose('git-fat filter-clean: caching to %s' % objfile) + sys.stdout.write(self.encode(digest)) + finally: + if not cached: + os.remove(tmpname) + def cmd_smudge(self): self.setup() result = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = os.path.join(self.objdir, result) - verbose('git-fat filter-smudge: restoring from %s' % objfile) + self.verbose('git-fat filter-smudge: restoring from %s' % objfile) try: cat(open(objfile), sys.stdout) except: sys.stdout.write(self.encode(result)) # could leave a better notice about how to recover this file else: # We have an iterable over the original input. - verbose('git-fat filter-smudge: not a managed file') + self.verbose('git-fat filter-smudge: not a managed file') cat(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) - def referenced_objects(self, rev=None): + def referenced_objects(self, rev=None, all=False): referenced = set() - if rev is None: + if all: rev = '--all' + elif rev is None: + rev = self.revparse('HEAD') p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) for line in p1.communicate()[0].splitlines(): @@ -136,17 +155,26 @@ class GitFat(object): fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash])) referenced.add(fathash) return referenced + def orphan_files(self): + 'generator for all orphan placeholders in the working tree' + for fname in subprocess.check_output(['git', 'ls-files']).splitlines(): + digest = self.decode_file(fname) + if digest: + yield (digest, fname) def cmd_status(self, args): self.setup() catalog = self.catalog_objects() - referenced = self.referenced_objects() + refargs = dict() + if '--all' in args: + refargs['all'] = True + referenced = self.referenced_objects(**refargs) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: for obj in referenced: print(obj) if orphans: - print('Orphan files:') + print('Orphan objects:') for orph in orphans: print(' ' + orph) if garbage: @@ -156,23 +184,44 @@ class GitFat(object): def cmd_push(self): 'Push anything that I have stored and referenced' self.setup() - files = self.referenced_objects() & self.catalog_objects() + # Pushing *all* objects because it's safer. Could implement partial push. + files = self.referenced_objects(all=True) & self.catalog_objects() remote = self.get_rsync() - verbose('Pushing to %s' % (remote)) + self.verbose('Pushing to %s' % (remote)) cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', self.objdir + '/', remote] p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) - def checkout(self): - files = subprocess.check_output(['git', 'ls-files', '']) - def cmd_pull(self): + def checkout(self, show_orphans=False): + 'Update any stale files in the present working tree' + orphans = [] + for digest, fname in self.orphan_files(): + objpath = os.path.join(self.objdir, digest) + if os.access(objpath, os.R_OK): + print('Restoring %s -> %s' % (digest, fname)) + shutil.copy(objpath, fname) + elif show_orphans: + print('Data unavailable: %s %s' % (digest,fname)) + subprocess.call(['git', 'checkout', '.']) + def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() - files = self.referenced_objects() - self.catalog_objects() + refargs = dict() + if '--all' in args: + refargs['all'] = True + for arg in args: + if arg.startswith('-') or len(arg) != 40: + continue + rev = self.revparse(arg) + if rev: + refargs['rev'] = rev + files = self.referenced_objects(**refargs) - self.catalog_objects() remote = self.get_rsync() cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', remote + '/', self.objdir] p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) self.checkout() + def cmd_checkout(self, args): + self.checkout(show_orphans=True) def cmd_gc(self): garbage = self.catalog_objects() - self.referenced_objects() print('Unreferenced objects to remove: %d' % len(garbage)) @@ -189,24 +238,23 @@ class GitFat(object): ]) if __name__ == '__main__': - if os.environ.get('GIT_FAT_VERBOSE'): - global verbose - verbose = verbose_stderr fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': fat.cmd_clean() elif cmd == 'filter-smudge': fat.cmd_smudge() + elif cmd == 'init': + fat.cmd_init() elif cmd == 'status': fat.cmd_status(sys.argv[2:]) elif cmd == 'push': fat.cmd_push() elif cmd == 'pull': - fat.cmd_pull() + fat.cmd_pull(sys.argv[2:]) elif cmd == 'gc': fat.cmd_gc() - elif cmd == 'init': - fat.cmd_init() + elif cmd == 'checkout': + fat.cmd_checkout(sys.argv[2:]) else: - print('Usage: git fat [status|push|pull|gc|init]', file=sys.stderr) + print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr) -- cgit v1.2.1