#!/usr/bin/env python from __future__ import print_function, with_statement import sys import hashlib import tempfile import os import subprocess import shlex import shutil import itertools BLOCK_SIZE = 4096 def verbose_stderr(*args, **kwargs): return print(*args, file=sys.stderr, **kwargs) def verbose_ignore(*args, **kwargs): pass def mkdir_p(path): import errno try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def readblocks(stream): bytes = 0 while True: data = stream.read(BLOCK_SIZE) bytes += len(data) if not data: break yield data def cat_iter(initer, outstream): for block in initer: outstream.write(block) def cat(instream, outstream): return cat_iter(readblocks(instream), outstream) def gitconfig_get(name, file=None): args = ['git', 'config', '--get'] if file is not None: args += ['--file', file] args.append(name) p = subprocess.Popen(args, stdout=subprocess.PIPE) output = p.communicate()[0].strip() if p.returncode != 0: return None else: return output def gitconfig_set(name, value, file=None): args = ['git', 'config'] if file is not None: args += ['--file', file] args += [name, value] p = subprocess.check_call(args) class GitFat(object): DecodeError = RuntimeError def __init__(self): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() self.objdir = os.path.join(self.gitroot, '.git', 'fat', 'objects') if os.environ.get('GIT_FAT_VERSION') == '1': self.encode = self.encode_v1 else: self.encode = self.encode_v2 def magiclen(enc): return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) self.magiclen = magiclen(self.encode) # Current version self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions def setup(self): mkdir_p(self.objdir) def get_rsync(self): cfgpath = os.path.join(self.gitroot,'.gitfat') remote = gitconfig_get('rsync.remote', file=cfgpath) if remote is None: raise RuntimeError('No rsync.remote in %s' % cfgpath) return remote def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode_v1(self, digest, bytes): 'Produce legacy representation of file to be stored in repository.' return '#$# git-fat %s\n' % (digest,) def encode_v2(self, digest, bytes): 'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.' return '#$# git-fat %s %20d\n' % (digest, bytes) def decode(self, string, noraise=False): cookie = '#$# git-fat ' if string.startswith(cookie): parts = string[len(cookie):].split() digest = parts[0] bytes = int(parts[1]) if len(parts) > 1 else None return digest, bytes elif noraise: return None, None else: raise GitFat.DecodeError('Could not decode %s' % (string)) def decode_stream(self, stream): 'Return digest if git-fat cache, otherwise return iterator over entire file contents' preamble = stream.read(self.magiclen) try: return self.decode(preamble) except GitFat.DecodeError: # Not sure if this is the right behavior return itertools.chain([preamble], readblocks(stream)), None def decode_file(self, fname): # Fast check stat = os.stat(fname) if stat.st_size != self.magiclen: return False, None # read file digest, bytes = self.decode_stream(open(fname)) if isinstance(digest, str): return digest, bytes else: return None, bytes def decode_clean(self, body): ''' Attempt to decode version in working tree. The tree version could be changed to have a more useful message than the machine-readable copy that goes into the repository. If the tree version decodes successfully, it indicates that the fat data is not currently available in this repository. ''' digest, bytes = self.decode(body, noraise=True) return digest def cmd_clean(self): ''' The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' self.setup() h = hashlib.new('sha1') bytes = 0 fd, tmpname = tempfile.mkstemp(dir=self.objdir) try: ishanging = False cached = False # changes to True when file is cached with os.fdopen(fd, 'w') as cache: outstream = cache blockiter = readblocks(sys.stdin) # Check whether this file is hanging block = next(blockiter) if self.decode_clean(block[0:self.magiclen]): ishanging = True outstream = sys.stdout h.update(block) bytes += len(block) outstream.write(block) for block in blockiter: h.update(block) bytes += len(block) outstream.write(block) outstream.flush() digest = h.hexdigest() objfile = os.path.join(self.objdir, digest) if not ishanging: if os.path.exists(objfile): self.verbose('git-fat filter-clean: cache already exists %s' % objfile) os.remove(tmpname) else: os.rename(tmpname, objfile) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True sys.stdout.write(self.encode(digest, bytes)) finally: if not cached: os.remove(tmpname) def cmd_smudge(self): self.setup() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = os.path.join(self.objdir, result) try: cat(open(objfile), sys.stdout) self.verbose('git-fat filter-smudge: restoring from %s' % objfile) except IOError: # file not found self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file else: # We have an iterable over the original input. self.verbose('git-fat filter-smudge: not a managed file') cat_iter(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) def referenced_objects(self, rev=None, all=False): referenced = set() if all: rev = '--all' elif rev is None: rev = self.revparse('HEAD') p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) for line in p1.communicate()[0].splitlines(): p2.stdin.write(line.split()[0] + '\n') for line in p2.communicate()[0].splitlines(): objhash, objtype, size = line.split() if objtype == 'blob' and int(size) in self.magiclens: fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0] referenced.add(fathash) return referenced def orphan_files(self): 'generator for all orphan placeholders in the working tree' for fname in subprocess.check_output(['git', 'ls-files']).splitlines(): digest = self.decode_file(fname)[0] if digest: yield (digest, fname) def cmd_status(self, args): self.setup() catalog = self.catalog_objects() refargs = dict() if '--all' in args: refargs['all'] = True referenced = self.referenced_objects(**refargs) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: for obj in referenced: print(obj) if orphans: print('Orphan objects:') for orph in orphans: print(' ' + orph) if garbage: print('Garbage objects:') for g in garbage: print(' ' + g) def is_dirty(self): return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0 def cmd_push(self, args): 'Push anything that I have stored and referenced' self.setup() # Default to push only those objects referenced by current HEAD # (includes history). Finer-grained pushing would be useful. pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.catalog_objects() remote = self.get_rsync() self.verbose('Pushing to %s' % (remote)) cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', self.objdir + '/', remote] p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' for digest, fname in self.orphan_files(): objpath = os.path.join(self.objdir, digest) if os.access(objpath, os.R_OK): print('Restoring %s -> %s' % (digest, fname)) # The output of our smudge filter depends on the existence of # the file in .git/fat/objects, but git caches the file stat # from the previous time the file was smudged, therefore it # won't try to re-smudge. I don't know a git command that # specifically invalidates that cache, but touching the file # also does the trick. os.utime(fname, None) # This re-smudge is essentially a copy that restores permissions. subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname]) elif show_orphans: print('Data unavailable: %s %s' % (digest,fname)) def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() refargs = dict() if '--all' in args: refargs['all'] = True for arg in args: if arg.startswith('-') or len(arg) != 40: continue rev = self.revparse(arg) if rev: refargs['rev'] = rev files = self.referenced_objects(**refargs) - self.catalog_objects() remote = self.get_rsync() cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', remote + '/', self.objdir] p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) self.checkout() def cmd_checkout(self, args): self.checkout(show_orphans=True) def cmd_gc(self): garbage = self.catalog_objects() - self.referenced_objects() print('Unreferenced objects to remove: %d' % len(garbage)) for obj in garbage: fname = os.path.join(self.objdir, obj) print('%10d %s' % (os.stat(fname).st_size, obj)) os.remove(fname) def cmd_init(self): self.setup() if gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge'): print('Git fat already configured, check configuration in .git/config') else: gitconfig_set('filter.fat.clean', 'git-fat filter-clean') gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') print('Initialized git fat') if __name__ == '__main__': fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': fat.cmd_clean() elif cmd == 'filter-smudge': fat.cmd_smudge() elif cmd == 'init': fat.cmd_init() elif cmd == 'status': fat.cmd_status(sys.argv[2:]) elif cmd == 'push': fat.cmd_push(sys.argv[2:]) elif cmd == 'pull': fat.cmd_pull(sys.argv[2:]) elif cmd == 'gc': fat.cmd_gc() elif cmd == 'checkout': fat.cmd_checkout(sys.argv[2:]) else: print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr)