#!/usr/bin/env python from __future__ import print_function, with_statement import sys import hashlib import tempfile import os import subprocess import shlex import shutil import itertools import threading import time import collections BLOCK_SIZE = 4096 def verbose_stderr(*args, **kwargs): return print(*args, file=sys.stderr, **kwargs) def verbose_ignore(*args, **kwargs): pass def mkdir_p(path): import errno try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def readblocks(stream): bytes = 0 while True: data = stream.read(BLOCK_SIZE) bytes += len(data) if not data: break yield data def cat_iter(initer, outstream): for block in initer: outstream.write(block) def cat(instream, outstream): return cat_iter(readblocks(instream), outstream) def difftreez_reader(input): """Incremental reader for git diff-tree -z output :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ... """ buffer = [] partial = '' while True: newread = input.read(BLOCK_SIZE) if not newread: break partial += newread while True: head, sep, partial = partial.partition('\0') if not sep: partial = head break buffer.append(head) if len(buffer) == 2: oldmode, newmode, oldhash, newhash, modflag = buffer[0].split() path = buffer[1] yield (newhash, modflag, path) buffer = [] def gitconfig_get(name, file=None): args = ['git', 'config', '--get'] if file is not None: args += ['--file', file] args.append(name) p = subprocess.Popen(args, stdout=subprocess.PIPE) output = p.communicate()[0].strip() if p.returncode != 0: return None else: return output def gitconfig_set(name, value, file=None): args = ['git', 'config'] if file is not None: args += ['--file', file] args += [name, value] p = subprocess.check_call(args) class GitFat(object): DecodeError = RuntimeError def __init__(self): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip() self.objdir = os.path.join(self.gitdir, 'fat', 'objects') if os.environ.get('GIT_FAT_VERSION') == '1': self.encode = self.encode_v1 else: self.encode = self.encode_v2 def magiclen(enc): return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) self.magiclen = magiclen(self.encode) # Current version self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions def setup(self): mkdir_p(self.objdir) def get_rsync(self): cfgpath = os.path.join(self.gitroot,'.gitfat') remote = gitconfig_get('rsync.remote', file=cfgpath) ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) if remote is None: raise RuntimeError('No rsync.remote in %s' % cfgpath) return remote, ssh_port, ssh_user def get_rsync_command(self,push): (remote, ssh_port, ssh_user) = self.get_rsync() if push: self.verbose('Pushing to %s' % (remote)) else: self.verbose('Pulling from %s' % (remote)) cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-'] rshopts = '' if ssh_user: rshopts += ' -l ' + ssh_user if ssh_port: rshopts += ' -p ' + ssh_port if rshopts: cmd.append('--rsh=ssh' + rshopts) cmd += [self.objdir + '/', remote + '/'] return cmd def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode_v1(self, digest, bytes): 'Produce legacy representation of file to be stored in repository.' return '#$# git-fat %s\n' % (digest,) def encode_v2(self, digest, bytes): 'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.' return '#$# git-fat %s %20d\n' % (digest, bytes) def decode(self, string, noraise=False): cookie = '#$# git-fat ' if string.startswith(cookie): parts = string[len(cookie):].split() digest = parts[0] bytes = int(parts[1]) if len(parts) > 1 else None return digest, bytes elif noraise: return None, None else: raise GitFat.DecodeError('Could not decode %s' % (string)) def decode_stream(self, stream): 'Return digest if git-fat cache, otherwise return iterator over entire file contents' preamble = stream.read(self.magiclen) try: return self.decode(preamble) except GitFat.DecodeError: # Not sure if this is the right behavior return itertools.chain([preamble], readblocks(stream)), None def decode_file(self, fname): # Fast check stat = os.stat(fname) if stat.st_size != self.magiclen: return False, None # read file digest, bytes = self.decode_stream(open(fname)) if isinstance(digest, str): return digest, bytes else: return None, bytes def decode_clean(self, body): ''' Attempt to decode version in working tree. The tree version could be changed to have a more useful message than the machine-readable copy that goes into the repository. If the tree version decodes successfully, it indicates that the fat data is not currently available in this repository. ''' digest, bytes = self.decode(body, noraise=True) return digest def filter_clean(self, instream, outstreamclean): h = hashlib.new('sha1') bytes = 0 fd, tmpname = tempfile.mkstemp(dir=self.objdir) try: ishanging = False cached = False # changes to True when file is cached with os.fdopen(fd, 'w') as cache: outstream = cache blockiter = readblocks(instream) firstblock = True for block in readblocks(instream): if firstblock: if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]): ishanging = True # Working tree version is verbatim from repository (not smudged) outstream = outstreamclean firstblock = False h.update(block) bytes += len(block) outstream.write(block) outstream.flush() digest = h.hexdigest() objfile = os.path.join(self.objdir, digest) if not ishanging: if os.path.exists(objfile): self.verbose('git-fat filter-clean: cache already exists %s' % objfile) os.remove(tmpname) else: os.rename(tmpname, objfile) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True outstreamclean.write(self.encode(digest, bytes)) finally: if not cached: os.remove(tmpname) def cmd_filter_clean(self): ''' The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' self.setup() self.filter_clean(sys.stdin, sys.stdout) def cmd_filter_smudge(self): self.setup() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = os.path.join(self.objdir, result) try: cat(open(objfile), sys.stdout) self.verbose('git-fat filter-smudge: restoring from %s' % objfile) except IOError: # file not found self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file else: # We have an iterable over the original input. self.verbose('git-fat filter-smudge: not a managed file') cat_iter(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) def referenced_objects(self, rev=None, all=False): referenced = set() if all: rev = '--all' elif rev is None: rev = self.revparse('HEAD') p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: output.write(line.split()[0] + '\n') output.close() cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) cut_thread.start() for line in p2.stdout: objhash, objtype, size = line.split() if objtype == 'blob' and int(size) in self.magiclens: try: fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0] referenced.add(fathash) except GitFat.DecodeError: pass cut_thread.join() p1.wait() p2.wait() return referenced def orphan_files(self): 'generator for all orphan placeholders in the working tree' for fname in subprocess.check_output(['git', 'ls-files']).splitlines(): digest = self.decode_file(fname)[0] if digest: yield (digest, fname) def cmd_status(self, args): self.setup() catalog = self.catalog_objects() refargs = dict() if '--all' in args: refargs['all'] = True referenced = self.referenced_objects(**refargs) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: for obj in referenced: print(obj) if orphans: print('Orphan objects:') for orph in orphans: print(' ' + orph) if garbage: print('Garbage objects:') for g in garbage: print(' ' + g) def is_dirty(self): return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0 def cmd_push(self, args): 'Push anything that I have stored and referenced' self.setup() # Default to push only those objects referenced by current HEAD # (includes history). Finer-grained pushing would be useful. pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.catalog_objects() cmd = self.get_rsync_command(push=True) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' for digest, fname in self.orphan_files(): objpath = os.path.join(self.objdir, digest) if os.access(objpath, os.R_OK): print('Restoring %s -> %s' % (digest, fname)) # The output of our smudge filter depends on the existence of # the file in .git/fat/objects, but git caches the file stat # from the previous time the file was smudged, therefore it # won't try to re-smudge. I don't know a git command that # specifically invalidates that cache, but touching the file # also does the trick. os.utime(fname, None) # This re-smudge is essentially a copy that restores permissions. subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname]) elif show_orphans: print('Data unavailable: %s %s' % (digest,fname)) def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() refargs = dict() if '--all' in args: refargs['all'] = True for arg in args: if arg.startswith('-') or len(arg) != 40: continue rev = self.revparse(arg) if rev: refargs['rev'] = rev files = self.referenced_objects(**refargs) - self.catalog_objects() cmd = self.get_rsync_command(push=False) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) self.checkout() def cmd_checkout(self, args): self.checkout(show_orphans=True) def cmd_gc(self): garbage = self.catalog_objects() - self.referenced_objects() print('Unreferenced objects to remove: %d' % len(garbage)) for obj in garbage: fname = os.path.join(self.objdir, obj) print('%10d %s' % (os.stat(fname).st_size, obj)) os.remove(fname) def cmd_init(self): self.setup() if gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge'): print('Git fat already configured, check configuration in .git/config') else: gitconfig_set('filter.fat.clean', 'git-fat filter-clean') gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') print('Initialized git fat') def gen_large_blobs(self, revs, threshsize): """Build dict of all blobs""" time0 = time.time() def hash_only(input, output): """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags. This truncates to one hash per line. """ for line in input: output.write(line[:40] + '\n') output.close() revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1) objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin)) hashonly.start() numblobs = 0; numlarge = 1 # Build dict with the sizes of all large blobs for line in objcheck.stdout: objhash, blob, size = line.split() if blob != 'blob': continue size = int(size) numblobs += 1 if size > threshsize: numlarge += 1 yield objhash, size revlist.wait() objcheck.wait() hashonly.join() time1 = time.time() self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) def cmd_find(self, args): maxsize = int(args[0]) blobsizes = dict(self.gen_large_blobs('--all', maxsize)) time0 = time.time() # Find all names assumed by large blobs (those in blobsizes) pathsizes = collections.defaultdict(lambda:set()) revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1) difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'], stdin=revlist.stdout, stdout=subprocess.PIPE) for newblob, modflag, path in difftreez_reader(difftree.stdout): bsize = blobsizes.get(newblob) if bsize: # We care about this blob pathsizes[path].add(bsize) time1 = time.time() self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) maxlen = max(map(len,pathsizes)) if pathsizes else 0 for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True): print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) revlist.wait() difftree.wait() def cmd_index_filter(self, args): manage_gitattributes = '--manage-gitattributes' in args filelist = set(f.strip() for f in open(args[0]).readlines()) lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) for line in lsfiles.stdout: mode, sep, tail = line.partition(' ') blobhash, sep, tail = tail.partition(' ') stageno, sep, tail = tail.partition('\t') filename = tail.strip() if filename not in filelist: continue # This file will contain the hash of the cleaned object hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) try: cleanedobj = open(hashfile).read().rstrip() except IOError: catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE) hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def dofilter(): self.filter_clean(catfile.stdout, hashobject.stdin) hashobject.stdin.close() filterclean = threading.Thread(target=dofilter) filterclean.start() cleanedobj = hashobject.stdout.read().rstrip() catfile.wait() hashobject.wait() filterclean.join() mkdir_p(os.path.dirname(hashfile)) open(hashfile, 'w').write(cleanedobj + '\n') updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename)) if manage_gitattributes: try: mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split() gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines() except ValueError: # Nothing to unpack, thus no file mode, stageno = '100644', '0' gitattributes_lines = [] gitattributes_extra = ['%s filter=fat -text' % line.split()[0] for line in filelist] hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = hashobject.communicate('\n'.join(gitattributes_lines + gitattributes_extra) + '\n') updateindex.stdin.write('%s %s %s\t%s\n' % (mode, stdout.strip(), stageno, '.gitattributes')) updateindex.stdin.close() lsfiles.wait() updateindex.wait() if __name__ == '__main__': fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': fat.cmd_filter_clean() elif cmd == 'filter-smudge': fat.cmd_filter_smudge() elif cmd == 'init': fat.cmd_init() elif cmd == 'status': fat.cmd_status(sys.argv[2:]) elif cmd == 'push': fat.cmd_push(sys.argv[2:]) elif cmd == 'pull': fat.cmd_pull(sys.argv[2:]) elif cmd == 'gc': fat.cmd_gc() elif cmd == 'checkout': fat.cmd_checkout(sys.argv[2:]) elif cmd == 'find': fat.cmd_find(sys.argv[2:]) elif cmd == 'index-filter': fat.cmd_index_filter(sys.argv[2:]) else: print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr)