#!/usr/bin/env python # -*- mode:python -*- from __future__ import print_function, with_statement import sys import hashlib import tempfile import os import subprocess import shlex import shutil import itertools import threading import time import collections if not type(sys.version_info) is tuple and sys.version_info.major > 2: sys.stderr.write('git-fat does not support Python-3 yet. Please use python2.\n') sys.exit(1) try: from subprocess import check_output del check_output except ImportError: def backport_check_output(*popenargs, **kwargs): r"""Run command with arguments and return its output as a byte string. Backported from Python 2.7 as it's implemented as pure python on stdlib. >>> check_output(['/usr/bin/python', '--version']) Python 2.6.2 """ process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll() if retcode: cmd = kwargs.get("args") if cmd is None: cmd = popenargs[0] error = subprocess.CalledProcessError(retcode, cmd) error.output = output raise error return output subprocess.check_output = backport_check_output BLOCK_SIZE = 4096 def verbose_stderr(*args, **kwargs): return print(*args, file=sys.stderr, **kwargs) def verbose_ignore(*args, **kwargs): pass def mkdir_p(path): import errno try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def umask(): """Get umask without changing it.""" old = os.umask(0) os.umask(old) return old def readblocks(stream): bytes = 0 while True: data = stream.read(BLOCK_SIZE) bytes += len(data) if not data: break yield data def cat_iter(initer, outstream): for block in initer: outstream.write(block) def cat(instream, outstream): return cat_iter(readblocks(instream), outstream) def difftreez_reader(input): """Incremental reader for git diff-tree -z output :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ... """ buffer = [] partial = '' while True: newread = input.read(BLOCK_SIZE) if not newread: break partial += newread while True: head, sep, partial = partial.partition('\0') if not sep: partial = head break buffer.append(head) if len(buffer) == 2: oldmode, newmode, oldhash, newhash, modflag = buffer[0].split() path = buffer[1] yield (newhash, modflag, path) buffer = [] def gitconfig_get(name, file=None): args = ['git', 'config', '--get'] if file is not None: args += ['--file', file] args.append(name) p = subprocess.Popen(args, stdout=subprocess.PIPE) output = p.communicate()[0].strip() if p.returncode and file is None: return None elif p.returncode: return gitconfig_get(name) else: return output def gitconfig_set(name, value, file=None): args = ['git', 'config'] if file is not None: args += ['--file', file] args += [name, value] p = subprocess.check_call(args) class GitFat(object): DecodeError = RuntimeError def __init__(self): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore try: self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() except subprocess.CalledProcessError: sys.exit(1) self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip() self.objdir = os.path.join(self.gitdir, 'fat', 'objects') if os.environ.get('GIT_FAT_VERSION') == '1': self.encode = self.encode_v1 else: self.encode = self.encode_v2 def magiclen(enc): return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) self.magiclen = magiclen(self.encode) # Current version self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions def setup(self): mkdir_p(self.objdir) def is_init_done(self): return gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge') def assert_init_done(self): if not self.is_init_done(): sys.stderr.write('fatal: git-fat is not yet configured in this repository.\n') sys.stderr.write('Run "git fat init" to configure.\n') sys.exit(1) def get_rsync(self): cfgpath = os.path.join(self.gitroot,'.gitfat') remote = gitconfig_get('rsync.remote', file=cfgpath) ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) options = gitconfig_get('rsync.options', file=cfgpath) if remote is None: raise RuntimeError('No rsync.remote in %s' % cfgpath) return remote, ssh_port, ssh_user, options def get_rsync_command(self,push): (remote, ssh_port, ssh_user, options) = self.get_rsync() if push: self.verbose('Pushing to %s' % (remote)) else: self.verbose('Pulling from %s' % (remote)) cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-'] rshopts = '' if ssh_user: rshopts += ' -l ' + ssh_user if ssh_port: rshopts += ' -p ' + ssh_port if rshopts: cmd.append('--rsh=ssh' + rshopts) if options: cmd += options.split(' ') if push: cmd += [self.objdir + '/', remote + '/'] else: cmd += [remote + '/', self.objdir + '/'] return cmd def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode_v1(self, digest, bytes): 'Produce legacy representation of file to be stored in repository.' return '#$# git-fat %s\n' % (digest,) def encode_v2(self, digest, bytes): 'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.' return '#$# git-fat %s %20d\n' % (digest, bytes) def decode(self, string, noraise=False): cookie = '#$# git-fat ' if string.startswith(cookie): parts = string[len(cookie):].split() digest = parts[0] bytes = int(parts[1]) if len(parts) > 1 else None return digest, bytes elif noraise: return None, None else: raise GitFat.DecodeError('Could not decode %s' % (string)) def decode_stream(self, stream): 'Return digest if git-fat cache, otherwise return iterator over entire file contents' preamble = stream.read(self.magiclen) try: return self.decode(preamble) except GitFat.DecodeError: # Not sure if this is the right behavior return itertools.chain([preamble], readblocks(stream)), None def decode_file(self, fname): # Fast check try: stat = os.lstat(fname) except OSError: return False, None if stat.st_size != self.magiclen: return False, None # read file try: digest, bytes = self.decode_stream(open(fname)) except IOError: return False, None if isinstance(digest, str): return digest, bytes else: return None, bytes def decode_clean(self, body): ''' Attempt to decode version in working tree. The tree version could be changed to have a more useful message than the machine-readable copy that goes into the repository. If the tree version decodes successfully, it indicates that the fat data is not currently available in this repository. ''' digest, bytes = self.decode(body, noraise=True) return digest def filter_clean(self, instream, outstreamclean): h = hashlib.new('sha1') bytes = 0 fd, tmpname = tempfile.mkstemp(dir=self.objdir) try: ishanging = False cached = False # changes to True when file is cached with os.fdopen(fd, 'w') as cache: outstream = cache firstblock = True for block in readblocks(instream): if firstblock: if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]): ishanging = True # Working tree version is verbatim from repository (not smudged) outstream = outstreamclean firstblock = False h.update(block) bytes += len(block) outstream.write(block) outstream.flush() digest = h.hexdigest() objfile = os.path.join(self.objdir, digest) if not ishanging: if os.path.exists(objfile): self.verbose('git-fat filter-clean: cache already exists %s' % objfile) os.remove(tmpname) else: # Set permissions for the new file using the current umask os.chmod(tmpname, int('444', 8) & ~umask()) os.rename(tmpname, objfile) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True outstreamclean.write(self.encode(digest, bytes)) finally: if not cached: os.remove(tmpname) def cmd_filter_clean(self): ''' The clean filter runs when a file is added to the index. It gets the "smudged" (tree) version of the file on stdin and produces the "clean" (repository) version on stdout. ''' self.setup() self.filter_clean(sys.stdin, sys.stdout) def cmd_filter_smudge(self): self.setup() result, bytes = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = os.path.join(self.objdir, result) try: cat(open(objfile), sys.stdout) self.verbose('git-fat filter-smudge: restoring from %s' % objfile) except IOError: # file not found self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file else: # We have an iterable over the original input. self.verbose('git-fat filter-smudge: not a managed file') cat_iter(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) def referenced_objects(self, rev=None, all=False): referenced = set() if all: rev = '--all' elif rev is None: rev = self.revparse('HEAD') # Revision list gives us object names to inspect with cat-file... p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: output.write(line.split()[0] + '\n') output.close() # ...`cat-file --batch-check` filters for git-fat object candidates in bulk... p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def filter_gitfat_candidates(input, output): for line in input: objhash, objtype, size = line.split() if objtype == 'blob' and int(size) in self.magiclens: output.write(objhash + '\n') output.close() # ...`cat-file --batch` provides full contents of git-fat candidates in bulk p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) # Stream data: p1 | cut_thread | p2 | filter_thread | p3 cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)) cut_thread.start() filter_thread.start() # Process metadata + content format provided by `cat-file --batch` while True: metadata_line = p3.stdout.readline() if not metadata_line: break # EOF objhash, objtype, size_str = metadata_line.split() size, bytes_read = int(size_str), 0 # We know from filter that item is a candidate git-fat object and # is small enough to read into memory and process content = '' while bytes_read < size: data = p3.stdout.read(size - bytes_read) if not data: break # EOF content += data bytes_read += len(data) try: fathash = self.decode(content)[0] referenced.add(fathash) except GitFat.DecodeError: pass # Consume LF record delimiter in `cat-file --batch` output bytes_read = 0 while bytes_read < 1: data = p3.stdout.read(1) if not data: break # EOF bytes_read += len(data) # Ensure everything is cleaned up cut_thread.join() filter_thread.join() p1.wait() p2.wait() p3.wait() return referenced def orphan_files(self, patterns=[]): 'generator for all orphan placeholders in the working tree' if not patterns or patterns == ['']: patterns = ['.'] for fname in subprocess.check_output(['git', 'ls-files', '-z'] + patterns).split('\x00')[:-1]: digest = self.decode_file(fname)[0] if digest: yield (digest, fname) def cmd_status(self, args): self.setup() catalog = self.catalog_objects() refargs = dict() if '--all' in args: refargs['all'] = True referenced = self.referenced_objects(**refargs) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: for obj in referenced: print(obj) if orphans: print('Orphan objects:') for orph in orphans: print(' ' + orph) if garbage: print('Garbage objects:') for g in garbage: print(' ' + g) def is_dirty(self): return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0 def cmd_push(self, args): 'Push anything that I have stored and referenced' self.setup() # Default to push only those objects referenced by current HEAD # (includes history). Finer-grained pushing would be useful. pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.catalog_objects() cmd = self.get_rsync_command(push=True) self.verbose('Executing: %s' % ' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) if p.returncode: sys.exit(p.returncode) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' self.assert_init_done() for digest, fname in self.orphan_files(): objpath = os.path.join(self.objdir, digest) if os.access(objpath, os.R_OK): print('Restoring %s -> %s' % (digest, fname)) # The output of our smudge filter depends on the existence of # the file in .git/fat/objects, but git caches the file stat # from the previous time the file was smudged, therefore it # won't try to re-smudge. I don't know a git command that # specifically invalidates that cache, but changing the mtime # on the file will invalidate the cache. # Here we set the mtime to mtime + 1. This is an improvement # over touching the file as it catches the edgecase where a # git-checkout happens within the same second as a git fat # checkout. stat = os.lstat(fname) os.utime(fname, (stat.st_atime, stat.st_mtime + 1)) # This re-smudge is essentially a copy that restores # permissions. subprocess.check_call( ['git', 'checkout-index', '--index', '--force', fname]) elif show_orphans: print('Data unavailable: %s %s' % (digest,fname)) def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() refargs = dict() if '--all' in args: refargs['all'] = True for arg in args: if arg.startswith('-') or len(arg) != 40: continue rev = self.revparse(arg) if rev: refargs['rev'] = rev files = self.filter_objects(refargs, self.parse_pull_patterns(args)) cmd = self.get_rsync_command(push=False) self.verbose('Executing: %s' % ' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) if p.returncode: sys.exit(p.returncode) self.checkout() def parse_pull_patterns(self, args): if '--' not in args: return [''] else: idx = args.index('--') patterns = args[idx+1:] #we don't care about '--' return patterns def filter_objects(self, refargs, patterns): files = self.referenced_objects(**refargs) - self.catalog_objects() if refargs.get('all'): # Currently ignores patterns; can we efficiently do both? return files orphans_matched = list(self.orphan_files(patterns)) orphans_objects = set(map(lambda x: x[0], orphans_matched)) return files & orphans_objects def cmd_checkout(self, args): self.checkout(show_orphans=True) def cmd_gc(self): garbage = self.catalog_objects() - self.referenced_objects() print('Unreferenced objects to remove: %d' % len(garbage)) for obj in garbage: fname = os.path.join(self.objdir, obj) print('%10d %s' % (os.stat(fname).st_size, obj)) os.remove(fname) def cmd_verify(self): """Print details of git-fat objects with incorrect data hash""" corrupted_objects = [] for obj in self.catalog_objects(): fname = os.path.join(self.objdir, obj) h = hashlib.new('sha1') for block in readblocks(open(fname)): h.update(block) data_hash = h.hexdigest() if obj != data_hash: corrupted_objects.append((obj, data_hash)) if corrupted_objects: print('Corrupted objects: %d' % len(corrupted_objects)) for obj, data_hash in corrupted_objects: print('%s data hash is %s' % (obj, data_hash)) sys.exit(1) def cmd_init(self): self.setup() if self.is_init_done(): print('Git fat already configured, check configuration in .git/config') else: gitconfig_set('filter.fat.clean', 'git-fat filter-clean') gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') print('Initialized git fat') def gen_large_blobs(self, revs, threshsize): """Build dict of all blobs""" time0 = time.time() def hash_only(input, output): """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags. This truncates to one hash per line. """ for line in input: output.write(line[:40] + '\n') output.close() revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1) objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin)) hashonly.start() numblobs = 0; numlarge = 1 # Build dict with the sizes of all large blobs for line in objcheck.stdout: objhash, blob, size = line.split() if blob != 'blob': continue size = int(size) numblobs += 1 if size > threshsize: numlarge += 1 yield objhash, size revlist.wait() objcheck.wait() hashonly.join() time1 = time.time() self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) def cmd_find(self, args): maxsize = int(args[0]) blobsizes = dict(self.gen_large_blobs('--all', maxsize)) time0 = time.time() # Find all names assumed by large blobs (those in blobsizes) pathsizes = collections.defaultdict(lambda:set()) revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1) difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'], stdin=revlist.stdout, stdout=subprocess.PIPE) for newblob, modflag, path in difftreez_reader(difftree.stdout): bsize = blobsizes.get(newblob) if bsize: # We care about this blob pathsizes[path].add(bsize) time1 = time.time() self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) maxlen = max(map(len,pathsizes)) if pathsizes else 0 for path, sizes in sorted(pathsizes.items(), key=lambda ps: max(ps[1]), reverse=True): print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) revlist.wait() difftree.wait() def cmd_index_filter(self, args): manage_gitattributes = '--manage-gitattributes' in args filelist = set(f.strip() for f in open(args[0]).readlines()) lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) for line in lsfiles.stdout: mode, sep, tail = line.partition(' ') blobhash, sep, tail = tail.partition(' ') stageno, sep, tail = tail.partition('\t') filename = tail.strip() if filename not in filelist: continue if mode == "120000": # skip symbolic links continue # This file will contain the hash of the cleaned object hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) try: cleanedobj = open(hashfile).read().rstrip() except IOError: catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE) hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def dofilter(): self.filter_clean(catfile.stdout, hashobject.stdin) hashobject.stdin.close() filterclean = threading.Thread(target=dofilter) filterclean.start() cleanedobj = hashobject.stdout.read().rstrip() catfile.wait() hashobject.wait() filterclean.join() mkdir_p(os.path.dirname(hashfile)) open(hashfile, 'w').write(cleanedobj + '\n') updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename)) if manage_gitattributes: try: mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split() gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines() except ValueError: # Nothing to unpack, thus no file mode, stageno = '100644', '0' gitattributes_lines = [] gitattributes_extra = ['%s filter=fat -text' % line.split()[0] for line in filelist] hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = hashobject.communicate('\n'.join(gitattributes_lines + gitattributes_extra) + '\n') updateindex.stdin.write('%s %s %s\t%s\n' % (mode, stdout.strip(), stageno, '.gitattributes')) updateindex.stdin.close() lsfiles.wait() updateindex.wait() if __name__ == '__main__': fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': fat.cmd_filter_clean() elif cmd == 'filter-smudge': fat.cmd_filter_smudge() elif cmd == 'init': fat.cmd_init() elif cmd == 'status': fat.cmd_status(sys.argv[2:]) elif cmd == 'push': fat.cmd_push(sys.argv[2:]) elif cmd == 'pull': fat.cmd_pull(sys.argv[2:]) elif cmd == 'gc': fat.cmd_gc() elif cmd == 'verify': fat.cmd_verify() elif cmd == 'checkout': fat.cmd_checkout(sys.argv[2:]) elif cmd == 'find': fat.cmd_find(sys.argv[2:]) elif cmd == 'index-filter': fat.cmd_index_filter(sys.argv[2:]) else: print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter]', file=sys.stderr)