diff options
author | Jed Brown <jed@59A2.org> | 2013-01-21 00:26:48 -0600 |
---|---|---|
committer | Jed Brown <jed@59A2.org> | 2013-01-21 00:26:48 -0600 |
commit | 1f25e7f59d7fdf88ad2d1b759532b8ec8086de93 (patch) | |
tree | a4fba8f34c9e9ad274043265efb449a0364c0357 /git-fat | |
parent | 486dfb16c1b3e6859ee62b43d1a1e51d38c24b08 (diff) | |
download | git-fat-1f25e7f59d7fdf88ad2d1b759532b8ec8086de93.tar.gz |
find and filter-index: experimental features for retroactive cleanup
Diffstat (limited to 'git-fat')
-rwxr-xr-x | git-fat | 113 |
1 files changed, 112 insertions, 1 deletions
@@ -11,6 +11,8 @@ import shlex import shutil import itertools import threading +import time +import collections BLOCK_SIZE = 4096 @@ -41,6 +43,29 @@ def cat_iter(initer, outstream): outstream.write(block) def cat(instream, outstream): return cat_iter(readblocks(instream), outstream) +def difftreez_reader(input): + """Incremental reader for git diff-tree -z output + + :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ... + """ + buffer = [] + partial = '' + while True: + newread = input.read(BLOCK_SIZE) + if not newread: + break + partial += newread + while True: + head, sep, partial = partial.partition('\0') + if not sep: + partial = head + break + buffer.append(head) + if len(buffer) == 2: + oldmode, newmode, oldhash, newhash, modflag = buffer[0].split() + path = buffer[1] + yield (newhash, modflag, path) + buffer = [] def gitconfig_get(name, file=None): args = ['git', 'config', '--get'] if file is not None: @@ -307,6 +332,88 @@ class GitFat(object): gitconfig_set('filter.fat.clean', 'git-fat filter-clean') gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') print('Initialized git fat') + def gen_large_blobs(self, revs, threshsize): + """Build dict of all blobs""" + time0 = time.time() + def keep_blobs(input, output): + """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria""" + # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable + for line in input: + if len(line) != 40: + output.write(line[:40] + '\n') + output.close() + revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1) + pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) + keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin)) + keepblobs.start() + numblobs = 0; numlarge = 1 + # Build dict with the sizes of all large blobs + for line in pblobcheck.stdout: + objhash, blob, size = line.split() + size = int(size) + numblobs += 1 + if size > threshsize: + numlarge += 1 + yield objhash, size + revlist.wait() + pblobcheck.wait() + keepblobs.join() + time1 = time.time() + self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) + def cmd_find(self, args): + maxsize = int(args[0]) + blobsizes = dict(self.gen_large_blobs('--all', maxsize)) + time0 = time.time() + # Find all names assumed by large blobs (those in blobsizes) + pathsizes = collections.defaultdict(lambda:set()) + revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1) + difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'], + stdin=revlist.stdout, stdout=subprocess.PIPE) + for newblob, modflag, path in difftreez_reader(difftree.stdout): + bsize = blobsizes.get(newblob) + if bsize: # We care about this blob + pathsizes[path].add(bsize) + time1 = time.time() + self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) + maxlen = max(map(len,pathsizes)) if pathsizes else 0 + for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True): + print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) + revlist.wait() + difftree.wait() + def cmd_index_filter(self, args): + filelist = set(f.strip() for f in open(args[0]).readlines()) + lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) + updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) + for line in lsfiles.stdout: + mode, sep, tail = line.partition(' ') + blobhash, sep, tail = tail.partition(' ') + stageno, sep, tail = tail.partition('\t') + filename = tail.strip() + if filename not in filelist: + continue + # This file will contain the hash of the cleaned object + hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) + try: + cleanedobj = open(hashfile).read().rstrip() + except IOError: + catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE) + hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + def dofilter(): + self.filter_clean(catfile.stdout, hashobject.stdin) + hashobject.stdin.close() + filterclean = threading.Thread(target=dofilter) + filterclean.start() + cleanedobj = hashobject.stdout.read().rstrip() + catfile.wait() + hashobject.wait() + filterclean.join() + mkdir_p(os.path.dirname(hashfile)) + open(hashfile, 'w').write(cleanedobj + '\n') + updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename)) + updateindex.stdin.close() + lsfiles.wait() + updateindex.wait() + if __name__ == '__main__': fat = GitFat() @@ -327,5 +434,9 @@ if __name__ == '__main__': fat.cmd_gc() elif cmd == 'checkout': fat.cmd_checkout(sys.argv[2:]) + elif cmd == 'find': + fat.cmd_find(sys.argv[2:]) + elif cmd == 'index-filter': + fat.cmd_index_filter(sys.argv[2:]) else: - print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr) + print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr) |