diff options
-rw-r--r-- | README.md | 42 | ||||
-rwxr-xr-x | git-fat | 113 |
2 files changed, 154 insertions, 1 deletions
@@ -158,6 +158,48 @@ Everything is in place treated specially. * Synchronize fat files with `git fat push` and `git fat pull`. +## Retroactive import using `git filter-branch` [Experimental] + +Sometimes large objects were added to a repository by accident or for +lack of a better place to put them. _If_ you are willing to rewrite +history, forcing everyone to reclone, you can retroactively manage those +files with `git fat`. Be sure that you understand the consequences of +`git filter-branch` before attempting this. This feature is experimental +and irreversible, so be doubly careful with backups. + +### Step 1: Locate the fat files + +Run `git fat find THRESH_BYTES > fat-files` and inspect `fat-files` in +an editor. Lines will be sorted by the maximum object size that has been +at each path, and look like + + something.big filter=fat -text # 8154677 1 + +where the first number after the `#` is the number of bytes and the +second number is the number of modifications that path has seen. You +will normally filter out some of these paths using grep and/or an +editor. When satisfied, remove the ends of the lines (including the `#`) +and append to `.gitattributes`. It's best to `git checkout .` and commit +at this time (likely enrolling some extant files into `git fat`). + +### Step 2: `filter-branch` + +Copy `.gitattributes` to `/tmp/fat-filter-files` and edit to remove +everything after the file name (e.g., `sed s/ \+filter=fat.*$//`). +Currently, this may only contain exact paths relative to the root of the +repository. Finally, run + + git filter-branch --index-filter \ + 'git fat index-filter /tmp/fat-filter-files` \ + --tag-name-filter cat -- --all + +When this finishes, inspect to see if everything is in order and follow +the +[Checklist for Shrinking a Repository](http://www.kernel.org/pub/software/scm/git/docs/git-filter-branch.html#_checklist_for_shrinking_a_repository) +in the `git filter-branch` man page, typically `git clone +file:///path/to/repo`. Be sure to `git fat push` from the original +repository. + ## Implementation notes The actual binary files are stored in `.git/fat/objects`, leaving `.git/objects` nice and small. @@ -11,6 +11,8 @@ import shlex import shutil import itertools import threading +import time +import collections BLOCK_SIZE = 4096 @@ -41,6 +43,29 @@ def cat_iter(initer, outstream): outstream.write(block) def cat(instream, outstream): return cat_iter(readblocks(instream), outstream) +def difftreez_reader(input): + """Incremental reader for git diff-tree -z output + + :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ... + """ + buffer = [] + partial = '' + while True: + newread = input.read(BLOCK_SIZE) + if not newread: + break + partial += newread + while True: + head, sep, partial = partial.partition('\0') + if not sep: + partial = head + break + buffer.append(head) + if len(buffer) == 2: + oldmode, newmode, oldhash, newhash, modflag = buffer[0].split() + path = buffer[1] + yield (newhash, modflag, path) + buffer = [] def gitconfig_get(name, file=None): args = ['git', 'config', '--get'] if file is not None: @@ -307,6 +332,88 @@ class GitFat(object): gitconfig_set('filter.fat.clean', 'git-fat filter-clean') gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge') print('Initialized git fat') + def gen_large_blobs(self, revs, threshsize): + """Build dict of all blobs""" + time0 = time.time() + def keep_blobs(input, output): + """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria""" + # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable + for line in input: + if len(line) != 40: + output.write(line[:40] + '\n') + output.close() + revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1) + pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) + keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin)) + keepblobs.start() + numblobs = 0; numlarge = 1 + # Build dict with the sizes of all large blobs + for line in pblobcheck.stdout: + objhash, blob, size = line.split() + size = int(size) + numblobs += 1 + if size > threshsize: + numlarge += 1 + yield objhash, size + revlist.wait() + pblobcheck.wait() + keepblobs.join() + time1 = time.time() + self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) + def cmd_find(self, args): + maxsize = int(args[0]) + blobsizes = dict(self.gen_large_blobs('--all', maxsize)) + time0 = time.time() + # Find all names assumed by large blobs (those in blobsizes) + pathsizes = collections.defaultdict(lambda:set()) + revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1) + difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'], + stdin=revlist.stdout, stdout=subprocess.PIPE) + for newblob, modflag, path in difftreez_reader(difftree.stdout): + bsize = blobsizes.get(newblob) + if bsize: # We care about this blob + pathsizes[path].add(bsize) + time1 = time.time() + self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) + maxlen = max(map(len,pathsizes)) if pathsizes else 0 + for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True): + print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) + revlist.wait() + difftree.wait() + def cmd_index_filter(self, args): + filelist = set(f.strip() for f in open(args[0]).readlines()) + lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) + updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) + for line in lsfiles.stdout: + mode, sep, tail = line.partition(' ') + blobhash, sep, tail = tail.partition(' ') + stageno, sep, tail = tail.partition('\t') + filename = tail.strip() + if filename not in filelist: + continue + # This file will contain the hash of the cleaned object + hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) + try: + cleanedobj = open(hashfile).read().rstrip() + except IOError: + catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE) + hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + def dofilter(): + self.filter_clean(catfile.stdout, hashobject.stdin) + hashobject.stdin.close() + filterclean = threading.Thread(target=dofilter) + filterclean.start() + cleanedobj = hashobject.stdout.read().rstrip() + catfile.wait() + hashobject.wait() + filterclean.join() + mkdir_p(os.path.dirname(hashfile)) + open(hashfile, 'w').write(cleanedobj + '\n') + updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename)) + updateindex.stdin.close() + lsfiles.wait() + updateindex.wait() + if __name__ == '__main__': fat = GitFat() @@ -327,5 +434,9 @@ if __name__ == '__main__': fat.cmd_gc() elif cmd == 'checkout': fat.cmd_checkout(sys.argv[2:]) + elif cmd == 'find': + fat.cmd_find(sys.argv[2:]) + elif cmd == 'index-filter': + fat.cmd_index_filter(sys.argv[2:]) else: - print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr) + print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr) |