find and filter-index: experimental features for retroactive cleanup

author: Jed Brown <jed@59A2.org> 2013-01-21 00:26:48 -0600
committer: Jed Brown <jed@59A2.org> 2013-01-21 00:26:48 -0600
commit: 1f25e7f59d7fdf88ad2d1b759532b8ec8086de93 (patch)
tree: a4fba8f34c9e9ad274043265efb449a0364c0357
parent: 486dfb16c1b3e6859ee62b43d1a1e51d38c24b08 (diff)
download: git-fat-1f25e7f59d7fdf88ad2d1b759532b8ec8086de93.tar.gz
2 files changed, 154 insertions, 1 deletions
diff --git a/README.md b/README.md
index e726ada..1abbf53 100644
--- a/README.md
+++ b/README.md
@@ -158,6 +158,48 @@ Everything is in place
   treated specially.
 * Synchronize fat files with `git fat push` and `git fat pull`.
 
+## Retroactive import using `git filter-branch` [Experimental]
+
+Sometimes large objects were added to a repository by accident or for
+lack of a better place to put them. _If_ you are willing to rewrite
+history, forcing everyone to reclone, you can retroactively manage those
+files with `git fat`. Be sure that you understand the consequences of
+`git filter-branch` before attempting this. This feature is experimental
+and irreversible, so be doubly careful with backups.
+
+### Step 1: Locate the fat files
+
+Run `git fat find THRESH_BYTES > fat-files` and inspect `fat-files` in
+an editor. Lines will be sorted by the maximum object size that has been
+at each path, and look like
+
+    something.big           filter=fat -text #    8154677 1
+
+where the first number after the `#` is the number of bytes and the
+second number is the number of modifications that path has seen. You
+will normally filter out some of these paths using grep and/or an
+editor. When satisfied, remove the ends of the lines (including the `#`)
+and append to `.gitattributes`. It's best to `git checkout .` and commit
+at this time (likely enrolling some extant files into `git fat`).
+
+### Step 2: `filter-branch`
+
+Copy `.gitattributes` to `/tmp/fat-filter-files` and edit to remove
+everything after the file name (e.g., `sed s/ \+filter=fat.*$//`).
+Currently, this may only contain exact paths relative to the root of the
+repository. Finally, run
+
+    git filter-branch --index-filter                 \
+        'git fat index-filter /tmp/fat-filter-files` \
+        --tag-name-filter cat -- --all
+
+When this finishes, inspect to see if everything is in order and follow
+the
+[Checklist for Shrinking a Repository](http://www.kernel.org/pub/software/scm/git/docs/git-filter-branch.html#_checklist_for_shrinking_a_repository)
+in the `git filter-branch` man page, typically `git clone
+file:///path/to/repo`. Be sure to `git fat push` from the original
+repository.
+
 ## Implementation notes
 The actual binary files are stored in `.git/fat/objects`, leaving `.git/objects` nice and small.
 
diff --git a/git-fat b/git-fat
index 66cc3a8..80c2364 100755
--- a/git-fat
+++ b/git-fat
@@ -11,6 +11,8 @@ import shlex
 import shutil
 import itertools
 import threading
+import time
+import collections
 
 BLOCK_SIZE = 4096
 
@@ -41,6 +43,29 @@ def cat_iter(initer, outstream):
         outstream.write(block)
 def cat(instream, outstream):
     return cat_iter(readblocks(instream), outstream)
+def difftreez_reader(input):
+    """Incremental reader for git diff-tree -z output
+
+    :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
+    """
+    buffer = []
+    partial = ''
+    while True:
+        newread = input.read(BLOCK_SIZE)
+        if not newread:
+            break
+        partial += newread
+        while True:
+            head, sep, partial = partial.partition('\0')
+            if not sep:
+                partial = head
+                break
+            buffer.append(head)
+            if len(buffer) == 2:
+                oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
+                path = buffer[1]
+                yield (newhash, modflag, path)
+                buffer = []
 def gitconfig_get(name, file=None):
     args = ['git', 'config', '--get']
     if file is not None:
@@ -307,6 +332,88 @@ class GitFat(object):
             gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
             gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge')
             print('Initialized git fat')
+    def gen_large_blobs(self, revs, threshsize):
+        """Build dict of all blobs"""
+        time0 = time.time()
+        def keep_blobs(input, output):
+            """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria"""
+            # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable
+            for line in input:
+                if len(line) != 40:
+                    output.write(line[:40] + '\n')
+            output.close()
+        revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
+        pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
+        keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin))
+        keepblobs.start()
+        numblobs = 0; numlarge = 1
+        # Build dict with the sizes of all large blobs
+        for line in pblobcheck.stdout:
+            objhash, blob, size = line.split()
+            size = int(size)
+            numblobs += 1
+            if size > threshsize:
+                numlarge += 1
+                yield objhash, size
+        revlist.wait()
+        pblobcheck.wait()
+        keepblobs.join()
+        time1 = time.time()
+        self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
+    def cmd_find(self, args):
+        maxsize = int(args[0])
+        blobsizes = dict(self.gen_large_blobs('--all', maxsize))
+        time0 = time.time()
+        # Find all names assumed by large blobs (those in blobsizes)
+        pathsizes = collections.defaultdict(lambda:set())
+        revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1)
+        difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
+                                    stdin=revlist.stdout, stdout=subprocess.PIPE)
+        for newblob, modflag, path in difftreez_reader(difftree.stdout):
+            bsize = blobsizes.get(newblob)
+            if bsize:                     # We care about this blob
+                pathsizes[path].add(bsize)
+        time1 = time.time()
+        self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
+        maxlen = max(map(len,pathsizes)) if pathsizes else 0
+        for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True):
+            print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
+        revlist.wait()
+        difftree.wait()
+    def cmd_index_filter(self, args):
+        filelist = set(f.strip() for f in open(args[0]).readlines())
+        lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
+        updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
+        for line in lsfiles.stdout:
+            mode, sep, tail = line.partition(' ')
+            blobhash, sep, tail = tail.partition(' ')
+            stageno, sep, tail = tail.partition('\t')
+            filename = tail.strip()
+            if filename not in filelist:
+                continue
+            # This file will contain the hash of the cleaned object
+            hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash)
+            try:
+                cleanedobj = open(hashfile).read().rstrip()
+            except IOError:
+                catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE)
+                hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+                def dofilter():
+                    self.filter_clean(catfile.stdout, hashobject.stdin)
+                    hashobject.stdin.close()
+                filterclean = threading.Thread(target=dofilter)
+                filterclean.start()
+                cleanedobj = hashobject.stdout.read().rstrip()
+                catfile.wait()
+                hashobject.wait()
+                filterclean.join()
+                mkdir_p(os.path.dirname(hashfile))
+                open(hashfile, 'w').write(cleanedobj + '\n')
+            updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename))
+        updateindex.stdin.close()
+        lsfiles.wait()
+        updateindex.wait()
+
 
 if __name__ == '__main__':
     fat = GitFat()
@@ -327,5 +434,9 @@ if __name__ == '__main__':
         fat.cmd_gc()
     elif cmd == 'checkout':
         fat.cmd_checkout(sys.argv[2:])
+    elif cmd == 'find':
+        fat.cmd_find(sys.argv[2:])
+    elif cmd == 'index-filter':
+        fat.cmd_index_filter(sys.argv[2:])
     else:
-        print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr)
+        print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr)
author	Jed Brown <jed@59A2.org>	2013-01-21 00:26:48 -0600
committer	Jed Brown <jed@59A2.org>	2013-01-21 00:26:48 -0600
commit	1f25e7f59d7fdf88ad2d1b759532b8ec8086de93 (patch)
tree	a4fba8f34c9e9ad274043265efb449a0364c0357
parent	486dfb16c1b3e6859ee62b43d1a1e51d38c24b08 (diff)
download	git-fat-1f25e7f59d7fdf88ad2d1b759532b8ec8086de93.tar.gz