summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJed Brown <jed@59A2.org>2013-01-21 00:26:48 -0600
committerJed Brown <jed@59A2.org>2013-01-21 00:26:48 -0600
commit1f25e7f59d7fdf88ad2d1b759532b8ec8086de93 (patch)
treea4fba8f34c9e9ad274043265efb449a0364c0357
parent486dfb16c1b3e6859ee62b43d1a1e51d38c24b08 (diff)
downloadgit-fat-1f25e7f59d7fdf88ad2d1b759532b8ec8086de93.tar.gz
find and filter-index: experimental features for retroactive cleanup
-rw-r--r--README.md42
-rwxr-xr-xgit-fat113
2 files changed, 154 insertions, 1 deletions
diff --git a/README.md b/README.md
index e726ada..1abbf53 100644
--- a/README.md
+++ b/README.md
@@ -158,6 +158,48 @@ Everything is in place
treated specially.
* Synchronize fat files with `git fat push` and `git fat pull`.
+## Retroactive import using `git filter-branch` [Experimental]
+
+Sometimes large objects were added to a repository by accident or for
+lack of a better place to put them. _If_ you are willing to rewrite
+history, forcing everyone to reclone, you can retroactively manage those
+files with `git fat`. Be sure that you understand the consequences of
+`git filter-branch` before attempting this. This feature is experimental
+and irreversible, so be doubly careful with backups.
+
+### Step 1: Locate the fat files
+
+Run `git fat find THRESH_BYTES > fat-files` and inspect `fat-files` in
+an editor. Lines will be sorted by the maximum object size that has been
+at each path, and look like
+
+ something.big filter=fat -text # 8154677 1
+
+where the first number after the `#` is the number of bytes and the
+second number is the number of modifications that path has seen. You
+will normally filter out some of these paths using grep and/or an
+editor. When satisfied, remove the ends of the lines (including the `#`)
+and append to `.gitattributes`. It's best to `git checkout .` and commit
+at this time (likely enrolling some extant files into `git fat`).
+
+### Step 2: `filter-branch`
+
+Copy `.gitattributes` to `/tmp/fat-filter-files` and edit to remove
+everything after the file name (e.g., `sed s/ \+filter=fat.*$//`).
+Currently, this may only contain exact paths relative to the root of the
+repository. Finally, run
+
+ git filter-branch --index-filter \
+ 'git fat index-filter /tmp/fat-filter-files` \
+ --tag-name-filter cat -- --all
+
+When this finishes, inspect to see if everything is in order and follow
+the
+[Checklist for Shrinking a Repository](http://www.kernel.org/pub/software/scm/git/docs/git-filter-branch.html#_checklist_for_shrinking_a_repository)
+in the `git filter-branch` man page, typically `git clone
+file:///path/to/repo`. Be sure to `git fat push` from the original
+repository.
+
## Implementation notes
The actual binary files are stored in `.git/fat/objects`, leaving `.git/objects` nice and small.
diff --git a/git-fat b/git-fat
index 66cc3a8..80c2364 100755
--- a/git-fat
+++ b/git-fat
@@ -11,6 +11,8 @@ import shlex
import shutil
import itertools
import threading
+import time
+import collections
BLOCK_SIZE = 4096
@@ -41,6 +43,29 @@ def cat_iter(initer, outstream):
outstream.write(block)
def cat(instream, outstream):
return cat_iter(readblocks(instream), outstream)
+def difftreez_reader(input):
+ """Incremental reader for git diff-tree -z output
+
+ :oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
+ """
+ buffer = []
+ partial = ''
+ while True:
+ newread = input.read(BLOCK_SIZE)
+ if not newread:
+ break
+ partial += newread
+ while True:
+ head, sep, partial = partial.partition('\0')
+ if not sep:
+ partial = head
+ break
+ buffer.append(head)
+ if len(buffer) == 2:
+ oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
+ path = buffer[1]
+ yield (newhash, modflag, path)
+ buffer = []
def gitconfig_get(name, file=None):
args = ['git', 'config', '--get']
if file is not None:
@@ -307,6 +332,88 @@ class GitFat(object):
gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
gitconfig_set('filter.fat.smudge', 'git-fat filter-smudge')
print('Initialized git fat')
+ def gen_large_blobs(self, revs, threshsize):
+ """Build dict of all blobs"""
+ time0 = time.time()
+ def keep_blobs(input, output):
+ """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria"""
+ # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable
+ for line in input:
+ if len(line) != 40:
+ output.write(line[:40] + '\n')
+ output.close()
+ revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
+ pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
+ keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin))
+ keepblobs.start()
+ numblobs = 0; numlarge = 1
+ # Build dict with the sizes of all large blobs
+ for line in pblobcheck.stdout:
+ objhash, blob, size = line.split()
+ size = int(size)
+ numblobs += 1
+ if size > threshsize:
+ numlarge += 1
+ yield objhash, size
+ revlist.wait()
+ pblobcheck.wait()
+ keepblobs.join()
+ time1 = time.time()
+ self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
+ def cmd_find(self, args):
+ maxsize = int(args[0])
+ blobsizes = dict(self.gen_large_blobs('--all', maxsize))
+ time0 = time.time()
+ # Find all names assumed by large blobs (those in blobsizes)
+ pathsizes = collections.defaultdict(lambda:set())
+ revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1)
+ difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
+ stdin=revlist.stdout, stdout=subprocess.PIPE)
+ for newblob, modflag, path in difftreez_reader(difftree.stdout):
+ bsize = blobsizes.get(newblob)
+ if bsize: # We care about this blob
+ pathsizes[path].add(bsize)
+ time1 = time.time()
+ self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
+ maxlen = max(map(len,pathsizes)) if pathsizes else 0
+ for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True):
+ print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
+ revlist.wait()
+ difftree.wait()
+ def cmd_index_filter(self, args):
+ filelist = set(f.strip() for f in open(args[0]).readlines())
+ lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE)
+ updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE)
+ for line in lsfiles.stdout:
+ mode, sep, tail = line.partition(' ')
+ blobhash, sep, tail = tail.partition(' ')
+ stageno, sep, tail = tail.partition('\t')
+ filename = tail.strip()
+ if filename not in filelist:
+ continue
+ # This file will contain the hash of the cleaned object
+ hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash)
+ try:
+ cleanedobj = open(hashfile).read().rstrip()
+ except IOError:
+ catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE)
+ hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ def dofilter():
+ self.filter_clean(catfile.stdout, hashobject.stdin)
+ hashobject.stdin.close()
+ filterclean = threading.Thread(target=dofilter)
+ filterclean.start()
+ cleanedobj = hashobject.stdout.read().rstrip()
+ catfile.wait()
+ hashobject.wait()
+ filterclean.join()
+ mkdir_p(os.path.dirname(hashfile))
+ open(hashfile, 'w').write(cleanedobj + '\n')
+ updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename))
+ updateindex.stdin.close()
+ lsfiles.wait()
+ updateindex.wait()
+
if __name__ == '__main__':
fat = GitFat()
@@ -327,5 +434,9 @@ if __name__ == '__main__':
fat.cmd_gc()
elif cmd == 'checkout':
fat.cmd_checkout(sys.argv[2:])
+ elif cmd == 'find':
+ fat.cmd_find(sys.argv[2:])
+ elif cmd == 'index-filter':
+ fat.cmd_index_filter(sys.argv[2:])
else:
- print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr)
+ print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr)