diff options
-rw-r--r-- | README.md | 94 | ||||
-rwxr-xr-x | git-fat | 212 | ||||
-rwxr-xr-x | test.sh | 23 |
3 files changed, 329 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..f5a9a07 --- /dev/null +++ b/README.md @@ -0,0 +1,94 @@ +# Installation and configuration +Place `git-fat` in your `PATH`. + +To use `git-fat` edit `.gitattributes` to regard any desired extensions +as fat files, e.g. + + *.png filter=fat -crlf + *.jpg filter=fat -crlf + *.gz filter=fat -crlf + +then run `git fat init` to active the extension. Now add and commit as +usual, all matched files will not go in `.git/objects`, but will appear +complete in the working tree. To set a remote store for the fat objects, +edit `.gitfat` + + [rsync] + remote = your.remote-host.org:/share/fat-store + +This file should typically be committed to the repository so that others +will automatically have their remote set. This remote address can use +any protocol supported by rsync. Most users will configure it to use +remote ssh in a directory with shared access. + +# A worked example + + $ export GIT_FAT_VERBOSE=1 # Show more verbose information about what is happening + $ git init repo + Initialized empty Git repository in /tmp/repo/.git/ + $ cd repo + $ git fat init + $ cat > .gitfat + [rsync] + remote = localhost:/tmp/fat-store + $ mkdir -p /tmp/fat-store # make sure the remote directory exists + $ echo '*.gz filter=fat -crlf' > .gitattributes + $ git add .gitfat .gitattributes + $ git commit -m'Initial repository' + [master (root-commit) eb7facb] Initial repository + 2 files changed, 3 insertions(+) + create mode 100644 .gitattributes + create mode 100644 .gitfat + $ curl https://nodeload.github.com/jedbrown/git-fat/tar.gz/master -o master.tar.gz + % Total % Received % Xferd Average Speed Time Time Time Current + Dload Upload Total Spent Left Speed + 100 6449 100 6449 0 0 7741 0 --:--:-- --:--:-- --:--:-- 9786 + $ git add master.tar.gz + git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324 + $ git commit -m'Added master.tar.gz' + [master b85a96f] Added master.tar.gz + git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324 + 1 file changed, 1 insertion(+) + create mode 100644 master.tar.gz + $ du -s .git/objects .git/fat + $ git fat push + Pushing to localhost:/tmp/fat-store + building file list ... + 1 file to consider + + sent 61 bytes received 12 bytes 48.67 bytes/sec + total size is 6449 speedup is 88.34 + +We could now push to a remote + + $ cd .. + $ git clone repo repo2 + Cloning into 'repo2'... + done. + $ cd repo2 + $ git fat init # don't forget + $ git fat pull + receiving file list ... + 1 file to consider + 1f218834a137f7b185b498924e7a030008aee2ae + 6449 100% 6.15MB/s 0:00:00 (xfer#1, to-check=0/1) + + sent 30 bytes received 6558 bytes 4392.00 bytes/sec + total size is 6449 speedup is 0.98 + $ cat master.tar.gz # we should checkout automatically + #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae + $ git checkout -f . + git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/b7939480ed4e54109f8f82d43e46a39e144ecad1 + git-fat filter-smudge: restoring from /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae + $ ls -l # recovered the full file + total 8 + -rw-r--r-- 1 jed users 6449 Nov 25 17:10 master.tar.gz + +# Important refinements +* Put some more useful message in smudged (working tree) version of missing files. +* Make +* Make commands safer in presence of a dirty tree. +* Private setting of a different remote. +* Gracefully handle unmanaged files when the filter is called (either + legacy files or files matching the pattern that should some reason not + be treated as fat). @@ -0,0 +1,212 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import sys +import hashlib +import tempfile +import os +import subprocess +import shlex + +BLOCK_SIZE = 4096 + +def verbose_stderr(*args, **kwargs): + return print(*args, file=sys.stderr, **kwargs) +def verbose_ignore(*args, **kwargs): + pass +verbose = verbose_ignore + +def gitroot(): + return subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() +def mkdir_p(path): + import errno + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise + +def readblocks(stream): + bytes = 0 + while True: + data = stream.read(BLOCK_SIZE) + bytes += len(data) + if not data: + break + yield data +def cat_iter(initer, outstream): + for block in initer: + outstream.write(block) +def cat(instream, outstream): + return cat_iter(readblocks(instream), outstream) + +class GitFat(object): + DecodeError = RuntimeError + def __init__(self): + self.gitroot = gitroot() + self.objdir = os.path.join(self.gitroot, '.git', 'fat', 'objects') + self.magiclen = len(self.encode(hashlib.sha1('dummy').hexdigest())) + def setup(self): + mkdir_p(self.objdir) + def get_rsync(self): + import ConfigParser + cfgpath = os.path.join(self.gitroot,'.gitfat') + try: + config = ConfigParser.RawConfigParser() + config.read(cfgpath) + remote = config.get('rsync', 'remote') + if remote[0] in ['"', "'"] and remote[-1] in ['"', "'"]: + remote = remote[1:-1] + return remote + except ConfigParser.NoSectionError: + raise RuntimeError('No rsync.remote in %s' % cfgpath) + def encode(self, digest): + return '#$# git-fat %s\n' % digest + def decode(self, string): + cookie = '#$# git-fat ' + if string.startswith(cookie): + return string[len(cookie):].split()[0] + else: + raise GitFat.DecodeError('Could not decode %s' % (string)) + def decode_stream(self, stream): + 'Return digest if git-fat cache, otherwise return iterator over entire file contents' + preamble = stream.read(self.magiclen) + try: + return self.decode(preamble) + except GitFat.DecodeError: + 'Not sure if this is the right behavior' + return itertools.chain([preamble], readblocks(stream)) + def decode_file(self, fname): + # Fast check + stat = os.stat(fname) + if stat.st_size != self.magiclen: + return False + # read file + digest = self.decode_stream(open(fname)) + if isinstance(digest, str): + return digest + else: + return None + def cmd_clean(self): + self.setup() + h = hashlib.new('sha1') + fd, tmpname = tempfile.mkstemp(dir=self.objdir) + try: + with os.fdopen(fd, 'w') as cache: + for block in readblocks(sys.stdin): + h.update(block) + cache.write(block) + digest = h.hexdigest() + objfile = os.path.join(self.objdir, digest) + os.rename(tmpname, objfile) + verbose('git-fat filter-clean: caching to %s' % objfile) + sys.stdout.write(self.encode(digest)) + sys.stdout.flush() + except: + raise + #os.remove(tmpname) + def cmd_smudge(self): + self.setup() + result = self.decode_stream(sys.stdin) + if isinstance(result, str): # We got a digest + objfile = os.path.join(self.objdir, result) + verbose('git-fat filter-smudge: restoring from %s' % objfile) + try: + cat(open(objfile), sys.stdout) + except: + sys.stdout.write(self.encode(result)) # could leave a better notice about how to recover this file + else: # We have an iterable over the original input. + verbose('git-fat filter-smudge: not a managed file') + cat(result, sys.stdout) + def catalog_objects(self): + return set(os.listdir(self.objdir)) + def referenced_objects(self, rev=None): + referenced = set() + if rev is None: + rev = '--all' + p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) + p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + for line in p1.communicate()[0].splitlines(): + p2.stdin.write(line.split()[0] + '\n') + for line in p2.communicate()[0].splitlines(): + objhash, objtype, size = line.split() + if objtype == 'blob' and int(size) == self.magiclen: + fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash])) + referenced.add(fathash) + return referenced + def cmd_status(self, args): + self.setup() + catalog = self.catalog_objects() + referenced = self.referenced_objects() + garbage = catalog - referenced + orphans = referenced - catalog + if '--all' in args: + for obj in referenced: + print(obj) + if orphans: + print('Orphan files:') + for orph in orphans: + print(' ' + orph) + if garbage: + print('Garbage objects:') + for g in garbage: + print(' ' + g) + def cmd_push(self): + 'Push anything that I have stored and referenced' + self.setup() + files = self.referenced_objects() & self.catalog_objects() + remote = self.get_rsync() + verbose('Pushing to %s' % (remote)) + cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', self.objdir + '/', remote] + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input='\x00'.join(files)) + def checkout(self): + files = subprocess.check_output(['git', 'ls-files', '']) + def cmd_pull(self): + 'Pull anything that I have referenced, but not stored' + self.setup() + files = self.referenced_objects() - self.catalog_objects() + remote = self.get_rsync() + cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', remote + '/', self.objdir] + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input='\x00'.join(files)) + self.checkout() + def cmd_gc(self): + garbage = self.catalog_objects() - self.referenced_objects() + print('Unreferenced objects to remove: %d' % len(garbage)) + for obj in garbage: + fname = os.path.join(self.objdir, obj) + print('%10d %s' % (os.stat(fname).st_size, obj)) + os.remove(fname) + def cmd_init(self): + self.setup() + open(os.path.join(self.gitroot,'.git','config'), 'a').writelines([ + '[filter "fat"]\n', + ' clean = git-fat filter-clean\n', + ' smudge = git-fat filter-smudge\n', + ]) + +if __name__ == '__main__': + if os.environ.get('GIT_FAT_VERBOSE'): + global verbose + verbose = verbose_stderr + fat = GitFat() + cmd = sys.argv[1] if len(sys.argv) > 1 else '' + if cmd == 'filter-clean': + fat.cmd_clean() + elif cmd == 'filter-smudge': + fat.cmd_smudge() + elif cmd == 'status': + fat.cmd_status(sys.argv[2:]) + elif cmd == 'push': + fat.cmd_push() + elif cmd == 'pull': + fat.cmd_pull() + elif cmd == 'gc': + fat.cmd_gc() + elif cmd == 'init': + fat.cmd_init() + else: + print('Usage: git fat [status|push|pull|gc|init]', file=sys.stderr) @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e + +git init fat-test +cd fat-test +git fat init +cat - >> .gitfat <<EOF +[rsync] +remote = localhost:/tmp/fat-store +EOF +echo '*.fat filter=fat -crlf' > .gitattributes +git add .gitattributes .gitfat +git commit -m'Initial fat repository' + +echo 'fat content a' > a.fat +git add a.fat +git commit -m'add a.fat' +echo 'fat content b' > b.fat +git add b.fat +git commit -m'add b.fat' +echo 'revise fat content a' > a.fat +git commit -am'revise a.fat' |