summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md94
-rwxr-xr-xgit-fat212
-rwxr-xr-xtest.sh23
3 files changed, 329 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f5a9a07
--- /dev/null
+++ b/README.md
@@ -0,0 +1,94 @@
+# Installation and configuration
+Place `git-fat` in your `PATH`.
+
+To use `git-fat` edit `.gitattributes` to regard any desired extensions
+as fat files, e.g.
+
+ *.png filter=fat -crlf
+ *.jpg filter=fat -crlf
+ *.gz filter=fat -crlf
+
+then run `git fat init` to active the extension. Now add and commit as
+usual, all matched files will not go in `.git/objects`, but will appear
+complete in the working tree. To set a remote store for the fat objects,
+edit `.gitfat`
+
+ [rsync]
+ remote = your.remote-host.org:/share/fat-store
+
+This file should typically be committed to the repository so that others
+will automatically have their remote set. This remote address can use
+any protocol supported by rsync. Most users will configure it to use
+remote ssh in a directory with shared access.
+
+# A worked example
+
+ $ export GIT_FAT_VERBOSE=1 # Show more verbose information about what is happening
+ $ git init repo
+ Initialized empty Git repository in /tmp/repo/.git/
+ $ cd repo
+ $ git fat init
+ $ cat > .gitfat
+ [rsync]
+ remote = localhost:/tmp/fat-store
+ $ mkdir -p /tmp/fat-store # make sure the remote directory exists
+ $ echo '*.gz filter=fat -crlf' > .gitattributes
+ $ git add .gitfat .gitattributes
+ $ git commit -m'Initial repository'
+ [master (root-commit) eb7facb] Initial repository
+ 2 files changed, 3 insertions(+)
+ create mode 100644 .gitattributes
+ create mode 100644 .gitfat
+ $ curl https://nodeload.github.com/jedbrown/git-fat/tar.gz/master -o master.tar.gz
+ % Total % Received % Xferd Average Speed Time Time Time Current
+ Dload Upload Total Spent Left Speed
+ 100 6449 100 6449 0 0 7741 0 --:--:-- --:--:-- --:--:-- 9786
+ $ git add master.tar.gz
+ git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324
+ $ git commit -m'Added master.tar.gz'
+ [master b85a96f] Added master.tar.gz
+ git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324
+ 1 file changed, 1 insertion(+)
+ create mode 100644 master.tar.gz
+ $ du -s .git/objects .git/fat
+ $ git fat push
+ Pushing to localhost:/tmp/fat-store
+ building file list ...
+ 1 file to consider
+
+ sent 61 bytes received 12 bytes 48.67 bytes/sec
+ total size is 6449 speedup is 88.34
+
+We could now push to a remote
+
+ $ cd ..
+ $ git clone repo repo2
+ Cloning into 'repo2'...
+ done.
+ $ cd repo2
+ $ git fat init # don't forget
+ $ git fat pull
+ receiving file list ...
+ 1 file to consider
+ 1f218834a137f7b185b498924e7a030008aee2ae
+ 6449 100% 6.15MB/s 0:00:00 (xfer#1, to-check=0/1)
+
+ sent 30 bytes received 6558 bytes 4392.00 bytes/sec
+ total size is 6449 speedup is 0.98
+ $ cat master.tar.gz # we should checkout automatically
+ #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae
+ $ git checkout -f .
+ git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/b7939480ed4e54109f8f82d43e46a39e144ecad1
+ git-fat filter-smudge: restoring from /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae
+ $ ls -l # recovered the full file
+ total 8
+ -rw-r--r-- 1 jed users 6449 Nov 25 17:10 master.tar.gz
+
+# Important refinements
+* Put some more useful message in smudged (working tree) version of missing files.
+* Make
+* Make commands safer in presence of a dirty tree.
+* Private setting of a different remote.
+* Gracefully handle unmanaged files when the filter is called (either
+ legacy files or files matching the pattern that should some reason not
+ be treated as fat).
diff --git a/git-fat b/git-fat
new file mode 100755
index 0000000..ab36470
--- /dev/null
+++ b/git-fat
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+import sys
+import hashlib
+import tempfile
+import os
+import subprocess
+import shlex
+
+BLOCK_SIZE = 4096
+
+def verbose_stderr(*args, **kwargs):
+ return print(*args, file=sys.stderr, **kwargs)
+def verbose_ignore(*args, **kwargs):
+ pass
+verbose = verbose_ignore
+
+def gitroot():
+ return subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
+def mkdir_p(path):
+ import errno
+ try:
+ os.makedirs(path)
+ except OSError as exc: # Python >2.5
+ if exc.errno == errno.EEXIST and os.path.isdir(path):
+ pass
+ else: raise
+
+def readblocks(stream):
+ bytes = 0
+ while True:
+ data = stream.read(BLOCK_SIZE)
+ bytes += len(data)
+ if not data:
+ break
+ yield data
+def cat_iter(initer, outstream):
+ for block in initer:
+ outstream.write(block)
+def cat(instream, outstream):
+ return cat_iter(readblocks(instream), outstream)
+
+class GitFat(object):
+ DecodeError = RuntimeError
+ def __init__(self):
+ self.gitroot = gitroot()
+ self.objdir = os.path.join(self.gitroot, '.git', 'fat', 'objects')
+ self.magiclen = len(self.encode(hashlib.sha1('dummy').hexdigest()))
+ def setup(self):
+ mkdir_p(self.objdir)
+ def get_rsync(self):
+ import ConfigParser
+ cfgpath = os.path.join(self.gitroot,'.gitfat')
+ try:
+ config = ConfigParser.RawConfigParser()
+ config.read(cfgpath)
+ remote = config.get('rsync', 'remote')
+ if remote[0] in ['"', "'"] and remote[-1] in ['"', "'"]:
+ remote = remote[1:-1]
+ return remote
+ except ConfigParser.NoSectionError:
+ raise RuntimeError('No rsync.remote in %s' % cfgpath)
+ def encode(self, digest):
+ return '#$# git-fat %s\n' % digest
+ def decode(self, string):
+ cookie = '#$# git-fat '
+ if string.startswith(cookie):
+ return string[len(cookie):].split()[0]
+ else:
+ raise GitFat.DecodeError('Could not decode %s' % (string))
+ def decode_stream(self, stream):
+ 'Return digest if git-fat cache, otherwise return iterator over entire file contents'
+ preamble = stream.read(self.magiclen)
+ try:
+ return self.decode(preamble)
+ except GitFat.DecodeError:
+ 'Not sure if this is the right behavior'
+ return itertools.chain([preamble], readblocks(stream))
+ def decode_file(self, fname):
+ # Fast check
+ stat = os.stat(fname)
+ if stat.st_size != self.magiclen:
+ return False
+ # read file
+ digest = self.decode_stream(open(fname))
+ if isinstance(digest, str):
+ return digest
+ else:
+ return None
+ def cmd_clean(self):
+ self.setup()
+ h = hashlib.new('sha1')
+ fd, tmpname = tempfile.mkstemp(dir=self.objdir)
+ try:
+ with os.fdopen(fd, 'w') as cache:
+ for block in readblocks(sys.stdin):
+ h.update(block)
+ cache.write(block)
+ digest = h.hexdigest()
+ objfile = os.path.join(self.objdir, digest)
+ os.rename(tmpname, objfile)
+ verbose('git-fat filter-clean: caching to %s' % objfile)
+ sys.stdout.write(self.encode(digest))
+ sys.stdout.flush()
+ except:
+ raise
+ #os.remove(tmpname)
+ def cmd_smudge(self):
+ self.setup()
+ result = self.decode_stream(sys.stdin)
+ if isinstance(result, str): # We got a digest
+ objfile = os.path.join(self.objdir, result)
+ verbose('git-fat filter-smudge: restoring from %s' % objfile)
+ try:
+ cat(open(objfile), sys.stdout)
+ except:
+ sys.stdout.write(self.encode(result)) # could leave a better notice about how to recover this file
+ else: # We have an iterable over the original input.
+ verbose('git-fat filter-smudge: not a managed file')
+ cat(result, sys.stdout)
+ def catalog_objects(self):
+ return set(os.listdir(self.objdir))
+ def referenced_objects(self, rev=None):
+ referenced = set()
+ if rev is None:
+ rev = '--all'
+ p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
+ p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ for line in p1.communicate()[0].splitlines():
+ p2.stdin.write(line.split()[0] + '\n')
+ for line in p2.communicate()[0].splitlines():
+ objhash, objtype, size = line.split()
+ if objtype == 'blob' and int(size) == self.magiclen:
+ fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))
+ referenced.add(fathash)
+ return referenced
+ def cmd_status(self, args):
+ self.setup()
+ catalog = self.catalog_objects()
+ referenced = self.referenced_objects()
+ garbage = catalog - referenced
+ orphans = referenced - catalog
+ if '--all' in args:
+ for obj in referenced:
+ print(obj)
+ if orphans:
+ print('Orphan files:')
+ for orph in orphans:
+ print(' ' + orph)
+ if garbage:
+ print('Garbage objects:')
+ for g in garbage:
+ print(' ' + g)
+ def cmd_push(self):
+ 'Push anything that I have stored and referenced'
+ self.setup()
+ files = self.referenced_objects() & self.catalog_objects()
+ remote = self.get_rsync()
+ verbose('Pushing to %s' % (remote))
+ cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', self.objdir + '/', remote]
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+ p.communicate(input='\x00'.join(files))
+ def checkout(self):
+ files = subprocess.check_output(['git', 'ls-files', ''])
+ def cmd_pull(self):
+ 'Pull anything that I have referenced, but not stored'
+ self.setup()
+ files = self.referenced_objects() - self.catalog_objects()
+ remote = self.get_rsync()
+ cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', remote + '/', self.objdir]
+ p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+ p.communicate(input='\x00'.join(files))
+ self.checkout()
+ def cmd_gc(self):
+ garbage = self.catalog_objects() - self.referenced_objects()
+ print('Unreferenced objects to remove: %d' % len(garbage))
+ for obj in garbage:
+ fname = os.path.join(self.objdir, obj)
+ print('%10d %s' % (os.stat(fname).st_size, obj))
+ os.remove(fname)
+ def cmd_init(self):
+ self.setup()
+ open(os.path.join(self.gitroot,'.git','config'), 'a').writelines([
+ '[filter "fat"]\n',
+ ' clean = git-fat filter-clean\n',
+ ' smudge = git-fat filter-smudge\n',
+ ])
+
+if __name__ == '__main__':
+ if os.environ.get('GIT_FAT_VERBOSE'):
+ global verbose
+ verbose = verbose_stderr
+ fat = GitFat()
+ cmd = sys.argv[1] if len(sys.argv) > 1 else ''
+ if cmd == 'filter-clean':
+ fat.cmd_clean()
+ elif cmd == 'filter-smudge':
+ fat.cmd_smudge()
+ elif cmd == 'status':
+ fat.cmd_status(sys.argv[2:])
+ elif cmd == 'push':
+ fat.cmd_push()
+ elif cmd == 'pull':
+ fat.cmd_pull()
+ elif cmd == 'gc':
+ fat.cmd_gc()
+ elif cmd == 'init':
+ fat.cmd_init()
+ else:
+ print('Usage: git fat [status|push|pull|gc|init]', file=sys.stderr)
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..630a9ff
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+
+git init fat-test
+cd fat-test
+git fat init
+cat - >> .gitfat <<EOF
+[rsync]
+remote = localhost:/tmp/fat-store
+EOF
+echo '*.fat filter=fat -crlf' > .gitattributes
+git add .gitattributes .gitfat
+git commit -m'Initial fat repository'
+
+echo 'fat content a' > a.fat
+git add a.fat
+git commit -m'add a.fat'
+echo 'fat content b' > b.fat
+git add b.fat
+git commit -m'add b.fat'
+echo 'revise fat content a' > a.fat
+git commit -am'revise a.fat'