summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJed Brown <jed@59A2.org>2012-11-25 23:21:45 +0100
committerJed Brown <jed@59A2.org>2012-11-25 23:21:45 +0100
commit40996c304ba56aad88cac0b9de02a384adb35d3f (patch)
tree56845fffc0da164f99f45b05fe8cfac7231bbc52
parentd5f924d9f040a49c0d0114b8ef03c6ec3a96bd9c (diff)
downloadgit-fat-40996c304ba56aad88cac0b9de02a384adb35d3f.tar.gz
Extend worked example and make several refinements
* Verbosity control * Automatically update working tree * Identify orphan files in filter-clean and pass through so they don't show up as suprious diffs.
-rw-r--r--README.md106
-rwxr-xr-xgit-fat116
2 files changed, 169 insertions, 53 deletions
diff --git a/README.md b/README.md
index f5a9a07..8a4b806 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,19 @@
# Installation and configuration
Place `git-fat` in your `PATH`.
-To use `git-fat` edit `.gitattributes` to regard any desired extensions
-as fat files, e.g.
+Edit `.gitattributes` to regard any desired extensions as fat files.
+ $ cat >> .gitattributes
*.png filter=fat -crlf
*.jpg filter=fat -crlf
*.gz filter=fat -crlf
+ ^D
-then run `git fat init` to active the extension. Now add and commit as
-usual, all matched files will not go in `.git/objects`, but will appear
-complete in the working tree. To set a remote store for the fat objects,
-edit `.gitfat`
+Run `git fat init` to active the extension. Now add and commit as usual.
+Matched files will be transparently stored externally, but will appear
+complete in the working tree.
+
+Set a remote store for the fat objects by editing `.gitfat`.
[rsync]
remote = your.remote-host.org:/share/fat-store
@@ -23,13 +25,20 @@ remote ssh in a directory with shared access.
# A worked example
- $ export GIT_FAT_VERBOSE=1 # Show more verbose information about what is happening
+Before we start, let's turn on verbose reporting so we can see what's
+happening. Without this environment variable, all the output lines
+starting with `git-fat` will not be shown.
+
+ $ export GIT_FAT_VERBOSE=1
+
+First, we create a repository and configure it for use with `git-fat`.
+
$ git init repo
Initialized empty Git repository in /tmp/repo/.git/
$ cd repo
$ git fat init
$ cat > .gitfat
- [rsync]
+ [rsync]
remote = localhost:/tmp/fat-store
$ mkdir -p /tmp/fat-store # make sure the remote directory exists
$ echo '*.gz filter=fat -crlf' > .gitattributes
@@ -39,6 +48,9 @@ remote ssh in a directory with shared access.
2 files changed, 3 insertions(+)
create mode 100644 .gitattributes
create mode 100644 .gitfat
+
+Now we add a binary file whose name matches the pattern we set in `.gitattributes`.
+
$ curl https://nodeload.github.com/jedbrown/git-fat/tar.gz/master -o master.tar.gz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
@@ -50,16 +62,34 @@ remote ssh in a directory with shared access.
git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324
1 file changed, 1 insertion(+)
create mode 100644 master.tar.gz
- $ du -s .git/objects .git/fat
+
+The patch itself is very simple and does not include the binary.
+
+ $ git show --pretty=oneline HEAD
+ 918063043a6156172c2ad66478c6edd5c7df0217 Add master.tar.gz
+ diff --git a/master.tar.gz b/master.tar.gz
+ new file mode 100644
+ index 0000000..12f7d52
+ --- /dev/null
+ +++ b/master.tar.gz
+ @@ -0,0 +1 @@
+ +#$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae
+
+## Pushing fat files
+Now let's push our fat files using the rsync configuration that we set up earlier.
+
$ git fat push
Pushing to localhost:/tmp/fat-store
- building file list ...
+ building file list ...
1 file to consider
-
+
sent 61 bytes received 12 bytes 48.67 bytes/sec
total size is 6449 speedup is 88.34
-We could now push to a remote
+We we might normally set a remote now and push the git repository.
+
+## Cloning and pulling
+Now let's look at what happens when we clone.
$ cd ..
$ git clone repo repo2
@@ -67,28 +97,66 @@ We could now push to a remote
done.
$ cd repo2
$ git fat init # don't forget
+ $ ls -l # file is just a placeholder
+ total 4
+ -rw-r--r-- 1 jed users 53 Nov 25 22:42 master.tar.gz
+ $ cat master.tar.gz # holds the SHA1 of the file
+ #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae
+
+We can always get a summary of what fat objects are missing in our local cache.
+
+ Orphan objects:
+ 1f218834a137f7b185b498924e7a030008aee2ae
+
+Now get any objects referenced by our current `HEAD`. This command also
+accepts the `--all` option to pull full history, or a revspec to pull
+selected history.
+
$ git fat pull
- receiving file list ...
+ receiving file list ...
1 file to consider
1f218834a137f7b185b498924e7a030008aee2ae
6449 100% 6.15MB/s 0:00:00 (xfer#1, to-check=0/1)
sent 30 bytes received 6558 bytes 4392.00 bytes/sec
total size is 6449 speedup is 0.98
- $ cat master.tar.gz # we should checkout automatically
- #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae
- $ git checkout -f .
- git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/b7939480ed4e54109f8f82d43e46a39e144ecad1
+ Restoring 1f218834a137f7b185b498924e7a030008aee2ae -> master.tar.gz
git-fat filter-smudge: restoring from /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae
+
+Everything is in place
+
+ $ git status
+ git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae
+ # On branch master
+ nothing to commit, working directory clean
$ ls -l # recovered the full file
total 8
-rw-r--r-- 1 jed users 6449 Nov 25 17:10 master.tar.gz
-# Important refinements
+## Summary
+* Set the "fat" file types in `.gitattributes`.
+* Use normal git commands to interact with the repository without
+ thinking about what files are fat and non-fat. The fat files will be
+ treated specially.
+* Synchronize fat files with `git fat push` and `git fat pull`.
+
+## Implementation notes
+The actual binary files are stored in `.git/fat/objects`, leaving `.git/objects` nice and small.
+
+ $ du -bs .git/objects
+ 2212 .git/objects/
+ $ ls -l .git/fat/objects # This is where the file actually goes, but that's not important
+ total 8
+ -rw------- 1 jed users 6449 Nov 25 17:01 1f218834a137f7b185b498924e7a030008aee2ae
+
+# Some refinements
+* Allow pulling and pushing only select files
+* Relate orphan objects to file system
* Put some more useful message in smudged (working tree) version of missing files.
-* Make
+* More friendly configuration for multiple fat remotes
* Make commands safer in presence of a dirty tree.
* Private setting of a different remote.
* Gracefully handle unmanaged files when the filter is called (either
legacy files or files matching the pattern that should some reason not
be treated as fat).
+* Don't append the [filter "fat"] section to .git/config if it doesn't already exist.
diff --git a/git-fat b/git-fat
index ab36470..84be082 100755
--- a/git-fat
+++ b/git-fat
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-from __future__ import print_function
+from __future__ import print_function, with_statement
import sys
import hashlib
@@ -8,6 +8,7 @@ import tempfile
import os
import subprocess
import shlex
+import shutil
BLOCK_SIZE = 4096
@@ -15,10 +16,7 @@ def verbose_stderr(*args, **kwargs):
return print(*args, file=sys.stderr, **kwargs)
def verbose_ignore(*args, **kwargs):
pass
-verbose = verbose_ignore
-def gitroot():
- return subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
def mkdir_p(path):
import errno
try:
@@ -45,7 +43,8 @@ def cat(instream, outstream):
class GitFat(object):
DecodeError = RuntimeError
def __init__(self):
- self.gitroot = gitroot()
+ self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
+ self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
self.objdir = os.path.join(self.gitroot, '.git', 'fat', 'objects')
self.magiclen = len(self.encode(hashlib.sha1('dummy').hexdigest()))
def setup(self):
@@ -62,12 +61,16 @@ class GitFat(object):
return remote
except ConfigParser.NoSectionError:
raise RuntimeError('No rsync.remote in %s' % cfgpath)
+ def revparse(self, revname):
+ return subprocess.check_output(['git', 'rev-parse', revname]).strip()
def encode(self, digest):
return '#$# git-fat %s\n' % digest
- def decode(self, string):
+ def decode(self, string, noraise=False):
cookie = '#$# git-fat '
if string.startswith(cookie):
return string[len(cookie):].split()[0]
+ elif noraise:
+ return None
else:
raise GitFat.DecodeError('Could not decode %s' % (string))
def decode_stream(self, stream):
@@ -94,38 +97,54 @@ class GitFat(object):
h = hashlib.new('sha1')
fd, tmpname = tempfile.mkstemp(dir=self.objdir)
try:
+ ishanging = False
+ cached = False # changes to True when file is cached, means we
with os.fdopen(fd, 'w') as cache:
- for block in readblocks(sys.stdin):
+ outstream = cache
+ blockiter = readblocks(sys.stdin)
+ # Check whether this file is hanging
+ block = next(blockiter)
+ if self.decode(block[0:self.magiclen], noraise=True):
+ ishanging = True
+ outstream = sys.stdout
+ h.update(block)
+ outstream.write(block)
+ for block in blockiter:
h.update(block)
- cache.write(block)
+ outstream.write(block)
+ outstream.flush()
digest = h.hexdigest()
objfile = os.path.join(self.objdir, digest)
- os.rename(tmpname, objfile)
- verbose('git-fat filter-clean: caching to %s' % objfile)
- sys.stdout.write(self.encode(digest))
- sys.stdout.flush()
- except:
- raise
- #os.remove(tmpname)
+ if not ishanging:
+ os.rename(tmpname, objfile)
+ cached = True
+ self.verbose('git-fat filter-clean: caching to %s' % objfile)
+ sys.stdout.write(self.encode(digest))
+ finally:
+ if not cached:
+ os.remove(tmpname)
+
def cmd_smudge(self):
self.setup()
result = self.decode_stream(sys.stdin)
if isinstance(result, str): # We got a digest
objfile = os.path.join(self.objdir, result)
- verbose('git-fat filter-smudge: restoring from %s' % objfile)
+ self.verbose('git-fat filter-smudge: restoring from %s' % objfile)
try:
cat(open(objfile), sys.stdout)
except:
sys.stdout.write(self.encode(result)) # could leave a better notice about how to recover this file
else: # We have an iterable over the original input.
- verbose('git-fat filter-smudge: not a managed file')
+ self.verbose('git-fat filter-smudge: not a managed file')
cat(result, sys.stdout)
def catalog_objects(self):
return set(os.listdir(self.objdir))
- def referenced_objects(self, rev=None):
+ def referenced_objects(self, rev=None, all=False):
referenced = set()
- if rev is None:
+ if all:
rev = '--all'
+ elif rev is None:
+ rev = self.revparse('HEAD')
p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
for line in p1.communicate()[0].splitlines():
@@ -136,17 +155,26 @@ class GitFat(object):
fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))
referenced.add(fathash)
return referenced
+ def orphan_files(self):
+ 'generator for all orphan placeholders in the working tree'
+ for fname in subprocess.check_output(['git', 'ls-files']).splitlines():
+ digest = self.decode_file(fname)
+ if digest:
+ yield (digest, fname)
def cmd_status(self, args):
self.setup()
catalog = self.catalog_objects()
- referenced = self.referenced_objects()
+ refargs = dict()
+ if '--all' in args:
+ refargs['all'] = True
+ referenced = self.referenced_objects(**refargs)
garbage = catalog - referenced
orphans = referenced - catalog
if '--all' in args:
for obj in referenced:
print(obj)
if orphans:
- print('Orphan files:')
+ print('Orphan objects:')
for orph in orphans:
print(' ' + orph)
if garbage:
@@ -156,23 +184,44 @@ class GitFat(object):
def cmd_push(self):
'Push anything that I have stored and referenced'
self.setup()
- files = self.referenced_objects() & self.catalog_objects()
+ # Pushing *all* objects because it's safer. Could implement partial push.
+ files = self.referenced_objects(all=True) & self.catalog_objects()
remote = self.get_rsync()
- verbose('Pushing to %s' % (remote))
+ self.verbose('Pushing to %s' % (remote))
cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', self.objdir + '/', remote]
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
- def checkout(self):
- files = subprocess.check_output(['git', 'ls-files', ''])
- def cmd_pull(self):
+ def checkout(self, show_orphans=False):
+ 'Update any stale files in the present working tree'
+ orphans = []
+ for digest, fname in self.orphan_files():
+ objpath = os.path.join(self.objdir, digest)
+ if os.access(objpath, os.R_OK):
+ print('Restoring %s -> %s' % (digest, fname))
+ shutil.copy(objpath, fname)
+ elif show_orphans:
+ print('Data unavailable: %s %s' % (digest,fname))
+ subprocess.call(['git', 'checkout', '.'])
+ def cmd_pull(self, args):
'Pull anything that I have referenced, but not stored'
self.setup()
- files = self.referenced_objects() - self.catalog_objects()
+ refargs = dict()
+ if '--all' in args:
+ refargs['all'] = True
+ for arg in args:
+ if arg.startswith('-') or len(arg) != 40:
+ continue
+ rev = self.revparse(arg)
+ if rev:
+ refargs['rev'] = rev
+ files = self.referenced_objects(**refargs) - self.catalog_objects()
remote = self.get_rsync()
cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', remote + '/', self.objdir]
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
self.checkout()
+ def cmd_checkout(self, args):
+ self.checkout(show_orphans=True)
def cmd_gc(self):
garbage = self.catalog_objects() - self.referenced_objects()
print('Unreferenced objects to remove: %d' % len(garbage))
@@ -189,24 +238,23 @@ class GitFat(object):
])
if __name__ == '__main__':
- if os.environ.get('GIT_FAT_VERBOSE'):
- global verbose
- verbose = verbose_stderr
fat = GitFat()
cmd = sys.argv[1] if len(sys.argv) > 1 else ''
if cmd == 'filter-clean':
fat.cmd_clean()
elif cmd == 'filter-smudge':
fat.cmd_smudge()
+ elif cmd == 'init':
+ fat.cmd_init()
elif cmd == 'status':
fat.cmd_status(sys.argv[2:])
elif cmd == 'push':
fat.cmd_push()
elif cmd == 'pull':
- fat.cmd_pull()
+ fat.cmd_pull(sys.argv[2:])
elif cmd == 'gc':
fat.cmd_gc()
- elif cmd == 'init':
- fat.cmd_init()
+ elif cmd == 'checkout':
+ fat.cmd_checkout(sys.argv[2:])
else:
- print('Usage: git fat [status|push|pull|gc|init]', file=sys.stderr)
+ print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr)