diff options
author | Jed Brown <jed@59A2.org> | 2012-11-25 23:21:45 +0100 |
---|---|---|
committer | Jed Brown <jed@59A2.org> | 2012-11-25 23:21:45 +0100 |
commit | 40996c304ba56aad88cac0b9de02a384adb35d3f (patch) | |
tree | 56845fffc0da164f99f45b05fe8cfac7231bbc52 | |
parent | d5f924d9f040a49c0d0114b8ef03c6ec3a96bd9c (diff) | |
download | git-fat-40996c304ba56aad88cac0b9de02a384adb35d3f.tar.gz |
Extend worked example and make several refinements
* Verbosity control
* Automatically update working tree
* Identify orphan files in filter-clean and pass through so they don't
show up as suprious diffs.
-rw-r--r-- | README.md | 106 | ||||
-rwxr-xr-x | git-fat | 116 |
2 files changed, 169 insertions, 53 deletions
@@ -1,17 +1,19 @@ # Installation and configuration Place `git-fat` in your `PATH`. -To use `git-fat` edit `.gitattributes` to regard any desired extensions -as fat files, e.g. +Edit `.gitattributes` to regard any desired extensions as fat files. + $ cat >> .gitattributes *.png filter=fat -crlf *.jpg filter=fat -crlf *.gz filter=fat -crlf + ^D -then run `git fat init` to active the extension. Now add and commit as -usual, all matched files will not go in `.git/objects`, but will appear -complete in the working tree. To set a remote store for the fat objects, -edit `.gitfat` +Run `git fat init` to active the extension. Now add and commit as usual. +Matched files will be transparently stored externally, but will appear +complete in the working tree. + +Set a remote store for the fat objects by editing `.gitfat`. [rsync] remote = your.remote-host.org:/share/fat-store @@ -23,13 +25,20 @@ remote ssh in a directory with shared access. # A worked example - $ export GIT_FAT_VERBOSE=1 # Show more verbose information about what is happening +Before we start, let's turn on verbose reporting so we can see what's +happening. Without this environment variable, all the output lines +starting with `git-fat` will not be shown. + + $ export GIT_FAT_VERBOSE=1 + +First, we create a repository and configure it for use with `git-fat`. + $ git init repo Initialized empty Git repository in /tmp/repo/.git/ $ cd repo $ git fat init $ cat > .gitfat - [rsync] + [rsync] remote = localhost:/tmp/fat-store $ mkdir -p /tmp/fat-store # make sure the remote directory exists $ echo '*.gz filter=fat -crlf' > .gitattributes @@ -39,6 +48,9 @@ remote ssh in a directory with shared access. 2 files changed, 3 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitfat + +Now we add a binary file whose name matches the pattern we set in `.gitattributes`. + $ curl https://nodeload.github.com/jedbrown/git-fat/tar.gz/master -o master.tar.gz % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed @@ -50,16 +62,34 @@ remote ssh in a directory with shared access. git-fat filter-clean: caching to /tmp/repo/.git/fat/objects/b3489819f81603b4c04e8ed134b80bace0810324 1 file changed, 1 insertion(+) create mode 100644 master.tar.gz - $ du -s .git/objects .git/fat + +The patch itself is very simple and does not include the binary. + + $ git show --pretty=oneline HEAD + 918063043a6156172c2ad66478c6edd5c7df0217 Add master.tar.gz + diff --git a/master.tar.gz b/master.tar.gz + new file mode 100644 + index 0000000..12f7d52 + --- /dev/null + +++ b/master.tar.gz + @@ -0,0 +1 @@ + +#$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae + +## Pushing fat files +Now let's push our fat files using the rsync configuration that we set up earlier. + $ git fat push Pushing to localhost:/tmp/fat-store - building file list ... + building file list ... 1 file to consider - + sent 61 bytes received 12 bytes 48.67 bytes/sec total size is 6449 speedup is 88.34 -We could now push to a remote +We we might normally set a remote now and push the git repository. + +## Cloning and pulling +Now let's look at what happens when we clone. $ cd .. $ git clone repo repo2 @@ -67,28 +97,66 @@ We could now push to a remote done. $ cd repo2 $ git fat init # don't forget + $ ls -l # file is just a placeholder + total 4 + -rw-r--r-- 1 jed users 53 Nov 25 22:42 master.tar.gz + $ cat master.tar.gz # holds the SHA1 of the file + #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae + +We can always get a summary of what fat objects are missing in our local cache. + + Orphan objects: + 1f218834a137f7b185b498924e7a030008aee2ae + +Now get any objects referenced by our current `HEAD`. This command also +accepts the `--all` option to pull full history, or a revspec to pull +selected history. + $ git fat pull - receiving file list ... + receiving file list ... 1 file to consider 1f218834a137f7b185b498924e7a030008aee2ae 6449 100% 6.15MB/s 0:00:00 (xfer#1, to-check=0/1) sent 30 bytes received 6558 bytes 4392.00 bytes/sec total size is 6449 speedup is 0.98 - $ cat master.tar.gz # we should checkout automatically - #$# git-fat 1f218834a137f7b185b498924e7a030008aee2ae - $ git checkout -f . - git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/b7939480ed4e54109f8f82d43e46a39e144ecad1 + Restoring 1f218834a137f7b185b498924e7a030008aee2ae -> master.tar.gz git-fat filter-smudge: restoring from /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae + +Everything is in place + + $ git status + git-fat filter-clean: caching to /tmp/repo2/.git/fat/objects/1f218834a137f7b185b498924e7a030008aee2ae + # On branch master + nothing to commit, working directory clean $ ls -l # recovered the full file total 8 -rw-r--r-- 1 jed users 6449 Nov 25 17:10 master.tar.gz -# Important refinements +## Summary +* Set the "fat" file types in `.gitattributes`. +* Use normal git commands to interact with the repository without + thinking about what files are fat and non-fat. The fat files will be + treated specially. +* Synchronize fat files with `git fat push` and `git fat pull`. + +## Implementation notes +The actual binary files are stored in `.git/fat/objects`, leaving `.git/objects` nice and small. + + $ du -bs .git/objects + 2212 .git/objects/ + $ ls -l .git/fat/objects # This is where the file actually goes, but that's not important + total 8 + -rw------- 1 jed users 6449 Nov 25 17:01 1f218834a137f7b185b498924e7a030008aee2ae + +# Some refinements +* Allow pulling and pushing only select files +* Relate orphan objects to file system * Put some more useful message in smudged (working tree) version of missing files. -* Make +* More friendly configuration for multiple fat remotes * Make commands safer in presence of a dirty tree. * Private setting of a different remote. * Gracefully handle unmanaged files when the filter is called (either legacy files or files matching the pattern that should some reason not be treated as fat). +* Don't append the [filter "fat"] section to .git/config if it doesn't already exist. @@ -1,6 +1,6 @@ #!/usr/bin/env python -from __future__ import print_function +from __future__ import print_function, with_statement import sys import hashlib @@ -8,6 +8,7 @@ import tempfile import os import subprocess import shlex +import shutil BLOCK_SIZE = 4096 @@ -15,10 +16,7 @@ def verbose_stderr(*args, **kwargs): return print(*args, file=sys.stderr, **kwargs) def verbose_ignore(*args, **kwargs): pass -verbose = verbose_ignore -def gitroot(): - return subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() def mkdir_p(path): import errno try: @@ -45,7 +43,8 @@ def cat(instream, outstream): class GitFat(object): DecodeError = RuntimeError def __init__(self): - self.gitroot = gitroot() + self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore + self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() self.objdir = os.path.join(self.gitroot, '.git', 'fat', 'objects') self.magiclen = len(self.encode(hashlib.sha1('dummy').hexdigest())) def setup(self): @@ -62,12 +61,16 @@ class GitFat(object): return remote except ConfigParser.NoSectionError: raise RuntimeError('No rsync.remote in %s' % cfgpath) + def revparse(self, revname): + return subprocess.check_output(['git', 'rev-parse', revname]).strip() def encode(self, digest): return '#$# git-fat %s\n' % digest - def decode(self, string): + def decode(self, string, noraise=False): cookie = '#$# git-fat ' if string.startswith(cookie): return string[len(cookie):].split()[0] + elif noraise: + return None else: raise GitFat.DecodeError('Could not decode %s' % (string)) def decode_stream(self, stream): @@ -94,38 +97,54 @@ class GitFat(object): h = hashlib.new('sha1') fd, tmpname = tempfile.mkstemp(dir=self.objdir) try: + ishanging = False + cached = False # changes to True when file is cached, means we with os.fdopen(fd, 'w') as cache: - for block in readblocks(sys.stdin): + outstream = cache + blockiter = readblocks(sys.stdin) + # Check whether this file is hanging + block = next(blockiter) + if self.decode(block[0:self.magiclen], noraise=True): + ishanging = True + outstream = sys.stdout + h.update(block) + outstream.write(block) + for block in blockiter: h.update(block) - cache.write(block) + outstream.write(block) + outstream.flush() digest = h.hexdigest() objfile = os.path.join(self.objdir, digest) - os.rename(tmpname, objfile) - verbose('git-fat filter-clean: caching to %s' % objfile) - sys.stdout.write(self.encode(digest)) - sys.stdout.flush() - except: - raise - #os.remove(tmpname) + if not ishanging: + os.rename(tmpname, objfile) + cached = True + self.verbose('git-fat filter-clean: caching to %s' % objfile) + sys.stdout.write(self.encode(digest)) + finally: + if not cached: + os.remove(tmpname) + def cmd_smudge(self): self.setup() result = self.decode_stream(sys.stdin) if isinstance(result, str): # We got a digest objfile = os.path.join(self.objdir, result) - verbose('git-fat filter-smudge: restoring from %s' % objfile) + self.verbose('git-fat filter-smudge: restoring from %s' % objfile) try: cat(open(objfile), sys.stdout) except: sys.stdout.write(self.encode(result)) # could leave a better notice about how to recover this file else: # We have an iterable over the original input. - verbose('git-fat filter-smudge: not a managed file') + self.verbose('git-fat filter-smudge: not a managed file') cat(result, sys.stdout) def catalog_objects(self): return set(os.listdir(self.objdir)) - def referenced_objects(self, rev=None): + def referenced_objects(self, rev=None, all=False): referenced = set() - if rev is None: + if all: rev = '--all' + elif rev is None: + rev = self.revparse('HEAD') p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) for line in p1.communicate()[0].splitlines(): @@ -136,17 +155,26 @@ class GitFat(object): fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash])) referenced.add(fathash) return referenced + def orphan_files(self): + 'generator for all orphan placeholders in the working tree' + for fname in subprocess.check_output(['git', 'ls-files']).splitlines(): + digest = self.decode_file(fname) + if digest: + yield (digest, fname) def cmd_status(self, args): self.setup() catalog = self.catalog_objects() - referenced = self.referenced_objects() + refargs = dict() + if '--all' in args: + refargs['all'] = True + referenced = self.referenced_objects(**refargs) garbage = catalog - referenced orphans = referenced - catalog if '--all' in args: for obj in referenced: print(obj) if orphans: - print('Orphan files:') + print('Orphan objects:') for orph in orphans: print(' ' + orph) if garbage: @@ -156,23 +184,44 @@ class GitFat(object): def cmd_push(self): 'Push anything that I have stored and referenced' self.setup() - files = self.referenced_objects() & self.catalog_objects() + # Pushing *all* objects because it's safer. Could implement partial push. + files = self.referenced_objects(all=True) & self.catalog_objects() remote = self.get_rsync() - verbose('Pushing to %s' % (remote)) + self.verbose('Pushing to %s' % (remote)) cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', self.objdir + '/', remote] p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) - def checkout(self): - files = subprocess.check_output(['git', 'ls-files', '']) - def cmd_pull(self): + def checkout(self, show_orphans=False): + 'Update any stale files in the present working tree' + orphans = [] + for digest, fname in self.orphan_files(): + objpath = os.path.join(self.objdir, digest) + if os.access(objpath, os.R_OK): + print('Restoring %s -> %s' % (digest, fname)) + shutil.copy(objpath, fname) + elif show_orphans: + print('Data unavailable: %s %s' % (digest,fname)) + subprocess.call(['git', 'checkout', '.']) + def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() - files = self.referenced_objects() - self.catalog_objects() + refargs = dict() + if '--all' in args: + refargs['all'] = True + for arg in args: + if arg.startswith('-') or len(arg) != 40: + continue + rev = self.revparse(arg) + if rev: + refargs['rev'] = rev + files = self.referenced_objects(**refargs) - self.catalog_objects() remote = self.get_rsync() cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-', remote + '/', self.objdir] p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) self.checkout() + def cmd_checkout(self, args): + self.checkout(show_orphans=True) def cmd_gc(self): garbage = self.catalog_objects() - self.referenced_objects() print('Unreferenced objects to remove: %d' % len(garbage)) @@ -189,24 +238,23 @@ class GitFat(object): ]) if __name__ == '__main__': - if os.environ.get('GIT_FAT_VERBOSE'): - global verbose - verbose = verbose_stderr fat = GitFat() cmd = sys.argv[1] if len(sys.argv) > 1 else '' if cmd == 'filter-clean': fat.cmd_clean() elif cmd == 'filter-smudge': fat.cmd_smudge() + elif cmd == 'init': + fat.cmd_init() elif cmd == 'status': fat.cmd_status(sys.argv[2:]) elif cmd == 'push': fat.cmd_push() elif cmd == 'pull': - fat.cmd_pull() + fat.cmd_pull(sys.argv[2:]) elif cmd == 'gc': fat.cmd_gc() - elif cmd == 'init': - fat.cmd_init() + elif cmd == 'checkout': + fat.cmd_checkout(sys.argv[2:]) else: - print('Usage: git fat [status|push|pull|gc|init]', file=sys.stderr) + print('Usage: git fat [init|status|push|pull|gc|checkout]', file=sys.stderr) |