summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Murty <james@murty.co>2014-05-28 22:41:15 +0100
committerJames Murty <james@murty.co>2014-05-28 22:41:15 +0100
commitc061c140f9bb4959a80692b8bf044c2f75610781 (patch)
tree327150dbe57c7dd849dbcfc883526c3f4ca38d8f
parenta796facf6a73a726a1cd7af8d07e3872b587c401 (diff)
downloadgit-fat-c061c140f9bb4959a80692b8bf044c2f75610781.tar.gz
More sophisticated use of cat-file bulk revision processing in two stages
This change takes better advantage of the relative strengths of `cat-file --batch-check` and `cat-file --batch` and combines them. * Uses `cat-file --batch-check` to filter the set of all git objects in bulk, leaving only those that are candidate git-fat object references, based on the object being of type "blob" and of magic size(s) * Uses `cat-file --batch` to read the full contents of all the candidate objects in bulk, for fast processing of their data to find the actual git-fat references.
-rwxr-xr-xgit-fat54
1 files changed, 36 insertions, 18 deletions
diff --git a/git-fat b/git-fat
index 6c5ee4d..bb01580 100755
--- a/git-fat
+++ b/git-fat
@@ -287,36 +287,54 @@ class GitFat(object):
rev = '--all'
elif rev is None:
rev = self.revparse('HEAD')
+ # Revision list gives us object names to inspect with cat-file...
p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
- p2 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def cut_sha1hash(input, output):
for line in input:
output.write(line.split()[0] + '\n')
output.close()
+ # ...`cat-file --batch-check` filters for git-fat object candidates in bulk...
+ p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ def filter_gitfat_candidates(input, output):
+ for line in input:
+ objhash, objtype, size = line.split()
+ if objtype == 'blob' and int(size) in self.magiclens:
+ output.write(objhash + '\n')
+ output.close()
+ # ...`cat-file --batch` provides full contents of git-fat candidates in bulk
+ p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ # Stream data from p1 -> p2 -> p3 in the background
cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
+ filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin))
cut_thread.start()
+ filter_thread.start()
+ # Process metadata + content format provided by `cat-file --batch`
while True:
- line = p2.stdout.readline()
- if not line:
+ metadata_line = p3.stdout.readline()
+ if not metadata_line:
break # EOF
- objhash, objtype, size_str = line.split()
- size = int(size_str)
- if objtype == 'blob' and size in self.magiclens:
- content = p2.stdout.readline()
- try:
- fathash = self.decode(content)[0]
- referenced.add(fathash)
- except GitFat.DecodeError:
- pass
- consumed = len(content)
- else:
- consumed = 0
- # Consume content
- while consumed <= size:
- consumed += len(p2.stdout.read(max(1, size-consumed)))
+ objhash, objtype, size_str = metadata_line.split()
+ size, bytes_read = int(size_str), 0
+ # We know from filter that item is a candidate git-fat object and
+ # is small enough to read into memory and process
+ content = ''
+ while bytes_read < size:
+ content += p3.stdout.read(size - bytes_read)
+ bytes_read = len(content)
+ try:
+ fathash = self.decode(content)[0]
+ referenced.add(fathash)
+ except GitFat.DecodeError:
+ pass
+ # Consume LF record delimiter in `cat-file --batch` output
+ while not p3.stdout.read(1):
+ pass
+ # Ensure everything is cleaned up
cut_thread.join()
+ filter_thread.join()
p1.wait()
p2.wait()
+ p3.wait()
return referenced
def orphan_files(self, patterns=[]):