summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Murty <james@murty.co>2014-05-25 22:41:05 +0100
committerJames Murty <james@murty.co>2014-05-25 22:57:50 +0100
commita796facf6a73a726a1cd7af8d07e3872b587c401 (patch)
tree25ebb7a4b1b33d5bdbe00837703e16c40428be06
parent456d22cb3c7ba60e15d5d0f7de771c2acf573481 (diff)
downloadgit-fat-a796facf6a73a726a1cd7af8d07e3872b587c401.tar.gz
Improve performance when looking up referenced objects.
Avoid a cat-file subprocess call per fat object blob by doing slightly uglier parsing of "cat-file --batch" that includes object content, instead of "cat-file --batch-check" that doesn't. In my ad-hoc testing on a reasonable size repository (50k rev-list -objects) this speeds up 'git fat status' by almost 40%.
-rwxr-xr-xgit-fat21
1 files changed, 16 insertions, 5 deletions
diff --git a/git-fat b/git-fat
index 97315ea..6c5ee4d 100755
--- a/git-fat
+++ b/git-fat
@@ -288,21 +288,32 @@ class GitFat(object):
elif rev is None:
rev = self.revparse('HEAD')
p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
- p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ p2 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def cut_sha1hash(input, output):
for line in input:
output.write(line.split()[0] + '\n')
output.close()
cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
cut_thread.start()
- for line in p2.stdout:
- objhash, objtype, size = line.split()
- if objtype == 'blob' and int(size) in self.magiclens:
+ while True:
+ line = p2.stdout.readline()
+ if not line:
+ break # EOF
+ objhash, objtype, size_str = line.split()
+ size = int(size_str)
+ if objtype == 'blob' and size in self.magiclens:
+ content = p2.stdout.readline()
try:
- fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
+ fathash = self.decode(content)[0]
referenced.add(fathash)
except GitFat.DecodeError:
pass
+ consumed = len(content)
+ else:
+ consumed = 0
+ # Consume content
+ while consumed <= size:
+ consumed += len(p2.stdout.read(max(1, size-consumed)))
cut_thread.join()
p1.wait()
p2.wait()