From a796facf6a73a726a1cd7af8d07e3872b587c401 Mon Sep 17 00:00:00 2001 From: James Murty Date: Sun, 25 May 2014 22:41:05 +0100 Subject: Improve performance when looking up referenced objects. Avoid a cat-file subprocess call per fat object blob by doing slightly uglier parsing of "cat-file --batch" that includes object content, instead of "cat-file --batch-check" that doesn't. In my ad-hoc testing on a reasonable size repository (50k rev-list -objects) this speeds up 'git fat status' by almost 40%. --- git-fat | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'git-fat') diff --git a/git-fat b/git-fat index 97315ea..6c5ee4d 100755 --- a/git-fat +++ b/git-fat @@ -288,21 +288,32 @@ class GitFat(object): elif rev is None: rev = self.revparse('HEAD') p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + p2 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: output.write(line.split()[0] + '\n') output.close() cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) cut_thread.start() - for line in p2.stdout: - objhash, objtype, size = line.split() - if objtype == 'blob' and int(size) in self.magiclens: + while True: + line = p2.stdout.readline() + if not line: + break # EOF + objhash, objtype, size_str = line.split() + size = int(size_str) + if objtype == 'blob' and size in self.magiclens: + content = p2.stdout.readline() try: - fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0] + fathash = self.decode(content)[0] referenced.add(fathash) except GitFat.DecodeError: pass + consumed = len(content) + else: + consumed = 0 + # Consume content + while consumed <= size: + consumed += len(p2.stdout.read(max(1, size-consumed))) cut_thread.join() p1.wait() p2.wait() -- cgit v1.2.1 From c061c140f9bb4959a80692b8bf044c2f75610781 Mon Sep 17 00:00:00 2001 From: James Murty Date: Wed, 28 May 2014 22:41:15 +0100 Subject: More sophisticated use of cat-file bulk revision processing in two stages This change takes better advantage of the relative strengths of `cat-file --batch-check` and `cat-file --batch` and combines them. * Uses `cat-file --batch-check` to filter the set of all git objects in bulk, leaving only those that are candidate git-fat object references, based on the object being of type "blob" and of magic size(s) * Uses `cat-file --batch` to read the full contents of all the candidate objects in bulk, for fast processing of their data to find the actual git-fat references. --- git-fat | 54 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 18 deletions(-) (limited to 'git-fat') diff --git a/git-fat b/git-fat index 6c5ee4d..bb01580 100755 --- a/git-fat +++ b/git-fat @@ -287,36 +287,54 @@ class GitFat(object): rev = '--all' elif rev is None: rev = self.revparse('HEAD') + # Revision list gives us object names to inspect with cat-file... p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: output.write(line.split()[0] + '\n') output.close() + # ...`cat-file --batch-check` filters for git-fat object candidates in bulk... + p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + def filter_gitfat_candidates(input, output): + for line in input: + objhash, objtype, size = line.split() + if objtype == 'blob' and int(size) in self.magiclens: + output.write(objhash + '\n') + output.close() + # ...`cat-file --batch` provides full contents of git-fat candidates in bulk + p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + # Stream data from p1 -> p2 -> p3 in the background cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) + filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)) cut_thread.start() + filter_thread.start() + # Process metadata + content format provided by `cat-file --batch` while True: - line = p2.stdout.readline() - if not line: + metadata_line = p3.stdout.readline() + if not metadata_line: break # EOF - objhash, objtype, size_str = line.split() - size = int(size_str) - if objtype == 'blob' and size in self.magiclens: - content = p2.stdout.readline() - try: - fathash = self.decode(content)[0] - referenced.add(fathash) - except GitFat.DecodeError: - pass - consumed = len(content) - else: - consumed = 0 - # Consume content - while consumed <= size: - consumed += len(p2.stdout.read(max(1, size-consumed))) + objhash, objtype, size_str = metadata_line.split() + size, bytes_read = int(size_str), 0 + # We know from filter that item is a candidate git-fat object and + # is small enough to read into memory and process + content = '' + while bytes_read < size: + content += p3.stdout.read(size - bytes_read) + bytes_read = len(content) + try: + fathash = self.decode(content)[0] + referenced.add(fathash) + except GitFat.DecodeError: + pass + # Consume LF record delimiter in `cat-file --batch` output + while not p3.stdout.read(1): + pass + # Ensure everything is cleaned up cut_thread.join() + filter_thread.join() p1.wait() p2.wait() + p3.wait() return referenced def orphan_files(self, patterns=[]): -- cgit v1.2.1 From ad8d77fe8479ddbbd752c3162fec28081e49cfc4 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Mon, 2 Jun 2014 22:20:28 +0200 Subject: referenced_objects: make comment more precise --- git-fat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'git-fat') diff --git a/git-fat b/git-fat index bb01580..f29b1f6 100755 --- a/git-fat +++ b/git-fat @@ -303,7 +303,7 @@ class GitFat(object): output.close() # ...`cat-file --batch` provides full contents of git-fat candidates in bulk p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) - # Stream data from p1 -> p2 -> p3 in the background + # Stream data: p1 | cut_thread | p2 | filter_thread | p3 cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)) cut_thread.start() -- cgit v1.2.1