From a796facf6a73a726a1cd7af8d07e3872b587c401 Mon Sep 17 00:00:00 2001
From: James Murty <james@murty.co>
Date: Sun, 25 May 2014 22:41:05 +0100
Subject: Improve performance when looking up referenced objects.

Avoid a cat-file subprocess call per fat object blob by doing slightly
uglier parsing of "cat-file --batch" that includes object content,
instead of "cat-file --batch-check" that doesn't.

In my ad-hoc testing on a reasonable size repository
(50k rev-list -objects) this speeds up 'git fat status' by almost 40%.
---
 git-fat | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'git-fat')

diff --git a/git-fat b/git-fat
index 97315ea..6c5ee4d 100755
--- a/git-fat
+++ b/git-fat
@@ -288,21 +288,32 @@ class GitFat(object):
         elif rev is None:
             rev = self.revparse('HEAD')
         p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
-        p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        p2 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
         def cut_sha1hash(input, output):
             for line in input:
                 output.write(line.split()[0] + '\n')
             output.close()
         cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
         cut_thread.start()
-        for line in p2.stdout:
-            objhash, objtype, size = line.split()
-            if objtype == 'blob' and int(size) in self.magiclens:
+        while True:
+            line = p2.stdout.readline()
+            if not line:
+                break  # EOF
+            objhash, objtype, size_str = line.split()
+            size = int(size_str)
+            if objtype == 'blob' and size in self.magiclens:
+                content = p2.stdout.readline()
                 try:
-                    fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0]
+                    fathash = self.decode(content)[0]
                     referenced.add(fathash)
                 except GitFat.DecodeError:
                     pass
+                consumed = len(content)
+            else:
+                consumed = 0
+            # Consume content
+            while consumed <= size:
+                consumed += len(p2.stdout.read(max(1, size-consumed)))
         cut_thread.join()
         p1.wait()
         p2.wait()
-- 
cgit v1.2.1


From c061c140f9bb4959a80692b8bf044c2f75610781 Mon Sep 17 00:00:00 2001
From: James Murty <james@murty.co>
Date: Wed, 28 May 2014 22:41:15 +0100
Subject: More sophisticated use of cat-file bulk revision processing in two
 stages

This change takes better advantage of the relative strengths of
`cat-file --batch-check` and `cat-file --batch` and combines them.

* Uses `cat-file --batch-check` to filter the set of all git objects in
  bulk, leaving only those that are candidate git-fat object references,
  based on the object being of type "blob" and of magic size(s)
* Uses `cat-file --batch` to read the full contents of all the candidate
  objects in bulk, for fast processing of their data to find the actual
  git-fat references.
---
 git-fat | 54 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 18 deletions(-)

(limited to 'git-fat')

diff --git a/git-fat b/git-fat
index 6c5ee4d..bb01580 100755
--- a/git-fat
+++ b/git-fat
@@ -287,36 +287,54 @@ class GitFat(object):
             rev = '--all'
         elif rev is None:
             rev = self.revparse('HEAD')
+        # Revision list gives us object names to inspect with cat-file...
         p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE)
-        p2 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
         def cut_sha1hash(input, output):
             for line in input:
                 output.write(line.split()[0] + '\n')
             output.close()
+        # ...`cat-file --batch-check` filters for git-fat object candidates in bulk...
+        p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        def filter_gitfat_candidates(input, output):
+            for line in input:
+                objhash, objtype, size = line.split()
+                if objtype == 'blob' and int(size) in self.magiclens:
+                    output.write(objhash + '\n')
+            output.close()
+        # ...`cat-file --batch` provides full contents of git-fat candidates in bulk
+        p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        # Stream data from p1 -> p2 -> p3 in the background
         cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
+        filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin))
         cut_thread.start()
+        filter_thread.start()
+        # Process metadata + content format provided by `cat-file --batch`
         while True:
-            line = p2.stdout.readline()
-            if not line:
+            metadata_line = p3.stdout.readline()
+            if not metadata_line:
                 break  # EOF
-            objhash, objtype, size_str = line.split()
-            size = int(size_str)
-            if objtype == 'blob' and size in self.magiclens:
-                content = p2.stdout.readline()
-                try:
-                    fathash = self.decode(content)[0]
-                    referenced.add(fathash)
-                except GitFat.DecodeError:
-                    pass
-                consumed = len(content)
-            else:
-                consumed = 0
-            # Consume content
-            while consumed <= size:
-                consumed += len(p2.stdout.read(max(1, size-consumed)))
+            objhash, objtype, size_str = metadata_line.split()
+            size, bytes_read = int(size_str), 0
+            # We know from filter that item is a candidate git-fat object and
+            # is small enough to read into memory and process
+            content = ''
+            while bytes_read < size:
+                content += p3.stdout.read(size - bytes_read)
+                bytes_read = len(content)
+            try:
+                fathash = self.decode(content)[0]
+                referenced.add(fathash)
+            except GitFat.DecodeError:
+                pass
+            # Consume LF record delimiter in `cat-file --batch` output
+            while not p3.stdout.read(1):
+                pass
+        # Ensure everything is cleaned up
         cut_thread.join()
+        filter_thread.join()
         p1.wait()
         p2.wait()
+        p3.wait()
         return referenced
 
     def orphan_files(self, patterns=[]):
-- 
cgit v1.2.1


From ad8d77fe8479ddbbd752c3162fec28081e49cfc4 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@59A2.org>
Date: Mon, 2 Jun 2014 22:20:28 +0200
Subject: referenced_objects: make comment more precise

---
 git-fat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'git-fat')

diff --git a/git-fat b/git-fat
index bb01580..f29b1f6 100755
--- a/git-fat
+++ b/git-fat
@@ -303,7 +303,7 @@ class GitFat(object):
             output.close()
         # ...`cat-file --batch` provides full contents of git-fat candidates in bulk
         p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-        # Stream data from p1 -> p2 -> p3 in the background
+        # Stream data: p1 | cut_thread | p2 | filter_thread | p3
         cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
         filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin))
         cut_thread.start()
-- 
cgit v1.2.1