fat find: positively identify blobs, simplifying filter

git rev-list --objects has a second field for objects other than blobs (like trees), so they cannot be filtered out a priori.
author: Jed Brown <jed@59A2.org> 2013-01-21 15:40:29 -0600
committer: Jed Brown <jed@59A2.org> 2013-01-21 15:40:29 -0600
commit: 952067b8abac81f44c8884d761e6548e7f5107d0 (patch)
tree: 5c482049d25f9b12bebb80c6f444a2ac02a20a21 /git-fat
parent: 45d76fc83c25145c4511977a153eebc5903a0622 (diff)
download: git-fat-952067b8abac81f44c8884d761e6548e7f5107d0.tar.gz
1 files changed, 13 insertions, 11 deletions
diff --git a/git-fat b/git-fat
index 80c2364..e177ce8 100755
--- a/git-fat
+++ b/git-fat
@@ -335,29 +335,31 @@ class GitFat(object):
     def gen_large_blobs(self, revs, threshsize):
         """Build dict of all blobs"""
         time0 = time.time()
-        def keep_blobs(input, output):
-            """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria"""
-            # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable
+        def hash_only(input, output):
+            """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
+            This truncates to one hash per line.
+            """
             for line in input:
-                if len(line) != 40:
-                    output.write(line[:40] + '\n')
+                output.write(line[:40] + '\n')
             output.close()
         revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
-        pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
-        keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin))
-        keepblobs.start()
+        objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
+        hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
+        hashonly.start()
         numblobs = 0; numlarge = 1
         # Build dict with the sizes of all large blobs
-        for line in pblobcheck.stdout:
+        for line in objcheck.stdout:
             objhash, blob, size = line.split()
+            if blob != 'blob':
+                continue
             size = int(size)
             numblobs += 1
             if size > threshsize:
                 numlarge += 1
                 yield objhash, size
         revlist.wait()
-        pblobcheck.wait()
-        keepblobs.join()
+        objcheck.wait()
+        hashonly.join()
         time1 = time.time()
         self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
     def cmd_find(self, args):
author	Jed Brown <jed@59A2.org>	2013-01-21 15:40:29 -0600
committer	Jed Brown <jed@59A2.org>	2013-01-21 15:40:29 -0600
commit	952067b8abac81f44c8884d761e6548e7f5107d0 (patch)
tree	5c482049d25f9b12bebb80c6f444a2ac02a20a21 /git-fat
parent	45d76fc83c25145c4511977a153eebc5903a0622 (diff)
download	git-fat-952067b8abac81f44c8884d761e6548e7f5107d0.tar.gz