summaryrefslogtreecommitdiff
path: root/git-fat
diff options
context:
space:
mode:
authorJed Brown <jed@59A2.org>2013-01-21 15:40:29 -0600
committerJed Brown <jed@59A2.org>2013-01-21 15:40:29 -0600
commit952067b8abac81f44c8884d761e6548e7f5107d0 (patch)
tree5c482049d25f9b12bebb80c6f444a2ac02a20a21 /git-fat
parent45d76fc83c25145c4511977a153eebc5903a0622 (diff)
downloadgit-fat-952067b8abac81f44c8884d761e6548e7f5107d0.tar.gz
fat find: positively identify blobs, simplifying filter
git rev-list --objects has a second field for objects other than blobs (like trees), so they cannot be filtered out a priori.
Diffstat (limited to 'git-fat')
-rwxr-xr-xgit-fat24
1 files changed, 13 insertions, 11 deletions
diff --git a/git-fat b/git-fat
index 80c2364..e177ce8 100755
--- a/git-fat
+++ b/git-fat
@@ -335,29 +335,31 @@ class GitFat(object):
def gen_large_blobs(self, revs, threshsize):
"""Build dict of all blobs"""
time0 = time.time()
- def keep_blobs(input, output):
- """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria"""
- # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable
+ def hash_only(input, output):
+ """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
+ This truncates to one hash per line.
+ """
for line in input:
- if len(line) != 40:
- output.write(line[:40] + '\n')
+ output.write(line[:40] + '\n')
output.close()
revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1)
- pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
- keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin))
- keepblobs.start()
+ objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1)
+ hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
+ hashonly.start()
numblobs = 0; numlarge = 1
# Build dict with the sizes of all large blobs
- for line in pblobcheck.stdout:
+ for line in objcheck.stdout:
objhash, blob, size = line.split()
+ if blob != 'blob':
+ continue
size = int(size)
numblobs += 1
if size > threshsize:
numlarge += 1
yield objhash, size
revlist.wait()
- pblobcheck.wait()
- keepblobs.join()
+ objcheck.wait()
+ hashonly.join()
time1 = time.time()
self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
def cmd_find(self, args):