diff options
author | Jed Brown <jed@59A2.org> | 2013-01-21 15:40:29 -0600 |
---|---|---|
committer | Jed Brown <jed@59A2.org> | 2013-01-21 15:40:29 -0600 |
commit | 952067b8abac81f44c8884d761e6548e7f5107d0 (patch) | |
tree | 5c482049d25f9b12bebb80c6f444a2ac02a20a21 /git-fat | |
parent | 45d76fc83c25145c4511977a153eebc5903a0622 (diff) | |
download | git-fat-952067b8abac81f44c8884d761e6548e7f5107d0.tar.gz |
fat find: positively identify blobs, simplifying filter
git rev-list --objects has a second field for objects other than
blobs (like trees), so they cannot be filtered out a priori.
Diffstat (limited to 'git-fat')
-rwxr-xr-x | git-fat | 24 |
1 files changed, 13 insertions, 11 deletions
@@ -335,29 +335,31 @@ class GitFat(object): def gen_large_blobs(self, revs, threshsize): """Build dict of all blobs""" time0 = time.time() - def keep_blobs(input, output): - """The output of git rev-list --objects shows the path only for blobs, so we can filter on that criteria""" - # Test with awk '{if (NF>1) print $1}' is 5-10% faster (and less code), but less portable + def hash_only(input, output): + """The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags. + This truncates to one hash per line. + """ for line in input: - if len(line) != 40: - output.write(line[:40] + '\n') + output.write(line[:40] + '\n') output.close() revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1) - pblobcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) - keepblobs = threading.Thread(target=keep_blobs, args=(revlist.stdout, pblobcheck.stdin)) - keepblobs.start() + objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1) + hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin)) + hashonly.start() numblobs = 0; numlarge = 1 # Build dict with the sizes of all large blobs - for line in pblobcheck.stdout: + for line in objcheck.stdout: objhash, blob, size = line.split() + if blob != 'blob': + continue size = int(size) numblobs += 1 if size > threshsize: numlarge += 1 yield objhash, size revlist.wait() - pblobcheck.wait() - keepblobs.join() + objcheck.wait() + hashonly.join() time1 = time.time() self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) def cmd_find(self, args): |