summaryrefslogtreecommitdiff
path: root/fs/utils.py
diff options
context:
space:
mode:
authorwillmcgugan <willmcgugan@67cdc799-7952-0410-af00-57a81ceafa0f>2009-11-22 23:46:06 +0000
committerwillmcgugan <willmcgugan@67cdc799-7952-0410-af00-57a81ceafa0f>2009-11-22 23:46:06 +0000
commit195cda7f4c6fac4668c23bbf29998d3b5ec9fca2 (patch)
treedbc254781b05ecf588e7491f8d1b543fb35c092e /fs/utils.py
parent7425f0470a3d0f7304f1ac4d89afdcb92da1c262 (diff)
downloadpyfilesystem-195cda7f4c6fac4668c23bbf29998d3b5ec9fca2.tar.gz
Experimental function to find duplicte files -- needs testing
git-svn-id: http://pyfilesystem.googlecode.com/svn/trunk@278 67cdc799-7952-0410-af00-57a81ceafa0f
Diffstat (limited to 'fs/utils.py')
-rw-r--r--fs/utils.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/fs/utils.py b/fs/utils.py
index 073c656..e53d1be 100644
--- a/fs/utils.py
+++ b/fs/utils.py
@@ -140,3 +140,98 @@ def countbytes(fs):
total = sum(fs.getsize(f) for f in fs.walkfiles())
return total
+# Work in progress, not tested
+def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
+ """A generator that yields the paths of duplicate files in an FS object.
+ Files are considered identical if the contents are the same (dates or
+ other attributes not take in to account).
+
+ fs -- A filesystem object
+ paths -- An iterable of paths in the FS object, or all files if omited
+ quick -- If set to True, the quick method of finding duplicates will be used,
+ which can potentially miss some duplicates.
+ signature_size -- The chunk size in bytes used to generate file signatures,
+ lower values will decrease the likelyhood of missed duplicates when used with
+ quick=True
+
+ """
+
+ from collections import defaultdict
+ from zlib.crc32 import crc32
+
+ if paths is None:
+ paths = fs.walkfiles()
+
+ paths = list(paths)
+
+ file_sizes = defaultdict(list)
+ for path in paths:
+ file_sizes[fs.getsize(path)].append(path)
+
+ size_duplicates = [paths for paths in file_sizes if len(paths) > 1]
+
+ signatures = defaultdict(list)
+
+ for paths in size_duplicates:
+ for path in paths:
+ signature = []
+ fread = None
+ try:
+ fread = fs.open(path, 'rb')
+ while True:
+ data = fread.read(signature_size)
+ if not data:
+ break
+ signature.append(crc32(data))
+ finally:
+ if fread is not None:
+ fread.close()
+ signatures[tuple(signature)].append(path)
+
+ if quick:
+ for paths in signatures:
+ if len(paths) > 1:
+ yield paths
+ return
+
+ from itertools import izip
+
+ def identical(p1, p2):
+
+ f1, f2 = None, None
+ try:
+ f1 = fs.open(p1, 'rb')
+ f2 = fs.open(p2, 'rb')
+ while True:
+ chunk1 = f1.read(16384)
+ if not chunk1:
+ break
+ chunk2 = f2.read(16384)
+ if chunk1 != chunk2:
+ return False
+ return True
+ finally:
+ if f1 is not None:
+ f1.close()
+ if f2 is not None:
+ f2.close()
+
+
+ for paths in signatures:
+
+ while len(paths) > 1:
+
+ test_p = paths.pop()
+ dups = [test_p]
+
+ for path in paths:
+ if identical(test_p, path):
+ dups.append(path)
+
+ if len(dups) > 1:
+ yield dups
+
+ paths = list(set(paths).difference(dups))
+
+
+