def normalize_path(p):
return normalize_osx_path(p)
+def md5hash_size(fp, chunk_size=1024*10):
+ md5 = hashlib.md5()
+ size = 0
+ while True:
+ data = fp.read(chunk_size)
+ if not data:
+ break
+ md5.update(data)
+ size += len(data)
+
+ return md5.digest(), size
def find_dupes(source, allowed_dupes, bail=True):
allowed_dupes = set(allowed_dupes)
md5s = OrderedDict()
for p, f in UnpackFinder(source):
- content = f.open().read()
- m = hashlib.md5(content).digest()
+ m, content_size = md5hash_size(f.open())
if m not in md5s:
if isinstance(f, DeflatedFile):
compressed = f.file.compressed_size
else:
- compressed = len(content)
- md5s[m] = (len(content), compressed, [])
+ compressed = content_size
+ md5s[m] = (content_size, compressed, [])
md5s[m][2].append(p)
total = 0
total_compressed = 0