From: Rob Lemley Date: Thu, 21 Feb 2019 20:14:17 +0000 (-0500) Subject: Bug 1526744 - find-dupes.py: Calculate md5 by chunk. X-Git-Tag: archive/raspbian/1%60.8.0-1_deb10u1+rpi1^2^2~2 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=5bc5cb3fec8c54eba07615e46d11f6050babd603;p=thunderbird.git Bug 1526744 - find-dupes.py: Calculate md5 by chunk. Read the file in chunks and use md5.update() rather than reading the entire file into RAM and calculating the hash all at once. This prevents out of memory errors on build systems with low RAM. Gbp-Pq: Topic fixes Gbp-Pq: Name Bug-1526744-find-dupes.py-Calculate-md5-by-chunk.patch --- diff --git a/toolkit/mozapps/installer/find-dupes.py b/toolkit/mozapps/installer/find-dupes.py index 3935b797da..0ff7efc844 100644 --- a/toolkit/mozapps/installer/find-dupes.py +++ b/toolkit/mozapps/installer/find-dupes.py @@ -39,19 +39,29 @@ def is_l10n_file(path): def normalize_path(p): return normalize_osx_path(p) +def md5hash_size(fp, chunk_size=1024*10): + md5 = hashlib.md5() + size = 0 + while True: + data = fp.read(chunk_size) + if not data: + break + md5.update(data) + size += len(data) + + return md5.digest(), size def find_dupes(source, allowed_dupes, bail=True): allowed_dupes = set(allowed_dupes) md5s = OrderedDict() for p, f in UnpackFinder(source): - content = f.open().read() - m = hashlib.md5(content).digest() + m, content_size = md5hash_size(f.open()) if m not in md5s: if isinstance(f, DeflatedFile): compressed = f.file.compressed_size else: - compressed = len(content) - md5s[m] = (len(content), compressed, []) + compressed = content_size + md5s[m] = (content_size, compressed, []) md5s[m][2].append(p) total = 0 total_compressed = 0