Bug 1526744 - find-dupes.py: Calculate md5 by chunk.

author Rob Lemley <rob@thunderbird.net>

Thu, 21 Feb 2019 20:14:17 +0000 (15:14 -0500)

committer Carsten Schoenert <c.schoenert@t-online.de>

Wed, 27 Mar 2019 17:22:51 +0000 (17:22 +0000)
author Rob Lemley <rob@thunderbird.net>
Thu, 21 Feb 2019 20:14:17 +0000 (15:14 -0500)
committer Carsten Schoenert <c.schoenert@t-online.de>
Wed, 27 Mar 2019 17:22:51 +0000 (17:22 +0000)
diff --git a/toolkit/mozapps/installer/find-dupes.py b/toolkit/mozapps/installer/find-dupes.py

index 3935b797da29d850a684cbdea8dfbdd4e51f4a6e..0ff7efc84401eef7905c226724c4ddaa83a14aaf 100644 (file)
--- a/toolkit/mozapps/installer/find-dupes.py
+++ b/toolkit/mozapps/installer/find-dupes.py
@@ -39,19 +39,29 @@ def is_l10n_file(path):
  def normalize_path(p):
      return normalize_osx_path(p)
  
+def md5hash_size(fp, chunk_size=1024*10):
+    md5 = hashlib.md5()
+    size = 0
+    while True:
+        data = fp.read(chunk_size)
+        if not data:
+            break
+        md5.update(data)
+        size += len(data)
+
+    return md5.digest(), size
  
  def find_dupes(source, allowed_dupes, bail=True):
      allowed_dupes = set(allowed_dupes)
      md5s = OrderedDict()
      for p, f in UnpackFinder(source):
-        content = f.open().read()
-        m = hashlib.md5(content).digest()
+        m, content_size = md5hash_size(f.open())
          if m not in md5s:
              if isinstance(f, DeflatedFile):
                  compressed = f.file.compressed_size
              else:
-                compressed = len(content)
-            md5s[m] = (len(content), compressed, [])
+                compressed = content_size
+            md5s[m] = (content_size, compressed, [])
          md5s[m][2].append(p)
      total = 0
      total_compressed = 0
author	Rob Lemley <rob@thunderbird.net>
	Thu, 21 Feb 2019 20:14:17 +0000 (15:14 -0500)
committer	Carsten Schoenert <c.schoenert@t-online.de>
	Wed, 27 Mar 2019 17:22:51 +0000 (17:22 +0000)