From 0654e8c5c673768f34c806f60889eea285e2f4b9 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Fri, 10 Mar 2023 11:34:29 +0000 Subject: [PATCH] hash calculation in chunks to avoid exhausting RAM --- src/auto_archiver/enrichers/hash_enricher.py | 29 +++++++++++++------- src/auto_archiver/storages/storage.py | 15 +++++----- src/auto_archiver/version.py | 2 +- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/src/auto_archiver/enrichers/hash_enricher.py b/src/auto_archiver/enrichers/hash_enricher.py index 35d9ebb..29e6a53 100644 --- a/src/auto_archiver/enrichers/hash_enricher.py +++ b/src/auto_archiver/enrichers/hash_enricher.py @@ -16,11 +16,13 @@ class HashEnricher(Enricher): super().__init__(config) algo_choices = self.configs()["algorithm"]["choices"] assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." + self.chunksize = int(self.chunksize) @staticmethod def configs() -> dict: return { - "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]} + "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, + "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, } def enrich(self, to_enrich: Metadata) -> None: @@ -28,12 +30,19 @@ class HashEnricher(Enricher): logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") for i, m in enumerate(to_enrich.media): - with open(m.filename, "rb") as f: - bytes = f.read() # read entire file as bytes - hash = None - if self.algorithm == "SHA-256": - hash = hashlib.sha256(bytes) - elif self.algorithm == "SHA3-512": - hash = hashlib.sha3_512(bytes) - else: continue - to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}") + if len(hd := self.calculate_hash(m.filename)): + to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}") + + def calculate_hash(self, filename): + hash = None + if self.algorithm == "SHA-256": + hash = hashlib.sha256() + elif self.algorithm == "SHA3-512": + hash = hashlib.sha3_512() + else: return "" + with open(filename, "rb") as f: + while True: + buf = f.read(self.chunksize) + if not buf: break + hash.update(buf) + return hash.hexdigest() diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index 1bc301b..53bd219 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -5,6 +5,7 @@ import hashlib from typing import IO, Any from ..core import Media, Metadata, Step +from ..enrichers import HashEnricher from loguru import logger import os, uuid from slugify import slugify @@ -64,18 +65,18 @@ class Storage(Step): filename, ext = os.path.splitext(media.filename) # path_generator logic - if self.path_generator == "flat": + if self.path_generator == "flat": path = "" - filename = slugify(filename) # in case it comes with os.sep + filename = slugify(filename) # in case it comes with os.sep elif self.path_generator == "url": path = slugify(item.get_url()) elif self.path_generator == "random": path = item.get("random_path", str(uuid.uuid4())[:16], True) # filename_generator logic if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] - elif self.filename_generator == "static": - with open(media.filename, "rb") as f: - bytes = f.read() # read entire file as bytes - filename = hashlib.sha256(bytes).hexdigest()[:24] + elif self.filename_generator == "static": + he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1.6e7}) + hd = he.calculate_hash(media.filename) + filename = hd[:24] - media.key = os.path.join(folder, path, f"{filename}{ext}") \ No newline at end of file + media.key = os.path.join(folder, path, f"{filename}{ext}") diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index 6439298..ed0d900 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -3,7 +3,7 @@ _MAJOR = "0" _MINOR = "4" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "3" +_PATCH = "4" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""