Compare commits

..

1 Commits

Author SHA1 Message Date
msramalho
0654e8c5c6 hash calculation in chunks to avoid exhausting RAM 2023-03-10 11:34:29 +00:00
3 changed files with 28 additions and 18 deletions

View File

@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
super().__init__(config) super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"] algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
self.chunksize = int(self.chunksize)
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]} "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
} }
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
for i, m in enumerate(to_enrich.media): for i, m in enumerate(to_enrich.media):
with open(m.filename, "rb") as f: if len(hd := self.calculate_hash(m.filename)):
bytes = f.read() # read entire file as bytes to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
hash = None
if self.algorithm == "SHA-256": def calculate_hash(self, filename):
hash = hashlib.sha256(bytes) hash = None
elif self.algorithm == "SHA3-512": if self.algorithm == "SHA-256":
hash = hashlib.sha3_512(bytes) hash = hashlib.sha256()
else: continue elif self.algorithm == "SHA3-512":
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}") hash = hashlib.sha3_512()
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()

View File

@@ -5,6 +5,7 @@ import hashlib
from typing import IO, Any from typing import IO, Any
from ..core import Media, Metadata, Step from ..core import Media, Metadata, Step
from ..enrichers import HashEnricher
from loguru import logger from loguru import logger
import os, uuid import os, uuid
from slugify import slugify from slugify import slugify
@@ -66,7 +67,7 @@ class Storage(Step):
# path_generator logic # path_generator logic
if self.path_generator == "flat": if self.path_generator == "flat":
path = "" path = ""
filename = slugify(filename) # in case it comes with os.sep filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url()) elif self.path_generator == "url": path = slugify(item.get_url())
elif self.path_generator == "random": elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True) path = item.get("random_path", str(uuid.uuid4())[:16], True)
@@ -74,8 +75,8 @@ class Storage(Step):
# filename_generator logic # filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static": elif self.filename_generator == "static":
with open(media.filename, "rb") as f: he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1.6e7})
bytes = f.read() # read entire file as bytes hd = he.calculate_hash(media.filename)
filename = hashlib.sha256(bytes).hexdigest()[:24] filename = hd[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}") media.key = os.path.join(folder, path, f"{filename}{ext}")

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "4" _MINOR = "4"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "3" _PATCH = "4"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""