auto-archiver/src/auto_archiver/enrichers/hash_enricher.py

import hashlib
from loguru import logger

from . import Enricher
from ..core import Metadata, ArchivingContext


class HashEnricher(Enricher):
    """
    Calculates hashes for Media instances
    """
    name = "hash_enricher"

    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
        algos = self.configs()["algorithm"]
        algo_choices = algos["choices"]
        if not getattr(self, 'algorithm', None):
            if not config.get('algorithm'):
                logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
                self.algorithm = algos["default"]
            else:
                self.algorithm = config["algorithm"]

        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."

        if not getattr(self, 'chunksize', None):
            if config.get('chunksize'):
                self.chunksize = config["chunksize"]
            else:
                self.chunksize = self.configs()["chunksize"]["default"]

        self.chunksize = int(self.chunksize)
        assert self.chunksize >= -1, "read length must be non-negative or -1"

        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)

    @staticmethod
    def configs() -> dict:
        return {
            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
            "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
        }

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
        logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")

        for i, m in enumerate(to_enrich.media):
            if len(hd := self.calculate_hash(m.filename)):
                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")

    def calculate_hash(self, filename) -> str:
        hash = None
        if self.algorithm == "SHA-256":
            hash = hashlib.sha256()
        elif self.algorithm == "SHA3-512":
            hash = hashlib.sha3_512()
        else: return ""
        with open(filename, "rb") as f:
            while True:
                buf = f.read(self.chunksize)
                if not buf: break
                hash.update(buf)
        return hash.hexdigest()