diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index b60c53d..e76dc5a 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -64,6 +64,7 @@ class GsheetsDb(Database): batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")) batch_if_valid('timestamp', item.get_timestamp()) + batch_if_valid('hash', media.get("hash", "not-calculated")) if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): batch_if_valid('screenshot', "\n".join(screenshot.urls)) diff --git a/src/auto_archiver/enrichers/hash_enricher.py b/src/auto_archiver/enrichers/hash_enricher.py index 29e6a53..7bf8f89 100644 --- a/src/auto_archiver/enrichers/hash_enricher.py +++ b/src/auto_archiver/enrichers/hash_enricher.py @@ -2,7 +2,7 @@ import hashlib from loguru import logger from . import Enricher -from ..core import Metadata +from ..core import Metadata, ArchivingContext class HashEnricher(Enricher): @@ -17,6 +17,7 @@ class HashEnricher(Enricher): algo_choices = self.configs()["algorithm"]["choices"] assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})." self.chunksize = int(self.chunksize) + ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True) @staticmethod def configs() -> dict: diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/formatters/html_formatter.py index f30156a..2d859f3 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/formatters/html_formatter.py @@ -8,6 +8,7 @@ from loguru import logger from ..version import __version__ from ..core import Metadata, Media, ArchivingContext from . import Formatter +from ..enrichers import HashEnricher @dataclass @@ -46,11 +47,16 @@ class HtmlFormatter(Formatter): html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html") with open(html_path, mode="w", encoding="utf-8") as outf: outf.write(content) - return Media(filename=html_path) + final_media = Media(filename=html_path) + + he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) + if len(hd := he.calculate_hash(final_media.filename)): + final_media.set("hash", f"{he.algorithm}:{hd}") + + return final_media # JINJA helper filters - class JinjaHelpers: @staticmethod def is_list(v) -> bool: diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index ce99042..6e19745 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -77,7 +77,7 @@ class Storage(Step): # filename_generator logic if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] elif self.filename_generator == "static": - he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}}) + he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) hd = he.calculate_hash(media.filename) filename = hd[:24]