feat: re-enable HASH on gsheet

This commit is contained in:
msramalho
2023-05-09 11:17:44 +01:00
parent 8f3d4e05c3
commit 875e1de589
4 changed files with 12 additions and 4 deletions

View File

@@ -64,6 +64,7 @@ class GsheetsDb(Database):
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
batch_if_valid('hash', media.get("hash", "not-calculated"))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))

View File

@@ -2,7 +2,7 @@ import hashlib
from loguru import logger
from . import Enricher
from ..core import Metadata
from ..core import Metadata, ArchivingContext
class HashEnricher(Enricher):
@@ -17,6 +17,7 @@ class HashEnricher(Enricher):
algo_choices = self.configs()["algorithm"]["choices"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
self.chunksize = int(self.chunksize)
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
@staticmethod
def configs() -> dict:

View File

@@ -8,6 +8,7 @@ from loguru import logger
from ..version import __version__
from ..core import Metadata, Media, ArchivingContext
from . import Formatter
from ..enrichers import HashEnricher
@dataclass
@@ -46,11 +47,16 @@ class HtmlFormatter(Formatter):
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
return Media(filename=html_path)
final_media = Media(filename=html_path)
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")
return final_media
# JINJA helper filters
class JinjaHelpers:
@staticmethod
def is_list(v) -> bool:

View File

@@ -77,7 +77,7 @@ class Storage(Step):
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
filename = hd[:24]