Bump version to v0.4.5 for release

Merge pull request #72 from milesmcc/patch-1
Fix hash enricher for flatfile output (closes #71)
2026-06-12 13:18:28 +03:00 · 2023-03-16 15:05:42 +00:00 · 2023-03-16 15:04:55 +00:00 · 2023-03-14 13:37:54 -07:00 · 2023-03-10 11:34:29 +00:00 · 2023-02-27 10:30:06 +01:00
4 changed files with 31 additions and 21 deletions
--- a/src/auto_archiver/enrichers/hash_enricher.py
+++ b/src/auto_archiver/enrichers/hash_enricher.py
@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
        super().__init__(config)
        algo_choices = self.configs()["algorithm"]["choices"]
        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
        self.chunksize = int(self.chunksize)
    @staticmethod
    def configs() -> dict:
        return {
-            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
+            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
            "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
        }
    def enrich(self, to_enrich: Metadata) -> None:
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
        logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
        for i, m in enumerate(to_enrich.media):
-            with open(m.filename, "rb") as f:
+            if len(hd := self.calculate_hash(m.filename)):
-                bytes = f.read()  # read entire file as bytes
+                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
-                hash = None
+
-                if self.algorithm == "SHA-256":
+    def calculate_hash(self, filename):
-                    hash = hashlib.sha256(bytes)
+        hash = None
-                elif self.algorithm == "SHA3-512":
+        if self.algorithm == "SHA-256":
-                    hash = hashlib.sha3_512(bytes)
+            hash = hashlib.sha256()
-                else: continue
+        elif self.algorithm == "SHA3-512":
-                to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
+            hash = hashlib.sha3_512()
        else: return ""
        with open(filename, "rb") as f:
            while True:
                buf = f.read(self.chunksize)
                if not buf: break
                hash.update(buf)
        return hash.hexdigest()
--- a/src/auto_archiver/enrichers/screenshot_enricher.py
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
        return {
            "width": {"default": 1280, "help": "width of the screenshots"},
            "height": {"default": 720, "help": "height of the screenshots"},
-            "timeout": {"default": 60, "help": "timeout for taking the screenshot"}
+            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
        }
    def enrich(self, to_enrich: Metadata) -> None:
@@ -27,7 +28,7 @@ class ScreenshotEnricher(Enricher):
        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
            try:
                driver.get(url)
-                time.sleep(2)
+                time.sleep(int(self.sleep_before_screenshot))
                screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
@@ -35,4 +36,3 @@ class ScreenshotEnricher(Enricher):
                logger.info("TimeoutException loading page for screenshot")
            except Exception as e:
                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
        # return None
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -5,6 +5,7 @@ import hashlib
 from typing import IO, Any
 from ..core import Media, Metadata, Step
 from ..enrichers import HashEnricher
 from loguru import logger
 import os, uuid
 from slugify import slugify
@@ -64,18 +65,18 @@ class Storage(Step):
        filename, ext = os.path.splitext(media.filename)
        # path_generator logic
-        if self.path_generator == "flat": 
+        if self.path_generator == "flat":
            path = ""
-            filename = slugify(filename) # in case it comes with os.sep
+            filename = slugify(filename)  # in case it comes with os.sep
        elif self.path_generator == "url": path = slugify(item.get_url())
        elif self.path_generator == "random":
            path = item.get("random_path", str(uuid.uuid4())[:16], True)
        # filename_generator logic
        if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
-        elif self.filename_generator == "static": 
+        elif self.filename_generator == "static":
-            with open(media.filename, "rb") as f:
+            he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
-                bytes = f.read()  # read entire file as bytes
+            hd = he.calculate_hash(media.filename)
-            filename = hashlib.sha256(bytes).hexdigest()[:24]
+            filename = hd[:24]
-        media.key = os.path.join(folder, path, f"{filename}{ext}")
+        media.key = os.path.join(folder, path, f"{filename}{ext}")
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "4"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "2"
+_PATCH = "5"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
msramalho	39818e648a	Bump version to v0.4.5 for release	2023-03-16 15:05:42 +00:00
Miguel Sozinho Ramalho	2bbf534d67	Merge pull request #72 from milesmcc/patch-1 Fix hash enricher for flatfile output (closes #71)	2023-03-16 15:04:55 +00:00
R. Miles McCain	6be7536fad	Fix hash enricher for flatfile output (closes #71 )	2023-03-14 13:37:54 -07:00
msramalho	0654e8c5c6	hash calculation in chunks to avoid exhausting RAM	2023-03-10 11:34:29 +00:00
msramalho	0e3c427371	Bump version to v0.4.3 for release	2023-02-27 10:30:06 +01:00