mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
hash enricher and media refactor
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from .enricher import Enricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackEnricher
|
||||
from .wayback_enricher import WaybackEnricher
|
||||
from .hash_enricher import HashEnricher
|
||||
41
src/enrichers/hash_enricher.py
Normal file
41
src/enrichers/hash_enricher.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import hashlib
|
||||
from utils import Webdriver
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time, requests
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
"""
|
||||
Calculates hashes for Media instances
|
||||
"""
|
||||
name = "hash_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
with open(m.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
hash = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256(bytes)
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512(bytes)
|
||||
else: continue
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
||||
@@ -27,7 +27,7 @@ class ScreenshotEnricher(Enricher):
|
||||
time.sleep(2)
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file, id="screenshot"))
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user