mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
39818e648a | ||
|
|
2bbf534d67 | ||
|
|
6be7536fad | ||
|
|
0654e8c5c6 | ||
|
|
0e3c427371 |
@@ -16,11 +16,13 @@ class HashEnricher(Enricher):
|
|||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
algo_choices = self.configs()["algorithm"]["choices"]
|
algo_choices = self.configs()["algorithm"]["choices"]
|
||||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||||
|
self.chunksize = int(self.chunksize)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||||
|
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
@@ -28,12 +30,19 @@ class HashEnricher(Enricher):
|
|||||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
with open(m.filename, "rb") as f:
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
bytes = f.read() # read entire file as bytes
|
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||||
hash = None
|
|
||||||
if self.algorithm == "SHA-256":
|
def calculate_hash(self, filename):
|
||||||
hash = hashlib.sha256(bytes)
|
hash = None
|
||||||
elif self.algorithm == "SHA3-512":
|
if self.algorithm == "SHA-256":
|
||||||
hash = hashlib.sha3_512(bytes)
|
hash = hashlib.sha256()
|
||||||
else: continue
|
elif self.algorithm == "SHA3-512":
|
||||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
hash = hashlib.sha3_512()
|
||||||
|
else: return ""
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
while True:
|
||||||
|
buf = f.read(self.chunksize)
|
||||||
|
if not buf: break
|
||||||
|
hash.update(buf)
|
||||||
|
return hash.hexdigest()
|
||||||
|
|||||||
@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
|
|||||||
return {
|
return {
|
||||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||||
"height": {"default": 720, "help": "height of the screenshots"},
|
"height": {"default": 720, "help": "height of the screenshots"},
|
||||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||||
|
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
@@ -27,7 +28,7 @@ class ScreenshotEnricher(Enricher):
|
|||||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||||
try:
|
try:
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(2)
|
time.sleep(int(self.sleep_before_screenshot))
|
||||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||||
driver.save_screenshot(screenshot_file)
|
driver.save_screenshot(screenshot_file)
|
||||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||||
@@ -35,4 +36,3 @@ class ScreenshotEnricher(Enricher):
|
|||||||
logger.info("TimeoutException loading page for screenshot")
|
logger.info("TimeoutException loading page for screenshot")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||||
# return None
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import hashlib
|
|||||||
from typing import IO, Any
|
from typing import IO, Any
|
||||||
|
|
||||||
from ..core import Media, Metadata, Step
|
from ..core import Media, Metadata, Step
|
||||||
|
from ..enrichers import HashEnricher
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import os, uuid
|
import os, uuid
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
@@ -64,18 +65,18 @@ class Storage(Step):
|
|||||||
filename, ext = os.path.splitext(media.filename)
|
filename, ext = os.path.splitext(media.filename)
|
||||||
|
|
||||||
# path_generator logic
|
# path_generator logic
|
||||||
if self.path_generator == "flat":
|
if self.path_generator == "flat":
|
||||||
path = ""
|
path = ""
|
||||||
filename = slugify(filename) # in case it comes with os.sep
|
filename = slugify(filename) # in case it comes with os.sep
|
||||||
elif self.path_generator == "url": path = slugify(item.get_url())
|
elif self.path_generator == "url": path = slugify(item.get_url())
|
||||||
elif self.path_generator == "random":
|
elif self.path_generator == "random":
|
||||||
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
||||||
|
|
||||||
# filename_generator logic
|
# filename_generator logic
|
||||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||||
elif self.filename_generator == "static":
|
elif self.filename_generator == "static":
|
||||||
with open(media.filename, "rb") as f:
|
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
|
||||||
bytes = f.read() # read entire file as bytes
|
hd = he.calculate_hash(media.filename)
|
||||||
filename = hashlib.sha256(bytes).hexdigest()[:24]
|
filename = hd[:24]
|
||||||
|
|
||||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "4"
|
_MINOR = "4"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "2"
|
_PATCH = "5"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user