mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Merge branch 'load_modules' into add_module_tests
This commit is contained in:
@@ -95,5 +95,11 @@ class Extractor(BaseModule):
|
||||
logger.warning(f"Failed to fetch the Media URL: {e}")
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
def download(self, item: Metadata) -> Metadata | False:
|
||||
"""
|
||||
Downloads the media from the given URL and returns a Metadata object with the downloaded media.
|
||||
|
||||
If the URL is not supported or the download fails, this method should return False.
|
||||
|
||||
"""
|
||||
pass
|
||||
@@ -69,6 +69,13 @@ class UniqueAppendAction(argparse.Action):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
feeders: List[Type[Feeder]]
|
||||
extractors: List[Type[Extractor]]
|
||||
enrichers: List[Type[Enricher]]
|
||||
databases: List[Type[Database]]
|
||||
storages: List[Type[Storage]]
|
||||
formatters: List[Type[Formatter]]
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
@@ -296,11 +303,7 @@ class ArchivingOrchestrator:
|
||||
step_items.append(loaded_module)
|
||||
|
||||
check_steps_ok()
|
||||
self.config['steps'][f"{module_type}s"] = step_items
|
||||
|
||||
|
||||
assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again."
|
||||
self.config['steps'][f"{module_type}s"] = step_items
|
||||
setattr(self, f"{module_type}s", step_items)
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
@@ -331,9 +334,9 @@ class ArchivingOrchestrator:
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"]))
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
for item in self.feed():
|
||||
for _ in self.feed():
|
||||
pass
|
||||
|
||||
def cleanup(self)->None:
|
||||
@@ -484,40 +487,7 @@ class ArchivingOrchestrator:
|
||||
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@property
|
||||
def feeders(self) -> List[Type[Feeder]]:
|
||||
return self._get_property('feeders')
|
||||
|
||||
@property
|
||||
def extractors(self) -> List[Type[Extractor]]:
|
||||
return self._get_property('extractors')
|
||||
|
||||
@property
|
||||
def enrichers(self) -> List[Type[Enricher]]:
|
||||
return self._get_property('enrichers')
|
||||
|
||||
@property
|
||||
def databases(self) -> List[Type[Database]]:
|
||||
return self._get_property('databases')
|
||||
|
||||
@property
|
||||
def storages(self) -> List[Type[Storage]]:
|
||||
return self._get_property('storages')
|
||||
|
||||
@property
|
||||
def formatters(self) -> List[Type[Formatter]]:
|
||||
return self._get_property('formatters')
|
||||
|
||||
@property
|
||||
def all_modules(self) -> List[Type[BaseModule]]:
|
||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
||||
|
||||
def _get_property(self, prop):
|
||||
try:
|
||||
f = self.config['steps'][prop]
|
||||
if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)):
|
||||
raise TypeError
|
||||
return f
|
||||
except:
|
||||
exit("Property called prior to full initialisation")
|
||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
||||
@@ -1,40 +0,0 @@
|
||||
from loguru import logger
|
||||
import time, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from ..utils import Webdriver, url as UrlUtil, random_str
|
||||
from ..core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
@@ -89,7 +89,6 @@ def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
|
||||
orchestrator.run(test_args + # we still need to load the real path to get the example_module
|
||||
["--module_paths", "tests/data/invalid_test_modules/"])
|
||||
|
||||
# assert False
|
||||
assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user