diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 98f1370..57320df 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -95,5 +95,11 @@ class Extractor(BaseModule): logger.warning(f"Failed to fetch the Media URL: {e}") @abstractmethod - def download(self, item: Metadata) -> Metadata: + def download(self, item: Metadata) -> Metadata | False: + """ + Downloads the media from the given URL and returns a Metadata object with the downloaded media. + + If the URL is not supported or the download fails, this method should return False. + + """ pass \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 8a634de..5ac091c 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -69,6 +69,13 @@ class UniqueAppendAction(argparse.Action): getattr(namespace, self.dest).append(value) class ArchivingOrchestrator: + + feeders: List[Type[Feeder]] + extractors: List[Type[Extractor]] + enrichers: List[Type[Enricher]] + databases: List[Type[Database]] + storages: List[Type[Storage]] + formatters: List[Type[Formatter]] def setup_basic_parser(self): parser = argparse.ArgumentParser( @@ -296,11 +303,7 @@ class ArchivingOrchestrator: step_items.append(loaded_module) check_steps_ok() - self.config['steps'][f"{module_type}s"] = step_items - - - assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again." - self.config['steps'][f"{module_type}s"] = step_items + setattr(self, f"{module_type}s", step_items) def load_config(self, config_file: str) -> dict: if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE: @@ -331,9 +334,9 @@ class ArchivingOrchestrator: # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: - logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"])) + logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) - for item in self.feed(): + for _ in self.feed(): pass def cleanup(self)->None: @@ -484,40 +487,7 @@ class ArchivingOrchestrator: # Helper Properties - - @property - def feeders(self) -> List[Type[Feeder]]: - return self._get_property('feeders') - - @property - def extractors(self) -> List[Type[Extractor]]: - return self._get_property('extractors') - - @property - def enrichers(self) -> List[Type[Enricher]]: - return self._get_property('enrichers') - - @property - def databases(self) -> List[Type[Database]]: - return self._get_property('databases') - - @property - def storages(self) -> List[Type[Storage]]: - return self._get_property('storages') - - @property - def formatters(self) -> List[Type[Formatter]]: - return self._get_property('formatters') @property def all_modules(self) -> List[Type[BaseModule]]: - return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters - - def _get_property(self, prop): - try: - f = self.config['steps'][prop] - if not (isinstance(f[0], BaseModule) or isinstance(f[0], LazyBaseModule)): - raise TypeError - return f - except: - exit("Property called prior to full initialisation") \ No newline at end of file + return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters \ No newline at end of file diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py deleted file mode 100644 index abb1e16..0000000 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ /dev/null @@ -1,40 +0,0 @@ -from loguru import logger -import time, os -from selenium.common.exceptions import TimeoutException - - -from auto_archiver.core import Enricher -from ..utils import Webdriver, url as UrlUtil, random_str -from ..core import Media, Metadata - -class ScreenshotEnricher(Enricher): - name = "screenshot_enricher" - - @staticmethod - def configs() -> dict: - return { - "width": {"default": 1280, "help": "width of the screenshots"}, - "height": {"default": 720, "help": "height of the screenshots"}, - "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, - "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, - "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, - } - - def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - if UrlUtil.is_auth_wall(url): - logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") - return - - logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: - try: - driver.get(url) - time.sleep(int(self.sleep_before_screenshot)) - screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") - driver.save_screenshot(screenshot_file) - to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") - except TimeoutException: - logger.info("TimeoutException loading page for screenshot") - except Exception as e: - logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 68417aa..5ba57d0 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -89,7 +89,6 @@ def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args): orchestrator.run(test_args + # we still need to load the real path to get the example_module ["--module_paths", "tests/data/invalid_test_modules/"]) - # assert False assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."