diff --git a/plugin.zip b/plugin.zip deleted file mode 100644 index ecc06ab..0000000 Binary files a/plugin.zip and /dev/null differ diff --git a/poetry.lock b/poetry.lock index 52dc143..8adf6b9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4154,4 +4154,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "f142f1b7907266898024fbab926401f52bc4b281aef5f52e96382ce21afca1d1" +content-hash = "1ab1e4c9b8beb51116052c1e8d180616a0938757f173f05b7355e279902d3350" diff --git a/pyproject.toml b/pyproject.toml index 3fcbc9b..466d090 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ dependencies = [ "bs4 (>=0.0.0)", "loguru (>=0.0.0)", "ffmpeg-python (>=0.0.0)", - "selenium (>=0.0.0)", "telethon (>=0.0.0)", "google-api-python-client (>=0.0.0)", "google-auth-httplib2 (>=0.0.0)", diff --git a/src/auto_archiver/modules/screenshot_enricher/__init__.py b/src/auto_archiver/modules/screenshot_enricher/__init__.py deleted file mode 100644 index 393f726..0000000 --- a/src/auto_archiver/modules/screenshot_enricher/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .screenshot_enricher import ScreenshotEnricher diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py deleted file mode 100644 index db04e6c..0000000 --- a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py +++ /dev/null @@ -1,44 +0,0 @@ -{ - "name": "Screenshot Enricher", - "type": ["enricher"], - "requires_setup": True, - "dependencies": { - "python": ["loguru", "selenium"], - }, - "configs": { - "width": {"default": 1280, "type": "int", "help": "width of the screenshots"}, - "height": {"default": 1024, "type": "int", "help": "height of the screenshots"}, - "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"}, - "sleep_before_screenshot": { - "default": 4, - "type": "int", - "help": "seconds to wait for the pages to load before taking screenshot", - }, - "http_proxy": { - "default": "", - "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port", - }, - "save_to_pdf": { - "default": False, - "type": "bool", - "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter", - }, - "print_options": { - "default": {}, - "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information", - "type": "json_loader", - }, - }, - "description": """ - Captures screenshots and optionally saves web pages as PDFs using a WebDriver. - - ### Features - - Takes screenshots of web pages, with configurable width, height, and timeout settings. - - Optionally saves pages as PDFs, with additional configuration for PDF printing options. - - Bypasses URLs detected as authentication walls. - - Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media. - - ### Notes - - Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH. - """, -} diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py deleted file mode 100644 index 4e01357..0000000 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ /dev/null @@ -1,61 +0,0 @@ -from loguru import logger -import time -import os -import base64 - -from selenium.common.exceptions import TimeoutException - - -from auto_archiver.core import Enricher -from auto_archiver.utils import Webdriver, url as UrlUtil, random_str -from auto_archiver.core import Media, Metadata - - -class ScreenshotEnricher(Enricher): - def __init__(self, webdriver_factory=None): - super().__init__() - self.webdriver_factory = webdriver_factory or Webdriver - - def enrich(self, to_enrich: Metadata) -> None: - url = to_enrich.get_url() - - logger.debug(f"Enriching screenshot for {url=}") - auth = self.auth_for_site(url) - - # screenshot enricher only supports cookie-type auth (selenium) - has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie")) - - if UrlUtil.is_auth_wall(url) and not has_valid_auth: - logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}") - if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]): - logger.warning( - f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\ - Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site." - ) - return - - with self.webdriver_factory( - self.width, - self.height, - self.timeout, - facebook_accept_cookies="facebook.com" in url, - http_proxy=self.http_proxy, - print_options=self.print_options, - auth=auth, - ) as driver: - try: - driver.get(url) - time.sleep(int(self.sleep_before_screenshot)) - screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png") - driver.save_screenshot(screenshot_file) - to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") - if self.save_to_pdf: - pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf") - pdf = driver.print_page(driver.print_options) - with open(pdf_file, "wb") as f: - f.write(base64.b64decode(pdf)) - to_enrich.add_media(Media(filename=pdf_file), id="pdf") - except TimeoutException: - logger.info("TimeoutException loading page for screenshot") - except Exception as e: - logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")