diff --git a/example.orchestration.yaml b/example.orchestration.yaml index a36b125..23a950b 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -97,6 +97,22 @@ configurations: screenshot_enricher: width: 1280 height: 2300 + # to save as pdf, uncomment the following lines and adjust the print options + # save_to_pdf: true + # print_options: + # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html + # background: true + # orientation: "portrait" + # scale: 1 + # page_width: 8.5in + # page_height: 11in + # margin_top: 0.4in + # margin_bottom: 0.4in + # margin_left: 0.4in + # margin_right: 0.4in + # page_ranges: "" + # shrink_to_fit: true + wayback_archiver_enricher: timeout: 10 key: "wayback key" diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/enrichers/screenshot_enricher.py index 69f466b..b2ef096 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/enrichers/screenshot_enricher.py @@ -1,5 +1,7 @@ from loguru import logger import time, os +import base64 + from selenium.common.exceptions import TimeoutException @@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher): "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + "print_options": {"default": {}, "help": "options to pass to the pdf printer"} } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() + if UrlUtil.is_auth_wall(url): logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") return logger.debug(f"Enriching screenshot for {url=}") - with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver: + with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver: try: driver.get(url) time.sleep(int(self.sleep_before_screenshot)) screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png") driver.save_screenshot(screenshot_file) to_enrich.add_media(Media(filename=screenshot_file), id="screenshot") + if self.save_to_pdf: + pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf") + pdf = driver.print_page(driver.print_options) + with open(pdf_file, "wb") as f: + f.write(base64.b64decode(pdf)) + to_enrich.add_media(Media(filename=pdf_file), id="pdf") except TimeoutException: logger.info("TimeoutException loading page for screenshot") except Exception as e: diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index dc21e17..7e95330 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -2,18 +2,24 @@ from __future__ import annotations from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.proxy import Proxy, ProxyType +from selenium.webdriver.common.print_page_options import PrintOptions + from loguru import logger from selenium.webdriver.common.by import By import time class Webdriver: - def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "") -> webdriver: + def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver: self.width = width self.height = height self.timeout_seconds = timeout_seconds self.facebook_accept_cookies = facebook_accept_cookies self.http_proxy = http_proxy + # create and set print options + self.print_options = PrintOptions() + for k, v in print_options.items(): + setattr(self.print_options, k, v) def __enter__(self) -> webdriver: options = webdriver.FirefoxOptions() @@ -24,6 +30,7 @@ class Webdriver: self.driver = webdriver.Firefox(options=options) self.driver.set_window_size(self.width, self.height) self.driver.set_page_load_timeout(self.timeout_seconds) + self.driver.print_options = self.print_options except TimeoutException as e: logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")