mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Add 'print_pdf' option to the screenshot enricher. Fixes #132
This commit is contained in:
@@ -97,6 +97,22 @@ configurations:
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 2300
|
||||
# to save as pdf, uncomment the following lines and adjust the print options
|
||||
# save_to_pdf: true
|
||||
# print_options:
|
||||
# for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html
|
||||
# background: true
|
||||
# orientation: "portrait"
|
||||
# scale: 1
|
||||
# page_width: 8.5in
|
||||
# page_height: 11in
|
||||
# margin_top: 0.4in
|
||||
# margin_bottom: 0.4in
|
||||
# margin_left: 0.4in
|
||||
# margin_right: 0.4in
|
||||
# page_ranges: ""
|
||||
# shrink_to_fit: true
|
||||
|
||||
wayback_archiver_enricher:
|
||||
timeout: 10
|
||||
key: "wayback key"
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
from loguru import logger
|
||||
import time, os
|
||||
import base64
|
||||
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
@@ -18,22 +20,31 @@ class ScreenshotEnricher(Enricher):
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy) as driver:
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
if self.save_to_pdf:
|
||||
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
|
||||
pdf = driver.print_page(driver.print_options)
|
||||
with open(pdf_file, "wb") as f:
|
||||
f.write(base64.b64decode(pdf))
|
||||
to_enrich.add_media(Media(filename=pdf_file), id="pdf")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
|
||||
@@ -2,18 +2,24 @@ from __future__ import annotations
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.proxy import Proxy, ProxyType
|
||||
from selenium.webdriver.common.print_page_options import PrintOptions
|
||||
|
||||
from loguru import logger
|
||||
from selenium.webdriver.common.by import By
|
||||
import time
|
||||
|
||||
|
||||
class Webdriver:
|
||||
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "") -> webdriver:
|
||||
def __init__(self, width: int, height: int, timeout_seconds: int, facebook_accept_cookies: bool = False, http_proxy: str = "", print_options: dict = {}) -> webdriver:
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
self.http_proxy = http_proxy
|
||||
# create and set print options
|
||||
self.print_options = PrintOptions()
|
||||
for k, v in print_options.items():
|
||||
setattr(self.print_options, k, v)
|
||||
|
||||
def __enter__(self) -> webdriver:
|
||||
options = webdriver.FirefoxOptions()
|
||||
@@ -24,6 +30,7 @@ class Webdriver:
|
||||
self.driver = webdriver.Firefox(options=options)
|
||||
self.driver.set_window_size(self.width, self.height)
|
||||
self.driver.set_page_load_timeout(self.timeout_seconds)
|
||||
self.driver.print_options = self.print_options
|
||||
except TimeoutException as e:
|
||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user