From dea0a49600a23dffac0181b5cd8b6d701cb80259 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Mar 2025 15:41:44 +0000 Subject: [PATCH 1/6] Download correct gecko-driver for the platform + fix setting executable path when running in Docker Fixes #232 --- Dockerfile | 16 +++++++++++++--- src/auto_archiver/core/base_module.py | 4 ++-- src/auto_archiver/utils/webdriver.py | 21 ++++++++++++++------- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index cbcfdd4..713d5c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,16 +7,26 @@ ENV RUNNING_IN_DOCKER=1 \ PYTHONFAULTHANDLER=1 \ PATH="/root/.local/bin:$PATH" + +ARG TARGETARCH + # Installing system dependencies RUN add-apt-repository ppa:mozillateam/ppa && \ apt-get update && \ apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \ apt-get install -y --no-install-recommends firefox-esr && \ - ln -s /usr/bin/firefox-esr /usr/bin/firefox && \ - wget https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz && \ + ln -s /usr/bin/firefox-esr /usr/bin/firefox + +ARG GECKODRIVER_VERSION=0.35.0 + +RUN if [ $(uname -m) = "aarch64" ]; then \ + GECKODRIVER_ARCH=linux-aarch64; \ + else \ + GECKODRIVER_ARCH=linux64; \ + fi && \ + wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-${GECKODRIVER_ARCH}.tar.gz && \ tar -xvzf geckodriver* -C /usr/local/bin && \ chmod +x /usr/local/bin/geckodriver && \ - rm geckodriver-v* && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 8d520d1..d6e4455 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -105,8 +105,8 @@ class BaseModule(ABC): for key in self.authentication.keys(): if key in site or site in key: logger.debug(f"Could not find exact authentication information for site '{site}'. \ - did find information for '{key}' which is close, is this what you meant? \ - If so, edit your authentication settings to make sure it exactly matches.") +did find information for '{key}' which is close, is this what you meant? \ +If so, edit your authentication settings to make sure it exactly matches.") def get_ytdlp_cookiejar(args): import yt_dlp diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index db26d04..50a7b94 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -1,18 +1,22 @@ """ This Webdriver class acts as a context manager for the selenium webdriver. """ from __future__ import annotations -from selenium import webdriver -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.common.proxy import Proxy, ProxyType -from selenium.webdriver.common.print_page_options import PrintOptions -from loguru import logger -from selenium.webdriver.common.by import By +import os import time #import domain_for_url from urllib.parse import urlparse, urlunparse from http.cookiejar import MozillaCookieJar +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.proxy import Proxy, ProxyType +from selenium.webdriver.common.print_page_options import PrintOptions +from selenium.webdriver.common.by import By + +from loguru import logger + + class CookieSettingDriver(webdriver.Firefox): facebook_accept_cookies: bool @@ -20,6 +24,10 @@ class CookieSettingDriver(webdriver.Firefox): cookiejar: MozillaCookieJar def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs): + if os.environ.get('RUNNING_IN_DOCKER'): + # Selenium doesn't support linux-aarch64 driver, we need to set this manually + kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver') + super(CookieSettingDriver, self).__init__(*args, **kwargs) self.cookies = cookies self.cookiejar = cookiejar @@ -90,7 +98,6 @@ class Webdriver: setattr(self.print_options, k, v) def __enter__(self) -> webdriver: - options = webdriver.FirefoxOptions() options.add_argument("--headless") options.add_argument(f'--proxy-server={self.http_proxy}') From 0dfab2d1bcb89de0d994d14cc03b91fe14930449 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Mar 2025 15:55:04 +0000 Subject: [PATCH 2/6] Add some code to attempt to click the cookies banners on various websites --- src/auto_archiver/utils/webdriver.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 50a7b94..c6ad341 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -72,6 +72,8 @@ class CookieSettingDriver(webdriver.Firefox): time.sleep(2) except Exception as e: logger.warning(f'Failed on fb accept cookies.', e) + + # now get the actual URL super(CookieSettingDriver, self).get(url) if self.facebook_accept_cookies: @@ -79,7 +81,17 @@ class CookieSettingDriver(webdriver.Firefox): close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']") if close_button: close_button.click() + else: + # for all other sites, try and use some common button text to reject/accept cookies + for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]: + try: + accept_button = self.find_element(By.XPATH, f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]") + if accept_button: + accept_button.click() + break + except Exception as e: + pass class Webdriver: From a47e18ef9ac6eacfb4a749511d60e081dd7f3e10 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Mar 2025 16:00:11 +0000 Subject: [PATCH 3/6] Bump gecko driver to 0.36.0 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 713d5c2..67ed7e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN add-apt-repository ppa:mozillateam/ppa && \ apt-get install -y --no-install-recommends firefox-esr && \ ln -s /usr/bin/firefox-esr /usr/bin/firefox -ARG GECKODRIVER_VERSION=0.35.0 +ARG GECKODRIVER_VERSION=0.36.0 RUN if [ $(uname -m) = "aarch64" ]; then \ GECKODRIVER_ARCH=linux-aarch64; \ From e756f1504fd679fc467b4d43a339859fe0eaa803 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Mar 2025 11:52:14 +0000 Subject: [PATCH 4/6] Remove geckodriver .tar file --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 67ed7e9..68aed42 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN if [ $(uname -m) = "aarch64" ]; then \ wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-${GECKODRIVER_ARCH}.tar.gz && \ tar -xvzf geckodriver* -C /usr/local/bin && \ chmod +x /usr/local/bin/geckodriver && \ + rm geckodriver-v* && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* From dba44b1ac1c810f366f3c47faee6206c9c77b31e Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Mar 2025 12:07:54 +0000 Subject: [PATCH 5/6] Use WebDriverWait when waiting for elements in screenshot enricher --- src/auto_archiver/utils/webdriver.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index c6ad341..1d01df2 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -9,8 +9,9 @@ from urllib.parse import urlparse, urlunparse from http.cookiejar import MozillaCookieJar from selenium import webdriver -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.common.proxy import Proxy, ProxyType +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common import exceptions as selenium_exceptions from selenium.webdriver.common.print_page_options import PrintOptions from selenium.webdriver.common.by import By @@ -78,19 +79,22 @@ class CookieSettingDriver(webdriver.Firefox): super(CookieSettingDriver, self).get(url) if self.facebook_accept_cookies: # try and click the 'close' button on the 'login' window to close it - close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']") - if close_button: - close_button.click() + try: + xpath = "//div[@role='dialog']//div[@aria-label='Close']" + WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() + except selenium_exceptions.NoSuchElementException: + logger.warning("Unable to find the 'close' button on the facebook login window") + pass + else: # for all other sites, try and use some common button text to reject/accept cookies for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]: try: - accept_button = self.find_element(By.XPATH, f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]") - if accept_button: - accept_button.click() - break - except Exception as e: + xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]" + WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() + break + except selenium_exceptions.WebDriverException: pass @@ -124,7 +128,7 @@ class Webdriver: self.driver.set_window_size(self.width, self.height) self.driver.set_page_load_timeout(self.timeout_seconds) self.driver.print_options = self.print_options - except TimeoutException as e: + except selenium_exceptions.TimeoutException as e: logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") return self.driver From e72b3e14ba5103973e0d5d3315807b2befc19ffb Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Mar 2025 12:08:29 +0000 Subject: [PATCH 6/6] Change default height of screenshots to attempt to capture more information --- src/auto_archiver/modules/screenshot_enricher/__manifest__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py index 9829844..970b0df 100644 --- a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py +++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py @@ -7,7 +7,7 @@ }, "configs": { "width": {"default": 1280, "help": "width of the screenshots"}, - "height": {"default": 720, "help": "height of the screenshots"}, + "height": {"default": 1024, "help": "height of the screenshots"}, "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},