Merge pull request #233 from bellingcat/docker-webdriver-aarch64

Docker webdriver aarch64
This commit is contained in:
Patrick Robertson
2025-03-07 15:04:45 +00:00
committed by GitHub
4 changed files with 50 additions and 16 deletions

View File

@@ -7,13 +7,24 @@ ENV RUNNING_IN_DOCKER=1 \
PYTHONFAULTHANDLER=1 \ PYTHONFAULTHANDLER=1 \
PATH="/root/.local/bin:$PATH" PATH="/root/.local/bin:$PATH"
ARG TARGETARCH
# Installing system dependencies # Installing system dependencies
RUN add-apt-repository ppa:mozillateam/ppa && \ RUN add-apt-repository ppa:mozillateam/ppa && \
apt-get update && \ apt-get update && \
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \ apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \
apt-get install -y --no-install-recommends firefox-esr && \ apt-get install -y --no-install-recommends firefox-esr && \
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \ ln -s /usr/bin/firefox-esr /usr/bin/firefox
wget https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz && \
ARG GECKODRIVER_VERSION=0.36.0
RUN if [ $(uname -m) = "aarch64" ]; then \
GECKODRIVER_ARCH=linux-aarch64; \
else \
GECKODRIVER_ARCH=linux64; \
fi && \
wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-${GECKODRIVER_ARCH}.tar.gz && \
tar -xvzf geckodriver* -C /usr/local/bin && \ tar -xvzf geckodriver* -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \ chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-v* && \ rm geckodriver-v* && \

View File

@@ -105,8 +105,8 @@ class BaseModule(ABC):
for key in self.authentication.keys(): for key in self.authentication.keys():
if key in site or site in key: if key in site or site in key:
logger.debug(f"Could not find exact authentication information for site '{site}'. \ logger.debug(f"Could not find exact authentication information for site '{site}'. \
did find information for '{key}' which is close, is this what you meant? \ did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.") If so, edit your authentication settings to make sure it exactly matches.")
def get_ytdlp_cookiejar(args): def get_ytdlp_cookiejar(args):
import yt_dlp import yt_dlp

View File

@@ -7,7 +7,7 @@
}, },
"configs": { "configs": {
"width": {"default": 1280, "help": "width of the screenshots"}, "width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"}, "height": {"default": 1024, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}, "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},

View File

@@ -1,18 +1,23 @@
""" This Webdriver class acts as a context manager for the selenium webdriver. """ """ This Webdriver class acts as a context manager for the selenium webdriver. """
from __future__ import annotations from __future__ import annotations
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.print_page_options import PrintOptions
from loguru import logger import os
from selenium.webdriver.common.by import By
import time import time
#import domain_for_url #import domain_for_url
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
from http.cookiejar import MozillaCookieJar from http.cookiejar import MozillaCookieJar
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions as selenium_exceptions
from selenium.webdriver.common.print_page_options import PrintOptions
from selenium.webdriver.common.by import By
from loguru import logger
class CookieSettingDriver(webdriver.Firefox): class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool facebook_accept_cookies: bool
@@ -20,6 +25,10 @@ class CookieSettingDriver(webdriver.Firefox):
cookiejar: MozillaCookieJar cookiejar: MozillaCookieJar
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs): def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
if os.environ.get('RUNNING_IN_DOCKER'):
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver')
super(CookieSettingDriver, self).__init__(*args, **kwargs) super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookies = cookies self.cookies = cookies
self.cookiejar = cookiejar self.cookiejar = cookiejar
@@ -64,14 +73,29 @@ class CookieSettingDriver(webdriver.Firefox):
time.sleep(2) time.sleep(2)
except Exception as e: except Exception as e:
logger.warning(f'Failed on fb accept cookies.', e) logger.warning(f'Failed on fb accept cookies.', e)
# now get the actual URL # now get the actual URL
super(CookieSettingDriver, self).get(url) super(CookieSettingDriver, self).get(url)
if self.facebook_accept_cookies: if self.facebook_accept_cookies:
# try and click the 'close' button on the 'login' window to close it # try and click the 'close' button on the 'login' window to close it
close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']") try:
if close_button: xpath = "//div[@role='dialog']//div[@aria-label='Close']"
close_button.click() WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
except selenium_exceptions.NoSuchElementException:
logger.warning("Unable to find the 'close' button on the facebook login window")
pass
else:
# for all other sites, try and use some common button text to reject/accept cookies
for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]:
try:
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
break
except selenium_exceptions.WebDriverException:
pass
class Webdriver: class Webdriver:
@@ -90,7 +114,6 @@ class Webdriver:
setattr(self.print_options, k, v) setattr(self.print_options, k, v)
def __enter__(self) -> webdriver: def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.add_argument("--headless") options.add_argument("--headless")
options.add_argument(f'--proxy-server={self.http_proxy}') options.add_argument(f'--proxy-server={self.http_proxy}')
@@ -105,7 +128,7 @@ class Webdriver:
self.driver.set_window_size(self.width, self.height) self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds) self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options self.driver.print_options = self.print_options
except TimeoutException as e: except selenium_exceptions.TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
return self.driver return self.driver