Files
auto-archiver/src/auto_archiver/utils/webdriver.py

118 lines
5.3 KiB
Python

""" This Webdriver class acts as a context manager for the selenium webdriver. """
from __future__ import annotations
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.print_page_options import PrintOptions
from loguru import logger
from selenium.webdriver.common.by import By
import time
#import domain_for_url
from urllib.parse import urlparse, urlunparse
from http.cookiejar import MozillaCookieJar
class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool
cookies: str
cookiejar: MozillaCookieJar
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookies = cookies
self.cookiejar = cookiejar
self.facebook_accept_cookies = facebook_accept_cookies
def get(self, url: str):
if self.cookies or self.cookiejar:
# set up the driver to make it not 'cookie averse' (needs a context/URL)
# get the 'robots.txt' file which should be quick and easy
robots_url = urlunparse(urlparse(url)._replace(path='/robots.txt', query='', fragment=''))
super(CookieSettingDriver, self).get(robots_url)
if self.cookies:
# an explicit cookie is set for this site, use that first
for cookie in self.cookies.split(";"):
for name, value in cookie.split("="):
self.driver.add_cookie({'name': name, 'value': value})
elif self.cookiejar:
domain = urlparse(url).netloc.lstrip("www.")
for cookie in self.cookiejar:
if domain in cookie.domain:
try:
self.add_cookie({
'name': cookie.name,
'value': cookie.value,
'path': cookie.path,
'domain': cookie.domain,
'secure': bool(cookie.secure),
'expiry': cookie.expires
})
except Exception as e:
logger.warning(f"Failed to add cookie to webdriver: {e}")
if self.facebook_accept_cookies:
try:
logger.debug(f'Trying fb click accept cookie popup.')
super(CookieSettingDriver, self).get("http://www.facebook.com")
essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
essential_only.click()
logger.debug(f'fb click worked')
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
time.sleep(2)
except Exception as e:
logger.warning(f'Failed on fb accept cookies.', e)
# now get the actual URL
super(CookieSettingDriver, self).get(url)
if self.facebook_accept_cookies:
# try and click the 'close' button on the 'login' window to close it
close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
if close_button:
close_button.click()
class Webdriver:
def __init__(self, width: int, height: int, timeout_seconds: int,
facebook_accept_cookies: bool = False, http_proxy: str = "",
print_options: dict = {}, auth: dict = {}) -> webdriver:
self.width = width
self.height = height
self.timeout_seconds = timeout_seconds
self.auth = auth
self.facebook_accept_cookies = facebook_accept_cookies
self.http_proxy = http_proxy
# create and set print options
self.print_options = PrintOptions()
for k, v in print_options.items():
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
options.add_argument(f'--proxy-server={self.http_proxy}')
options.set_preference('network.protocol-handler.external.tg', False)
# if facebook cookie popup is present, force the browser to English since then it's easier to click the 'Decline optional cookies' option
if self.facebook_accept_cookies:
options.add_argument('--lang=en')
try:
self.driver = CookieSettingDriver(cookies=self.auth.get('cookies'), cookiejar=self.auth.get('cookies_jar'),
facebook_accept_cookies=self.facebook_accept_cookies, options=options)
self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
return self.driver
def __exit__(self, exc_type, exc_val, exc_tb):
self.driver.close()
self.driver.quit()
del self.driver
return True