Merge branch 'main' into merge_modules

This commit is contained in:
Patrick Robertson
2025-03-07 16:19:51 +00:00
47 changed files with 55825 additions and 102 deletions

View File

@@ -105,8 +105,8 @@ class BaseModule(ABC):
for key in self.authentication.keys():
if key in site or site in key:
logger.debug(f"Could not find exact authentication information for site '{site}'. \
did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.")
did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.")
def get_ytdlp_cookiejar(args):
import yt_dlp

View File

@@ -80,7 +80,10 @@ class ModuleFactory:
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
if 'archiver' in module_name:
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
raise IndexError(message)
return available[0]
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:

View File

@@ -15,6 +15,7 @@ from copy import copy
from rich_argparse import RichHelpFormatter
from loguru import logger
import requests
from .metadata import Metadata, Media
from auto_archiver.version import __version__
@@ -72,10 +73,20 @@ class ArchivingOrchestrator:
self.basic_parser = parser
return parser
def check_steps(self, config):
for module_type in MODULE_TYPES:
if not config['steps'].get(f"{module_type}s", []):
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
if module_type == 'extractor' and config['steps'].get('archivers'):
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
# modules parser to get the overridden 'steps' values
modules_parser = argparse.ArgumentParser(
add_help=False,
@@ -100,6 +111,7 @@ class ArchivingOrchestrator:
# but should we add them? Or should we just add them to the 'complete' parser?
if is_valid_config(yaml_config):
self.check_steps(yaml_config)
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
@@ -115,10 +127,6 @@ class ArchivingOrchestrator:
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
if not yaml_config['steps']['feeders']:
yaml_config['steps']['feeders'] = ['cli_feeder']
# add them to the config
for module in simple_modules:
for module_type in module.type:
@@ -171,9 +179,6 @@ class ArchivingOrchestrator:
if not parser:
parser = self.parser
# allow passing URLs directly on the command line
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \
@@ -193,7 +198,11 @@ class ArchivingOrchestrator:
modules = self.module_factory.available_modules()
for module in modules:
if module.name == 'cli_feeder':
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
continue
if not module.configs:
# this module has no configs, don't show anything in the help
# (TODO: do we want to show something about this module though, like a description?)
@@ -277,36 +286,16 @@ class ArchivingOrchestrator:
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
for module in modules_to_load:
if module == 'cli_feeder':
# cli_feeder is a pseudo module, it just takes the command line args for [URLS]
urls = self.config['urls']
if not urls:
raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
def feed(self) -> Generator[Metadata]:
for url in urls:
logger.debug(f"Processing URL: '{url}'")
yield Metadata().set_url(url)
pseudo_module = type('CLIFeeder', (Feeder,), {
'name': 'cli_feeder',
'display_name': 'CLI Feeder',
'__iter__': feed
})()
pseudo_module.__iter__ = feed
step_items.append(pseudo_module)
continue
if module in invalid_modules:
continue
loaded_module = None
try:
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module:
if loaded_module and module_type == 'extractor':
loaded_module.cleanup()
raise e
@@ -348,7 +337,23 @@ class ArchivingOrchestrator:
yaml_config = self.load_config(basic_config.config_file)
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
def check_for_updates(self):
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
latest_version = response['info']['version']
# check version compared to current version
if latest_version != __version__:
if os.environ.get('RUNNING_IN_DOCKER'):
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
else:
update_cmd = "`pip install --upgrade auto-archiver`"
logger.warning("")
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
logger.warning(f"A new version of auto-archiver is available (v{latest_version}, you have {__version__})")
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
logger.warning("")
def setup(self, args: list):
"""
Function to configure all setup of the orchestrator: setup configs and load modules.
@@ -356,6 +361,8 @@ class ArchivingOrchestrator:
This method should only ever be called once
"""
self.check_for_updates()
if self.setup_finished:
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \

View File

@@ -0,0 +1,23 @@
{
'name': 'Command Line Feeder',
'type': ['feeder'],
'entry_point': 'cli_feeder::CLIFeeder',
'requires_setup': False,
'description': 'Feeds URLs to orchestrator from the command line',
'configs': {
'urls': {
'default': None,
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
},
},
'description': """
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
without the need to specify any additional configuration or command line arguments:
`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
""",
}

View File

@@ -0,0 +1,21 @@
from loguru import logger
from auto_archiver.core.feeder import Feeder
from auto_archiver.core.metadata import Metadata
class CLIFeeder(Feeder):
def setup(self) -> None:
self.urls = self.config['urls']
if not self.urls:
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
def __iter__(self) -> Metadata:
urls = self.config['urls']
for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
m.set_context("folder", "cli")
yield m
logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -10,7 +10,7 @@ class ConsoleDb(Database):
"""
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
logger.info(f"STARTED {item}")
def failed(self, item: Metadata, reason:str) -> None:
logger.error(f"FAILED {item}: {reason}")

View File

@@ -6,7 +6,7 @@
},
'entry_point': 'csv_db::CSVDb',
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"}
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
},
"description": """
Handles exporting archival results to a CSV file.

View File

@@ -28,6 +28,13 @@ the broader archiving framework.
metadata objects. Some dropins are included in this generic_archiver by default, but
custom dropins can be created to handle additional websites and passed to the archiver
via the command line using the `--dropins` option (TODO!).
### Auto-Updates
The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
""",
"configs": {
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"ytdlp_update_interval": {
"default": 5,
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
"type": "int",
},
},
}

View File

@@ -1,7 +1,11 @@
import datetime, os, yt_dlp, pysubs2
import datetime, os
import importlib
import subprocess
from typing import Generator, Type
import yt_dlp
from yt_dlp.extractor.common import InfoExtractor
import pysubs2
from loguru import logger
@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
class GenericExtractor(Extractor):
_dropins = {}
def setup(self):
# check for file .ytdlp-update in the secrets folder
if self.ytdlp_update_interval < 0:
return
use_secrets = os.path.exists('secrets')
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
next_update_check = None
if os.path.exists(path):
with open(path, "r") as f:
next_update_check = datetime.datetime.fromisoformat(f.read())
if not next_update_check or next_update_check < datetime.datetime.now():
self.update_ytdlp()
next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
with open(path, "w") as f:
f.write(next_update_check.isoformat())
def update_ytdlp(self):
logger.info("Checking and updating yt-dlp...")
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
from importlib.metadata import version as get_version
old_version = get_version("yt-dlp")
try:
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
if "Successfully installed yt-dlp" in result.stdout.decode():
new_version = importlib.metadata.version("yt-dlp")
logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
importlib.reload(yt_dlp)
else:
logger.info("yt-dlp already up to date")
except Exception as e:
logger.error(f"Error updating yt-dlp: {e}")
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
"""
Returns a list of valid extractors for the given URL"""

View File

@@ -12,7 +12,9 @@
"default": None,
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
},
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"header": {"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",

View File

@@ -7,7 +7,9 @@
"bin": [""]
},
"configs": {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
"detect_thumbnails": {"default": True,
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
"type": "bool"},
},
"description": """ """,
}

View File

@@ -10,25 +10,30 @@
"requires_setup": True,
"configs": {
"username": {"required": True,
"help": "a valid Instagram username"},
"help": "A valid Instagram username."},
"password": {
"required": True,
"help": "the corresponding Instagram account password",
"help": "The corresponding Instagram account password.",
},
"download_folder": {
"default": "instaloader",
"help": "name of a folder to temporarily download content to",
"help": "Name of a folder to temporarily download content to.",
},
"session_file": {
"default": "secrets/instaloader.session",
"help": "path to the instagram session which saves session credentials",
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
},
# TODO: fine-grain
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
},
"description": """
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram.
> ⚠️ **Warning**
> This module is not actively maintained due to known issues with blocking.
> Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts.
Authentication is required via username/password or a session file.

View File

@@ -3,7 +3,7 @@
highlights, and tagged posts. Authentication is required via username/password or a session file.
"""
import re, os, shutil, traceback
import re, os, shutil
import instaloader
from loguru import logger
@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
"""
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
# NB: post regex should be tested before profile
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
# https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
# https://regex101.com/r/6Wbsxa/1
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
def setup(self) -> None:
self.insta = instaloader.Instaloader(
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
download_geotags=True,
download_comments=True,
compress_json=False,
dirname_pattern=self.download_folder,
filename_pattern="{date_utc}_UTC_{target}__{typename}"
)
try:
self.insta.load_session_from_file(self.username, self.session_file)
except Exception as e:
logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
try:
self.insta.login(self.username, config.instagram_self.password)
# TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
logger.debug(f"Session file failed", exc_info=True)
logger.info("No valid session file found - Attempting login with use and password.")
self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file)
except Exception as e2:
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
except Exception as e:
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata:

View File

@@ -104,7 +104,7 @@ class InstagramTbotExtractor(Extractor):
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):

View File

@@ -17,7 +17,9 @@
"choices": ["random", "static"],
},
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
"save_absolute": {"default": False,
"type": "bool",
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
},
"description": """
LocalStorage: A storage module for saving archived content locally on the filesystem.

View File

@@ -6,13 +6,25 @@
"python": ["loguru", "selenium"],
},
"configs": {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
"width": {"default": 1280,
"type": "int",
"help": "width of the screenshots"},
"height": {"default": 1024,
"type": "int",
"help": "height of the screenshots"},
"timeout": {"default": 60,
"type": "int",
"help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4,
"type": "int",
"help": "seconds to wait for the pages to load before taking screenshot"},
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
"save_to_pdf": {"default": False,
"type": "bool",
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
"print_options": {"default": {},
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
"type": "json_loader"},
},
"description": """
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.

View File

@@ -7,7 +7,9 @@
},
'entry_point': 'ssl_enricher::SSLEnricher',
"configs": {
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
"skip_when_nothing_archived": {"default": True,
"type": 'bool',
"help": "if true, will skip enriching when no media is archived"},
},
"description": """
Retrieves SSL certificate information for a domain and stores it as a file.

View File

@@ -14,7 +14,9 @@
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
"join_channels": {"default": True,
"type": "bool",
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
"channel_invites": {
"default": {},
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",

View File

@@ -17,11 +17,19 @@
"configs": {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"timeout": {"default": 120,
"type": "int",
"help": "timeout for WACZ generation in seconds", "type": "int"},
"extract_media": {"default": False,
"type": 'bool',
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
},
"extract_screenshot": {"default": True,
"type": 'bool',
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
},
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
},
"description": """

View File

@@ -9,6 +9,7 @@
"configs": {
"timeout": {
"default": 15,
"type": "int",
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
},
"if_not_archived_within": {

View File

@@ -10,8 +10,12 @@
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"required": True,
"help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"include_srt": {"default": False,
"type": "bool",
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90,
"type": "int",
"help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translate",
"help": "which Whisper operation to execute",
"choices": ["transcribe", "translate", "language_detection"]},

View File

@@ -1,18 +1,23 @@
""" This Webdriver class acts as a context manager for the selenium webdriver. """
from __future__ import annotations
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.print_page_options import PrintOptions
from loguru import logger
from selenium.webdriver.common.by import By
import os
import time
#import domain_for_url
from urllib.parse import urlparse, urlunparse
from http.cookiejar import MozillaCookieJar
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions as selenium_exceptions
from selenium.webdriver.common.print_page_options import PrintOptions
from selenium.webdriver.common.by import By
from loguru import logger
class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool
@@ -20,6 +25,10 @@ class CookieSettingDriver(webdriver.Firefox):
cookiejar: MozillaCookieJar
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
if os.environ.get('RUNNING_IN_DOCKER'):
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver')
super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookies = cookies
self.cookiejar = cookiejar
@@ -64,14 +73,29 @@ class CookieSettingDriver(webdriver.Firefox):
time.sleep(2)
except Exception as e:
logger.warning(f'Failed on fb accept cookies.', e)
# now get the actual URL
super(CookieSettingDriver, self).get(url)
if self.facebook_accept_cookies:
# try and click the 'close' button on the 'login' window to close it
close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
if close_button:
close_button.click()
try:
xpath = "//div[@role='dialog']//div[@aria-label='Close']"
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
except selenium_exceptions.NoSuchElementException:
logger.warning("Unable to find the 'close' button on the facebook login window")
pass
else:
# for all other sites, try and use some common button text to reject/accept cookies
for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]:
try:
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
break
except selenium_exceptions.WebDriverException:
pass
class Webdriver:
@@ -90,7 +114,6 @@ class Webdriver:
setattr(self.print_options, k, v)
def __enter__(self) -> webdriver:
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
options.add_argument(f'--proxy-server={self.http_proxy}')
@@ -105,7 +128,7 @@ class Webdriver:
self.driver.set_window_size(self.width, self.height)
self.driver.set_page_load_timeout(self.timeout_seconds)
self.driver.print_options = self.print_options
except TimeoutException as e:
except selenium_exceptions.TimeoutException as e:
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
return self.driver