adds Dropin flexible integration for antibot

This commit is contained in:
msramalho
2025-06-07 19:09:37 +01:00
parent d202d79e0f
commit 07ff5baf07
6 changed files with 165 additions and 28 deletions

View File

@@ -2,9 +2,7 @@
"name": "Antibot Extractor/Enricher",
"type": ["extractor", "enricher"],
"requires_setup": False,
"dependencies": {
"python": ["loguru", "seleniumbase"],
},
"dependencies": {"python": ["loguru", "seleniumbase", "yt_dlp"], "bin": ["ffmpeg"]},
"configs": {
"save_to_pdf": {
"default": False,
@@ -23,6 +21,10 @@
"default": ".svg,.ico,.gif",
"help": "CSV of media (image/video) file extensions to exclude from download",
},
"user_data_dir": {
"default": "secrets/antibot_user_data",
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will be lost.",
},
"proxy": {
"default": None,
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",

View File

@@ -5,11 +5,16 @@ import os
import sys
import traceback
from urllib.parse import urljoin
import glob
import stat
import importlib.util
from loguru import logger
import selenium
from seleniumbase import SB
from auto_archiver.core import Extractor, Enricher, Metadata, Media
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.utils.misc import random_str
@@ -34,6 +39,38 @@ class AntibotExtractorEnricher(Extractor, Enricher):
else:
self.max_download_videos = int(self.max_download_videos)
os.makedirs(self.user_data_dir, exist_ok=True)
self._warn_about_docker_and_user_data_dir()
self.dropins = self.load_dropins()
def load_dropins(self):
dropins = []
# TODO: add user-configurable drop-ins via config like generic_extractor
dropins_dir = os.path.join(os.path.dirname(__file__), "dropins")
for file_path in glob.glob(os.path.join(dropins_dir, "*.py")):
if os.path.basename(file_path).startswith("_"):
continue # skip __init__.py or private modules
module_name = f"auto_archiver.modules.antibot_extractor_enricher.dropins.{os.path.splitext(os.path.basename(file_path))[0]}"
spec = importlib.util.spec_from_file_location(module_name, file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
for attr in dir(module):
obj = getattr(module, attr)
if getattr(obj, "__module__", None) != module.__name__:
continue # Skip imported modules/classes/functions
if isinstance(obj, type) and issubclass(obj, Dropin):
dropins.append(obj)
logger.debug(f"ANTIBOT loaded drop-in classes: {', '.join([d.__name__ for d in dropins])}")
return dropins
def sanitize_url(self, url: str) -> str:
for dropin in self.dropins:
if dropin.suitable(url):
return dropin.sanitize_url(url)
return url
def download(self, item: Metadata) -> Metadata:
result = Metadata()
result.merge(item)
@@ -41,12 +78,26 @@ class AntibotExtractorEnricher(Extractor, Enricher):
result.status = "antibot"
return result
def enrich(self, to_enrich: Metadata) -> bool:
def _warn_about_docker_and_user_data_dir(self):
in_docker = os.environ.get("RUNNING_IN_DOCKER")
if in_docker and self.user_data_dir:
st = os.stat(self.user_data_dir)
perms = stat.filemode(st.st_mode)
owner = st.st_uid
group = st.st_gid
if owner != 0 or group != 0:
logger.warning(
f"""ANTIBOT: Running in Docker with user_data_dir {self.user_data_dir} with permissions {perms} and non-root {owner=}. This may cause issues with Chrome, if you get 'session not created' errors make sure to remove the folder and let docker create it."""
)
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
using_user_data_dir = self.user_data_dir if custom_data_dir else None
url = to_enrich.get_url()
# TODO: implement cookies auth = self.auth_for_site(url) and combine with if UrlUtil.is_auth_wall(url) like in ScreenshotEnricher
url_sample = url[:75]
try:
with SB(uc=True, agent=self.agent, headed=None, proxy=self.proxy) as sb:
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
logger.info(f"ANTIBOT selenium browser is up with agent {self.agent}, opening {url_sample}...")
sb.uc_open_with_reconnect(url, 4)
@@ -55,11 +106,17 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# TODO: implement other Captcha handling
sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected
# time.sleep(1) # wait for the page to load
suitable_dropin = self._get_suitable_dropin(url, sb)
if suitable_dropin:
suitable_dropin.open_page(url)
if self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
return False
logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely
to_enrich.set_title(sb.get_title())
self._enrich_html_source_code(sb, to_enrich)
@@ -67,18 +124,42 @@ class AntibotExtractorEnricher(Extractor, Enricher):
if self.save_to_pdf:
self._enrich_full_page_pdf(sb, to_enrich)
self._enrich_download_media(sb, to_enrich, css_selector="img", max_media=self.max_download_images)
self._enrich_download_media(
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos
)
downloaded_images, downloaded_videos = 0, 0
if suitable_dropin:
downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich)
self._enrich_download_media(
sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images
)
self._enrich_download_media(
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos
)
logger.success(f"ANTIBOT completed for {url_sample}")
return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e:
if custom_data_dir: # the retry logic only works once
logger.error(
f"ANTIBOT session not created error: {e}. Please remove the user_data_dir {self.user_data_dir} and try again, will retry without user data dir though."
)
return self.enrich(to_enrich, custom_data_dir=False)
raise e # re-raise
except Exception as e:
logger.error(f"ANTIBOT runtime error: {e}: {traceback.format_exc()}")
return False
def _get_suitable_dropin(self, url: str, sb: SB):
"""
Returns a suitable drop-in for the given URL.
This method checks if the URL is suitable for any of the registered drop-ins.
"""
for dropin in self.dropins:
if dropin.suitable(url):
logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}")
return dropin(sb, self)
# logger.warning(f"ANTIBOT no suitable drop-in found for {url}")
return None
def _hit_auth_wall(self, sb: SB) -> bool:
"""
Tries to detect if the currently loaded page is an auth/login wall.
@@ -202,16 +283,20 @@ class AntibotExtractorEnricher(Extractor, Enricher):
)
url = to_enrich.get_url()
all_urls = set()
media_elements = sb.find_elements(css_selector)
for media in media_elements:
# media_elements = sb.find_elements(css_selector)
sources = sb.execute_script(f"""
return Array.from(document.querySelectorAll("{css_selector}"))
.map(el => el.src || el.href)
.filter(Boolean);
""")
for src in sources:
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")
break
if src := media.get_attribute("src"):
mimerype = mimetypes.guess_type(src)[0]
if mimerype in self.exclude_media_mimetypes:
continue
full_src = urljoin(url, src)
if full_src not in all_urls and (filename := self.download_from_url(full_src)):
all_urls.add(full_src)
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))
mimerype = mimetypes.guess_type(src)[0]
if mimerype in self.exclude_media_mimetypes:
continue
full_src = urljoin(url, src)
if full_src not in all_urls and (filename := self.download_from_url(full_src)):
all_urls.add(full_src)
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))

View File

@@ -0,0 +1,55 @@
from seleniumbase import SB
from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata
class Dropin:
"""
A class to handle drop-in functionality for the antibot extractor enricher module.
This class is designed to be a base class for drop-ins that can handle specific websites.
"""
def __init__(self, sb: SB, extractor: Extractor):
"""
Initialize the Dropin with the given SeleniumBase instance.
:param sb: An instance of the SeleniumBase class that this drop-in will use.
:param extractor: An instance of the Extractor class that this drop-in will use.
"""
self.sb: SB = sb
self.extractor: Extractor = extractor
@staticmethod
def suitable(url: str) -> bool:
"""
Check if the URL is suitable for processing with this dropin.
:param url: The URL to check.
:return: True if the URL is suitable for processing, False otherwise.
"""
raise NotImplementedError("This method should be implemented in the subclass")
@staticmethod
def sanitize_url(url: str) -> str:
"""
Used to clean unnecessary URL parameters OR unfurl redirect links
"""
return url
def open_page(self, url) -> bool:
"""
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
:param url: The URL to open.
:return: True if success, False otherwise.
"""
raise NotImplementedError("This method should be implemented in the subclass")
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
"""
Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object.
:return: A tuple (number of Images added, number of Videos added).
"""
raise NotImplementedError("This method should be implemented in the subclass")

View File

@@ -4,10 +4,7 @@
"author": "Bellingcat",
"type": ["extractor"],
"requires_setup": False,
"dependencies": {
"python": ["yt_dlp", "requests", "loguru", "slugify"],
"bin": ["ffmpeg"]
},
"dependencies": {"python": ["yt_dlp", "requests", "loguru", "slugify"], "bin": ["ffmpeg"]},
"description": """
This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.

View File

@@ -382,8 +382,6 @@ class GenericExtractor(Extractor):
entries = [data]
result = Metadata()
for entry in entries:
try:
filename = ydl_entry_to_filename(ydl, entry)

View File

@@ -128,7 +128,7 @@ def ydl_entry_to_filename(ydl, entry: dict) -> str:
filename = ydl.prepare_filename(entry)
if os.path.exists(filename):
return filename
base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext'
directory = os.path.dirname(base_filename) # '/get/path/to'
basename = os.path.basename(base_filename) # 'file'
@@ -139,4 +139,4 @@ def ydl_entry_to_filename(ydl, entry: dict) -> str:
and "video/" in (mimetypes.guess_type(f)[0] or "")
):
return os.path.join(directory, f)
return False
return False