From aaa9ead39d267ae07ce4bb9e1c3f4e1479e72805 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:58:53 +0100 Subject: [PATCH] adds documentation for dropins --- docs/scripts/scripts.py | 22 ++++++++++++++++++- .../__manifest__.py | 9 ++++++-- .../antibot_extractor_enricher/dropin.py | 14 ++++++++++++ .../dropins/linkedin.py | 15 +++++++++++++ .../dropins/reddit.py | 14 ++++++++++++ .../antibot_extractor_enricher/dropins/vk.py | 16 +++++++++++++- 6 files changed, 86 insertions(+), 4 deletions(-) diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index bfddd29..f9cb13d 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -47,7 +47,6 @@ def generate_module_docs(): for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)): # generate the markdown file from the __manifest__.py file. - manifest = module.manifest for type in manifest["type"]: modules_by_type.setdefault(type, []).append(module) @@ -64,6 +63,27 @@ def generate_module_docs(): """ steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"]) + if manifest.get("autodoc_dropins"): + loaded_module = module.load({}) + dropins = loaded_module.load_dropins() + dropin_str = "\n##### Available Dropins\n" + for dropin in dropins: + if not (ddoc := dropin.documentation()): + continue + dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n" + dropin_str += f"{ddoc.get('description')}\n\n" + if ddoc.get("site"): + dropin_str += f"**Site**: {ddoc['site']}\n\n" + if dauth := ddoc.get("authentication"): + dropin_str += "**YAML configuration**:\n" + dropin_auth_yaml = "authentication:\n...\n" + for site, creds in dauth.items(): + dropin_auth_yaml += f" {site}:\n" + for k, v in creds.items(): + dropin_auth_yaml += f' {k}: "{v}"\n' + dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n" + readme_str += dropin_str + if not manifest["configs"]: config_string = f"# No configuration options for {module.name}.*\n" else: diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py index e2bcad9..f08547b 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py @@ -31,11 +31,12 @@ "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'", }, }, + "autodoc_dropins": True, "description": """ Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha. - Still in trial development, please report any issues or suggestions via GitHub Issues. - + > ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues). + ### Features - Extracts the HTML source code of the page. - Takes full-page screenshots of web pages. @@ -44,5 +45,9 @@ ### Notes - Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary. + + ### Dropins + This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below. + """, } diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 2e8c4f6..d4b255d 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,4 +1,5 @@ import os +from typing import Mapping from loguru import logger from seleniumbase import SB import yt_dlp @@ -13,6 +14,19 @@ class Dropin: This class is designed to be a base class for drop-ins that can handle specific websites. """ + @staticmethod + def documentation() -> Mapping[str, str]: + """ + Each Dropin should auto-document itself with this method. + Return dictionary can include: + - 'name': A string representing the name of the dropin. + - 'description': A string describing the functionality of the dropin. + - 'site': A string representing the site this dropin is for. + - 'authentication': A dictionary with authentication example for the site. + + """ + return {} + def __init__(self, sb: SB, extractor: Extractor): """ Initialize the Dropin with the given SeleniumBase instance. diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py index 3917af9..336b630 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py @@ -1,3 +1,4 @@ +from typing import Mapping from loguru import logger from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -7,6 +8,20 @@ class LinkedinDropin(Dropin): A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module. """ + @staticmethod + def documentation() -> Mapping[str, str]: + return { + "name": "Linkedin Dropin", + "description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.", + "site": "linkedin.com", + "authentication": { + "linkedin.com": { + "username": "email address or phone number", + "password": "password", + } + }, + } + notifications_css_selector = 'a[href*="linkedin.com/notifications"]' @staticmethod diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py index 78bc510..3f699b6 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py @@ -1,4 +1,5 @@ from contextlib import suppress +from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -10,6 +11,19 @@ class RedditDropin(Dropin): A class to handle Reddit drop-in functionality for the antibot extractor enricher module. """ + def documentation() -> Mapping[str, str]: + return { + "name": "Reddit Dropin", + "description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.", + "site": "reddit.com", + "authentication": { + "reddit.com": { + "username": "email address or username", + "password": "password", + } + }, + } + @staticmethod def suitable(url: str) -> bool: return "reddit.com" in url diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 6888727..3f92eda 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -1,4 +1,5 @@ import re +from typing import Mapping from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -16,6 +17,19 @@ class VkDropin(Dropin): CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)") PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)") + def documentation() -> Mapping[str, str]: + return { + "name": "VKontakte Dropin", + "description": "Handles VKontakte posts and works without authentication for some content.", + "site": "vk.com", + "authentication": { + "vk.com": { + "username": "phone number with country code", + "password": "password", + } + }, + } + @staticmethod def suitable(url: str) -> bool: return "vk.com" in url @@ -39,7 +53,7 @@ class VkDropin(Dropin): @logger.catch def _login(self) -> bool: - # TODO: test method + # TODO: test method, because current tests work without a login self.sb.open("https://vk.com") self.sb.wait_for_ready_state_complete() if "/feed" in self.sb.get_current_url():