adds documentation for dropins

2026-06-07 19:08:30 +03:00 · 2025-06-11 17:58:53 +01:00
parent f5be7a50c1
commit aaa9ead39d
6 changed files with 86 additions and 4 deletions
--- a/docs/scripts/scripts.py
+++ b/docs/scripts/scripts.py
@@ -47,7 +47,6 @@ def generate_module_docs():

    for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
        # generate the markdown file from the __manifest__.py file.
-
        manifest = module.manifest
        for type in manifest["type"]:
            modules_by_type.setdefault(type, []).append(module)
@@ -64,6 +63,27 @@ def generate_module_docs():
 """
        steps_str = "\n".join(f"  {t}s:\n  - {module.name}" for t in manifest["type"])

+        if manifest.get("autodoc_dropins"):
+            loaded_module = module.load({})
+            dropins = loaded_module.load_dropins()
+            dropin_str = "\n##### Available Dropins\n"
+            for dropin in dropins:
+                if not (ddoc := dropin.documentation()):
+                    continue
+                dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n"
+                dropin_str += f"{ddoc.get('description')}\n\n"
+                if ddoc.get("site"):
+                    dropin_str += f"**Site**: {ddoc['site']}\n\n"
+                if dauth := ddoc.get("authentication"):
+                    dropin_str += "**YAML configuration**:\n"
+                    dropin_auth_yaml = "authentication:\n...\n"
+                    for site, creds in dauth.items():
+                        dropin_auth_yaml += f"  {site}:\n"
+                        for k, v in creds.items():
+                            dropin_auth_yaml += f'    {k}: "{v}"\n'
+                    dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n"
+            readme_str += dropin_str
+
        if not manifest["configs"]:
            config_string = f"# No configuration options for {module.name}.*\n"
        else:
--- a/src/auto_archiver/modules/antibot_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/manifest.py
@@ -31,10 +31,11 @@
            "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
        },
    },
+    "autodoc_dropins": True,
    "description": """
    Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
 	
-	Still in trial development, please report any issues or suggestions via GitHub Issues.
+	> ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues).
 	
    ### Features
 	- Extracts the HTML source code of the page.
@@ -44,5 +45,9 @@

    ### Notes
 	- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
+
+	### Dropins
+	This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below.
+
    """,
 }
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@@ -1,4 +1,5 @@
 import os
+from typing import Mapping
 from loguru import logger
 from seleniumbase import SB
 import yt_dlp
@@ -13,6 +14,19 @@ class Dropin:
    This class is designed to be a base class for drop-ins that can handle specific websites.
    """

+    @staticmethod
+    def documentation() -> Mapping[str, str]:
+        """
+        Each Dropin should auto-document itself with this method.
+        Return dictionary can include:
+        - 'name': A string representing the name of the dropin.
+        - 'description': A string describing the functionality of the dropin.
+        - 'site': A string representing the site this dropin is for.
+        - 'authentication': A dictionary with authentication example for the site.
+
+        """
+        return {}
+
    def __init__(self, sb: SB, extractor: Extractor):
        """
        Initialize the Dropin with the given SeleniumBase instance.
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
@@ -1,3 +1,4 @@
+from typing import Mapping
 from loguru import logger
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin

@@ -7,6 +8,20 @@ class LinkedinDropin(Dropin):
    A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module.
    """

+    @staticmethod
+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "Linkedin Dropin",
+            "description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.",
+            "site": "linkedin.com",
+            "authentication": {
+                "linkedin.com": {
+                    "username": "email address or phone number",
+                    "password": "password",
+                }
+            },
+        }
+
    notifications_css_selector = 'a[href*="linkedin.com/notifications"]'

    @staticmethod
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
@@ -1,4 +1,5 @@
 from contextlib import suppress
+from typing import Mapping
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin

@@ -10,6 +11,19 @@ class RedditDropin(Dropin):
    A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
    """

+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "Reddit Dropin",
+            "description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.",
+            "site": "reddit.com",
+            "authentication": {
+                "reddit.com": {
+                    "username": "email address or username",
+                    "password": "password",
+                }
+            },
+        }
+
    @staticmethod
    def suitable(url: str) -> bool:
        return "reddit.com" in url
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
@@ -1,4 +1,5 @@
 import re
+from typing import Mapping

 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -16,6 +17,19 @@ class VkDropin(Dropin):
    CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
    PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")

+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "VKontakte Dropin",
+            "description": "Handles VKontakte posts and works without authentication for some content.",
+            "site": "vk.com",
+            "authentication": {
+                "vk.com": {
+                    "username": "phone number with country code",
+                    "password": "password",
+                }
+            },
+        }
+
    @staticmethod
    def suitable(url: str) -> bool:
        return "vk.com" in url
@@ -39,7 +53,7 @@ class VkDropin(Dropin):

    @logger.catch
    def _login(self) -> bool:
-        # TODO: test method
+        # TODO: test method, because current tests work without a login
        self.sb.open("https://vk.com")
        self.sb.wait_for_ready_state_complete()
        if "/feed" in self.sb.get_current_url():