From aaa9ead39d267ae07ce4bb9e1c3f4e1479e72805 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:58:53 +0100
Subject: [PATCH] adds documentation for dropins

---
 docs/scripts/scripts.py                       | 22 ++++++++++++++++++-
 .../__manifest__.py                           |  9 ++++++--
 .../antibot_extractor_enricher/dropin.py      | 14 ++++++++++++
 .../dropins/linkedin.py                       | 15 +++++++++++++
 .../dropins/reddit.py                         | 14 ++++++++++++
 .../antibot_extractor_enricher/dropins/vk.py  | 16 +++++++++++++-
 6 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py
index bfddd29..f9cb13d 100644
--- a/docs/scripts/scripts.py
+++ b/docs/scripts/scripts.py
@@ -47,7 +47,6 @@ def generate_module_docs():
 
     for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
         # generate the markdown file from the __manifest__.py file.
-
         manifest = module.manifest
         for type in manifest["type"]:
             modules_by_type.setdefault(type, []).append(module)
@@ -64,6 +63,27 @@ def generate_module_docs():
 """
         steps_str = "\n".join(f"  {t}s:\n  - {module.name}" for t in manifest["type"])
 
+        if manifest.get("autodoc_dropins"):
+            loaded_module = module.load({})
+            dropins = loaded_module.load_dropins()
+            dropin_str = "\n##### Available Dropins\n"
+            for dropin in dropins:
+                if not (ddoc := dropin.documentation()):
+                    continue
+                dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n"
+                dropin_str += f"{ddoc.get('description')}\n\n"
+                if ddoc.get("site"):
+                    dropin_str += f"**Site**: {ddoc['site']}\n\n"
+                if dauth := ddoc.get("authentication"):
+                    dropin_str += "**YAML configuration**:\n"
+                    dropin_auth_yaml = "authentication:\n...\n"
+                    for site, creds in dauth.items():
+                        dropin_auth_yaml += f"  {site}:\n"
+                        for k, v in creds.items():
+                            dropin_auth_yaml += f'    {k}: "{v}"\n'
+                    dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n"
+            readme_str += dropin_str
+
         if not manifest["configs"]:
             config_string = f"# No configuration options for {module.name}.*\n"
         else:
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py
index e2bcad9..f08547b 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py
@@ -31,11 +31,12 @@
             "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
         },
     },
+    "autodoc_dropins": True,
     "description": """
     Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
 	
-	Still in trial development, please report any issues or suggestions via GitHub Issues.
-
+	> ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues).
+	
     ### Features
 	- Extracts the HTML source code of the page.
     - Takes full-page screenshots of web pages.
@@ -44,5 +45,9 @@
 
     ### Notes
 	- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
+
+	### Dropins
+	This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below.
+
     """,
 }
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
index 2e8c4f6..d4b255d 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@@ -1,4 +1,5 @@
 import os
+from typing import Mapping
 from loguru import logger
 from seleniumbase import SB
 import yt_dlp
@@ -13,6 +14,19 @@ class Dropin:
     This class is designed to be a base class for drop-ins that can handle specific websites.
     """
 
+    @staticmethod
+    def documentation() -> Mapping[str, str]:
+        """
+        Each Dropin should auto-document itself with this method.
+        Return dictionary can include:
+        - 'name': A string representing the name of the dropin.
+        - 'description': A string describing the functionality of the dropin.
+        - 'site': A string representing the site this dropin is for.
+        - 'authentication': A dictionary with authentication example for the site.
+
+        """
+        return {}
+
     def __init__(self, sb: SB, extractor: Extractor):
         """
         Initialize the Dropin with the given SeleniumBase instance.
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
index 3917af9..336b630 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/linkedin.py
@@ -1,3 +1,4 @@
+from typing import Mapping
 from loguru import logger
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
 
@@ -7,6 +8,20 @@ class LinkedinDropin(Dropin):
     A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module.
     """
 
+    @staticmethod
+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "Linkedin Dropin",
+            "description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.",
+            "site": "linkedin.com",
+            "authentication": {
+                "linkedin.com": {
+                    "username": "email address or phone number",
+                    "password": "password",
+                }
+            },
+        }
+
     notifications_css_selector = 'a[href*="linkedin.com/notifications"]'
 
     @staticmethod
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
index 78bc510..3f699b6 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/reddit.py
@@ -1,4 +1,5 @@
 from contextlib import suppress
+from typing import Mapping
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
 
@@ -10,6 +11,19 @@ class RedditDropin(Dropin):
     A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
     """
 
+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "Reddit Dropin",
+            "description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.",
+            "site": "reddit.com",
+            "authentication": {
+                "reddit.com": {
+                    "username": "email address or username",
+                    "password": "password",
+                }
+            },
+        }
+
     @staticmethod
     def suitable(url: str) -> bool:
         return "reddit.com" in url
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
index 6888727..3f92eda 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
@@ -1,4 +1,5 @@
 import re
+from typing import Mapping
 
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -16,6 +17,19 @@ class VkDropin(Dropin):
     CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
     PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
 
+    def documentation() -> Mapping[str, str]:
+        return {
+            "name": "VKontakte Dropin",
+            "description": "Handles VKontakte posts and works without authentication for some content.",
+            "site": "vk.com",
+            "authentication": {
+                "vk.com": {
+                    "username": "phone number with country code",
+                    "password": "password",
+                }
+            },
+        }
+
     @staticmethod
     def suitable(url: str) -> bool:
         return "vk.com" in url
@@ -39,7 +53,7 @@ class VkDropin(Dropin):
 
     @logger.catch
     def _login(self) -> bool:
-        # TODO: test method
+        # TODO: test method, because current tests work without a login
         self.sb.open("https://vk.com")
         self.sb.wait_for_ready_state_complete()
         if "/feed" in self.sb.get_current_url():