adds documentation for dropins

This commit is contained in:
msramalho
2025-06-11 17:58:53 +01:00
parent f5be7a50c1
commit aaa9ead39d
6 changed files with 86 additions and 4 deletions

View File

@@ -47,7 +47,6 @@ def generate_module_docs():
for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
# generate the markdown file from the __manifest__.py file.
manifest = module.manifest
for type in manifest["type"]:
modules_by_type.setdefault(type, []).append(module)
@@ -64,6 +63,27 @@ def generate_module_docs():
"""
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest["type"])
if manifest.get("autodoc_dropins"):
loaded_module = module.load({})
dropins = loaded_module.load_dropins()
dropin_str = "\n##### Available Dropins\n"
for dropin in dropins:
if not (ddoc := dropin.documentation()):
continue
dropin_str += f"\n###### {ddoc.get('name', dropin.__name__)}\n\n"
dropin_str += f"{ddoc.get('description')}\n\n"
if ddoc.get("site"):
dropin_str += f"**Site**: {ddoc['site']}\n\n"
if dauth := ddoc.get("authentication"):
dropin_str += "**YAML configuration**:\n"
dropin_auth_yaml = "authentication:\n...\n"
for site, creds in dauth.items():
dropin_auth_yaml += f" {site}:\n"
for k, v in creds.items():
dropin_auth_yaml += f' {k}: "{v}"\n'
dropin_str += f"```{{code}} yaml\n{dropin_auth_yaml}...\n```\n"
readme_str += dropin_str
if not manifest["configs"]:
config_string = f"# No configuration options for {module.name}.*\n"
else:

View File

@@ -31,10 +31,11 @@
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
},
},
"autodoc_dropins": True,
"description": """
Uses a browser controlled by SeleniumBase to capture HTML, media, and screenshots/PDFs of a web page, by bypassing anti-bot measures like Cloudflare's Turnstile or Google Recaptcha.
Still in trial development, please report any issues or suggestions via GitHub Issues.
> ⚠️ Still in trial development, please report any issues or suggestions via [GitHub Issues](https://github.com/bellingcat/auto-archiver/issues).
### Features
- Extracts the HTML source code of the page.
@@ -44,5 +45,9 @@
### Notes
- Using a proxy affects Cloudflare Turnstile captcha handling, so it is recommended to use a proxy only if necessary.
### Dropins
This module uses sub-modules called Dropins for specific sites that allow it to handle anti-bot measures and custom Login flows. You don't need to include the dropins in your configuration, but you do need to add authentication credentials if you want to overcome login walls on those sites, see detailed instructions for each Dropin below.
""",
}

View File

@@ -1,4 +1,5 @@
import os
from typing import Mapping
from loguru import logger
from seleniumbase import SB
import yt_dlp
@@ -13,6 +14,19 @@ class Dropin:
This class is designed to be a base class for drop-ins that can handle specific websites.
"""
@staticmethod
def documentation() -> Mapping[str, str]:
"""
Each Dropin should auto-document itself with this method.
Return dictionary can include:
- 'name': A string representing the name of the dropin.
- 'description': A string describing the functionality of the dropin.
- 'site': A string representing the site this dropin is for.
- 'authentication': A dictionary with authentication example for the site.
"""
return {}
def __init__(self, sb: SB, extractor: Extractor):
"""
Initialize the Dropin with the given SeleniumBase instance.

View File

@@ -1,3 +1,4 @@
from typing import Mapping
from loguru import logger
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -7,6 +8,20 @@ class LinkedinDropin(Dropin):
A class to handle LinkedIn drop-in functionality for the antibot extractor enricher module.
"""
@staticmethod
def documentation() -> Mapping[str, str]:
return {
"name": "Linkedin Dropin",
"description": "Handles LinkedIn pages/posts and requires authentication to access most content but will still be useful without it. The first time you login to a new IP, LinkedIn may require an email verification code, you can do a manual login first and then it won't ask for it again.",
"site": "linkedin.com",
"authentication": {
"linkedin.com": {
"username": "email address or phone number",
"password": "password",
}
},
}
notifications_css_selector = 'a[href*="linkedin.com/notifications"]'
@staticmethod

View File

@@ -1,4 +1,5 @@
from contextlib import suppress
from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -10,6 +11,19 @@ class RedditDropin(Dropin):
A class to handle Reddit drop-in functionality for the antibot extractor enricher module.
"""
def documentation() -> Mapping[str, str]:
return {
"name": "Reddit Dropin",
"description": "Handles Reddit posts and works without authentication until Reddit flags your IP, so authentication is advised.",
"site": "reddit.com",
"authentication": {
"reddit.com": {
"username": "email address or username",
"password": "password",
}
},
}
@staticmethod
def suitable(url: str) -> bool:
return "reddit.com" in url

View File

@@ -1,4 +1,5 @@
import re
from typing import Mapping
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
@@ -16,6 +17,19 @@ class VkDropin(Dropin):
CLIP_PATTERN = re.compile(r"(clip.{0,1}\d+_\d+)")
PHOTO_PATTERN = re.compile(r"(photo.{0,1}\d+_\d+)")
def documentation() -> Mapping[str, str]:
return {
"name": "VKontakte Dropin",
"description": "Handles VKontakte posts and works without authentication for some content.",
"site": "vk.com",
"authentication": {
"vk.com": {
"username": "phone number with country code",
"password": "password",
}
},
}
@staticmethod
def suitable(url: str) -> bool:
return "vk.com" in url
@@ -39,7 +53,7 @@ class VkDropin(Dropin):
@logger.catch
def _login(self) -> bool:
# TODO: test method
# TODO: test method, because current tests work without a login
self.sb.open("https://vk.com")
self.sb.wait_for_ready_state_complete()
if "/feed" in self.sb.get_current_url():