Merge branch 'main' into merge_modules

This commit is contained in:
Patrick Robertson
2025-03-07 16:19:51 +00:00
47 changed files with 55825 additions and 102 deletions

View File

@@ -0,0 +1,23 @@
{
'name': 'Command Line Feeder',
'type': ['feeder'],
'entry_point': 'cli_feeder::CLIFeeder',
'requires_setup': False,
'description': 'Feeds URLs to orchestrator from the command line',
'configs': {
'urls': {
'default': None,
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
},
},
'description': """
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
without the need to specify any additional configuration or command line arguments:
`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
""",
}

View File

@@ -0,0 +1,21 @@
from loguru import logger
from auto_archiver.core.feeder import Feeder
from auto_archiver.core.metadata import Metadata
class CLIFeeder(Feeder):
def setup(self) -> None:
self.urls = self.config['urls']
if not self.urls:
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
def __iter__(self) -> Metadata:
urls = self.config['urls']
for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
m.set_context("folder", "cli")
yield m
logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -10,7 +10,7 @@ class ConsoleDb(Database):
"""
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
logger.info(f"STARTED {item}")
def failed(self, item: Metadata, reason:str) -> None:
logger.error(f"FAILED {item}: {reason}")

View File

@@ -6,7 +6,7 @@
},
'entry_point': 'csv_db::CSVDb',
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"}
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
},
"description": """
Handles exporting archival results to a CSV file.

View File

@@ -28,6 +28,13 @@ the broader archiving framework.
metadata objects. Some dropins are included in this generic_archiver by default, but
custom dropins can be created to handle additional websites and passed to the archiver
via the command line using the `--dropins` option (TODO!).
### Auto-Updates
The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
""",
"configs": {
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"ytdlp_update_interval": {
"default": 5,
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
"type": "int",
},
},
}

View File

@@ -1,7 +1,11 @@
import datetime, os, yt_dlp, pysubs2
import datetime, os
import importlib
import subprocess
from typing import Generator, Type
import yt_dlp
from yt_dlp.extractor.common import InfoExtractor
import pysubs2
from loguru import logger
@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
class GenericExtractor(Extractor):
_dropins = {}
def setup(self):
# check for file .ytdlp-update in the secrets folder
if self.ytdlp_update_interval < 0:
return
use_secrets = os.path.exists('secrets')
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
next_update_check = None
if os.path.exists(path):
with open(path, "r") as f:
next_update_check = datetime.datetime.fromisoformat(f.read())
if not next_update_check or next_update_check < datetime.datetime.now():
self.update_ytdlp()
next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
with open(path, "w") as f:
f.write(next_update_check.isoformat())
def update_ytdlp(self):
logger.info("Checking and updating yt-dlp...")
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
from importlib.metadata import version as get_version
old_version = get_version("yt-dlp")
try:
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
if "Successfully installed yt-dlp" in result.stdout.decode():
new_version = importlib.metadata.version("yt-dlp")
logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
importlib.reload(yt_dlp)
else:
logger.info("yt-dlp already up to date")
except Exception as e:
logger.error(f"Error updating yt-dlp: {e}")
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
"""
Returns a list of valid extractors for the given URL"""

View File

@@ -12,7 +12,9 @@
"default": None,
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
},
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"header": {"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",

View File

@@ -7,7 +7,9 @@
"bin": [""]
},
"configs": {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
"detect_thumbnails": {"default": True,
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
"type": "bool"},
},
"description": """ """,
}

View File

@@ -10,25 +10,30 @@
"requires_setup": True,
"configs": {
"username": {"required": True,
"help": "a valid Instagram username"},
"help": "A valid Instagram username."},
"password": {
"required": True,
"help": "the corresponding Instagram account password",
"help": "The corresponding Instagram account password.",
},
"download_folder": {
"default": "instaloader",
"help": "name of a folder to temporarily download content to",
"help": "Name of a folder to temporarily download content to.",
},
"session_file": {
"default": "secrets/instaloader.session",
"help": "path to the instagram session which saves session credentials",
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
},
# TODO: fine-grain
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
},
"description": """
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram.
> ⚠️ **Warning**
> This module is not actively maintained due to known issues with blocking.
> Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts.
Authentication is required via username/password or a session file.

View File

@@ -3,7 +3,7 @@
highlights, and tagged posts. Authentication is required via username/password or a session file.
"""
import re, os, shutil, traceback
import re, os, shutil
import instaloader
from loguru import logger
@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
"""
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
# NB: post regex should be tested before profile
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
# https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
# https://regex101.com/r/6Wbsxa/1
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
def setup(self) -> None:
self.insta = instaloader.Instaloader(
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
download_geotags=True,
download_comments=True,
compress_json=False,
dirname_pattern=self.download_folder,
filename_pattern="{date_utc}_UTC_{target}__{typename}"
)
try:
self.insta.load_session_from_file(self.username, self.session_file)
except Exception as e:
logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
try:
self.insta.login(self.username, config.instagram_self.password)
# TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
logger.debug(f"Session file failed", exc_info=True)
logger.info("No valid session file found - Attempting login with use and password.")
self.insta.login(self.username, self.password)
self.insta.save_session_to_file(self.session_file)
except Exception as e2:
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
except Exception as e:
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
def download(self, item: Metadata) -> Metadata:

View File

@@ -104,7 +104,7 @@ class InstagramTbotExtractor(Extractor):
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):

View File

@@ -17,7 +17,9 @@
"choices": ["random", "static"],
},
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
"save_absolute": {"default": False,
"type": "bool",
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
},
"description": """
LocalStorage: A storage module for saving archived content locally on the filesystem.

View File

@@ -6,13 +6,25 @@
"python": ["loguru", "selenium"],
},
"configs": {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
"width": {"default": 1280,
"type": "int",
"help": "width of the screenshots"},
"height": {"default": 1024,
"type": "int",
"help": "height of the screenshots"},
"timeout": {"default": 60,
"type": "int",
"help": "timeout for taking the screenshot"},
"sleep_before_screenshot": {"default": 4,
"type": "int",
"help": "seconds to wait for the pages to load before taking screenshot"},
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
"save_to_pdf": {"default": False,
"type": "bool",
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
"print_options": {"default": {},
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
"type": "json_loader"},
},
"description": """
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.

View File

@@ -7,7 +7,9 @@
},
'entry_point': 'ssl_enricher::SSLEnricher',
"configs": {
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
"skip_when_nothing_archived": {"default": True,
"type": 'bool',
"help": "if true, will skip enriching when no media is archived"},
},
"description": """
Retrieves SSL certificate information for a domain and stores it as a file.

View File

@@ -14,7 +14,9 @@
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
"join_channels": {"default": True,
"type": "bool",
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
"channel_invites": {
"default": {},
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",

View File

@@ -17,11 +17,19 @@
"configs": {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"timeout": {"default": 120,
"type": "int",
"help": "timeout for WACZ generation in seconds", "type": "int"},
"extract_media": {"default": False,
"type": 'bool',
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
},
"extract_screenshot": {"default": True,
"type": 'bool',
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
},
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
},
"description": """

View File

@@ -9,6 +9,7 @@
"configs": {
"timeout": {
"default": 15,
"type": "int",
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
},
"if_not_archived_within": {

View File

@@ -10,8 +10,12 @@
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"required": True,
"help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"include_srt": {"default": False,
"type": "bool",
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90,
"type": "int",
"help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translate",
"help": "which Whisper operation to execute",
"choices": ["transcribe", "translate", "language_detection"]},