mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Merge pull request #217 from bellingcat/settings_page
Settings page user interface
This commit is contained in:
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
'name': 'Command Line Feeder',
|
||||
'type': ['feeder'],
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
'requires_setup': False,
|
||||
'description': 'Feeds URLs to orchestrator from the command line',
|
||||
'configs': {
|
||||
'urls': {
|
||||
'default': None,
|
||||
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
|
||||
},
|
||||
},
|
||||
'description': """
|
||||
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
||||
without the need to specify any additional configuration or command line arguments:
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
|
||||
|
||||
You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
|
||||
""",
|
||||
}
|
||||
21
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
21
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.feeder import Feeder
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.urls = self.config['urls']
|
||||
if not self.urls:
|
||||
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
urls = self.config['urls']
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
m.set_context("folder", "cli")
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
@@ -6,7 +6,7 @@
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||
},
|
||||
"description": """
|
||||
Handles exporting archival results to a CSV file.
|
||||
|
||||
@@ -12,7 +12,9 @@
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"header": {"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
"detect_thumbnails": {"default": True,
|
||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||
"type": "bool"},
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
"save_absolute": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
|
||||
@@ -6,13 +6,25 @@
|
||||
"python": ["loguru", "selenium"],
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 1024, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"width": {"default": 1280,
|
||||
"type": "int",
|
||||
"help": "width of the screenshots"},
|
||||
"height": {"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"},
|
||||
"timeout": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
"save_to_pdf": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader"},
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
"skip_when_nothing_archived": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"join_channels": {"default": True,
|
||||
"type": "bool",
|
||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
|
||||
@@ -17,11 +17,19 @@
|
||||
"configs": {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"timeout": {"default": 120,
|
||||
"type": "int",
|
||||
"help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
"extract_media": {"default": False,
|
||||
"type": 'bool',
|
||||
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"extract_screenshot": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
"configs": {
|
||||
"timeout": {
|
||||
"default": 15,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
|
||||
},
|
||||
"if_not_archived_within": {
|
||||
|
||||
@@ -10,8 +10,12 @@
|
||||
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"required": True,
|
||||
"help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"include_srt": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90,
|
||||
"type": "int",
|
||||
"help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate",
|
||||
"help": "which Whisper operation to execute",
|
||||
"choices": ["transcribe", "translate", "language_detection"]},
|
||||
|
||||
Reference in New Issue
Block a user