mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Merge pull request #217 from bellingcat/settings_page
Settings page user interface
This commit is contained in:
@@ -80,7 +80,10 @@ class ModuleFactory:
|
||||
|
||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||
if 'archiver' in module_name:
|
||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
||||
raise IndexError(message)
|
||||
return available[0]
|
||||
|
||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
@@ -73,10 +73,20 @@ class ArchivingOrchestrator:
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
@@ -101,6 +111,7 @@ class ArchivingOrchestrator:
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if is_valid_config(yaml_config):
|
||||
self.check_steps(yaml_config)
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
@@ -116,10 +127,6 @@ class ArchivingOrchestrator:
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||
if not yaml_config['steps']['feeders']:
|
||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
@@ -172,9 +179,6 @@ class ArchivingOrchestrator:
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# allow passing URLs directly on the command line
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
@@ -194,7 +198,11 @@ class ArchivingOrchestrator:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
for module in modules:
|
||||
|
||||
if module.name == 'cli_feeder':
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
continue
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
@@ -278,27 +286,6 @@ class ArchivingOrchestrator:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
# cli_feeder is a pseudo module, it just takes the command line args for [URLS]
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
|
||||
pseudo_module = type('CLIFeeder', (Feeder,), {
|
||||
'name': 'cli_feeder',
|
||||
'display_name': 'CLI Feeder',
|
||||
'__iter__': feed
|
||||
|
||||
})()
|
||||
|
||||
pseudo_module.__iter__ = feed
|
||||
step_items.append(pseudo_module)
|
||||
continue
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
'name': 'Command Line Feeder',
|
||||
'type': ['feeder'],
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
'requires_setup': False,
|
||||
'description': 'Feeds URLs to orchestrator from the command line',
|
||||
'configs': {
|
||||
'urls': {
|
||||
'default': None,
|
||||
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
|
||||
},
|
||||
},
|
||||
'description': """
|
||||
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
||||
without the need to specify any additional configuration or command line arguments:
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
|
||||
|
||||
You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
|
||||
""",
|
||||
}
|
||||
21
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
21
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.feeder import Feeder
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.urls = self.config['urls']
|
||||
if not self.urls:
|
||||
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
urls = self.config['urls']
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
m.set_context("folder", "cli")
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
@@ -6,7 +6,7 @@
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||
},
|
||||
"description": """
|
||||
Handles exporting archival results to a CSV file.
|
||||
|
||||
@@ -12,7 +12,9 @@
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"header": {"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
"detect_thumbnails": {"default": True,
|
||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||
"type": "bool"},
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
"save_absolute": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
|
||||
@@ -6,13 +6,25 @@
|
||||
"python": ["loguru", "selenium"],
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 1024, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"width": {"default": 1280,
|
||||
"type": "int",
|
||||
"help": "width of the screenshots"},
|
||||
"height": {"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"},
|
||||
"timeout": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
"save_to_pdf": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader"},
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
"skip_when_nothing_archived": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"join_channels": {"default": True,
|
||||
"type": "bool",
|
||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
|
||||
@@ -17,11 +17,19 @@
|
||||
"configs": {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"timeout": {"default": 120,
|
||||
"type": "int",
|
||||
"help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
"extract_media": {"default": False,
|
||||
"type": 'bool',
|
||||
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"extract_screenshot": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
"configs": {
|
||||
"timeout": {
|
||||
"default": 15,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
|
||||
},
|
||||
"if_not_archived_within": {
|
||||
|
||||
@@ -10,8 +10,12 @@
|
||||
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"required": True,
|
||||
"help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"include_srt": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90,
|
||||
"type": "int",
|
||||
"help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate",
|
||||
"help": "which Whisper operation to execute",
|
||||
"choices": ["transcribe", "translate", "language_detection"]},
|
||||
|
||||
Reference in New Issue
Block a user