mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Merge branch 'main' into small_issues
# Conflicts: # src/auto_archiver/core/base_module.py # src/auto_archiver/utils/misc.py
This commit is contained in:
@@ -84,11 +84,13 @@ class BaseModule(ABC):
|
||||
* api_key: str - the API key to use for login\n
|
||||
* api_secret: str - the API secret to use for login\n
|
||||
* cookie: str - a cookie string to use for login (specific to this site)\n
|
||||
* cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
|
||||
"""
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
||||
site = UrlUtil.domain_for_url(site)
|
||||
site = UrlUtil.domain_for_url(site).lstrip("www.")
|
||||
# add the 'www' version of the site to the list of sites to check
|
||||
authdict = {}
|
||||
|
||||
@@ -115,16 +117,29 @@ class BaseModule(ABC):
|
||||
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
|
||||
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
||||
|
||||
# get the cookies jar, prefer the browser cookies than the file
|
||||
if 'cookies_from_browser' in self.authentication:
|
||||
get_cookiejar_options = None
|
||||
|
||||
# order of priority:
|
||||
# 1. cookies_from_browser setting in site config
|
||||
# 2. cookies_file setting in site config
|
||||
# 3. cookies_from_browser setting in global config
|
||||
# 4. cookies_file setting in global config
|
||||
|
||||
if 'cookies_from_browser' in authdict:
|
||||
get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
|
||||
elif 'cookies_file' in authdict:
|
||||
get_cookiejar_options = ['--cookies', authdict['cookies_file']]
|
||||
elif 'cookies_from_browser' in self.authentication:
|
||||
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
|
||||
if extract_cookies:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
|
||||
get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
|
||||
elif 'cookies_file' in self.authentication:
|
||||
authdict['cookies_file'] = self.authentication['cookies_file']
|
||||
if extract_cookies:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
|
||||
get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
|
||||
|
||||
|
||||
if get_cookiejar_options:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||
|
||||
return authdict
|
||||
|
||||
def repr(self):
|
||||
|
||||
@@ -7,6 +7,7 @@ flexible setup in various environments.
|
||||
|
||||
import argparse
|
||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -17,10 +18,12 @@ from typing import Any, List, Type, Tuple
|
||||
|
||||
_yaml: YAML = YAML()
|
||||
|
||||
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
|
||||
|
||||
EMPTY_CONFIG = _yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
"""
|
||||
|
||||
@@ -52,6 +55,57 @@ logging:
|
||||
""")
|
||||
# note: 'logging' is explicitly added above in order to better format the config file
|
||||
|
||||
|
||||
# Arg Parse Actions/Classes
|
||||
class AuthenticationJsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
|
||||
try:
|
||||
auth_dict = json.loads(values)
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
return auth_dict
|
||||
except:
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
for value in values:
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
def error(self, message):
|
||||
@@ -82,6 +136,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
return super().parse_known_args(args, namespace)
|
||||
|
||||
# Config Utils
|
||||
|
||||
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||
dotdict = {}
|
||||
@@ -153,8 +208,8 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
pass
|
||||
|
||||
if not config:
|
||||
config = EMPTY_CONFIG
|
||||
|
||||
config = deepcopy(EMPTY_CONFIG)
|
||||
|
||||
return config
|
||||
|
||||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
@@ -170,4 +225,7 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
_yaml.dump(config_to_save, outf)
|
||||
_yaml.dump(config_to_save, outf)
|
||||
|
||||
def is_valid_config(config: CommentedMap) -> bool:
|
||||
return config and config != EMPTY_CONFIG
|
||||
@@ -13,7 +13,7 @@ from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system.
|
||||
"""Base classes and utilities for enrichers in the Auto Archiver system.
|
||||
|
||||
Enricher modules must implement the `enrich` method to define their behavior.
|
||||
"""
|
||||
|
||||
@@ -134,7 +134,6 @@ class LazyBaseModule:
|
||||
|
||||
"""
|
||||
name: str
|
||||
type: list
|
||||
description: str
|
||||
path: str
|
||||
module_factory: ModuleFactory
|
||||
@@ -148,6 +147,10 @@ class LazyBaseModule:
|
||||
self.path = path
|
||||
self.module_factory = factory
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.manifest['type']
|
||||
|
||||
@property
|
||||
def entry_point(self):
|
||||
if not self._entry_point and not self.manifest['entry_point']:
|
||||
@@ -183,10 +186,9 @@ class LazyBaseModule:
|
||||
try:
|
||||
manifest.update(ast.literal_eval(f.read()))
|
||||
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
||||
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
|
||||
self._manifest = manifest
|
||||
self.type = manifest['type']
|
||||
self._entry_point = manifest['entry_point']
|
||||
self.description = manifest['description']
|
||||
self.version = manifest['version']
|
||||
@@ -254,7 +256,7 @@ class LazyBaseModule:
|
||||
instance.module_factory = self.module_factory
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
|
||||
@@ -6,95 +6,31 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List, Type, TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
from copy import copy
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from tempfile import TemporaryDirectory
|
||||
import traceback
|
||||
from copy import copy
|
||||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.url import check_url_or_raise
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
|
||||
class JsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
try:
|
||||
setattr(namespace, self.dest, json.loads(values))
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
|
||||
|
||||
class AuthenticationJsonParseAction(JsonParseAction):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
super().__call__(parser, namespace, values, option_string)
|
||||
auth_dict = getattr(namespace, self.dest)
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
return auth_dict
|
||||
except:
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(auth_dict).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
auth_dict[site] = val
|
||||
del auth_dict[key]
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
for value in values:
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# instance variables
|
||||
@@ -163,7 +99,7 @@ class ArchivingOrchestrator:
|
||||
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if yaml_config != EMPTY_CONFIG:
|
||||
if is_valid_config(yaml_config):
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
@@ -189,7 +125,13 @@ class ArchivingOrchestrator:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
self.add_individual_module_args(self.module_factory.available_modules(), parser)
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
@@ -198,6 +140,9 @@ class ArchivingOrchestrator:
|
||||
# merge the new config with the old one
|
||||
config = merge_dicts(vars(parsed), yaml_config)
|
||||
|
||||
# set up the authentication dict as needed
|
||||
config = self.setup_authentication(config)
|
||||
|
||||
# clean out args from the base_parser that we don't want in the config
|
||||
for key in vars(basic_config):
|
||||
config.pop(key, None)
|
||||
@@ -286,14 +231,20 @@ class ArchivingOrchestrator:
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable('auto_archiver')
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
try:
|
||||
logger.remove(0) # remove the default logger
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
@@ -312,27 +263,25 @@ class ArchivingOrchestrator:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
if not modules_to_load:
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
if len(modules_to_load):
|
||||
logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}")
|
||||
exit()
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
exit()
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
# pseudo module, don't load it
|
||||
# cli_feeder is a pseudo module, it just takes the command line args for [URLS]
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
@@ -352,13 +301,14 @@ class ArchivingOrchestrator:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
try:
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
loaded_module.cleanup()
|
||||
exit()
|
||||
raise e
|
||||
|
||||
if not loaded_module:
|
||||
invalid_modules.append(module)
|
||||
@@ -372,7 +322,7 @@ class ArchivingOrchestrator:
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
@@ -437,8 +387,12 @@ class ArchivingOrchestrator:
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
"""
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
try:
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
exit(1)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info("Cleaning up")
|
||||
@@ -503,8 +457,8 @@ class ArchivingOrchestrator:
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
try:
|
||||
self.assert_valid_url(original_url)
|
||||
except AssertionError as e:
|
||||
check_url_or_raise(original_url)
|
||||
except ValueError as e:
|
||||
logger.error(f"Error archiving URL {original_url}: {e}")
|
||||
raise e
|
||||
|
||||
@@ -564,26 +518,27 @@ class ArchivingOrchestrator:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def assert_valid_url(self, url: str) -> bool:
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||
Setup authentication for all modules that require it
|
||||
|
||||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
|
||||
|
||||
parsed = urlparse(url)
|
||||
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
|
||||
assert parsed.hostname, f"Invalid URL hostname"
|
||||
assert parsed.hostname != "localhost", f"Invalid URL"
|
||||
authentication = config.get('authentication', {})
|
||||
|
||||
try: # special rules for IP addresses
|
||||
ip = ip_address(parsed.hostname)
|
||||
except ValueError: pass
|
||||
else:
|
||||
assert ip.is_global, f"Invalid IP used"
|
||||
assert not ip.is_reserved, f"Invalid IP used"
|
||||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config['authentication'] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# used as validators for config values. Should raise an exception if the value is invalid.
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
def example_validator(value):
|
||||
if "example" not in value:
|
||||
@@ -16,4 +17,7 @@ def positive_number(value):
|
||||
def valid_file(value):
|
||||
if not Path(value).is_file():
|
||||
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
||||
return value
|
||||
return value
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
Reference in New Issue
Block a user