mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
Merge branch 'main' into timestamping_rewrite
This commit is contained in:
@@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
import sys
|
||||
|
||||
def main():
|
||||
ArchivingOrchestrator().run(sys.argv[1:])
|
||||
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"""
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .module import BaseModule
|
||||
from .base_module import BaseModule
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from typing import Mapping, Any
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from tempfile import TemporaryDirectory
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||
|
||||
from loguru import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .module import ModuleFactory
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
"""
|
||||
@@ -17,41 +22,24 @@ class BaseModule(ABC):
|
||||
however modules can have a .setup() method to run any setup code
|
||||
(e.g. logging in to a site, spinning up a browser etc.)
|
||||
|
||||
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
|
||||
See consts.MODULE_TYPES for the types of modules you can create, noting that
|
||||
a subclass can be of multiple types. For example, a module that extracts data from
|
||||
a website and stores it in a database would be both an 'extractor' and a 'database' module.
|
||||
|
||||
Each module is a python package, and should have a __manifest__.py file in the
|
||||
same directory as the module file. The __manifest__.py specifies the module information
|
||||
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
|
||||
like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
|
||||
default manifest structure.
|
||||
|
||||
"""
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
_DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
MODULE_TYPES = CONF_MODULE_TYPES
|
||||
|
||||
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
|
||||
config: Mapping[str, Any]
|
||||
authentication: Mapping[str, Mapping[str, str]]
|
||||
name: str
|
||||
module_factory: ModuleFactory
|
||||
|
||||
# this is set by the orchestrator prior to archiving
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
@@ -63,12 +51,6 @@ class BaseModule(ABC):
|
||||
def config_setup(self, config: dict):
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
||||
# change values to affect other modules
|
||||
@@ -89,16 +71,21 @@ class BaseModule(ABC):
|
||||
Returns the authentication information for a given site. This is used to authenticate
|
||||
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
|
||||
|
||||
extract_cookies: bool - whether or not to extract cookies from the given browser and return the
|
||||
cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
|
||||
:param site: the domain of the site to get authentication information for
|
||||
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
|
||||
|
||||
Currently, the dict can have keys of the following types:
|
||||
- username: str - the username to use for login
|
||||
- password: str - the password to use for login
|
||||
- api_key: str - the API key to use for login
|
||||
- api_secret: str - the API secret to use for login
|
||||
- cookie: str - a cookie string to use for login (specific to this site)
|
||||
- cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
|
||||
:returns: authdict dict of login information for the given site
|
||||
|
||||
**Global options:**\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
|
||||
* cookies_file: str - the path to a cookies file to use for login\n
|
||||
|
||||
**Currently, the sites dict can have keys of the following types:**\n
|
||||
* username: str - the username to use for login\n
|
||||
* password: str - the password to use for login\n
|
||||
* api_key: str - the API key to use for login\n
|
||||
* api_secret: str - the API secret to use for login\n
|
||||
* cookie: str - a cookie string to use for login (specific to this site)\n
|
||||
"""
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
||||
@@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
from loguru import logger
|
||||
|
||||
from copy import deepcopy
|
||||
from .module import BaseModule
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
@@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
"""
|
||||
|
||||
# Global configuration
|
||||
@@ -48,6 +48,7 @@ authentication: {}
|
||||
|
||||
logging:
|
||||
level: INFO
|
||||
|
||||
""")
|
||||
# note: 'logging' is explicitly added above in order to better format the config file
|
||||
|
||||
@@ -128,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
yaml_subdict[key] = value
|
||||
continue
|
||||
|
||||
if key == 'steps':
|
||||
for module_type, modules in value.items():
|
||||
# overwrite the 'steps' from the config file with the ones from the CLI
|
||||
yaml_subdict[key][module_type] = modules
|
||||
|
||||
if is_dict_type(value):
|
||||
update_dict(value, yaml_subdict[key])
|
||||
elif is_list_type(value):
|
||||
@@ -136,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
yaml_subdict[key] = value
|
||||
|
||||
update_dict(from_dot_notation(dotdict), yaml_dict)
|
||||
|
||||
return yaml_dict
|
||||
|
||||
def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
@@ -158,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
auth_dict = config_to_save.get("authentication", {})
|
||||
if auth_dict and auth_dict.get('load_from_file'):
|
||||
# remove all other values from the config, don't want to store it in the config file
|
||||
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
_yaml.dump(config_to_save, outf)
|
||||
23
src/auto_archiver/core/consts.py
Normal file
23
src/auto_archiver/core/consts.py
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
@@ -1,3 +1,8 @@
|
||||
"""
|
||||
Database module for the auto-archiver that defines the interface for implementing database modules
|
||||
in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import Union
|
||||
@@ -5,6 +10,11 @@ from typing import Union
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
class Database(BaseModule):
|
||||
"""
|
||||
Base class for implementing database modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `fetch` and `done` methods to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
"""signals the DB that the given item archival has started"""
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Enrichers are modular components that enhance archived content by adding
|
||||
Base module for Enrichers – modular components that enhance archived content by adding
|
||||
context, metadata, or additional processing.
|
||||
|
||||
These add additional information to the context, such as screenshots, hashes, and metadata.
|
||||
@@ -13,7 +13,16 @@ from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system.
|
||||
|
||||
Enricher modules must implement the `enrich` method to define their behavior.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
"""
|
||||
Enriches a Metadata object with additional information or context.
|
||||
|
||||
Takes the metadata object to enrich as an argument and modifies it in place, returning None.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -29,14 +29,24 @@ class Extractor(BaseModule):
|
||||
valid_url: re.Pattern = None
|
||||
|
||||
def cleanup(self) -> None:
|
||||
# called when extractors are done, or upon errors, cleanup any resources
|
||||
"""
|
||||
Called when extractors are done, or upon errors, cleanup any resources
|
||||
"""
|
||||
pass
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
"""
|
||||
Used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
"""
|
||||
return url
|
||||
|
||||
def match_link(self, url: str) -> re.Match:
|
||||
"""
|
||||
Returns a match object if the given URL matches the valid_url pattern or False/None if not.
|
||||
|
||||
Normally used in the `suitable` method to check if the URL is supported by this extractor.
|
||||
|
||||
"""
|
||||
return self.valid_url.match(url)
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
@@ -80,8 +90,8 @@ class Extractor(BaseModule):
|
||||
d.raise_for_status()
|
||||
|
||||
# get mimetype from the response headers
|
||||
if not Path(to_filename).suffix:
|
||||
content_type = d.headers.get('Content-Type')
|
||||
if not mimetypes.guess_type(to_filename)[0]:
|
||||
content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
|
||||
extension = mimetypes.guess_extension(content_type)
|
||||
if extension:
|
||||
to_filename += extension
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
The feeder base module defines the interface for implementing feeders in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata
|
||||
@@ -5,5 +9,17 @@ from auto_archiver.core import BaseModule
|
||||
|
||||
class Feeder(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing feeders in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `__iter__` method to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self) -> Metadata: return None
|
||||
def __iter__(self) -> Metadata:
|
||||
"""
|
||||
Returns an iterator (use `yield`) over the items to be archived.
|
||||
|
||||
These should be instances of Metadata, typically created with Metadata().set_url(url).
|
||||
"""
|
||||
return None
|
||||
@@ -1,9 +1,24 @@
|
||||
"""
|
||||
Base module for formatters – modular components that format metadata into media objects for storage.
|
||||
|
||||
The most commonly used formatter is the HTML formatter, which takes metadata and formats it into an HTML file for storage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, Media, BaseModule
|
||||
|
||||
|
||||
class Formatter(BaseModule):
|
||||
"""
|
||||
Base class for implementing formatters in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `format` method to define their behavior.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
def format(self, item: Metadata) -> Media:
|
||||
"""
|
||||
Formats a Metadata object into a user-viewable format (e.g. HTML) and stores it if needed.
|
||||
"""
|
||||
return None
|
||||
@@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from typing import List, TYPE_CHECKING
|
||||
import shutil
|
||||
import ast
|
||||
import copy
|
||||
@@ -16,99 +16,113 @@ import os
|
||||
from os.path import join
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from .base_module import BaseModule
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
|
||||
|
||||
_LAZY_LOADED_MODULES = {}
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
|
||||
|
||||
def setup_paths(paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
"""
|
||||
for path in paths:
|
||||
# check path exists, if it doesn't, log a warning
|
||||
if not os.path.exists(path):
|
||||
logger.warning(f"Path '{path}' does not exist. Skipping...")
|
||||
continue
|
||||
HAS_SETUP_PATHS = False
|
||||
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
class ModuleFactory:
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
def __init__(self):
|
||||
self._lazy_modules = {}
|
||||
|
||||
def get_module(module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
"""
|
||||
return get_module_lazy(module_name).load(config)
|
||||
def setup_paths(self, paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
"""
|
||||
global HAS_SETUP_PATHS
|
||||
|
||||
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
"""
|
||||
if module_name in _LAZY_LOADED_MODULES:
|
||||
return _LAZY_LOADED_MODULES[module_name]
|
||||
|
||||
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
return available[0]
|
||||
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
def is_really_module(module_path):
|
||||
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
all_modules = []
|
||||
|
||||
for module_folder in auto_archiver.modules.__path__:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
try:
|
||||
possible_modules = os.listdir(module_folder)
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Module folder {module_folder} does not exist")
|
||||
continue
|
||||
|
||||
for possible_module in possible_modules:
|
||||
if limit_to_modules and possible_module not in limit_to_modules:
|
||||
for path in paths:
|
||||
# check path exists, if it doesn't, log a warning
|
||||
if not os.path.exists(path):
|
||||
logger.warning(f"Path '{path}' does not exist. Skipping...")
|
||||
continue
|
||||
|
||||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
if HAS_SETUP_PATHS == True:
|
||||
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
|
||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
|
||||
HAS_SETUP_PATHS = True
|
||||
|
||||
def get_module(self, module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
"""
|
||||
return self.get_module_lazy(module_name).load(config)
|
||||
|
||||
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
"""
|
||||
if module_name in self._lazy_modules:
|
||||
return self._lazy_modules[module_name]
|
||||
|
||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
return available[0]
|
||||
|
||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
def is_really_module(module_path):
|
||||
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||
return True
|
||||
|
||||
all_modules = []
|
||||
|
||||
for module_folder in auto_archiver.modules.__path__:
|
||||
# walk through each module in module_folder and check if it has a valid manifest
|
||||
try:
|
||||
possible_modules = os.listdir(module_folder)
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Module folder {module_folder} does not exist")
|
||||
continue
|
||||
if _LAZY_LOADED_MODULES.get(possible_module):
|
||||
continue
|
||||
lazy_module = LazyBaseModule(possible_module, possible_module_path)
|
||||
|
||||
_LAZY_LOADED_MODULES[possible_module] = lazy_module
|
||||
for possible_module in possible_modules:
|
||||
if limit_to_modules and possible_module not in limit_to_modules:
|
||||
continue
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
if not any(module == m.name for m in all_modules):
|
||||
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
|
||||
possible_module_path = join(module_folder, possible_module)
|
||||
if not is_really_module(possible_module_path):
|
||||
continue
|
||||
if self._lazy_modules.get(possible_module):
|
||||
continue
|
||||
lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
|
||||
|
||||
return all_modules
|
||||
self._lazy_modules[possible_module] = lazy_module
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
if not any(module == m.name for m in all_modules):
|
||||
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
|
||||
|
||||
return all_modules
|
||||
|
||||
@dataclass
|
||||
class LazyBaseModule:
|
||||
@@ -123,14 +137,16 @@ class LazyBaseModule:
|
||||
type: list
|
||||
description: str
|
||||
path: str
|
||||
module_factory: ModuleFactory
|
||||
|
||||
_manifest: dict = None
|
||||
_instance: BaseModule = None
|
||||
_entry_point: str = None
|
||||
|
||||
def __init__(self, module_name, path):
|
||||
def __init__(self, module_name, path, factory: ModuleFactory):
|
||||
self.name = module_name
|
||||
self.path = path
|
||||
self.module_factory = factory
|
||||
|
||||
@property
|
||||
def entry_point(self):
|
||||
@@ -161,7 +177,7 @@ class LazyBaseModule:
|
||||
return self._manifest
|
||||
# print(f"Loading manifest for module {module_path}")
|
||||
# load the manifest file
|
||||
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
|
||||
manifest = copy.deepcopy(DEFAULT_MANIFEST)
|
||||
|
||||
with open(join(self.path, MANIFEST_FILE)) as f:
|
||||
try:
|
||||
@@ -189,13 +205,14 @@ class LazyBaseModule:
|
||||
# clear out any empty strings that a user may have erroneously added
|
||||
continue
|
||||
if not check(dep):
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
exit(1)
|
||||
|
||||
def check_python_dep(dep):
|
||||
# first check if it's a module:
|
||||
try:
|
||||
m = get_module_lazy(dep, suppress_warnings=True)
|
||||
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
|
||||
try:
|
||||
# we must now load this module and set it up with the config
|
||||
m.load(config)
|
||||
@@ -230,19 +247,21 @@ class LazyBaseModule:
|
||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||
# finally, get the class instance
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
if not getattr(instance, 'name', None):
|
||||
instance.name = self.name
|
||||
|
||||
if not getattr(instance, 'display_name', None):
|
||||
instance.display_name = self.display_name
|
||||
|
||||
self._instance = instance
|
||||
|
||||
# set the name, display name and module factory
|
||||
instance.name = self.name
|
||||
instance.display_name = self.display_name
|
||||
instance.module_factory = self.module_factory
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
|
||||
# save the instance for future easy loading
|
||||
self._instance = instance
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
|
||||
@@ -5,9 +5,10 @@
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List, Type
|
||||
from typing import Generator, Union, List, Type, TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
from copy import copy
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
@@ -21,15 +22,18 @@ from rich_argparse import RichHelpFormatter
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .module import BaseModule
|
||||
|
||||
from .consts import MODULE_TYPES
|
||||
from loguru import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
|
||||
class JsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
try:
|
||||
@@ -42,51 +46,85 @@ class AuthenticationJsonParseAction(JsonParseAction):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
super().__call__(parser, namespace, values, option_string)
|
||||
auth_dict = getattr(namespace, self.dest)
|
||||
if isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(auth_dict, 'r') as f:
|
||||
with open(path, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
return auth_dict
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
for site, auth in auth_dict.items():
|
||||
if not isinstance(site, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(auth_dict).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
auth_dict[site] = val
|
||||
del auth_dict[key]
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
if not hasattr(namespace, self.dest):
|
||||
setattr(namespace, self.dest, [])
|
||||
for value in values:
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
logger_id: int
|
||||
|
||||
# instance variables, used for convenience to access modules by step
|
||||
feeders: List[Type[Feeder]]
|
||||
extractors: List[Type[Extractor]]
|
||||
enrichers: List[Type[Enricher]]
|
||||
databases: List[Type[Database]]
|
||||
storages: List[Type[Storage]]
|
||||
formatters: List[Type[Formatter]]
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.module_factory = ModuleFactory()
|
||||
self.setup_finished = False
|
||||
self.logger_id = None
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="auto-archiver",
|
||||
add_help=False,
|
||||
description="""
|
||||
prog="auto-archiver",
|
||||
add_help=False,
|
||||
description="""
|
||||
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
|
||||
it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
|
||||
""",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
@@ -100,101 +138,115 @@ class ArchivingOrchestrator:
|
||||
return parser
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
)
|
||||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
add_help=False,
|
||||
)
|
||||
self.add_additional_args(parser)
|
||||
|
||||
# merge command line module args (--feeders, --enrichers etc.) and add them to the config
|
||||
|
||||
# check what mode we're in
|
||||
# if we have a config file, use that to decide which modules to load
|
||||
# if simple, we'll load just the modules that has requires_setup = False
|
||||
# if full, we'll load all modules
|
||||
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if yaml_config != EMPTY_CONFIG:
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for config in [yaml_config['steps'], basic_config.__dict__]:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
enabled_modules.extend(config.get(f"{module_type}s", []))
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
self.add_module_args(avail_modules, parser)
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
|
||||
self.add_module_args(simple_modules, parser)
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||
|
||||
if not yaml_config['steps']['feeders']:
|
||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
self.add_module_args(available_modules(with_manifest=True), parser)
|
||||
|
||||
self.add_individual_module_args(self.module_factory.available_modules(), parser)
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
parsed, unknown = parser.parse_known_args(unused_args)
|
||||
|
||||
# merge the new config with the old one
|
||||
self.config = merge_dicts(vars(parsed), yaml_config)
|
||||
config = merge_dicts(vars(parsed), yaml_config)
|
||||
|
||||
# clean out args from the base_parser that we don't want in the config
|
||||
for key in vars(basic_config):
|
||||
self.config.pop(key, None)
|
||||
config.pop(key, None)
|
||||
|
||||
# setup the logging
|
||||
self.setup_logging()
|
||||
self.setup_logging(config)
|
||||
|
||||
if unknown:
|
||||
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
|
||||
|
||||
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
||||
|
||||
if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
||||
logger.info(f"Storing configuration file to {basic_config.config_file}")
|
||||
store_yaml(self.config, basic_config.config_file)
|
||||
|
||||
return self.config
|
||||
store_yaml(config, basic_config.config_file)
|
||||
|
||||
return config
|
||||
|
||||
def add_modules_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
|
||||
# allow passing URLs directly on the command line
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
|
||||
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
|
||||
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction)
|
||||
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
|
||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
|
||||
if not modules:
|
||||
modules = available_modules(with_manifest=True)
|
||||
|
||||
module: LazyBaseModule
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
for module in modules:
|
||||
|
||||
if not module.configs:
|
||||
@@ -224,21 +276,29 @@ class ArchivingOrchestrator:
|
||||
arg.should_store = should_store
|
||||
|
||||
def show_help(self, basic_config: dict):
|
||||
# for the help message, we want to load *all* possible modules and show the help
|
||||
# add configs as arg parser arguments
|
||||
|
||||
# for the help message, we want to load manifests from *all* possible modules and show their help/settings
|
||||
# add configs as arg parser arguments
|
||||
|
||||
self.add_modules_args(self.basic_parser)
|
||||
self.add_additional_args(self.basic_parser)
|
||||
self.add_module_args(parser=self.basic_parser)
|
||||
self.add_individual_module_args(parser=self.basic_parser)
|
||||
self.basic_parser.print_help()
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self):
|
||||
|
||||
def setup_logging(self, config):
|
||||
# setup loguru logging
|
||||
logger.remove(0) # remove the default logger
|
||||
logging_config = self.config['logging']
|
||||
logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
try:
|
||||
logger.remove(0) # remove the default logger
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
@@ -246,9 +306,9 @@ class ArchivingOrchestrator:
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
@@ -273,6 +333,7 @@ class ArchivingOrchestrator:
|
||||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
@@ -284,7 +345,6 @@ class ArchivingOrchestrator:
|
||||
'__iter__': feed
|
||||
|
||||
})()
|
||||
|
||||
|
||||
pseudo_module.__iter__ = feed
|
||||
step_items.append(pseudo_module)
|
||||
@@ -293,7 +353,7 @@ class ArchivingOrchestrator:
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
try:
|
||||
loaded_module: BaseModule = get_module(module, self.config)
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
@@ -308,48 +368,85 @@ class ArchivingOrchestrator:
|
||||
|
||||
check_steps_ok()
|
||||
setattr(self, f"{module_type}s", step_items)
|
||||
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
|
||||
This function should only ever be run once.
|
||||
"""
|
||||
|
||||
def run(self, args: list) -> None:
|
||||
|
||||
self.setup_basic_parser()
|
||||
|
||||
# parse the known arguments for now (basically, we want the config file)
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args(args)
|
||||
|
||||
# setup any custom module paths, so they'll show in the help and for arg parsing
|
||||
setup_paths(basic_config.module_paths)
|
||||
self.module_factory.setup_paths(basic_config.module_paths)
|
||||
|
||||
# if help flag was called, then show the help
|
||||
if basic_config.help:
|
||||
self.show_help(basic_config)
|
||||
|
||||
# merge command line --feeder etc. args with what's in the yaml config
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs.")
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config['steps'])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
for _ in self.feed():
|
||||
pass
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
"""
|
||||
This is the main entry point for the orchestrator, when run from the command line.
|
||||
|
||||
def cleanup(self)->None:
|
||||
:param args: list of arguments to pass to the orchestrator - these are the command line args
|
||||
|
||||
You should not call this method from code implementations.
|
||||
|
||||
This method sets up the configuration, loads the modules, and runs the feed.
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
"""
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info("Cleaning up")
|
||||
for e in self.extractors:
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
@@ -393,7 +490,6 @@ class ArchivingOrchestrator:
|
||||
m.tmp_dir = None
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
@@ -440,13 +536,13 @@ class ArchivingOrchestrator:
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
except Exception as e:
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
except Exception as exc:
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
# 5 - store all downloaded/generated media
|
||||
@@ -474,13 +570,13 @@ class ArchivingOrchestrator:
|
||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||
"""
|
||||
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
|
||||
|
||||
|
||||
parsed = urlparse(url)
|
||||
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
|
||||
assert parsed.hostname, f"Invalid URL hostname"
|
||||
assert parsed.hostname != "localhost", f"Invalid URL"
|
||||
|
||||
try: # special rules for IP addresses
|
||||
try: # special rules for IP addresses
|
||||
ip = ip_address(parsed.hostname)
|
||||
except ValueError: pass
|
||||
else:
|
||||
@@ -489,9 +585,8 @@ class ArchivingOrchestrator:
|
||||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
|
||||
|
||||
# Helper Properties
|
||||
|
||||
|
||||
@property
|
||||
def all_modules(self) -> List[Type[BaseModule]]:
|
||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
Base module for Storage modules – modular components that store media objects in various locations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import IO
|
||||
@@ -10,8 +14,14 @@ from auto_archiver.utils.misc import random_str
|
||||
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
@@ -22,10 +32,18 @@ class Storage(BaseModule):
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
@abstractmethod
|
||||
def get_cdn_url(self, media: Media) -> str: pass
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
"""
|
||||
Returns the URL of the media object stored in the CDN.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""
|
||||
Uploads (or saves) a file to the storage service/location.
|
||||
"""
|
||||
pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
@@ -56,7 +74,7 @@ class Storage(BaseModule):
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = get_module(HashEnricher, self.config)
|
||||
he = self.module_factory.get_module(HashEnricher, self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
|
||||
@@ -1 +1 @@
|
||||
from atlos_db import AtlosDb
|
||||
from .atlos_db import AtlosDb
|
||||
@@ -11,6 +11,8 @@
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": True,
|
||||
"type": "str",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
def get_atlos_config_options():
|
||||
return {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"type": str
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": str
|
||||
},
|
||||
}
|
||||
1
src/auto_archiver/modules/atlos_storage/__init__.py
Normal file
1
src/auto_archiver/modules/atlos_storage/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .atlos_storage import AtlosStorage
|
||||
32
src/auto_archiver/modules/atlos_storage/__manifest__.py
Normal file
32
src/auto_archiver/modules/atlos_storage/__manifest__.py
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"name": "Atlos Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "boto3"],
|
||||
"bin": []
|
||||
},
|
||||
"description": """
|
||||
Stores media files in a [Atlos](https://www.atlos.org/).
|
||||
|
||||
### Features
|
||||
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
|
||||
|
||||
### Notes
|
||||
- Requires setup with Atlos credentials.
|
||||
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
|
||||
""",
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": True,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
GDriveStorage: A storage module for saving archived content to Google Drive.
|
||||
|
||||
Author: Dave Mateer, (And maintained by: )
|
||||
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
|
||||
|
||||
### Features
|
||||
|
||||
@@ -39,11 +39,11 @@ class Bluesky(GenericDropin):
|
||||
for image_media in image_medias:
|
||||
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
||||
image_media = archiver.download_from_url(url)
|
||||
media.append(image_media)
|
||||
media.append(Media(image_media))
|
||||
for video_media in video_medias:
|
||||
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
||||
video_media = archiver.download_from_url(url)
|
||||
media.append(video_media)
|
||||
media.append(Media(video_media))
|
||||
return media
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,8 @@ class Facebook(GenericDropin):
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
||||
|
||||
post_data = ie_instance._extract_from_url.extract_metadata(webpage)
|
||||
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||
post_data = ie_instance._extract_metadata(webpage)
|
||||
return post_data
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import datetime, os, yt_dlp, pysubs2
|
||||
import importlib
|
||||
from typing import Type
|
||||
from typing import Generator, Type
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
|
||||
from loguru import logger
|
||||
@@ -11,7 +11,7 @@ from auto_archiver.core import Metadata, Media
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
def suitable_extractors(self, url: str) -> list[str]:
|
||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||
"""
|
||||
Returns a list of valid extractors for the given URL"""
|
||||
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
||||
@@ -116,7 +116,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
"""
|
||||
Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
|
||||
Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata.
|
||||
"""
|
||||
|
||||
ie_instance = info_extractor(downloader=ydl)
|
||||
@@ -266,6 +266,11 @@ class GenericExtractor(Extractor):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
#TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
if url.startswith("https://ya.ru"):
|
||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||
item.set("replaced_url", url)
|
||||
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
||||
'quiet': False, 'noplaylist': not self.allow_playlist ,
|
||||
@@ -275,7 +280,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
@@ -284,7 +289,7 @@ class GenericExtractor(Extractor):
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookie_from_browser' in auth:
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {
|
||||
"default": None,
|
||||
"help": "(alternative to sheet name) the id of the sheet to archive",
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
|
||||
@@ -9,9 +9,7 @@ import base64
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core import Formatter
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class HtmlFormatter(Formatter):
|
||||
environment: Environment = None
|
||||
@@ -51,7 +49,7 @@ class HtmlFormatter(Formatter):
|
||||
final_media = Media(filename=html_path, _mimetype="text/html")
|
||||
|
||||
# get the already instantiated hash_enricher module
|
||||
he = get_module('hash_enricher', self.config)
|
||||
he = self.module_factory.get_module('hash_enricher', self.config)
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -200,7 +200,7 @@
|
||||
el.innerHTML = decodeCertificate(certificate);
|
||||
|
||||
let cyberChefUrl =
|
||||
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
|
||||
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
|
||||
// create a new anchor with this url and append after the code
|
||||
let a = document.createElement("a");
|
||||
a.href = cyberChefUrl;
|
||||
|
||||
@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
# This may be outdated and replaced by the below message, but keeping until confirmed
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
# # TODO: It currently returns this as a success - is that intentional?
|
||||
# if "Media not found or unavailable" in message:
|
||||
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
# return False
|
||||
|
||||
if "Media not found or unavailable" in message:
|
||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "selenium"],
|
||||
"bin": ["chromedriver"]
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
|
||||
@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
|
||||
def __init__(self, webdriver_factory=None):
|
||||
super().__init__()
|
||||
self.webdriver_factory = webdriver_factory or Webdriver
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
with self.webdriver_factory(
|
||||
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
|
||||
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
|
||||
|
||||
@@ -20,5 +20,6 @@
|
||||
- Processes HTML content of messages to retrieve embedded media.
|
||||
- Sets structured metadata, including timestamps, content, and media details.
|
||||
- Does not require user authentication for Telegram.
|
||||
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "telethon_extractor",
|
||||
"name": "Telethon Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
@@ -40,5 +40,9 @@ To use the `TelethonExtractor`, you must configure the following:
|
||||
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
|
||||
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
|
||||
|
||||
### First Time Login
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
"""
|
||||
}
|
||||
@@ -7,8 +7,12 @@
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
"thumbnails_per_minute": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16,
|
||||
"type": "int",
|
||||
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
|
||||
logger.error(f"error getting duration of video {m.filename}: {e}")
|
||||
return
|
||||
|
||||
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
|
||||
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
|
||||
|
||||
thumbnails_media = []
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
|
||||
@@ -221,4 +221,4 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
to_enrich.add_media(m, warc_fn)
|
||||
counter += 1
|
||||
seen_urls.add(record_url)
|
||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
|
||||
@@ -4,7 +4,6 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.stores = self.config['steps']['storages']
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
self.s3 = self.module_factory.get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
# TODO: this used to pass all storage items to store now
|
||||
# Now only passing S3, the rest will get added later in the usual order (?)
|
||||
# Only storing S3, the rest will get added later in the usual order (?)
|
||||
m.store(url=url, metadata=to_enrich, storages=[self.s3])
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# we need to explicitly expose the available imports here
|
||||
from .misc import *
|
||||
from .webdriver import Webdriver
|
||||
from .atlos import get_atlos_config_options
|
||||
|
||||
# handy utils from ytdlp
|
||||
from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)
|
||||
@@ -1,13 +0,0 @@
|
||||
def get_atlos_config_options():
|
||||
return {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
}
|
||||
@@ -46,7 +46,7 @@ def dump_payload(p):
|
||||
|
||||
|
||||
def update_nested_dict(dictionary, update_dict):
|
||||
# takes 2 dicts and overwrites the first with the second only on the changed balues
|
||||
# takes 2 dicts and overwrites the first with the second only on the changed values
|
||||
for key, value in update_dict.items():
|
||||
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
|
||||
update_nested_dict(dictionary[key], value)
|
||||
|
||||
Reference in New Issue
Block a user