Merge branch 'main' into timestamping_rewrite

This commit is contained in:
Patrick Robertson
2025-02-24 12:03:14 +00:00
125 changed files with 4277 additions and 2162 deletions

View File

@@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
import sys
def main():
ArchivingOrchestrator().run(sys.argv[1:])
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
if __name__ == "__main__":
main()

View File

@@ -3,7 +3,7 @@
"""
from .metadata import Metadata
from .media import Media
from .module import BaseModule
from .base_module import BaseModule
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

View File

@@ -1,13 +1,18 @@
from urllib.parse import urlparse
from typing import Mapping, Any
from __future__ import annotations
from typing import Mapping, Any, Type, TYPE_CHECKING
from abc import ABC
from copy import deepcopy, copy
from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger
if TYPE_CHECKING:
from .module import ModuleFactory
class BaseModule(ABC):
"""
@@ -17,41 +22,24 @@ class BaseModule(ABC):
however modules can have a .setup() method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
See consts.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
default manifest structure.
"""
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
MODULE_TYPES = CONF_MODULE_TYPES
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]]
name: str
module_factory: ModuleFactory
# this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None
@@ -63,12 +51,6 @@ class BaseModule(ABC):
def config_setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):
authentication[site] = val
del authentication[key]
# this is important. Each instance is given its own deepcopied config, so modules cannot
# change values to affect other modules
@@ -89,16 +71,21 @@ class BaseModule(ABC):
Returns the authentication information for a given site. This is used to authenticate
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
extract_cookies: bool - whether or not to extract cookies from the given browser and return the
cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
:param site: the domain of the site to get authentication information for
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
Currently, the dict can have keys of the following types:
- username: str - the username to use for login
- password: str - the password to use for login
- api_key: str - the API key to use for login
- api_secret: str - the API secret to use for login
- cookie: str - a cookie string to use for login (specific to this site)
- cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
:returns: authdict dict of login information for the given site
**Global options:**\n
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
* cookies_file: str - the path to a cookies file to use for login\n
**Currently, the sites dict can have keys of the following types:**\n
* username: str - the username to use for login\n
* password: str - the password to use for login\n
* api_key: str - the API key to use for login\n
* api_secret: str - the API secret to use for login\n
* cookie: str - a cookie string to use for login (specific to this site)\n
"""
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

View File

@@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
from loguru import logger
from copy import deepcopy
from .module import BaseModule
from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple
@@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
"""
# Global configuration
@@ -48,6 +48,7 @@ authentication: {}
logging:
level: INFO
""")
# note: 'logging' is explicitly added above in order to better format the config file
@@ -128,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_subdict[key] = value
continue
if key == 'steps':
for module_type, modules in value.items():
# overwrite the 'steps' from the config file with the ones from the CLI
yaml_subdict[key][module_type] = modules
if is_dict_type(value):
update_dict(value, yaml_subdict[key])
elif is_list_type(value):
@@ -136,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_subdict[key] = value
update_dict(from_dot_notation(dotdict), yaml_dict)
return yaml_dict
def read_yaml(yaml_filename: str) -> CommentedMap:
@@ -158,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save = deepcopy(config)
auth_dict = config_to_save.get("authentication", {})
if auth_dict and auth_dict.get('load_from_file'):
# remove all other values from the config, don't want to store it in the config file
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
_yaml.dump(config_to_save, outf)

View File

@@ -0,0 +1,23 @@
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}

View File

@@ -1,3 +1,8 @@
"""
Database module for the auto-archiver that defines the interface for implementing database modules
in the media archiving framework.
"""
from __future__ import annotations
from abc import abstractmethod
from typing import Union
@@ -5,6 +10,11 @@ from typing import Union
from auto_archiver.core import Metadata, BaseModule
class Database(BaseModule):
"""
Base class for implementing database modules in the media archiving framework.
Subclasses must implement the `fetch` and `done` methods to define platform-specific behavior.
"""
def started(self, item: Metadata) -> None:
"""signals the DB that the given item archival has started"""

View File

@@ -1,5 +1,5 @@
"""
Enrichers are modular components that enhance archived content by adding
Base module for Enrichers modular components that enhance archived content by adding
context, metadata, or additional processing.
These add additional information to the context, such as screenshots, hashes, and metadata.
@@ -13,7 +13,16 @@ from abc import abstractmethod
from auto_archiver.core import Metadata, BaseModule
class Enricher(BaseModule):
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
"""Base classes and utilities for enrichers in the Auto-Archiver system.
Enricher modules must implement the `enrich` method to define their behavior.
"""
@abstractmethod
def enrich(self, to_enrich: Metadata) -> None: pass
def enrich(self, to_enrich: Metadata) -> None:
"""
Enriches a Metadata object with additional information or context.
Takes the metadata object to enrich as an argument and modifies it in place, returning None.
"""
pass

View File

@@ -29,14 +29,24 @@ class Extractor(BaseModule):
valid_url: re.Pattern = None
def cleanup(self) -> None:
# called when extractors are done, or upon errors, cleanup any resources
"""
Called when extractors are done, or upon errors, cleanup any resources
"""
pass
def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters OR unfurl redirect links
"""
Used to clean unnecessary URL parameters OR unfurl redirect links
"""
return url
def match_link(self, url: str) -> re.Match:
"""
Returns a match object if the given URL matches the valid_url pattern or False/None if not.
Normally used in the `suitable` method to check if the URL is supported by this extractor.
"""
return self.valid_url.match(url)
def suitable(self, url: str) -> bool:
@@ -80,8 +90,8 @@ class Extractor(BaseModule):
d.raise_for_status()
# get mimetype from the response headers
if not Path(to_filename).suffix:
content_type = d.headers.get('Content-Type')
if not mimetypes.guess_type(to_filename)[0]:
content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
extension = mimetypes.guess_extension(content_type)
if extension:
to_filename += extension

View File

@@ -1,3 +1,7 @@
"""
The feeder base module defines the interface for implementing feeders in the media archiving framework.
"""
from __future__ import annotations
from abc import abstractmethod
from auto_archiver.core import Metadata
@@ -5,5 +9,17 @@ from auto_archiver.core import BaseModule
class Feeder(BaseModule):
"""
Base class for implementing feeders in the media archiving framework.
Subclasses must implement the `__iter__` method to define platform-specific behavior.
"""
@abstractmethod
def __iter__(self) -> Metadata: return None
def __iter__(self) -> Metadata:
"""
Returns an iterator (use `yield`) over the items to be archived.
These should be instances of Metadata, typically created with Metadata().set_url(url).
"""
return None

View File

@@ -1,9 +1,24 @@
"""
Base module for formatters modular components that format metadata into media objects for storage.
The most commonly used formatter is the HTML formatter, which takes metadata and formats it into an HTML file for storage.
"""
from __future__ import annotations
from abc import abstractmethod
from auto_archiver.core import Metadata, Media, BaseModule
class Formatter(BaseModule):
"""
Base class for implementing formatters in the media archiving framework.
Subclasses must implement the `format` method to define their behavior.
"""
@abstractmethod
def format(self, item: Metadata) -> Media: return None
def format(self, item: Metadata) -> Media:
"""
Formats a Metadata object into a user-viewable format (e.g. HTML) and stores it if needed.
"""
return None

View File

@@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
from __future__ import annotations
from dataclasses import dataclass
from typing import List
from typing import List, TYPE_CHECKING
import shutil
import ast
import copy
@@ -16,99 +16,113 @@ import os
from os.path import join
from loguru import logger
import auto_archiver
from .base_module import BaseModule
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
_LAZY_LOADED_MODULES = {}
MANIFEST_FILE = "__manifest__.py"
if TYPE_CHECKING:
from .base_module import BaseModule
def setup_paths(paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
HAS_SETUP_PATHS = False
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
auto_archiver.modules.__path__.append(path)
class ModuleFactory:
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
def __init__(self):
self._lazy_modules = {}
def get_module(module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return get_module_lazy(module_name).load(config)
def setup_paths(self, paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
global HAS_SETUP_PATHS
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in _LAZY_LOADED_MODULES:
return _LAZY_LOADED_MODULES[module_name]
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
if HAS_SETUP_PATHS == True:
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
auto_archiver.modules.__path__.append(path)
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
HAS_SETUP_PATHS = True
def get_module(self, module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return self.get_module_lazy(module_name).load(config)
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in self._lazy_modules:
return self._lazy_modules[module_name]
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
if _LAZY_LOADED_MODULES.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path)
_LAZY_LOADED_MODULES[possible_module] = lazy_module
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
if self._lazy_modules.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
return all_modules
self._lazy_modules[possible_module] = lazy_module
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
return all_modules
@dataclass
class LazyBaseModule:
@@ -123,14 +137,16 @@ class LazyBaseModule:
type: list
description: str
path: str
module_factory: ModuleFactory
_manifest: dict = None
_instance: BaseModule = None
_entry_point: str = None
def __init__(self, module_name, path):
def __init__(self, module_name, path, factory: ModuleFactory):
self.name = module_name
self.path = path
self.module_factory = factory
@property
def entry_point(self):
@@ -161,7 +177,7 @@ class LazyBaseModule:
return self._manifest
# print(f"Loading manifest for module {module_path}")
# load the manifest file
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
manifest = copy.deepcopy(DEFAULT_MANIFEST)
with open(join(self.path, MANIFEST_FILE)) as f:
try:
@@ -189,13 +205,14 @@ class LazyBaseModule:
# clear out any empty strings that a user may have erroneously added
continue
if not check(dep):
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
exit(1)
def check_python_dep(dep):
# first check if it's a module:
try:
m = get_module_lazy(dep, suppress_warnings=True)
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
try:
# we must now load this module and set it up with the config
m.load(config)
@@ -230,19 +247,21 @@ class LazyBaseModule:
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
# finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
if not getattr(instance, 'name', None):
instance.name = self.name
if not getattr(instance, 'display_name', None):
instance.display_name = self.display_name
self._instance = instance
# set the name, display name and module factory
instance.name = self.name
instance.display_name = self.display_name
instance.module_factory = self.module_factory
# merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config)
instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance
def __repr__(self):

View File

@@ -5,9 +5,10 @@
"""
from __future__ import annotations
from typing import Generator, Union, List, Type
from typing import Generator, Union, List, Type, TYPE_CHECKING
from urllib.parse import urlparse
from ipaddress import ip_address
from copy import copy
import argparse
import os
import sys
@@ -21,15 +22,18 @@ from rich_argparse import RichHelpFormatter
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule
from .consts import MODULE_TYPES
from loguru import logger
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
DEFAULT_CONFIG_FILE = "orchestration.yaml"
class JsonParseAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
try:
@@ -42,51 +46,85 @@ class AuthenticationJsonParseAction(JsonParseAction):
def __call__(self, parser, namespace, values, option_string=None):
super().__call__(parser, namespace, values, option_string)
auth_dict = getattr(namespace, self.dest)
if isinstance(auth_dict, str):
# if it's a string
def load_from_file(path):
try:
with open(auth_dict, 'r') as f:
with open(path, 'r') as f:
try:
auth_dict = json.load(f)
except json.JSONDecodeError:
f.seek(0)
# maybe it's yaml, try that
auth_dict = _yaml.load(f)
if auth_dict.get('authentication'):
auth_dict = auth_dict['authentication']
auth_dict['load_from_file'] = path
return auth_dict
except:
pass
return None
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
auth_dict = load_from_file(auth_dict['from_file'])
elif isinstance(auth_dict, str):
# if it's a string
auth_dict = load_from_file(auth_dict)
if not isinstance(auth_dict, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
for site, auth in auth_dict.items():
if not isinstance(site, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
for key, auth in auth_dict.items():
if key in global_options:
continue
if not isinstance(key, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
# extract out concatenated sites
for key, val in copy(auth_dict).items():
if "," in key:
for site in key.split(","):
auth_dict[site] = val
del auth_dict[key]
setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
if not hasattr(namespace, self.dest):
setattr(namespace, self.dest, [])
for value in values:
if value not in getattr(namespace, self.dest):
getattr(namespace, self.dest).append(value)
class ArchivingOrchestrator:
# instance variables
module_factory: ModuleFactory
setup_finished: bool
logger_id: int
# instance variables, used for convenience to access modules by step
feeders: List[Type[Feeder]]
extractors: List[Type[Extractor]]
enrichers: List[Type[Enricher]]
databases: List[Type[Database]]
storages: List[Type[Storage]]
formatters: List[Type[Formatter]]
def __init__(self):
self.module_factory = ModuleFactory()
self.setup_finished = False
self.logger_id = None
def setup_basic_parser(self):
parser = argparse.ArgumentParser(
prog="auto-archiver",
add_help=False,
description="""
prog="auto-archiver",
add_help=False,
description="""
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
""",
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
formatter_class=RichHelpFormatter,
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
formatter_class=RichHelpFormatter,
)
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
parser.add_argument('--version', action='version', version=__version__)
@@ -100,101 +138,115 @@ class ArchivingOrchestrator:
return parser
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
# modules parser to get the overridden 'steps' values
modules_parser = argparse.ArgumentParser(
add_help=False,
)
self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in MODULE_TYPES:
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser(
add_help=False,
)
self.add_additional_args(parser)
# merge command line module args (--feeders, --enrichers etc.) and add them to the config
# check what mode we're in
# if we have a config file, use that to decide which modules to load
# if simple, we'll load just the modules that has requires_setup = False
# if full, we'll load all modules
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
# but should we add them? Or should we just add them to the 'complete' parser?
if yaml_config != EMPTY_CONFIG:
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
# first loads the modules from the config file, then from the command line
for config in [yaml_config['steps'], basic_config.__dict__]:
for module_type in BaseModule.MODULE_TYPES:
enabled_modules.extend(config.get(f"{module_type}s", []))
for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_module_args(avail_modules, parser)
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
yaml_config['steps']['feeders'] = ['cli_feeder']
if not yaml_config['steps']['feeders']:
yaml_config['steps']['feeders'] = ['cli_feeder']
# add them to the config
for module in simple_modules:
for module_type in module.type:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
self.add_module_args(available_modules(with_manifest=True), parser)
self.add_individual_module_args(self.module_factory.available_modules(), parser)
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
parsed, unknown = parser.parse_known_args(unused_args)
# merge the new config with the old one
self.config = merge_dicts(vars(parsed), yaml_config)
config = merge_dicts(vars(parsed), yaml_config)
# clean out args from the base_parser that we don't want in the config
for key in vars(basic_config):
self.config.pop(key, None)
config.pop(key, None)
# setup the logging
self.setup_logging()
self.setup_logging(config)
if unknown:
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_yaml(self.config, basic_config.config_file)
return self.config
store_yaml(config, basic_config.config_file)
return config
def add_modules_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
# Module loading from the command line
for module_type in MODULE_TYPES:
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
# allow passing URLs directly on the command line
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
You may also pass a path to a valid JSON/YAML file which will be parsed.',
default={},
nargs="?",
action=AuthenticationJsonParseAction)
# logging arguments
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules:
modules = available_modules(with_manifest=True)
module: LazyBaseModule
modules = self.module_factory.available_modules()
for module in modules:
if not module.configs:
@@ -224,21 +276,29 @@ class ArchivingOrchestrator:
arg.should_store = should_store
def show_help(self, basic_config: dict):
# for the help message, we want to load *all* possible modules and show the help
# add configs as arg parser arguments
# for the help message, we want to load manifests from *all* possible modules and show their help/settings
# add configs as arg parser arguments
self.add_modules_args(self.basic_parser)
self.add_additional_args(self.basic_parser)
self.add_module_args(parser=self.basic_parser)
self.add_individual_module_args(parser=self.basic_parser)
self.basic_parser.print_help()
self.basic_parser.exit()
def setup_logging(self):
def setup_logging(self, config):
# setup loguru logging
logger.remove(0) # remove the default logger
logging_config = self.config['logging']
logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
try:
logger.remove(0) # remove the default logger
except ValueError:
pass
logging_config = config['logging']
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self, modules_by_type):
"""
@@ -246,9 +306,9 @@ class ArchivingOrchestrator:
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
are loaded, the program will exit with an error message.
"""
invalid_modules = []
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
step_items = []
modules_to_load = modules_by_type[f"{module_type}s"]
@@ -273,6 +333,7 @@ class ArchivingOrchestrator:
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
exit()
# cli_feeder is a pseudo module, it just takes the command line args
def feed(self) -> Generator[Metadata]:
for url in urls:
logger.debug(f"Processing URL: '{url}'")
@@ -284,7 +345,6 @@ class ArchivingOrchestrator:
'__iter__': feed
})()
pseudo_module.__iter__ = feed
step_items.append(pseudo_module)
@@ -293,7 +353,7 @@ class ArchivingOrchestrator:
if module in invalid_modules:
continue
try:
loaded_module: BaseModule = get_module(module, self.config)
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module:
@@ -308,48 +368,85 @@ class ArchivingOrchestrator:
check_steps_ok()
setattr(self, f"{module_type}s", step_items)
def load_config(self, config_file: str) -> dict:
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
exit()
return read_yaml(config_file)
def setup_config(self, args: list) -> dict:
"""
Sets up the configuration file, merging the default config with the user's config
This function should only ever be run once.
"""
def run(self, args: list) -> None:
self.setup_basic_parser()
# parse the known arguments for now (basically, we want the config file)
basic_config, unused_args = self.basic_parser.parse_known_args(args)
# setup any custom module paths, so they'll show in the help and for arg parsing
setup_paths(basic_config.module_paths)
self.module_factory.setup_paths(basic_config.module_paths)
# if help flag was called, then show the help
if basic_config.help:
self.show_help(basic_config)
# merge command line --feeder etc. args with what's in the yaml config
yaml_config = self.load_config(basic_config.config_file)
self.setup_complete_parser(basic_config, yaml_config, unused_args)
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
def setup(self, args: list):
"""
Function to configure all setup of the orchestrator: setup configs and load modules.
This method should only ever be called once
"""
if self.setup_finished:
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
For code implementatations, you should call .setup_config() once then you may call .feed() \
multiple times to archive multiple URLs.")
return
self.setup_basic_parser()
self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config['steps'])
# log out the modules that were loaded
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
self.setup_finished = True
for _ in self.feed():
pass
def _command_line_run(self, args: list) -> Generator[Metadata]:
"""
This is the main entry point for the orchestrator, when run from the command line.
def cleanup(self)->None:
:param args: list of arguments to pass to the orchestrator - these are the command line args
You should not call this method from code implementations.
This method sets up the configuration, loads the modules, and runs the feed.
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
To test configurations, without loading any modules you can also first call 'setup_configs'
"""
self.setup(args)
return self.feed()
def cleanup(self) -> None:
logger.info("Cleaning up")
for e in self.extractors:
e.cleanup()
def feed(self) -> Generator[Metadata]:
url_count = 0
for feeder in self.feeders:
for item in feeder:
@@ -393,7 +490,6 @@ class ArchivingOrchestrator:
m.tmp_dir = None
tmp_dir.cleanup()
def archive(self, result: Metadata) -> Union[Metadata, None]:
"""
Runs the archiving process for a single URL
@@ -440,13 +536,13 @@ class ArchivingOrchestrator:
try:
result.merge(a.download(result))
if result.is_success(): break
except Exception as e:
except Exception as e:
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
# 4 - call enrichers to work with archived content
for e in self.enrichers:
try: e.enrich(result)
except Exception as exc:
except Exception as exc:
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media
@@ -474,13 +570,13 @@ class ArchivingOrchestrator:
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
"""
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
parsed = urlparse(url)
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
assert parsed.hostname, f"Invalid URL hostname"
assert parsed.hostname != "localhost", f"Invalid URL"
try: # special rules for IP addresses
try: # special rules for IP addresses
ip = ip_address(parsed.hostname)
except ValueError: pass
else:
@@ -489,9 +585,8 @@ class ArchivingOrchestrator:
assert not ip.is_link_local, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used"
# Helper Properties
@property
def all_modules(self) -> List[Type[BaseModule]]:
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters

View File

@@ -1,3 +1,7 @@
"""
Base module for Storage modules modular components that store media objects in various locations.
"""
from __future__ import annotations
from abc import abstractmethod
from typing import IO
@@ -10,8 +14,14 @@ from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule):
"""
Base class for implementing storage modules in the media archiving framework.
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
"""
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
if media.is_stored(in_storage=self):
@@ -22,10 +32,18 @@ class Storage(BaseModule):
media.add_url(self.get_cdn_url(media))
@abstractmethod
def get_cdn_url(self, media: Media) -> str: pass
def get_cdn_url(self, media: Media) -> str:
"""
Returns the URL of the media object stored in the CDN.
"""
pass
@abstractmethod
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
"""
Uploads (or saves) a file to the storage service/location.
"""
pass
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
@@ -56,7 +74,7 @@ class Storage(BaseModule):
filename = random_str(24)
elif filename_generator == "static":
# load the hash_enricher module
he = get_module(HashEnricher, self.config)
he = self.module_factory.get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:

View File

@@ -1 +1 @@
from atlos_db import AtlosDb
from .atlos_db import AtlosDb

View File

@@ -11,6 +11,8 @@
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"required": True,
"type": "str",
},
"atlos_url": {
"default": "https://platform.atlos.org",

View File

@@ -1,13 +0,0 @@
def get_atlos_config_options():
return {
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"type": str
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": str
},
}

View File

@@ -0,0 +1 @@
from .atlos_storage import AtlosStorage

View File

@@ -0,0 +1,32 @@
{
"name": "Atlos Storage",
"type": ["storage"],
"requires_setup": True,
"dependencies": {
"python": ["loguru", "boto3"],
"bin": []
},
"description": """
Stores media files in a [Atlos](https://www.atlos.org/).
### Features
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
### Notes
- Requires setup with Atlos credentials.
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
""",
"configs": {
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"required": True,
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str"
},
}
}

View File

@@ -32,7 +32,6 @@
GDriveStorage: A storage module for saving archived content to Google Drive.
Author: Dave Mateer, (And maintained by: )
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
### Features

View File

@@ -39,11 +39,11 @@ class Bluesky(GenericDropin):
for image_media in image_medias:
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
image_media = archiver.download_from_url(url)
media.append(image_media)
media.append(Media(image_media))
for video_media in video_medias:
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
video_media = archiver.download_from_url(url)
media.append(video_media)
media.append(Media(video_media))
return media

View File

@@ -8,7 +8,8 @@ class Facebook(GenericDropin):
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
post_data = ie_instance._extract_from_url.extract_metadata(webpage)
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
post_data = ie_instance._extract_metadata(webpage)
return post_data
def create_metadata(self, post: dict, ie_instance, archiver, url):

View File

@@ -1,6 +1,6 @@
import datetime, os, yt_dlp, pysubs2
import importlib
from typing import Type
from typing import Generator, Type
from yt_dlp.extractor.common import InfoExtractor
from loguru import logger
@@ -11,7 +11,7 @@ from auto_archiver.core import Metadata, Media
class GenericExtractor(Extractor):
_dropins = {}
def suitable_extractors(self, url: str) -> list[str]:
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
"""
Returns a list of valid extractors for the given URL"""
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
@@ -116,7 +116,7 @@ class GenericExtractor(Extractor):
def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
"""
Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
Calls into the ytdlp InfoExtract subclass to use the private _extract_post method to get the post metadata.
"""
ie_instance = info_extractor(downloader=ydl)
@@ -266,6 +266,11 @@ class GenericExtractor(Extractor):
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
#TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
if url.startswith("https://ya.ru"):
url = url.replace("https://ya.ru", "https://yandex.ru")
item.set("replaced_url", url)
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
'quiet': False, 'noplaylist': not self.allow_playlist ,
@@ -275,7 +280,7 @@ class GenericExtractor(Extractor):
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
logger.debug(f'Using provided auth username and password for {url}')
@@ -284,7 +289,7 @@ class GenericExtractor(Extractor):
elif 'cookie' in auth:
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookie_from_browser' in auth:
elif 'cookies_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:

View File

@@ -10,7 +10,7 @@
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {
"default": None,
"help": "(alternative to sheet name) the id of the sheet to archive",
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
},
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {

View File

@@ -9,9 +9,7 @@ import base64
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
class HtmlFormatter(Formatter):
environment: Environment = None
@@ -51,7 +49,7 @@ class HtmlFormatter(Formatter):
final_media = Media(filename=html_path, _mimetype="text/html")
# get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config)
he = self.module_factory.get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")

View File

@@ -200,7 +200,7 @@
el.innerHTML = decodeCertificate(certificate);
let cyberChefUrl =
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
// create a new anchor with this url and append after the code
let a = document.createElement("a");
a.href = cyberChefUrl;

View File

@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
chat, since_id = self._send_url_to_bot(url)
message = self._process_messages(chat, since_id, tmp_dir, result)
# This may be outdated and replaced by the below message, but keeping until confirmed
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
# # TODO: It currently returns this as a success - is that intentional?
# if "Media not found or unavailable" in message:
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
# return False
if "Media not found or unavailable" in message:
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
return False
if message:
result.set_content(message).set_title(message[:128])

View File

@@ -4,7 +4,6 @@
"requires_setup": True,
"dependencies": {
"python": ["loguru", "selenium"],
"bin": ["chromedriver"]
},
"configs": {
"width": {"default": 1280, "help": "width of the screenshots"},

View File

@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
with self.webdriver_factory(
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")

View File

@@ -20,5 +20,6 @@
- Processes HTML content of messages to retrieve embedded media.
- Sets structured metadata, including timestamps, content, and media details.
- Does not require user authentication for Telegram.
""",
}

View File

@@ -1,5 +1,5 @@
{
"name": "telethon_extractor",
"name": "Telethon Extractor",
"type": ["extractor"],
"requires_setup": True,
"dependencies": {
@@ -40,5 +40,9 @@ To use the `TelethonExtractor`, you must configure the following:
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
### First Time Login
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
"""
}

View File

@@ -7,8 +7,12 @@
"bin": ["ffmpeg"]
},
"configs": {
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
"thumbnails_per_minute": {"default": 60,
"type": "int",
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
"max_thumbnails": {"default": 16,
"type": "int",
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
},
"description": """
Generates thumbnails for video files to provide visual previews.

View File

@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
logger.error(f"error getting duration of video {m.filename}: {e}")
return
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
thumbnails_media = []

View File

@@ -1,6 +1,6 @@
{
"name": "WACZ Enricher",
"type": ["enricher", "archiver"],
"type": ["enricher", "extractor"],
"entry_point": "wacz_enricher::WaczExtractorEnricher",
"requires_setup": True,
"dependencies": {

View File

@@ -221,4 +221,4 @@ class WaczExtractorEnricher(Enricher, Extractor):
to_enrich.add_media(m, warc_fn)
counter += 1
seen_urls.add(record_url)
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")

View File

@@ -1,6 +1,6 @@
{
"name": "Wayback Machine Enricher",
"type": ["enricher", "archiver"],
"type": ["enricher", "extractor"],
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
"requires_setup": True,
"dependencies": {

View File

@@ -4,7 +4,6 @@ from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher):
"""
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
def setup(self) -> None:
self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config)
self.s3 = self.module_factory.get_module("s3_storage", self.config)
if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
# TODO: this used to pass all storage items to store now
# Now only passing S3, the rest will get added later in the usual order (?)
# Only storing S3, the rest will get added later in the usual order (?)
m.store(url=url, metadata=to_enrich, storages=[self.s3])
try:
job_id = self.submit_job(m)

View File

@@ -2,7 +2,6 @@
# we need to explicitly expose the available imports here
from .misc import *
from .webdriver import Webdriver
from .atlos import get_atlos_config_options
# handy utils from ytdlp
from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)

View File

@@ -1,13 +0,0 @@
def get_atlos_config_options():
return {
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"cli_set": lambda cli_val, _: cli_val
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"cli_set": lambda cli_val, _: cli_val
},
}

View File

@@ -46,7 +46,7 @@ def dump_payload(p):
def update_nested_dict(dictionary, update_dict):
# takes 2 dicts and overwrites the first with the second only on the changed balues
# takes 2 dicts and overwrites the first with the second only on the changed values
for key, value in update_dict.items():
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
update_nested_dict(dictionary[key], value)