Merge main

This commit is contained in:
Patrick Robertson
2025-02-20 10:29:57 +00:00
50 changed files with 2381 additions and 669 deletions

View File

@@ -3,7 +3,7 @@
"""
from .metadata import Metadata
from .media import Media
from .module import BaseModule
from .base_module import BaseModule
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

View File

@@ -1,13 +1,18 @@
from urllib.parse import urlparse
from typing import Mapping, Any
from __future__ import annotations
from typing import Mapping, Any, Type, TYPE_CHECKING
from abc import ABC
from copy import deepcopy, copy
from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger
if TYPE_CHECKING:
from .module import ModuleFactory
class BaseModule(ABC):
"""
@@ -17,41 +22,24 @@ class BaseModule(ABC):
however modules can have a .setup() method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
See consts.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
default manifest structure.
"""
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
MODULE_TYPES = CONF_MODULE_TYPES
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]]
name: str
module_factory: ModuleFactory
# this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None

View File

@@ -10,8 +10,8 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
from loguru import logger
from copy import deepcopy, copy
from .module import BaseModule
from copy import deepcopy
from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple
@@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
"""
# Global configuration
@@ -170,4 +170,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
_yaml.dump(config_to_save, outf)
_yaml.dump(config_to_save, outf)

View File

@@ -0,0 +1,23 @@
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}

View File

@@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
from __future__ import annotations
from dataclasses import dataclass
from typing import List
from typing import List, TYPE_CHECKING
import shutil
import ast
import copy
@@ -16,99 +16,113 @@ import os
from os.path import join
from loguru import logger
import auto_archiver
from .base_module import BaseModule
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
_LAZY_LOADED_MODULES = {}
MANIFEST_FILE = "__manifest__.py"
if TYPE_CHECKING:
from .base_module import BaseModule
def setup_paths(paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
HAS_SETUP_PATHS = False
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
auto_archiver.modules.__path__.append(path)
class ModuleFactory:
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
def __init__(self):
self._lazy_modules = {}
def get_module(module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return get_module_lazy(module_name).load(config)
def setup_paths(self, paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
global HAS_SETUP_PATHS
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in _LAZY_LOADED_MODULES:
return _LAZY_LOADED_MODULES[module_name]
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
if HAS_SETUP_PATHS == True:
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
auto_archiver.modules.__path__.append(path)
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
HAS_SETUP_PATHS = True
def get_module(self, module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return self.get_module_lazy(module_name).load(config)
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in self._lazy_modules:
return self._lazy_modules[module_name]
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
if _LAZY_LOADED_MODULES.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path)
_LAZY_LOADED_MODULES[possible_module] = lazy_module
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
if self._lazy_modules.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
return all_modules
self._lazy_modules[possible_module] = lazy_module
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
return all_modules
@dataclass
class LazyBaseModule:
@@ -123,14 +137,16 @@ class LazyBaseModule:
type: list
description: str
path: str
module_factory: ModuleFactory
_manifest: dict = None
_instance: BaseModule = None
_entry_point: str = None
def __init__(self, module_name, path):
def __init__(self, module_name, path, factory: ModuleFactory):
self.name = module_name
self.path = path
self.module_factory = factory
@property
def entry_point(self):
@@ -161,7 +177,7 @@ class LazyBaseModule:
return self._manifest
# print(f"Loading manifest for module {module_path}")
# load the manifest file
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
manifest = copy.deepcopy(DEFAULT_MANIFEST)
with open(join(self.path, MANIFEST_FILE)) as f:
try:
@@ -189,13 +205,14 @@ class LazyBaseModule:
# clear out any empty strings that a user may have erroneously added
continue
if not check(dep):
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
exit(1)
def check_python_dep(dep):
# first check if it's a module:
try:
m = get_module_lazy(dep, suppress_warnings=True)
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
try:
# we must now load this module and set it up with the config
m.load(config)
@@ -230,19 +247,21 @@ class LazyBaseModule:
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
# finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
if not getattr(instance, 'name', None):
instance.name = self.name
if not getattr(instance, 'display_name', None):
instance.display_name = self.display_name
self._instance = instance
# set the name, display name and module factory
instance.name = self.name
instance.display_name = self.display_name
instance.module_factory = self.module_factory
# merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config)
instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance
def __repr__(self):

View File

@@ -5,7 +5,7 @@
"""
from __future__ import annotations
from typing import Generator, Union, List, Type
from typing import Generator, Union, List, Type, TYPE_CHECKING
from urllib.parse import urlparse
from ipaddress import ip_address
import argparse
@@ -21,12 +21,14 @@ from rich_argparse import RichHelpFormatter
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule
from .consts import MODULE_TYPES
from loguru import logger
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
DEFAULT_CONFIG_FILE = "orchestration.yaml"
@@ -87,6 +89,12 @@ class UniqueAppendAction(argparse.Action):
class ArchivingOrchestrator:
# instance variables
module_factory: ModuleFactory
setup_finished: bool
logger_id: int
# instance variables, used for convenience to access modules by step
feeders: List[Type[Feeder]]
extractors: List[Type[Extractor]]
enrichers: List[Type[Enricher]]
@@ -94,6 +102,11 @@ class ArchivingOrchestrator:
storages: List[Type[Storage]]
formatters: List[Type[Formatter]]
def __init__(self):
self.module_factory = ModuleFactory()
self.setup_finished = False
self.logger_id = None
def setup_basic_parser(self):
parser = argparse.ArgumentParser(
prog="auto-archiver",
@@ -125,7 +138,7 @@ class ArchivingOrchestrator:
)
self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser(
@@ -147,15 +160,15 @@ class ArchivingOrchestrator:
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
# first loads the modules from the config file, then from the command line
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
@@ -168,7 +181,7 @@ class ArchivingOrchestrator:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
self.add_individual_module_args(available_modules(with_manifest=True), parser)
self.add_individual_module_args(self.module_factory.available_modules(), parser)
parser.set_defaults(**to_dot_notation(yaml_config))
@@ -198,7 +211,7 @@ class ArchivingOrchestrator:
parser = self.parser
# Module loading from the command line
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None):
@@ -224,7 +237,7 @@ class ArchivingOrchestrator:
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules:
modules = available_modules(with_manifest=True)
modules = self.module_factory.available_modules()
for module in modules:
@@ -266,11 +279,18 @@ class ArchivingOrchestrator:
def setup_logging(self, config):
# setup loguru logging
logger.remove(0) # remove the default logger
try:
logger.remove(0) # remove the default logger
except ValueError:
pass
logging_config = config['logging']
logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self, modules_by_type):
"""
@@ -280,7 +300,7 @@ class ArchivingOrchestrator:
"""
invalid_modules = []
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
step_items = []
modules_to_load = modules_by_type[f"{module_type}s"]
@@ -325,7 +345,7 @@ class ArchivingOrchestrator:
if module in invalid_modules:
continue
try:
loaded_module: BaseModule = get_module(module, self.config)
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module:
@@ -351,14 +371,17 @@ class ArchivingOrchestrator:
def setup_config(self, args: list) -> dict:
"""
Sets up the configuration file, merging the default config with the user's config
This function should only ever be run once.
"""
self.setup_basic_parser()
# parse the known arguments for now (basically, we want the config file)
basic_config, unused_args = self.basic_parser.parse_known_args(args)
# setup any custom module paths, so they'll show in the help and for arg parsing
setup_paths(basic_config.module_paths)
self.module_factory.setup_paths(basic_config.module_paths)
# if help flag was called, then show the help
if basic_config.help:
@@ -370,16 +393,29 @@ class ArchivingOrchestrator:
def setup(self, args: list):
"""
Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser
Function to configure all setup of the orchestrator: setup configs and load modules.
This method should only ever be called once
"""
if self.setup_finished:
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
For code implementatations, you should call .setup_config() once then you may call .feed() \
multiple times to archive multiple URLs.")
return
self.setup_basic_parser()
self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config['steps'])
# log out the modules that were loaded
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
self.setup_finished = True
def _command_line_run(self, args: list) -> Generator[Metadata]:
"""

View File

@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule):
"""
@@ -74,7 +74,7 @@ class Storage(BaseModule):
filename = random_str(24)
elif filename_generator == "static":
# load the hash_enricher module
he = get_module(HashEnricher, self.config)
he = self.module_factory.get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:

View File

@@ -1 +1 @@
from atlos_db import AtlosDb
from .atlos_db import AtlosDb

View File

@@ -0,0 +1 @@
from .atlos_storage import AtlosStorage

View File

@@ -281,7 +281,7 @@ class GenericExtractor(Extractor):
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
logger.debug(f'Using provided auth username and password for {url}')
@@ -290,7 +290,7 @@ class GenericExtractor(Extractor):
elif 'cookie' in auth:
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookie_from_browser' in auth:
elif 'cookies_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:

View File

@@ -10,7 +10,6 @@ from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
class HtmlFormatter(Formatter):
environment: Environment = None
@@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
final_media = Media(filename=html_path, _mimetype="text/html")
# get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config)
he = self.module_factory.get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")

View File

@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
chat, since_id = self._send_url_to_bot(url)
message = self._process_messages(chat, since_id, tmp_dir, result)
# This may be outdated and replaced by the below message, but keeping until confirmed
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
# # TODO: It currently returns this as a success - is that intentional?
# if "Media not found or unavailable" in message:
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
# return False
if "Media not found or unavailable" in message:
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
return False
if message:
result.set_content(message).set_title(message[:128])

View File

@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
with self.webdriver_factory(
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")

View File

@@ -7,8 +7,12 @@
"bin": ["ffmpeg"]
},
"configs": {
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
"thumbnails_per_minute": {"default": 60,
"type": "int",
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
"max_thumbnails": {"default": 16,
"type": "int",
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
},
"description": """
Generates thumbnails for video files to provide visual previews.

View File

@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
logger.error(f"error getting duration of video {m.filename}: {e}")
return
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
thumbnails_media = []

View File

@@ -4,7 +4,6 @@ from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher):
"""
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
def setup(self) -> None:
self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config)
self.s3 = self.module_factory.get_module("s3_storage", self.config)
if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
# TODO: this used to pass all storage items to store now
# Now only passing S3, the rest will get added later in the usual order (?)
# Only storing S3, the rest will get added later in the usual order (?)
m.store(url=url, metadata=to_enrich, storages=[self.s3])
try:
job_id = self.submit_job(m)

View File

@@ -46,7 +46,7 @@ def dump_payload(p):
def update_nested_dict(dictionary, update_dict):
# takes 2 dicts and overwrites the first with the second only on the changed balues
# takes 2 dicts and overwrites the first with the second only on the changed values
for key, value in update_dict.items():
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
update_nested_dict(dictionary[key], value)