mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Merge branch 'main' into opentimestamps
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
""" Core modules to handle things such as orchestration, metadata and configs..
|
||||
"""Core modules to handle things such as orchestration, metadata and configs.."""
|
||||
|
||||
"""
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .base_module import BaseModule
|
||||
@@ -14,4 +13,4 @@ from .enricher import Enricher
|
||||
from .feeder import Feeder
|
||||
from .storage import Storage
|
||||
from .extractor import Extractor
|
||||
from .formatter import Formatter
|
||||
from .formatter import Formatter
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Mapping, Any, Type, TYPE_CHECKING
|
||||
from typing import Mapping, Any, TYPE_CHECKING
|
||||
from abc import ABC
|
||||
from copy import deepcopy, copy
|
||||
from copy import deepcopy
|
||||
from tempfile import TemporaryDirectory
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
|
||||
@@ -13,8 +12,8 @@ from loguru import logger
|
||||
if TYPE_CHECKING:
|
||||
from .module import ModuleFactory
|
||||
|
||||
class BaseModule(ABC):
|
||||
|
||||
class BaseModule(ABC):
|
||||
"""
|
||||
Base module class. All modules should inherit from this class.
|
||||
|
||||
@@ -46,14 +45,13 @@ class BaseModule(ABC):
|
||||
|
||||
@property
|
||||
def storages(self) -> list:
|
||||
return self.config.get('storages', [])
|
||||
return self.config.get("storages", [])
|
||||
|
||||
def config_setup(self, config: dict):
|
||||
|
||||
# this is important. Each instance is given its own deepcopied config, so modules cannot
|
||||
# change values to affect other modules
|
||||
config = deepcopy(config)
|
||||
authentication = deepcopy(config.pop('authentication', {}))
|
||||
authentication = deepcopy(config.pop("authentication", {}))
|
||||
|
||||
self.authentication = authentication
|
||||
self.config = config
|
||||
@@ -61,14 +59,15 @@ class BaseModule(ABC):
|
||||
setattr(self, key, val)
|
||||
|
||||
def setup(self):
|
||||
# For any additional setup required by modules, e.g. autehntication
|
||||
# For any additional setup required by modules outside of the configs in the manifesst,
|
||||
# e.g. authentication
|
||||
pass
|
||||
|
||||
def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
|
||||
"""
|
||||
Returns the authentication information for a given site. This is used to authenticate
|
||||
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
|
||||
|
||||
|
||||
:param site: the domain of the site to get authentication information for
|
||||
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
|
||||
|
||||
@@ -90,11 +89,10 @@ class BaseModule(ABC):
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
||||
site = UrlUtil.domain_for_url(site).lstrip("www.")
|
||||
site = UrlUtil.domain_for_url(site).removeprefix("www.")
|
||||
# add the 'www' version of the site to the list of sites to check
|
||||
authdict = {}
|
||||
|
||||
|
||||
for to_try in [site, f"www.{site}"]:
|
||||
if to_try in self.authentication:
|
||||
authdict.update(self.authentication[to_try])
|
||||
@@ -104,17 +102,20 @@ class BaseModule(ABC):
|
||||
if not authdict:
|
||||
for key in self.authentication.keys():
|
||||
if key in site or site in key:
|
||||
logger.debug(f"Could not find exact authentication information for site '{site}'. \
|
||||
logger.debug(
|
||||
f"Could not find exact authentication information for site '{site}'. \
|
||||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
If so, edit your authentication settings to make sure it exactly matches."
|
||||
)
|
||||
|
||||
def get_ytdlp_cookiejar(args):
|
||||
import yt_dlp
|
||||
from yt_dlp import parse_options
|
||||
|
||||
logger.debug(f"Extracting cookies from settings: {args[1]}")
|
||||
# parse_options returns a named tuple as follows, we only need the ydl_options part
|
||||
# collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
|
||||
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
|
||||
ytdlp_opts = getattr(parse_options(args), "ydl_opts")
|
||||
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
||||
|
||||
get_cookiejar_options = None
|
||||
@@ -125,22 +126,21 @@ If so, edit your authentication settings to make sure it exactly matches.")
|
||||
# 3. cookies_from_browser setting in global config
|
||||
# 4. cookies_file setting in global config
|
||||
|
||||
if 'cookies_from_browser' in authdict:
|
||||
get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
|
||||
elif 'cookies_file' in authdict:
|
||||
get_cookiejar_options = ['--cookies', authdict['cookies_file']]
|
||||
elif 'cookies_from_browser' in self.authentication:
|
||||
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
|
||||
get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
|
||||
elif 'cookies_file' in self.authentication:
|
||||
authdict['cookies_file'] = self.authentication['cookies_file']
|
||||
get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
|
||||
if "cookies_from_browser" in authdict:
|
||||
get_cookiejar_options = ["--cookies-from-browser", authdict["cookies_from_browser"]]
|
||||
elif "cookies_file" in authdict:
|
||||
get_cookiejar_options = ["--cookies", authdict["cookies_file"]]
|
||||
elif "cookies_from_browser" in self.authentication:
|
||||
authdict["cookies_from_browser"] = self.authentication["cookies_from_browser"]
|
||||
get_cookiejar_options = ["--cookies-from-browser", self.authentication["cookies_from_browser"]]
|
||||
elif "cookies_file" in self.authentication:
|
||||
authdict["cookies_file"] = self.authentication["cookies_file"]
|
||||
get_cookiejar_options = ["--cookies", self.authentication["cookies_file"]]
|
||||
|
||||
|
||||
if get_cookiejar_options:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||
authdict["cookies_jar"] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||
|
||||
return authdict
|
||||
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
|
||||
@@ -6,7 +6,7 @@ flexible setup in various environments.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
from ruamel.yaml import YAML, CommentedMap
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
@@ -19,12 +19,14 @@ _yaml: YAML = YAML()
|
||||
|
||||
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
|
||||
|
||||
EMPTY_CONFIG = _yaml.load("""
|
||||
EMPTY_CONFIG = _yaml.load(
|
||||
"""
|
||||
# Auto Archiver Configuration
|
||||
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
"""
|
||||
steps:"""
|
||||
+ "".join([f"\n {module}s: []" for module in MODULE_TYPES])
|
||||
+ """
|
||||
|
||||
# Global configuration
|
||||
|
||||
@@ -51,50 +53,54 @@ authentication: {}
|
||||
logging:
|
||||
level: INFO
|
||||
|
||||
""")
|
||||
"""
|
||||
)
|
||||
# note: 'logging' is explicitly added above in order to better format the config file
|
||||
|
||||
|
||||
# Arg Parse Actions/Classes
|
||||
class AuthenticationJsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
|
||||
try:
|
||||
auth_dict = json.loads(values)
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}") from e
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
with open(path, "r") as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
if auth_dict.get("authentication"):
|
||||
auth_dict = auth_dict["authentication"]
|
||||
auth_dict["load_from_file"] = path
|
||||
return auth_dict
|
||||
except:
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
if isinstance(auth_dict, dict) and auth_dict.get("from_file"):
|
||||
auth_dict = load_from_file(auth_dict["from_file"])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Authentication must be a dictionary of site names and their authentication methods"
|
||||
)
|
||||
global_options = ["cookies_from_browser", "cookies_file", "load_from_file"]
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}"
|
||||
)
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
@@ -105,8 +111,8 @@ class UniqueAppendAction(argparse.Action):
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
def error(self, message):
|
||||
"""
|
||||
Override of error to format a nicer looking error message using logger
|
||||
@@ -135,8 +141,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
return super().parse_known_args(args, namespace)
|
||||
|
||||
|
||||
# Config Utils
|
||||
|
||||
|
||||
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||
dotdict = {}
|
||||
|
||||
@@ -150,6 +158,7 @@ def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||
process_subdict(yaml_conf)
|
||||
return dotdict
|
||||
|
||||
|
||||
def from_dot_notation(dotdict: dict) -> dict:
|
||||
normal_dict = {}
|
||||
|
||||
@@ -170,9 +179,11 @@ def from_dot_notation(dotdict: dict) -> dict:
|
||||
def is_list_type(value):
|
||||
return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set)
|
||||
|
||||
|
||||
def is_dict_type(value):
|
||||
return isinstance(value, dict) or isinstance(value, CommentedMap)
|
||||
|
||||
|
||||
def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
yaml_dict: CommentedMap = deepcopy(yaml_dict)
|
||||
|
||||
@@ -183,7 +194,7 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
yaml_subdict[key] = value
|
||||
continue
|
||||
|
||||
if key == 'steps':
|
||||
if key == "steps":
|
||||
for module_type, modules in value.items():
|
||||
# overwrite the 'steps' from the config file with the ones from the CLI
|
||||
yaml_subdict[key][module_type] = modules
|
||||
@@ -198,6 +209,7 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
|
||||
update_dict(from_dot_notation(dotdict), yaml_dict)
|
||||
return yaml_dict
|
||||
|
||||
|
||||
def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
config = None
|
||||
try:
|
||||
@@ -211,6 +223,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
|
||||
return config
|
||||
|
||||
|
||||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
|
||||
|
||||
@@ -218,13 +231,14 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
config_to_save = deepcopy(config)
|
||||
|
||||
auth_dict = config_to_save.get("authentication", {})
|
||||
if auth_dict and auth_dict.get('load_from_file'):
|
||||
if auth_dict and auth_dict.get("load_from_file"):
|
||||
# remove all other values from the config, don't want to store it in the config file
|
||||
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
config_to_save.pop("urls", None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
_yaml.dump(config_to_save, outf)
|
||||
|
||||
|
||||
def is_valid_config(config: CommentedMap) -> bool:
|
||||
return config and config != EMPTY_CONFIG
|
||||
return config and config != EMPTY_CONFIG
|
||||
|
||||
@@ -1,25 +1,19 @@
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
"name": "", # the display name of the module
|
||||
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
|
||||
"type": [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
"description": "", # a description of the module
|
||||
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
"version": "1.0", # the version of the module
|
||||
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
Database module for the auto-archiver that defines the interface for implementing database modules
|
||||
in the media archiving framework.
|
||||
in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -9,6 +9,7 @@ from typing import Union
|
||||
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
|
||||
class Database(BaseModule):
|
||||
"""
|
||||
Base class for implementing database modules in the media archiving framework.
|
||||
@@ -20,7 +21,7 @@ class Database(BaseModule):
|
||||
"""signals the DB that the given item archival has started"""
|
||||
pass
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""update DB accordingly for failure"""
|
||||
pass
|
||||
|
||||
@@ -34,6 +35,6 @@ class Database(BaseModule):
|
||||
return False
|
||||
|
||||
@abstractmethod
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
pass
|
||||
|
||||
@@ -8,13 +8,15 @@ the archiving step and before storage or formatting.
|
||||
|
||||
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto Archiver system.
|
||||
|
||||
|
||||
Enricher modules must implement the `enrich` method to define their behavior.
|
||||
"""
|
||||
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for extractors.
|
||||
"""The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for extractors.
|
||||
|
||||
Factory method to initialize an extractor instance based on its name.
|
||||
Factory method to initialize an extractor instance based on its name.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import mimetypes
|
||||
import os
|
||||
import mimetypes
|
||||
import requests
|
||||
from loguru import logger
|
||||
from retrying import retry
|
||||
@@ -39,7 +37,7 @@ class Extractor(BaseModule):
|
||||
Used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
"""
|
||||
return url
|
||||
|
||||
|
||||
def match_link(self, url: str) -> re.Match:
|
||||
"""
|
||||
Returns a match object if the given URL matches the valid_url pattern or False/None if not.
|
||||
@@ -58,7 +56,7 @@ class Extractor(BaseModule):
|
||||
"""
|
||||
if self.valid_url:
|
||||
return self.match_link(url) is not None
|
||||
|
||||
|
||||
return True
|
||||
|
||||
def _guess_file_type(self, path: str) -> str:
|
||||
@@ -74,17 +72,17 @@ class Extractor(BaseModule):
|
||||
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
|
||||
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
|
||||
"""
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
downloads a URL to provided filename, or inferred from URL, returns local filename
|
||||
"""
|
||||
if not to_filename:
|
||||
to_filename = url.split('/')[-1].split('?')[0]
|
||||
to_filename = url.split("/")[-1].split("?")[0]
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||
if verbose:
|
||||
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
try:
|
||||
d = requests.get(url, stream=True, headers=headers, timeout=30)
|
||||
@@ -92,12 +90,12 @@ class Extractor(BaseModule):
|
||||
|
||||
# get mimetype from the response headers
|
||||
if not mimetypes.guess_type(to_filename)[0]:
|
||||
content_type = d.headers.get('Content-Type') or self._guess_file_type(url)
|
||||
content_type = d.headers.get("Content-Type") or self._guess_file_type(url)
|
||||
extension = mimetypes.guess_extension(content_type)
|
||||
if extension:
|
||||
to_filename += extension
|
||||
|
||||
with open(to_filename, 'wb') as f:
|
||||
with open(to_filename, "wb") as f:
|
||||
for chunk in d.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
return to_filename
|
||||
@@ -109,8 +107,8 @@ class Extractor(BaseModule):
|
||||
def download(self, item: Metadata) -> Metadata | False:
|
||||
"""
|
||||
Downloads the media from the given URL and returns a Metadata object with the downloaded media.
|
||||
|
||||
|
||||
If the URL is not supported or the download fails, this method should return False.
|
||||
|
||||
"""
|
||||
pass
|
||||
pass
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
The feeder base module defines the interface for implementing feeders in the media archiving framework.
|
||||
The feeder base module defines the interface for implementing feeders in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -7,8 +7,8 @@ from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import BaseModule
|
||||
|
||||
class Feeder(BaseModule):
|
||||
|
||||
class Feeder(BaseModule):
|
||||
"""
|
||||
Base class for implementing feeders in the media archiving framework.
|
||||
|
||||
@@ -19,7 +19,7 @@ class Feeder(BaseModule):
|
||||
def __iter__(self) -> Metadata:
|
||||
"""
|
||||
Returns an iterator (use `yield`) over the items to be archived.
|
||||
|
||||
|
||||
These should be instances of Metadata, typically created with Metadata().set_url(url).
|
||||
"""
|
||||
return None
|
||||
return None
|
||||
|
||||
@@ -12,7 +12,7 @@ from auto_archiver.core import Metadata, Media, BaseModule
|
||||
class Formatter(BaseModule):
|
||||
"""
|
||||
Base class for implementing formatters in the media archiving framework.
|
||||
|
||||
|
||||
Subclasses must implement the `format` method to define their behavior.
|
||||
"""
|
||||
|
||||
@@ -21,4 +21,4 @@ class Formatter(BaseModule):
|
||||
"""
|
||||
Formats a Metadata object into a user-viewable format (e.g. HTML) and stores it if needed.
|
||||
"""
|
||||
return None
|
||||
return None
|
||||
|
||||
@@ -26,6 +26,7 @@ class Media:
|
||||
- properties: Additional metadata or transformations for the media.
|
||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||
"""
|
||||
|
||||
filename: str
|
||||
_key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
@@ -51,14 +52,15 @@ class Media:
|
||||
This function returns a generator for all the inner media.
|
||||
|
||||
"""
|
||||
if include_self: yield self
|
||||
if include_self:
|
||||
yield self
|
||||
for prop in self.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
if isinstance(prop, Media):
|
||||
for inner_media in prop.all_inner_media(include_self=True):
|
||||
yield inner_media
|
||||
if isinstance(prop, list):
|
||||
for prop_media in prop:
|
||||
if isinstance(prop_media, Media):
|
||||
if isinstance(prop_media, Media):
|
||||
for inner_media in prop_media.all_inner_media(include_self=True):
|
||||
yield inner_media
|
||||
|
||||
@@ -113,15 +115,17 @@ class Media:
|
||||
# checks for video streams with ffmpeg, or min file size for a video
|
||||
# self.is_video() should be used together with this method
|
||||
try:
|
||||
streams = ffmpeg.probe(self.filename, select_streams='v')['streams']
|
||||
streams = ffmpeg.probe(self.filename, select_streams="v")["streams"]
|
||||
logger.warning(f"STREAMS FOR {self.filename} {streams}")
|
||||
return any(s.get("duration_ts", 0) > 0 for s in streams)
|
||||
except Error: return False # ffmpeg errors when reading bad files
|
||||
except Error:
|
||||
return False # ffmpeg errors when reading bad files
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
fsize = os.path.getsize(self.filename)
|
||||
return fsize > 20_000
|
||||
except: pass
|
||||
except Exception as e:
|
||||
pass
|
||||
return True
|
||||
|
||||
@@ -13,7 +13,7 @@ from __future__ import annotations
|
||||
import hashlib
|
||||
from typing import Any, List, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json, config
|
||||
from dataclasses_json import dataclass_json
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from dateutil.parser import parse as parse_dt
|
||||
@@ -21,6 +21,7 @@ from loguru import logger
|
||||
|
||||
from .media import Media
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Metadata:
|
||||
@@ -40,19 +41,23 @@ class Metadata:
|
||||
- If `True`, this instance's values are overwritten by `right`.
|
||||
- If `False`, the inverse applies.
|
||||
"""
|
||||
if not right: return self
|
||||
if not right:
|
||||
return self
|
||||
if overwrite_left:
|
||||
if right.status and len(right.status):
|
||||
self.status = right.status
|
||||
self._context.update(right._context)
|
||||
for k, v in right.metadata.items():
|
||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||
assert k not in self.metadata or type(v) is type(self.get(k))
|
||||
if not isinstance(v, (dict, list, set)) or k not in self.metadata:
|
||||
self.set(k, v)
|
||||
else: # key conflict
|
||||
if type(v) in [dict, set]: self.set(k, self.get(k) | v)
|
||||
elif type(v) == list: self.set(k, self.get(k) + v)
|
||||
if isinstance(v, (dict, set)):
|
||||
self.set(k, self.get(k) | v)
|
||||
elif type(v) is list:
|
||||
self.set(k, self.get(k) + v)
|
||||
self.media.extend(right.media)
|
||||
|
||||
else: # invert and do same logic
|
||||
return right.merge(self)
|
||||
return self
|
||||
@@ -69,7 +74,7 @@ class Metadata:
|
||||
|
||||
def append(self, key: str, val: Any) -> Metadata:
|
||||
if key not in self.metadata:
|
||||
self.metadata[key] = []
|
||||
self.metadata[key] = []
|
||||
self.metadata[key] = val
|
||||
return self
|
||||
|
||||
@@ -80,24 +85,26 @@ class Metadata:
|
||||
return self.metadata.get(key, default)
|
||||
|
||||
def success(self, context: str = None) -> Metadata:
|
||||
if context: self.status = f"{context}: success"
|
||||
else: self.status = "success"
|
||||
if context:
|
||||
self.status = f"{context}: success"
|
||||
else:
|
||||
self.status = "success"
|
||||
return self
|
||||
|
||||
def is_success(self) -> bool:
|
||||
return "success" in self.status
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
meaningfull_ids = set(self.metadata.keys()) - set(["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"])
|
||||
meaningfull_ids = set(self.metadata.keys()) - set(
|
||||
["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"]
|
||||
)
|
||||
return not self.is_success() and len(self.media) == 0 and len(meaningfull_ids) == 0
|
||||
|
||||
@property # getter .netloc
|
||||
def netloc(self) -> str:
|
||||
return urlparse(self.get_url()).netloc
|
||||
|
||||
|
||||
# custom getter/setters
|
||||
|
||||
# custom getter/setters
|
||||
|
||||
def set_url(self, url: str) -> Metadata:
|
||||
assert type(url) is str and len(url) > 0, "invalid URL"
|
||||
@@ -120,36 +127,43 @@ class Metadata:
|
||||
return self.get("title")
|
||||
|
||||
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||
if type(timestamp) == str:
|
||||
if isinstance(timestamp, str):
|
||||
timestamp = parse_dt(timestamp)
|
||||
assert type(timestamp) == datetime.datetime, "set_timestamp expects a datetime instance"
|
||||
assert isinstance(timestamp, datetime.datetime), "set_timestamp expects a datetime instance"
|
||||
return self.set("timestamp", timestamp)
|
||||
|
||||
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime:
|
||||
def get_timestamp(self, utc=True, iso=True) -> datetime.datetime | str | None:
|
||||
ts = self.get("timestamp")
|
||||
if not ts: return
|
||||
if not ts:
|
||||
return None
|
||||
try:
|
||||
if type(ts) == str: ts = datetime.datetime.fromisoformat(ts)
|
||||
if type(ts) == float: ts = datetime.datetime.fromtimestamp(ts)
|
||||
if utc: ts = ts.replace(tzinfo=datetime.timezone.utc)
|
||||
if iso: return ts.isoformat()
|
||||
return ts
|
||||
if isinstance(ts, str):
|
||||
ts = datetime.datetime.fromisoformat(ts)
|
||||
elif isinstance(ts, float):
|
||||
ts = datetime.datetime.fromtimestamp(ts)
|
||||
if utc:
|
||||
ts = ts.replace(tzinfo=datetime.timezone.utc)
|
||||
return ts.isoformat() if iso else ts
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
||||
return
|
||||
return None
|
||||
|
||||
def add_media(self, media: Media, id: str = None) -> Metadata:
|
||||
# adds a new media, optionally including an id
|
||||
if media is None: return
|
||||
if media is None:
|
||||
return
|
||||
if id is not None:
|
||||
assert not len([1 for m in self.media if m.get("id") == id]), f"cannot add 2 pieces of media with the same id {id}"
|
||||
assert not len([1 for m in self.media if m.get("id") == id]), (
|
||||
f"cannot add 2 pieces of media with the same id {id}"
|
||||
)
|
||||
media.set("id", id)
|
||||
self.media.append(media)
|
||||
return media
|
||||
|
||||
def get_media_by_id(self, id: str, default=None) -> Media:
|
||||
for m in self.media:
|
||||
if m.get("id") == id: return m
|
||||
if m.get("id") == id:
|
||||
return m
|
||||
return default
|
||||
|
||||
def remove_duplicate_media_by_hash(self) -> None:
|
||||
@@ -159,7 +173,8 @@ class Metadata:
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(chunksize)
|
||||
if not buf: break
|
||||
if not buf:
|
||||
break
|
||||
hash_algo.update(buf)
|
||||
return hash_algo.hexdigest()
|
||||
|
||||
@@ -167,15 +182,18 @@ class Metadata:
|
||||
new_media = []
|
||||
for m in self.media:
|
||||
h = m.get("hash")
|
||||
if not h: h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
if len(h) and h in media_hashes: continue
|
||||
if not h:
|
||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
if len(h) and h in media_hashes:
|
||||
continue
|
||||
media_hashes.add(h)
|
||||
new_media.append(m)
|
||||
self.media = new_media
|
||||
|
||||
def get_first_image(self, default=None) -> Media:
|
||||
for m in self.media:
|
||||
if "image" in m.mimetype: return m
|
||||
if "image" in m.mimetype:
|
||||
return m
|
||||
return default
|
||||
|
||||
def set_final_media(self, final: Media) -> Metadata:
|
||||
@@ -193,22 +211,25 @@ class Metadata:
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def choose_most_complete(results: List[Metadata]) -> Metadata:
|
||||
# returns the most complete result from a list of results
|
||||
# prioritizes results with more media, then more metadata
|
||||
if len(results) == 0: return None
|
||||
if len(results) == 1: return results[0]
|
||||
if len(results) == 0:
|
||||
return None
|
||||
if len(results) == 1:
|
||||
return results[0]
|
||||
most_complete = results[0]
|
||||
for r in results[1:]:
|
||||
if len(r.media) > len(most_complete.media): most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||
if len(r.media) > len(most_complete.media):
|
||||
most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata):
|
||||
most_complete = r
|
||||
return most_complete
|
||||
|
||||
def set_context(self, key: str, val: Any) -> Metadata:
|
||||
self._context[key] = val
|
||||
return self
|
||||
|
||||
|
||||
def get_context(self, key: str, default: Any = None) -> Any:
|
||||
return self._context.get(key, default)
|
||||
return self._context.get(key, default)
|
||||
|
||||
@@ -3,6 +3,7 @@ Defines the Step abstract base class, which acts as a blueprint for steps in the
|
||||
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
@@ -24,17 +25,17 @@ if TYPE_CHECKING:
|
||||
|
||||
HAS_SETUP_PATHS = False
|
||||
|
||||
class ModuleFactory:
|
||||
|
||||
class ModuleFactory:
|
||||
def __init__(self):
|
||||
self._lazy_modules = {}
|
||||
|
||||
def setup_paths(self, paths: list[str]) -> None:
|
||||
"""
|
||||
Sets up the paths for the modules to be loaded from
|
||||
|
||||
|
||||
This is necessary for the modules to be imported correctly
|
||||
|
||||
|
||||
"""
|
||||
global HAS_SETUP_PATHS
|
||||
|
||||
@@ -46,11 +47,13 @@ class ModuleFactory:
|
||||
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
if HAS_SETUP_PATHS == True:
|
||||
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||
if HAS_SETUP_PATHS:
|
||||
logger.warning(
|
||||
f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
|
||||
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
|
||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing)."
|
||||
)
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
@@ -60,20 +63,20 @@ class ModuleFactory:
|
||||
def get_module(self, module_name: str, config: dict) -> Type[BaseModule]:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
||||
|
||||
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
|
||||
|
||||
|
||||
"""
|
||||
return self.get_module_lazy(module_name).load(config)
|
||||
|
||||
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
|
||||
"""
|
||||
Lazily loads a module, returning a LazyBaseModule
|
||||
|
||||
|
||||
This has all the information about the module, but does not load the module itself or its dependencies
|
||||
|
||||
|
||||
To load an actual module, call .setup() on a lazy module
|
||||
|
||||
|
||||
"""
|
||||
if module_name in self._lazy_modules:
|
||||
return self._lazy_modules[module_name]
|
||||
@@ -81,13 +84,14 @@ class ModuleFactory:
|
||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||
if 'archiver' in module_name:
|
||||
if "archiver" in module_name:
|
||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
||||
raise IndexError(message)
|
||||
return available[0]
|
||||
|
||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
def available_modules(
|
||||
self, limit_to_modules: List[str] = [], suppress_warnings: bool = False
|
||||
) -> List[LazyBaseModule]:
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
@@ -119,7 +123,7 @@ class ModuleFactory:
|
||||
self._lazy_modules[possible_module] = lazy_module
|
||||
|
||||
all_modules.append(lazy_module)
|
||||
|
||||
|
||||
if not suppress_warnings:
|
||||
for module in limit_to_modules:
|
||||
if not any(module == m.name for m in all_modules):
|
||||
@@ -127,15 +131,16 @@ class ModuleFactory:
|
||||
|
||||
return all_modules
|
||||
|
||||
|
||||
@dataclass
|
||||
class LazyBaseModule:
|
||||
|
||||
"""
|
||||
A lazy module class, which only loads the manifest and does not load the module itself.
|
||||
|
||||
This is useful for getting information about a module without actually loading it.
|
||||
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
path: str
|
||||
@@ -152,30 +157,30 @@ class LazyBaseModule:
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.manifest['type']
|
||||
return self.manifest["type"]
|
||||
|
||||
@property
|
||||
def entry_point(self):
|
||||
if not self._entry_point and not self.manifest['entry_point']:
|
||||
if not self._entry_point and not self.manifest["entry_point"]:
|
||||
# try to create the entry point from the module name
|
||||
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
|
||||
return self._entry_point
|
||||
|
||||
@property
|
||||
def dependencies(self) -> dict:
|
||||
return self.manifest['dependencies']
|
||||
|
||||
return self.manifest["dependencies"]
|
||||
|
||||
@property
|
||||
def configs(self) -> dict:
|
||||
return self.manifest['configs']
|
||||
|
||||
return self.manifest["configs"]
|
||||
|
||||
@property
|
||||
def requires_setup(self) -> bool:
|
||||
return self.manifest['requires_setup']
|
||||
|
||||
return self.manifest["requires_setup"]
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
return self.manifest['name']
|
||||
return self.manifest["name"]
|
||||
|
||||
@property
|
||||
def manifest(self) -> dict:
|
||||
@@ -189,17 +194,16 @@ class LazyBaseModule:
|
||||
try:
|
||||
manifest.update(ast.literal_eval(f.read()))
|
||||
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
||||
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
|
||||
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") from e
|
||||
|
||||
self._manifest = manifest
|
||||
self._entry_point = manifest['entry_point']
|
||||
self.description = manifest['description']
|
||||
self.version = manifest['version']
|
||||
self._entry_point = manifest["entry_point"]
|
||||
self.description = manifest["description"]
|
||||
self.version = manifest["version"]
|
||||
|
||||
return manifest
|
||||
|
||||
def load(self, config) -> BaseModule:
|
||||
|
||||
if self._instance:
|
||||
return self._instance
|
||||
|
||||
@@ -210,8 +214,10 @@ class LazyBaseModule:
|
||||
# clear out any empty strings that a user may have erroneously added
|
||||
continue
|
||||
if not check(dep):
|
||||
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||
logger.error(
|
||||
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
def check_python_dep(dep):
|
||||
@@ -219,10 +225,10 @@ class LazyBaseModule:
|
||||
try:
|
||||
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
|
||||
try:
|
||||
# we must now load this module and set it up with the config
|
||||
# we must now load this module and set it up with the config
|
||||
m.load(config)
|
||||
return True
|
||||
except:
|
||||
except Exception:
|
||||
logger.error(f"Unable to setup module '{dep}' for use in module '{self.name}'")
|
||||
return False
|
||||
except IndexError:
|
||||
@@ -231,13 +237,12 @@ class LazyBaseModule:
|
||||
|
||||
return find_spec(dep)
|
||||
|
||||
check_deps(self.dependencies.get('python', []), check_python_dep)
|
||||
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
||||
|
||||
check_deps(self.dependencies.get("python", []), check_python_dep)
|
||||
check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep))
|
||||
|
||||
logger.debug(f"Loading module '{self.display_name}'...")
|
||||
|
||||
for qualname in [self.name, f'auto_archiver.modules.{self.name}']:
|
||||
for qualname in [self.name, f"auto_archiver.modules.{self.name}"]:
|
||||
try:
|
||||
# first import the whole module, to make sure it's working properly
|
||||
__import__(qualname)
|
||||
@@ -246,10 +251,10 @@ class LazyBaseModule:
|
||||
pass
|
||||
|
||||
# then import the file for the entry point
|
||||
file_name, class_name = self.entry_point.split('::')
|
||||
sub_qualname = f'{qualname}.{file_name}'
|
||||
file_name, class_name = self.entry_point.split("::")
|
||||
sub_qualname = f"{qualname}.{file_name}"
|
||||
|
||||
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||
__import__(f"{qualname}.{file_name}", fromlist=[self.entry_point])
|
||||
# finally, get the class instance
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
|
||||
@@ -257,11 +262,11 @@ class LazyBaseModule:
|
||||
instance.name = self.name
|
||||
instance.display_name = self.display_name
|
||||
instance.module_factory = self.module_factory
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v["default"]) for k, v in self.configs.items() if "default" in v)
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
|
||||
@@ -270,4 +275,4 @@ class LazyBaseModule:
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
return f"Module<'{self.display_name}' ({self.name})>"
|
||||
return f"Module<'{self.display_name}' ({self.name})>"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
"""Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
@@ -19,8 +19,17 @@ import requests
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .config import (
|
||||
read_yaml,
|
||||
store_yaml,
|
||||
to_dot_notation,
|
||||
merge_dicts,
|
||||
is_valid_config,
|
||||
DefaultValidatingParser,
|
||||
UniqueAppendAction,
|
||||
AuthenticationJsonParseAction,
|
||||
DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES, SetupError
|
||||
@@ -30,8 +39,8 @@ if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
@@ -61,30 +70,63 @@ class ArchivingOrchestrator:
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
action="store",
|
||||
dest="config_file",
|
||||
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
|
||||
default=DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
action="store",
|
||||
dest="mode",
|
||||
type=str,
|
||||
choices=["simple", "full"],
|
||||
help="the mode to run the archiver in",
|
||||
default="simple",
|
||||
)
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--store",
|
||||
dest="store",
|
||||
default=False,
|
||||
help="Store the created config in the config file",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--module_paths",
|
||||
dest="module_paths",
|
||||
nargs="+",
|
||||
default=[],
|
||||
help="additional paths to search for modules",
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
if not config["steps"].get(f"{module_type}s", []):
|
||||
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
|
||||
raise SetupError(
|
||||
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
|
||||
)
|
||||
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||
raise SetupError(
|
||||
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||
)
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
@@ -92,7 +134,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
|
||||
"steps"
|
||||
].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
add_help=False,
|
||||
@@ -115,30 +159,32 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
avail_modules = self.module_factory.available_modules(
|
||||
limit_to_modules=enabled_modules, suppress_warnings=True
|
||||
)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
elif basic_config.mode == "simple":
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
@@ -164,43 +210,76 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
store_yaml(config, basic_config.config_file)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def add_modules_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
f"--{module_type}s",
|
||||
dest=f"{module_type}s",
|
||||
nargs="+",
|
||||
help=f"the {module_type}s to use",
|
||||
default=[],
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
parser.add_argument(
|
||||
"--authentication",
|
||||
dest="authentication",
|
||||
help="A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction)
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.",
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction,
|
||||
)
|
||||
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
parser.add_argument(
|
||||
"--logging.level",
|
||||
action="store",
|
||||
dest="logging.level",
|
||||
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
|
||||
help="the logging level to use",
|
||||
default="INFO",
|
||||
type=str.upper,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.rotation",
|
||||
action="store",
|
||||
dest="logging.rotation",
|
||||
help="the logging rotation to use",
|
||||
default=None,
|
||||
)
|
||||
|
||||
def add_individual_module_args(
|
||||
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
|
||||
) -> None:
|
||||
if not modules:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
|
||||
for module in modules:
|
||||
if module.name == 'cli_feeder':
|
||||
if module.name == "cli_feeder":
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
parser.add_argument(
|
||||
"urls",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
@@ -209,21 +288,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
if not kwargs.get('metavar', None):
|
||||
if not kwargs.get("metavar", None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
kwargs["metavar"] = name.upper()
|
||||
|
||||
if kwargs.get('required', False):
|
||||
if kwargs.get("required", False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop('default', None)
|
||||
kwargs.pop("default", None)
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
kwargs.pop("cli_set", None)
|
||||
should_store = kwargs.pop("should_store", False)
|
||||
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
||||
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
|
||||
except AttributeError:
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
@@ -238,12 +317,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
logging_config = config["logging"]
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
if logging_config.get("enabled", True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable('auto_archiver')
|
||||
logger.disable("auto_archiver")
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
@@ -253,38 +331,45 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
pass
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
|
||||
if log_file := logging_config["file"]:
|
||||
logger.add(log_file) if not logging_config["rotation"] else logger.add(
|
||||
log_file, rotation=logging_config["rotation"]
|
||||
)
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
if not modules_to_load:
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
if len(modules_to_load):
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
logger.error(
|
||||
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
|
||||
)
|
||||
raise SetupError(
|
||||
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
|
||||
)
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
|
||||
raise SetupError(
|
||||
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
|
||||
)
|
||||
|
||||
for module in modules_to_load:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
@@ -293,7 +378,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == 'extractor':
|
||||
if loaded_module and module_type == "extractor":
|
||||
loaded_module.cleanup()
|
||||
raise e
|
||||
|
||||
@@ -308,11 +393,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
logger.error(
|
||||
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
|
||||
)
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
@@ -335,13 +422,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response['info']['version']
|
||||
latest_version = response["info"]["version"]
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
@@ -351,33 +438,36 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
self.check_for_updates()
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
logger.warning(
|
||||
"The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs.")
|
||||
multiple times to archive multiple URLs."
|
||||
)
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config['steps'])
|
||||
self.install_modules(self.config["steps"])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
logger.info(
|
||||
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
|
||||
)
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
@@ -385,9 +475,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
This is the main entry point for the orchestrator, when run from the command line.
|
||||
|
||||
:param args: list of arguments to pass to the orchestrator - these are the command line args
|
||||
|
||||
|
||||
You should not call this method from code implementations.
|
||||
|
||||
|
||||
This method sets up the configuration, loads the modules, and runs the feed.
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
@@ -405,7 +495,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
@@ -436,9 +525,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||
for d in self.databases:
|
||||
if type(e) == AssertionError:
|
||||
if isinstance(e, AssertionError):
|
||||
d.failed(item, str(e))
|
||||
else:
|
||||
d.failed(item, reason="unexpected error")
|
||||
@@ -451,13 +540,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
@@ -473,7 +562,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
url = a.sanitize_url(url)
|
||||
|
||||
result.set_url(url)
|
||||
if original_url != url: result.set("original_url", original_url)
|
||||
if original_url != url:
|
||||
result.set("original_url", original_url)
|
||||
|
||||
# 2 - notify start to DBs, propagate already archived if feature enabled in DBs
|
||||
cached_result = None
|
||||
@@ -484,7 +574,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
if cached_result:
|
||||
logger.debug("Found previously archived entry")
|
||||
for d in self.databases:
|
||||
try: d.done(cached_result, cached=True)
|
||||
try:
|
||||
d.done(cached_result, cached=True)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
return cached_result
|
||||
@@ -494,13 +585,15 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.info(f"Trying extractor {a.name} for {url}")
|
||||
try:
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
if result.is_success():
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# 4 - call enrichers to work with archived content
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
try:
|
||||
e.enrich(result)
|
||||
except Exception as exc:
|
||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
@@ -518,12 +611,12 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
# signal completion to databases and archivers
|
||||
for d in self.databases:
|
||||
try: d.done(result)
|
||||
try:
|
||||
d.done(result)
|
||||
except Exception as e:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
@@ -532,7 +625,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
authentication = config.get("authentication", {})
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
@@ -541,8 +634,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config['authentication'] = authentication
|
||||
|
||||
config["authentication"] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@@ -23,7 +23,6 @@ from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import IO
|
||||
import os
|
||||
import platform
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
@@ -33,16 +32,16 @@ from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
|
||||
@@ -74,18 +73,18 @@ class Storage(BaseModule):
|
||||
This method should not be called directly, but instead be called through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||
with open(media.filename, "rb") as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
|
||||
|
||||
if media.key is not None and len(media.key) > 0:
|
||||
# media key is already set
|
||||
return
|
||||
|
||||
folder = metadata.get_context('folder', '')
|
||||
folder = metadata.get_context("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
@@ -105,12 +104,11 @@ class Storage(BaseModule):
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||
he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
media._key = key
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
media._key = key
|
||||
|
||||
@@ -3,11 +3,13 @@ from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def example_validator(value):
|
||||
if "example" not in value:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
||||
return value
|
||||
|
||||
|
||||
def positive_number(value):
|
||||
if value < 0:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
||||
@@ -19,5 +21,6 @@ def valid_file(value):
|
||||
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
||||
return value
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
return json.loads(cli_val)
|
||||
|
||||
Reference in New Issue
Block a user