Files
auto-archiver/src/auto_archiver/core/base_module.py
Patrick Robertson d6b4b7a932 Further cleanup
* Removes (partly) the ArchivingOrchestrator
* Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com
* More unit tests
* Improved error handling
2025-01-30 16:44:40 +01:00

100 lines
4.2 KiB
Python

from urllib.parse import urlparse
from typing import Mapping, Any
from abc import ABC
from copy import deepcopy, copy
from tempfile import TemporaryDirectory
from loguru import logger
class BaseModule(ABC):
"""
Base module class. All modules should inherit from this class.
The exact methods a class implements will depend on the type of module it is,
however all modules have a .setup(config: dict) method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
default manifest structure.
"""
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]]
name: str
# this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None
def setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out contatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):
authentication[site] = val
del authentication[key]
# this is important. Each instance is given its own deepcopied config, so modules cannot
# change values to affect other modules
config = deepcopy(config)
authentication = deepcopy(config.pop('authentication', {}))
self.authentication = authentication
self.config = config
for key, val in config.get(self.name, {}).items():
setattr(self, key, val)
def repr(self):
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
def auth_for_site(self, site: str) -> dict:
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now, just hard code those.
# SECURITY: parse the domain using urllib
site = urlparse(site).netloc
# add the 'www' version of the site to the list of sites to check
for to_try in [site, f"www.{site}"]:
if to_try in self.authentication:
return self.authentication[to_try]
# do a fuzzy string match just to print a warning - don't use it since it's insecure
for key in self.authentication.keys():
if key in site or site in key:
logger.warning(f"Could not find exact authentication information for site '{site}'. \
did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.")
return {}