Files
auto-archiver/src/auto_archiver/core/config.py

132 lines
4.4 KiB
Python

"""
The Config class initializes and parses configurations for all other steps.
It supports CLI argument parsing, loading from YAML file, and overrides to allow
flexible setup in various environments.
"""
import argparse
from ruamel.yaml import YAML, CommentedMap, add_representer
from copy import deepcopy
from .module import MODULE_TYPES
from typing import Any, List, Type
# configurable_parents = [
# Feeder,
# Enricher,
# Extractor,
# Database,
# Storage,
# Formatter
# # Util
# ]
# feeder: Feeder
# formatter: Formatter
# extractors: List[Extractor] = field(default_factory=[])
# enrichers: List[Enricher] = field(default_factory=[])
# storages: List[Storage] = field(default_factory=[])
# databases: List[Database] = field(default_factory=[])
# def __init__(self) -> None:
# self.defaults = {}
# self.cli_ops = {}
# self.config = {}
# def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
# """
# if yaml_config_filename is provided, the --config argument is ignored,
# useful for library usage when the config values are preloaded
# overwrite_configs is a dict that overwrites the yaml file contents
# """
# # 1. parse CLI values
# if use_cli:
# parser = argparse.ArgumentParser(
# # prog = "auto-archiver",
# description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
# epilog="Check the code at https://github.com/bellingcat/auto-archiver"
# )
# parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
# parser.add_argument('--version', action='version', version=__version__)
EMPTY_CONFIG = CommentedMap(**{
"steps": dict((f"{module_type}s", []) for module_type in MODULE_TYPES)
})
def to_dot_notation(yaml_conf: CommentedMap | dict) -> argparse.ArgumentParser:
dotdict = {}
def process_subdict(subdict, prefix=""):
for key, value in subdict.items():
if is_dict_type(value):
process_subdict(value, f"{prefix}{key}.")
else:
dotdict[f"{prefix}{key}"] = value
process_subdict(yaml_conf)
return dotdict
def from_dot_notation(dotdict: dict) -> dict:
normal_dict = {}
def add_part(key, value, current_dict):
if "." in key:
key_parts = key.split(".")
current_dict.setdefault(key_parts[0], {})
add_part(".".join(key_parts[1:]), value, current_dict[key_parts[0]])
else:
current_dict[key] = value
for key, value in dotdict.items():
add_part(key, value, normal_dict)
return normal_dict
def is_list_type(value):
return isinstance(value, list) or isinstance(value, tuple) or isinstance(value, set)
def is_dict_type(value):
return isinstance(value, dict) or isinstance(value, CommentedMap)
def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_dict: CommentedMap = deepcopy(yaml_dict)
# first deal with lists, since 'update' replaces lists from a in b, but we want to extend
def update_dict(subdict, yaml_subdict):
for key, value in subdict.items():
if not yaml_subdict.get(key):
yaml_subdict[key] = value
continue
if is_dict_type(value):
update_dict(value, yaml_subdict[key])
elif is_list_type(value):
yaml_subdict[key].extend(s for s in value if s not in yaml_subdict[key])
else:
yaml_subdict[key] = value
update_dict(from_dot_notation(dotdict), yaml_dict)
return yaml_dict
yaml = YAML()
def read_yaml(yaml_filename: str) -> CommentedMap:
config = None
try:
with open(yaml_filename, "r", encoding="utf-8") as inf:
config = yaml.load(inf)
except FileNotFoundError:
pass
if not config:
config = EMPTY_CONFIG
return config
def store_yaml(config: CommentedMap, yaml_filename: str):
with open(yaml_filename, "w", encoding="utf-8") as outf:
yaml.dump(config, outf)