further cleanup

This commit is contained in:
msramalho
2023-01-21 19:57:54 +00:00
parent 9bd8ea0994
commit 746f6a333e
9 changed files with 8 additions and 46 deletions

View File

@@ -6,7 +6,7 @@ from typing import List
from collections import defaultdict
from ..archivers import Archiver
from ..feeders import Feeder
from ..feeders import Feeder, CLIFeeder
from ..databases import Database
from ..formatters import Formatter
from ..storages import Storage
@@ -16,8 +16,6 @@ from ..enrichers import Enricher
@dataclass
class Config:
# TODO: should Config inherit from Step so it can have it's own configurations?
# these are only detected if they are put to the respective __init__.py
configurable_parents = [
Feeder,
Enricher,
@@ -27,18 +25,17 @@ class Config:
Formatter
# Util
]
feeder: Step # TODO:= BaseFeeder
feeder: Feeder
formatter: Formatter
archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
archivers: List[Archiver] = field(default_factory=[])
enrichers: List[Enricher] = field(default_factory=[])
storages: List[Step] = field(default_factory=[]) # TODO: fix type
storages: List[Storage] = field(default_factory=[])
databases: List[Database] = field(default_factory=[])
def __init__(self) -> None:
self.defaults = {}
self.cli_ops = {}
self.config = {}
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
def parse(self, use_cli=True, yaml_config_filename: str = None):
"""
@@ -49,7 +46,7 @@ class Config:
if use_cli:
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
description="Auto Archiver is a ...!", # TODO: update
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
)
@@ -63,7 +60,7 @@ class Config:
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
config_path = f"{child.name}.{config}"
if use_cli:
try:
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))

View File

@@ -22,9 +22,6 @@ class Metadata:
final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False
# def __init__(self, url, metadata = {}) -> None:
# self.set_url(url)
# self.metadata = metadata
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
@@ -134,16 +131,11 @@ class Metadata:
return self
def get_single_media(self) -> Media:
# TODO: could be refactored to use a custom media.id
# TODO: could be refactored to use a custom media.id or metadata
if self.final_media:
return self.final_media
return self.media[0]
# def as_json(self) -> str:
# # converts all metadata and data into JSON
# return json.dumps(self.metadata)
# #TODO: datetime is not serializable
def get_clean_metadata(self) -> Metadata:
return dict(
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},

View File

@@ -132,7 +132,6 @@ class ArchivingOrchestrator:
# a.download(result) # TODO: refactor so there's not merge here
logger.info(f"Trying archiver {a.name}")
result.merge(a.download(result))
# TODO: fix logic to halt when done
if result.is_success(): break
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?