mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
further cleanup
This commit is contained in:
@@ -6,7 +6,7 @@ from typing import List
|
||||
from collections import defaultdict
|
||||
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
from ..feeders import Feeder, CLIFeeder
|
||||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
@@ -16,8 +16,6 @@ from ..enrichers import Enricher
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
# TODO: should Config inherit from Step so it can have it's own configurations?
|
||||
# these are only detected if they are put to the respective __init__.py
|
||||
configurable_parents = [
|
||||
Feeder,
|
||||
Enricher,
|
||||
@@ -27,18 +25,17 @@ class Config:
|
||||
Formatter
|
||||
# Util
|
||||
]
|
||||
feeder: Step # TODO:= BaseFeeder
|
||||
feeder: Feeder
|
||||
formatter: Formatter
|
||||
archivers: List[Archiver] = field(default_factory=[]) # TODO: fix type
|
||||
archivers: List[Archiver] = field(default_factory=[])
|
||||
enrichers: List[Enricher] = field(default_factory=[])
|
||||
storages: List[Step] = field(default_factory=[]) # TODO: fix type
|
||||
storages: List[Storage] = field(default_factory=[])
|
||||
databases: List[Database] = field(default_factory=[])
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.defaults = {}
|
||||
self.cli_ops = {}
|
||||
self.config = {}
|
||||
# TODO: make this work for nested props like gsheet_feeder.columns.url = "URL"
|
||||
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None):
|
||||
"""
|
||||
@@ -49,7 +46,7 @@ class Config:
|
||||
if use_cli:
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a ...!", # TODO: update
|
||||
description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
|
||||
@@ -63,7 +60,7 @@ class Config:
|
||||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
config_path = f"{child.name}.{config}"
|
||||
|
||||
|
||||
if use_cli:
|
||||
try:
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
|
||||
|
||||
@@ -22,9 +22,6 @@ class Metadata:
|
||||
final_media: Media = None # can be overwritten by formatters
|
||||
rearchivable: bool = False
|
||||
|
||||
# def __init__(self, url, metadata = {}) -> None:
|
||||
# self.set_url(url)
|
||||
# self.metadata = metadata
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
@@ -134,16 +131,11 @@ class Metadata:
|
||||
return self
|
||||
|
||||
def get_single_media(self) -> Media:
|
||||
# TODO: could be refactored to use a custom media.id
|
||||
# TODO: could be refactored to use a custom media.id or metadata
|
||||
if self.final_media:
|
||||
return self.final_media
|
||||
return self.media[0]
|
||||
|
||||
# def as_json(self) -> str:
|
||||
# # converts all metadata and data into JSON
|
||||
# return json.dumps(self.metadata)
|
||||
# #TODO: datetime is not serializable
|
||||
|
||||
def get_clean_metadata(self) -> Metadata:
|
||||
return dict(
|
||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
||||
|
||||
@@ -132,7 +132,6 @@ class ArchivingOrchestrator:
|
||||
# a.download(result) # TODO: refactor so there's not merge here
|
||||
logger.info(f"Trying archiver {a.name}")
|
||||
result.merge(a.download(result))
|
||||
# TODO: fix logic to halt when done
|
||||
if result.is_success(): break
|
||||
|
||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||
|
||||
Reference in New Issue
Block a user