mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
fixing imports
This commit is contained in:
@@ -11,8 +11,8 @@ from ..feeders import Feeder
|
||||
from ..databases import Database
|
||||
from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from . import Step
|
||||
from ..enrichers import Enricher
|
||||
from . import Step
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -6,13 +6,11 @@ from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from .media import Media
|
||||
|
||||
|
||||
# annotation order matters
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class Metadata:
|
||||
@@ -72,6 +70,7 @@ class Metadata:
|
||||
|
||||
# custom getter/setters
|
||||
|
||||
|
||||
def set_url(self, url: str) -> Metadata:
|
||||
assert type(url) is str and len(url) > 0, "invalid URL"
|
||||
return self.set("url", url)
|
||||
|
||||
@@ -15,49 +15,11 @@ import tempfile, traceback
|
||||
from loguru import logger
|
||||
|
||||
|
||||
"""
|
||||
how not to couple the different pieces of logic
|
||||
due to the use of constants for the metadata keys?
|
||||
perhaps having methods on the Metadata level that can be used to fetch a limited number of
|
||||
keys, never using strings but rather methods?
|
||||
eg: m = Metadata()
|
||||
m.get("screenshot") vs m.get_all()
|
||||
m.get_url()
|
||||
m.get_hash()
|
||||
m.get_main_file().get_title()
|
||||
m.get_screenshot() # this method should only exist because of the Screenshot Enricher
|
||||
# maybe there is a way for Archivers and Enrichers and Storages to add their own methdods
|
||||
# which raises still the Q of how the database, eg., knows they exist?
|
||||
# maybe there's a function to fetch them all, and each Database can register wathever they get
|
||||
# for eg the GoogleSheets will only register based on the available column names, it knows what it wants
|
||||
# and if it's there: great, otherwise business as usual.
|
||||
# and a MongoDatabase could register all data, for example.
|
||||
#
|
||||
How are Orchestrators created? from a configuration file?
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
# Config contains 1 URL, or URLs, from the command line
|
||||
# OR a feeder which is described in the config file
|
||||
# config.get_feeder() # if called as docker run --url "http...." then the uses the default filter
|
||||
# if config.yaml says config
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
Example applications:
|
||||
1. auto-archiver for GSheets
|
||||
2. archiver for URL: feeder is CLIFeeder(config.cli.urls="") # --urls="u1,u2"
|
||||
3. archiver backend for a UI that implements a REST API, the API calls CLI
|
||||
|
||||
Cisticola considerations:
|
||||
1. By isolating the archiving logic into "Archiving only pieces of logic" these could simply call cisticola.tiktok_scraper(user, pass)
|
||||
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||
"""
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
def __init__(self, config) -> None:
|
||||
self.feeder: Feeder = config.feeder
|
||||
self.formatter: Formatter = config.formatter
|
||||
self.enrichers = config.enrichers
|
||||
self.enrichers: List[Enricher] = config.enrichers
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[Storage] = config.storages
|
||||
@@ -124,7 +86,7 @@ class ArchivingOrchestrator:
|
||||
# 3 - call archivers until one succeeds
|
||||
for a in self.archivers:
|
||||
logger.info(f"Trying archiver {a.name}")
|
||||
try:
|
||||
try:
|
||||
# Q: should this be refactored so it's just a.download(result)?
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
|
||||
@@ -2,7 +2,6 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from inspect import ClassFoundException
|
||||
from typing import Type
|
||||
from ..core import Metadata
|
||||
from abc import ABC
|
||||
# from collections.abc import Iterable
|
||||
|
||||
|
||||
Reference in New Issue
Block a user