mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
WIP refactor logic
This commit is contained in:
80
src/configs/v2config.py
Normal file
80
src/configs/v2config.py
Normal file
@@ -0,0 +1,80 @@
|
||||
|
||||
|
||||
import argparse, yaml
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
from step import Step
|
||||
from utils import Util
|
||||
from enrichers import Enricher
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigV2:
|
||||
# TODO: should Config inherit from Step so it can have it's own configurations?
|
||||
configurable_parents = [
|
||||
Enricher,
|
||||
Util
|
||||
]
|
||||
feeder : Step #TODO:= BaseFeeder
|
||||
archivers: List[Step] = field(default_factory=[]) #TODO: fix type
|
||||
enrichers: List[Enricher] = field(default_factory=[])
|
||||
formatters: List[Step] = field(default_factory=[]) #TODO: fix type
|
||||
storages: List[Step] = field(default_factory=[]) #TODO: fix type
|
||||
databases: List[Step] = field(default_factory=[]) #TODO: fix type
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.defaults = {}
|
||||
self.config = {}
|
||||
|
||||
def parse(self):
|
||||
# 1. parse CLI values
|
||||
parser = argparse.ArgumentParser(
|
||||
# prog = "auto-archiver",
|
||||
description="Auto Archiver is a ...!",
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
for child in configurable.__subclasses__():
|
||||
for config, details in child.configs().items():
|
||||
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
|
||||
assert "." not in config, f"config property cannot contain dots('.'): {config}"
|
||||
config_path = f"{child.name}.{config}"
|
||||
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
|
||||
self.defaults[config_path] = details["default"]
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 2. read YAML config file
|
||||
with open(args.config, "r", encoding="utf-8") as inf:
|
||||
self.yaml_config = yaml.safe_load(inf)
|
||||
|
||||
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
self.config = defaultdict(dict)
|
||||
for config_path, default in self.defaults.items():
|
||||
child, config = tuple(config_path.split("."))
|
||||
val = getattr(args, config_path)
|
||||
if val is None:
|
||||
val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
|
||||
self.config[child][config] = val
|
||||
self.config = dict(self.config)
|
||||
|
||||
# 4. STEPS: read steps and validate they exist
|
||||
steps = self.yaml_config.get("steps", {})
|
||||
assert "archivers" in steps, "your configuration steps are missing the archivers property"
|
||||
assert "storages" in steps, "your configuration steps are missing the storages property"
|
||||
|
||||
print(self.config)
|
||||
|
||||
# self.feeder = Feeder.init
|
||||
self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)]
|
||||
|
||||
|
||||
print(self.enrichers)
|
||||
|
||||
def validate(self):
|
||||
pass
|
||||
2
src/enrichers/__init__.py
Normal file
2
src/enrichers/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .enricher import Enricher
|
||||
from .enricher_screenshot import ScreenshotEnricher
|
||||
20
src/enrichers/enricher.py
Normal file
20
src/enrichers/enricher.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from metadata import Metadata
|
||||
from step import Step
|
||||
|
||||
@dataclass
|
||||
class Enricher(Step, ABC):
|
||||
name = "enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
Step.__init__(self)
|
||||
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Enricher:
|
||||
return Step.init(name, config, Enricher)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
||||
53
src/enrichers/enricher_screenshot.py
Normal file
53
src/enrichers/enricher_screenshot.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from . import Enricher
|
||||
from metadata import Metadata
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
}
|
||||
|
||||
def enrich(self, item: Metadata) -> Metadata:
|
||||
url = self.get_url(item)
|
||||
print("enrich")
|
||||
# driver = config.webdriver
|
||||
# with driver as Webdriver(): # TODO: make a util
|
||||
# #TODO: take screenshot
|
||||
# pass
|
||||
|
||||
# logger.debug(f"getting screenshot for {url=}")
|
||||
# key = self._get_key_from_url(url, ".png", append_datetime=True)
|
||||
# filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
|
||||
# # Accept cookies popup dismiss for ytdlp video
|
||||
# if 'facebook.com' in url:
|
||||
# try:
|
||||
# logger.debug(f'Trying fb click accept cookie popup for {url}')
|
||||
# self.driver.get("http://www.facebook.com")
|
||||
# foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
|
||||
# foo.click()
|
||||
# logger.debug(f'fb click worked')
|
||||
# # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
|
||||
# time.sleep(2)
|
||||
# except:
|
||||
# logger.warning(f'Failed on fb accept cookies for url {url}')
|
||||
|
||||
# try:
|
||||
# self.driver.get(url)
|
||||
# time.sleep(6)
|
||||
# except TimeoutException:
|
||||
# logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
# self.driver.save_screenshot(filename)
|
||||
# self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
|
||||
|
||||
# cdn_url = self.storage.get_cdn_url(key)
|
||||
# self.add_to_media(cdn_url, key)
|
||||
|
||||
# return cdn_url
|
||||
30
src/metadata.py
Normal file
30
src/metadata.py
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Union, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
# does not handle files, only primitives
|
||||
# the only piece of logic to handle files is the archiver, enricher, and storage
|
||||
status: str
|
||||
# title: str
|
||||
# url: str
|
||||
# hash: str
|
||||
metadata: Dict[str, Metadata]
|
||||
|
||||
@staticmethod
|
||||
def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
# should return a merged version of the Metadata
|
||||
# will work for archived() and enriched()
|
||||
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
|
||||
pass
|
||||
|
||||
def get(self, key: str) -> Union[Metadata, str]:
|
||||
# goes through metadata and returns the Metadata available
|
||||
pass
|
||||
|
||||
def as_json(self) -> str:
|
||||
# converts all metadata and data into JSON
|
||||
pass
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import Union, Dict
|
||||
from __future__ import annotations
|
||||
from typing import Union, Dict
|
||||
from dataclasses import dataclass
|
||||
|
||||
"""
|
||||
@@ -39,31 +39,31 @@ Cisticola considerations:
|
||||
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||
"""
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
# does not handle files, only primitives
|
||||
# the only piece of logic to handle files is the archiver, enricher, and storage
|
||||
status: str
|
||||
# title: str
|
||||
# url: str
|
||||
# hash: str
|
||||
main_file: Metadata
|
||||
metadata: Dict[str, Metadata]
|
||||
# @dataclass
|
||||
# class Metadata:
|
||||
# # does not handle files, only primitives
|
||||
# # the only piece of logic to handle files is the archiver, enricher, and storage
|
||||
# status: str
|
||||
# # title: str
|
||||
# # url: str
|
||||
# # hash: str
|
||||
# main_file: Metadata
|
||||
# metadata: Dict[str, Metadata]
|
||||
|
||||
@staticmethod
|
||||
def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
|
||||
# should return a merged version of the Metadata
|
||||
# will work for archived() and enriched()
|
||||
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
|
||||
pass
|
||||
# @staticmethod
|
||||
# def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
|
||||
# # should return a merged version of the Metadata
|
||||
# # will work for archived() and enriched()
|
||||
# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
|
||||
# pass
|
||||
|
||||
def get(self, key) -> Union[Metadata, str]:
|
||||
# goes through metadata and returns the Metadata available
|
||||
pass
|
||||
# def get(self, key) -> Union[Metadata, str]:
|
||||
# # goes through metadata and returns the Metadata available
|
||||
# pass
|
||||
|
||||
def as_json(self) -> str:
|
||||
# converts all metadata and data into JSON
|
||||
pass
|
||||
# def as_json(self) -> str:
|
||||
# # converts all metadata and data into JSON
|
||||
# pass
|
||||
|
||||
|
||||
"""
|
||||
@@ -116,27 +116,27 @@ class ArchivingOrchestrator:
|
||||
# where does that update/processing happen? in config.py
|
||||
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
|
||||
self.archivers = [
|
||||
Archiver.init(a, config.get(a))
|
||||
Archiver.init(a, config)
|
||||
for a in config.archivers
|
||||
]
|
||||
|
||||
self.enrichments = [
|
||||
Enrichment.init(e, config.get(e))
|
||||
for e in config.enrichments
|
||||
self.enrichers = [
|
||||
Enricher.init(e, config)
|
||||
for e in config.enrichers
|
||||
]
|
||||
|
||||
self.formatters = [
|
||||
Formatter.init(f, config.get(f))
|
||||
Formatter.init(f, config)
|
||||
for f in config.formatters
|
||||
]
|
||||
|
||||
self.storages = [
|
||||
Storage.init(s, config.get(s))
|
||||
Storage.init(s, config)
|
||||
for s in config.storages
|
||||
]
|
||||
|
||||
self.databases = [
|
||||
Database.init(f, config.get(f))
|
||||
Database.init(f, config)
|
||||
for f in config.formatters
|
||||
]
|
||||
|
||||
@@ -192,11 +192,11 @@ class ArchivingOrchestrator:
|
||||
|
||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
|
||||
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
|
||||
# then how to execute it last? should there also be post-processors? are there other examples?
|
||||
# maybe as a PDF? or a Markdown file
|
||||
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
|
||||
for e in enrichments:
|
||||
for e in enrichers:
|
||||
result.update(e.enrich(result))
|
||||
|
||||
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
||||
|
||||
30
src/step.py
Normal file
30
src/step.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import Type
|
||||
from metadata import Metadata
|
||||
from abc import ABC
|
||||
|
||||
|
||||
@dataclass
|
||||
class Step(ABC):
|
||||
name : str = None
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
self.config = self.config[self.name]
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict: {}
|
||||
|
||||
def init(name: str, config: dict, child: Type[Step]) -> Step:
|
||||
"""
|
||||
cannot find subclasses of child.subclasses
|
||||
"""
|
||||
for sub in child.__subclasses__():
|
||||
if sub.name == name:
|
||||
return sub.__init__(config)
|
||||
raise f"Unable to initialize class with {name=}"
|
||||
|
||||
def get_url(self, item: Metadata) -> str:
|
||||
url = item.get("url")
|
||||
assert type(url) is str and len(url) > 0
|
||||
return url
|
||||
@@ -1,3 +1,4 @@
|
||||
# we need to explicitly expose the available imports here
|
||||
from .gworksheet import *
|
||||
from .misc import *
|
||||
from .misc import *
|
||||
from .util import Util
|
||||
20
src/utils/util.py
Normal file
20
src/utils/util.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from metadata import Metadata
|
||||
from step import Step
|
||||
|
||||
@dataclass
|
||||
class Util(Step, ABC):
|
||||
name = "util"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
Step.__init__(self)
|
||||
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Util:
|
||||
return super().init(name, config, Util)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
||||
Reference in New Issue
Block a user