WIP refactor logic

This commit is contained in:
msramalho
2022-11-15 15:00:52 +00:00
parent 6a0ce5ced1
commit 65dd155c90
12 changed files with 320 additions and 39 deletions

80
src/configs/v2config.py Normal file
View File

@@ -0,0 +1,80 @@
import argparse, yaml
from dataclasses import dataclass, field
from typing import List
from step import Step
from utils import Util
from enrichers import Enricher
from collections import defaultdict
@dataclass
class ConfigV2:
# TODO: should Config inherit from Step so it can have it's own configurations?
configurable_parents = [
Enricher,
Util
]
feeder : Step #TODO:= BaseFeeder
archivers: List[Step] = field(default_factory=[]) #TODO: fix type
enrichers: List[Enricher] = field(default_factory=[])
formatters: List[Step] = field(default_factory=[]) #TODO: fix type
storages: List[Step] = field(default_factory=[]) #TODO: fix type
databases: List[Step] = field(default_factory=[]) #TODO: fix type
def __init__(self) -> None:
self.defaults = {}
self.config = {}
def parse(self):
# 1. parse CLI values
parser = argparse.ArgumentParser(
# prog = "auto-archiver",
description="Auto Archiver is a ...!",
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
)
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
for configurable in self.configurable_parents:
child: Step
for child in configurable.__subclasses__():
for config, details in child.configs().items():
assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
assert "." not in config, f"config property cannot contain dots('.'): {config}"
config_path = f"{child.name}.{config}"
parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
self.defaults[config_path] = details["default"]
args = parser.parse_args()
# 2. read YAML config file
with open(args.config, "r", encoding="utf-8") as inf:
self.yaml_config = yaml.safe_load(inf)
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
self.config = defaultdict(dict)
for config_path, default in self.defaults.items():
child, config = tuple(config_path.split("."))
val = getattr(args, config_path)
if val is None:
val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
self.config[child][config] = val
self.config = dict(self.config)
# 4. STEPS: read steps and validate they exist
steps = self.yaml_config.get("steps", {})
assert "archivers" in steps, "your configuration steps are missing the archivers property"
assert "storages" in steps, "your configuration steps are missing the storages property"
print(self.config)
# self.feeder = Feeder.init
self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)]
print(self.enrichers)
def validate(self):
pass

View File

@@ -0,0 +1,2 @@
from .enricher import Enricher
from .enricher_screenshot import ScreenshotEnricher

20
src/enrichers/enricher.py Normal file
View File

@@ -0,0 +1,20 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
from step import Step
@dataclass
class Enricher(Step, ABC):
name = "enricher"
def __init__(self, config: dict) -> None:
Step.__init__(self)
# only for typing...
def init(name: str, config: dict) -> Enricher:
return Step.init(name, config, Enricher)
@abstractmethod
def enrich(self, item: Metadata) -> Metadata: pass

View File

@@ -0,0 +1,53 @@
from . import Enricher
from metadata import Metadata
from loguru import logger
class ScreenshotEnricher(Enricher):
name = "screenshot"
@staticmethod
def configs() -> dict:
return {
"width": {"default": 1280, "help": "width of the screenshots"},
"height": {"default": 720, "help": "height of the screenshots"},
}
def enrich(self, item: Metadata) -> Metadata:
url = self.get_url(item)
print("enrich")
# driver = config.webdriver
# with driver as Webdriver(): # TODO: make a util
# #TODO: take screenshot
# pass
# logger.debug(f"getting screenshot for {url=}")
# key = self._get_key_from_url(url, ".png", append_datetime=True)
# filename = os.path.join(Storage.TMP_FOLDER, key)
# # Accept cookies popup dismiss for ytdlp video
# if 'facebook.com' in url:
# try:
# logger.debug(f'Trying fb click accept cookie popup for {url}')
# self.driver.get("http://www.facebook.com")
# foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
# foo.click()
# logger.debug(f'fb click worked')
# # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
# time.sleep(2)
# except:
# logger.warning(f'Failed on fb accept cookies for url {url}')
# try:
# self.driver.get(url)
# time.sleep(6)
# except TimeoutException:
# logger.info("TimeoutException loading page for screenshot")
# self.driver.save_screenshot(filename)
# self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
# cdn_url = self.storage.get_cdn_url(key)
# self.add_to_media(cdn_url, key)
# return cdn_url

30
src/metadata.py Normal file
View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from typing import Union, Dict
from dataclasses import dataclass
@dataclass
class Metadata:
# does not handle files, only primitives
# the only piece of logic to handle files is the archiver, enricher, and storage
status: str
# title: str
# url: str
# hash: str
metadata: Dict[str, Metadata]
@staticmethod
def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
# should return a merged version of the Metadata
# will work for archived() and enriched()
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
pass
def get(self, key: str) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available
pass
def as_json(self) -> str:
# converts all metadata and data into JSON
pass

View File

@@ -1,5 +1,5 @@
from typing import Union, Dict
from __future__ import annotations
from typing import Union, Dict
from dataclasses import dataclass
"""
@@ -39,31 +39,31 @@ Cisticola considerations:
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
"""
@dataclass
class Metadata:
# does not handle files, only primitives
# the only piece of logic to handle files is the archiver, enricher, and storage
status: str
# title: str
# url: str
# hash: str
main_file: Metadata
metadata: Dict[str, Metadata]
# @dataclass
# class Metadata:
# # does not handle files, only primitives
# # the only piece of logic to handle files is the archiver, enricher, and storage
# status: str
# # title: str
# # url: str
# # hash: str
# main_file: Metadata
# metadata: Dict[str, Metadata]
@staticmethod
def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
# should return a merged version of the Metadata
# will work for archived() and enriched()
# what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
pass
# @staticmethod
# def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
# # should return a merged version of the Metadata
# # will work for archived() and enriched()
# # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
# pass
def get(self, key) -> Union[Metadata, str]:
# goes through metadata and returns the Metadata available
pass
# def get(self, key) -> Union[Metadata, str]:
# # goes through metadata and returns the Metadata available
# pass
def as_json(self) -> str:
# converts all metadata and data into JSON
pass
# def as_json(self) -> str:
# # converts all metadata and data into JSON
# pass
"""
@@ -116,27 +116,27 @@ class ArchivingOrchestrator:
# where does that update/processing happen? in config.py
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
self.archivers = [
Archiver.init(a, config.get(a))
Archiver.init(a, config)
for a in config.archivers
]
self.enrichments = [
Enrichment.init(e, config.get(e))
for e in config.enrichments
self.enrichers = [
Enricher.init(e, config)
for e in config.enrichers
]
self.formatters = [
Formatter.init(f, config.get(f))
Formatter.init(f, config)
for f in config.formatters
]
self.storages = [
Storage.init(s, config.get(s))
Storage.init(s, config)
for s in config.storages
]
self.databases = [
Database.init(f, config.get(f))
Database.init(f, config)
for f in config.formatters
]
@@ -192,11 +192,11 @@ class ArchivingOrchestrator:
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
# eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
# then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
for e in enrichments:
for e in enrichers:
result.update(e.enrich(result))
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>

30
src/step.py Normal file
View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Type
from metadata import Metadata
from abc import ABC
@dataclass
class Step(ABC):
name : str = None
def __init__(self, config: dict) -> None:
self.config = self.config[self.name]
@staticmethod
def configs() -> dict: {}
def init(name: str, config: dict, child: Type[Step]) -> Step:
"""
cannot find subclasses of child.subclasses
"""
for sub in child.__subclasses__():
if sub.name == name:
return sub.__init__(config)
raise f"Unable to initialize class with {name=}"
def get_url(self, item: Metadata) -> str:
url = item.get("url")
assert type(url) is str and len(url) > 0
return url

View File

@@ -1,3 +1,4 @@
# we need to explicitly expose the available imports here
from .gworksheet import *
from .misc import *
from .misc import *
from .util import Util

20
src/utils/util.py Normal file
View File

@@ -0,0 +1,20 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
from step import Step
@dataclass
class Util(Step, ABC):
name = "util"
def __init__(self, config: dict) -> None:
Step.__init__(self)
# only for typing...
def init(name: str, config: dict) -> Util:
return super().init(name, config, Util)
@abstractmethod
def enrich(self, item: Metadata) -> Metadata: pass

9
src/v2.py Normal file
View File

@@ -0,0 +1,9 @@
from configs.v2config import ConfigV2
from orchestrator import ArchivingOrchestrator
config = ConfigV2()
config.parse()
# orchestrator = ArchivingOrchestrator(config)