WIP refactor logic

2026-06-11 20:58:29 +03:00 · 2022-11-15 15:00:52 +00:00
parent 6a0ce5ced1
commit 65dd155c90
12 changed files with 320 additions and 39 deletions
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -0,0 +1,80 @@
+
+
+import argparse, yaml
+from dataclasses import dataclass, field
+from typing import List
+from step import Step
+from utils import Util
+from enrichers import Enricher
+from collections import defaultdict
+
+
+@dataclass
+class ConfigV2:
+    # TODO: should Config inherit from Step so it can have it's own configurations?
+    configurable_parents = [
+        Enricher,
+        Util
+    ]
+    feeder : Step #TODO:= BaseFeeder
+    archivers: List[Step] = field(default_factory=[]) #TODO: fix type
+    enrichers: List[Enricher] = field(default_factory=[])
+    formatters: List[Step] = field(default_factory=[]) #TODO: fix type
+    storages: List[Step] = field(default_factory=[]) #TODO: fix type
+    databases: List[Step] = field(default_factory=[]) #TODO: fix type
+
+    def __init__(self) -> None:
+        self.defaults = {}
+        self.config = {}
+
+    def parse(self):
+        # 1. parse CLI values
+        parser = argparse.ArgumentParser(
+            # prog = "auto-archiver",
+            description="Auto Archiver is a ...!",
+            epilog="Check the code at https://github.com/bellingcat/auto-archiver"
+        )
+
+        parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
+
+        for configurable in self.configurable_parents:
+            child: Step
+            for child in configurable.__subclasses__():
+                for config, details in child.configs().items():
+                    assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
+                    assert "." not in config, f"config property cannot contain dots('.'): {config}"
+                    config_path = f"{child.name}.{config}"
+                    parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=details['help'])
+                    self.defaults[config_path] = details["default"]
+
+        args = parser.parse_args()
+
+        # 2. read YAML config file
+        with open(args.config, "r", encoding="utf-8") as inf:
+            self.yaml_config = yaml.safe_load(inf)
+
+        # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
+        self.config = defaultdict(dict)
+        for config_path, default in self.defaults.items():
+            child, config = tuple(config_path.split("."))
+            val = getattr(args, config_path)
+            if val is None:
+                val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
+            self.config[child][config] = val
+        self.config = dict(self.config)
+
+        # 4. STEPS: read steps and validate they exist
+        steps = self.yaml_config.get("steps", {})
+        assert "archivers" in steps, "your configuration steps are missing the archivers property"
+        assert "storages" in steps, "your configuration steps are missing the storages property"
+        
+        print(self.config)
+        
+        # self.feeder = Feeder.init
+        self.enrichers = [Enricher.init(steps.get("enrichers", [])[0], self.config)]
+        
+        
+        print(self.enrichers)
+
+    def validate(self):
+        pass
--- a/src/enrichers/init.py
+++ b/src/enrichers/init.py
@@ -0,0 +1,2 @@
+from .enricher import Enricher
+from .enricher_screenshot import ScreenshotEnricher 
--- a/src/enrichers/enricher.py
+++ b/src/enrichers/enricher.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from metadata import Metadata
+from step import Step
+
+@dataclass
+class Enricher(Step, ABC):
+    name = "enricher"
+
+    def __init__(self, config: dict) -> None:
+        Step.__init__(self)
+        
+
+    # only for typing...
+    def init(name: str, config: dict) -> Enricher:
+        return Step.init(name, config, Enricher)
+
+    @abstractmethod
+    def enrich(self, item: Metadata) -> Metadata: pass
--- a/src/enrichers/enricher_screenshot.py
+++ b/src/enrichers/enricher_screenshot.py
@@ -0,0 +1,53 @@
+from . import Enricher
+from metadata import Metadata
+from loguru import logger
+
+
+class ScreenshotEnricher(Enricher):
+    name = "screenshot"
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "width": {"default": 1280, "help": "width of the screenshots"},
+            "height": {"default": 720, "help": "height of the screenshots"},
+        }
+
+    def enrich(self, item: Metadata) -> Metadata:
+        url = self.get_url(item)
+        print("enrich")
+        # driver = config.webdriver
+        # with driver as Webdriver(): # TODO: make a util
+        #     #TODO: take screenshot
+        #     pass
+
+        # logger.debug(f"getting screenshot for {url=}")
+        # key = self._get_key_from_url(url, ".png", append_datetime=True)
+        # filename = os.path.join(Storage.TMP_FOLDER, key)
+
+        # # Accept cookies popup dismiss for ytdlp video
+        # if 'facebook.com' in url:
+        #     try:
+        #         logger.debug(f'Trying fb click accept cookie popup for {url}')
+        #         self.driver.get("http://www.facebook.com")
+        #         foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
+        #         foo.click()
+        #         logger.debug(f'fb click worked')
+        #         # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
+        #         time.sleep(2)
+        #     except:
+        #         logger.warning(f'Failed on fb accept cookies for url {url}')
+
+        # try:
+        #     self.driver.get(url)
+        #     time.sleep(6)
+        # except TimeoutException:
+        #     logger.info("TimeoutException loading page for screenshot")
+
+        # self.driver.save_screenshot(filename)
+        # self.storage.upload(filename, key, extra_args={'ACL': 'public-read', 'ContentType': 'image/png'})
+
+        # cdn_url = self.storage.get_cdn_url(key)
+        # self.add_to_media(cdn_url, key)
+
+        # return cdn_url
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -0,0 +1,30 @@
+
+from __future__ import annotations
+from typing import Union, Dict
+from dataclasses import dataclass
+
+
+@dataclass
+class Metadata:
+    # does not handle files, only primitives
+    # the only piece of logic to handle files is the archiver, enricher, and storage
+    status: str
+    # title: str
+    # url: str
+    # hash: str
+    metadata: Dict[str, Metadata]
+
+    @staticmethod
+    def merge(left: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
+        # should return a merged version of the Metadata
+        # will work for archived() and enriched()
+        # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
+        pass
+
+    def get(self, key: str) -> Union[Metadata, str]:
+        # goes through metadata and returns the Metadata available
+        pass
+
+    def as_json(self) -> str:
+        # converts all metadata and data into JSON
+        pass
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -1,5 +1,5 @@
-from typing import Union, Dict
 from __future__ import annotations
+from typing import Union, Dict
 from dataclasses import dataclass

 """
@@ -39,31 +39,31 @@ Cisticola considerations:
 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
 """

-@dataclass
-class Metadata:
-    # does not handle files, only primitives
-    # the only piece of logic to handle files is the archiver, enricher, and storage
-    status: str
-    # title: str
-    # url: str
-    # hash: str
-    main_file: Metadata
-    metadata: Dict[str, Metadata]
+# @dataclass
+# class Metadata:
+#     # does not handle files, only primitives
+#     # the only piece of logic to handle files is the archiver, enricher, and storage
+#     status: str
+#     # title: str
+#     # url: str
+#     # hash: str
+#     main_file: Metadata
+#     metadata: Dict[str, Metadata]

-    @staticmethod
-    def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
-        # should return a merged version of the Metadata
-        # will work for archived() and enriched()
-        # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
-        pass
+#     @staticmethod
+#     def merge(left, right : Metadata, overwrite_left=True) -> Metadata:
+#         # should return a merged version of the Metadata
+#         # will work for archived() and enriched()
+#         # what if 2 metadatas contain the same keys? only one can remain! : overwrite_left
+#         pass

-    def get(self, key) -> Union[Metadata, str]:
-        # goes through metadata and returns the Metadata available
-        pass
+#     def get(self, key) -> Union[Metadata, str]:
+#         # goes through metadata and returns the Metadata available
+#         pass

-    def as_json(self) -> str:
-        # converts all metadata and data into JSON
-        pass
+#     def as_json(self) -> str:
+#         # converts all metadata and data into JSON
+#         pass


 """
@@ -116,27 +116,27 @@ class ArchivingOrchestrator:
        # where does that update/processing happen? in config.py
        # reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
        self.archivers = [
-            Archiver.init(a, config.get(a))
+            Archiver.init(a, config)
            for a in config.archivers
        ]

-        self.enrichments = [
-            Enrichment.init(e, config.get(e))
-            for e in config.enrichments
+        self.enrichers = [
+            Enricher.init(e, config)
+            for e in config.enrichers
        ]

        self.formatters = [
-            Formatter.init(f, config.get(f))
+            Formatter.init(f, config)
            for f in config.formatters
        ]

        self.storages = [
-            Storage.init(s, config.get(s))
+            Storage.init(s, config)
            for s in config.storages
        ]

        self.databases = [
-            Database.init(f, config.get(f))
+            Database.init(f, config)
            for f in config.formatters
        ]

@@ -192,11 +192,11 @@ class ArchivingOrchestrator:

        # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
        # should it call the HTMLgenerator as if it's not an enrichment?
-        # eg: if it is enable: generates an HTML with all the returned media, should it include enrichments? yes
+        # eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
        # then how to execute it last? should there also be post-processors? are there other examples?
        # maybe as a PDF? or a Markdown file
        # side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
-        for e in enrichments:
+        for e in enrichers:
            result.update(e.enrich(result))

        # formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
--- a/src/step.py
+++ b/src/step.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Type
+from metadata import Metadata
+from abc import ABC
+
+
+@dataclass
+class Step(ABC):
+    name : str = None
+
+    def __init__(self, config: dict) -> None:
+        self.config = self.config[self.name]
+
+    @staticmethod
+    def configs() -> dict: {}
+
+    def init(name: str, config: dict, child: Type[Step]) -> Step:
+        """
+        cannot find subclasses of child.subclasses
+        """
+        for sub in child.__subclasses__():
+            if sub.name == name:
+                return sub.__init__(config)
+        raise f"Unable to initialize class with {name=}"
+
+    def get_url(self, item: Metadata) -> str:
+        url = item.get("url")
+        assert type(url) is str and len(url) > 0
+        return url
--- a/src/utils/init.py
+++ b/src/utils/init.py
@@ -1,3 +1,4 @@
 # we need to explicitly expose the available imports here
 from .gworksheet import *
-from .misc import *
+from .misc import *
+from .util import Util
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from metadata import Metadata
+from step import Step
+
+@dataclass
+class Util(Step, ABC):
+    name = "util"
+
+    def __init__(self, config: dict) -> None:
+        Step.__init__(self)
+        
+
+        # only for typing...
+    def init(name: str, config: dict) -> Util:
+        return super().init(name, config, Util)
+
+    @abstractmethod
+    def enrich(self, item: Metadata) -> Metadata: pass
--- a/src/v2.py
+++ b/src/v2.py
@@ -0,0 +1,9 @@
+
+
+from configs.v2config import ConfigV2
+from orchestrator import ArchivingOrchestrator
+
+config = ConfigV2()
+config.parse()
+
+# orchestrator = ArchivingOrchestrator(config)