WIP feeder

2026-06-11 20:58:29 +03:00 · 2022-12-10 12:03:46 +00:00
parent 9dc709d3b9
commit 955891a411
9 changed files with 88 additions and 45 deletions
--- a/orchestrate.yaml
+++ b/orchestrate.yaml
@@ -3,11 +3,11 @@ steps:
  # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
  feeder: gsheets_feeder # default -> only expects URL from CLI
  archivers: # order matters
-    - tiktok
    - telethon
-    - twitter
-    - instagram
-    - webarchive # this way it runs as a failsafe only
+    # - tiktok
+    # - twitter
+    # - instagram
+    # - webarchive # this way it runs as a failsafe only
  enrichers:
    - screenshot
    # - wacz
--- a/src/configs/v2config.py
+++ b/src/configs/v2config.py
@@ -4,7 +4,7 @@ import argparse, yaml
 from dataclasses import dataclass, field
 from typing import List
 from feeders.feeder import Feeder
-from step import Step
+from steps.step import Step
 from utils import Util
 from enrichers import Enricher
 from collections import defaultdict
--- a/src/databases/database.py
+++ b/src/databases/database.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from metadata import Metadata
+from steps.step import Step
+
+@dataclass
+class Database(Step, ABC):
+    name = "database"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        
+
+    # only for typing...
+    def init(name: str, config: dict) -> Database:
+        return Step.init(name, config, Database)
+
+    @abstractmethod
+    def enrich(self, item: Metadata) -> Metadata: pass
--- a/src/enrichers/enricher.py
+++ b/src/enrichers/enricher.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from abc import abstractmethod, ABC
 from metadata import Metadata
-from step import Step
+from steps.step import Step

@dataclass
 class Enricher(Step, ABC):
--- a/src/feeders/feeder.py
+++ b/src/feeders/feeder.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from abc import abstractmethod
 # from metadata import Metadata
-from step import Step
+from steps.step import Step


@dataclass
--- a/src/feeders/feeder_gsheet.py
+++ b/src/feeders/feeder_gsheet.py
@@ -5,10 +5,11 @@ from loguru import logger

 # from . import Enricher
 from feeders.feeder import Feeder
+from steps.gsheet import Gsheets
 from utils import GWorksheet


-class GsheetsFeeder(Feeder):
+class GsheetsFeeder(Gsheets, Feeder):
    name = "gsheets_feeder"

    def __init__(self, config: dict) -> None:
@@ -19,41 +20,21 @@ class GsheetsFeeder(Feeder):

    @staticmethod
    def configs() -> dict:
-        return {
-            "sheet": {"default": None, "help": "name of the sheet to archive"},
-            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
-            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
-            "allow_worksheets": {
-                "default": set(),
-                "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
-                "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-            },
-            "block_worksheets": {
-                "default": set(),
-                "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
-                "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-            },
-            "columns": {
-                "default": {
-                    'url': 'link',
-                    'status': 'archive status',
-                    'folder': 'destination folder',
-                    'archive': 'archive location',
-                    'date': 'archive date',
-                    'thumbnail': 'thumbnail',
-                    'thumbnail_index': 'thumbnail index',
-                    'timestamp': 'upload timestamp',
-                    'title': 'upload title',
-                    'duration': 'duration',
-                    'screenshot': 'screenshot',
-                    'hash': 'hash',
-                    'wacz': 'wacz',
-                    'replaywebpage': 'replaywebpage',
+        return dict(
+            Gsheets.configs(),
+            ** {
+                "allow_worksheets": {
+                    "default": set(),
+                    "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
                },
-                "help": "names of columns in the google sheet",
-                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
-            },
-        }
+                "block_worksheets": {
+                    "default": set(),
+                    "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
+                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
+                }
+            })
+
    def __iter__(self) -> str:
        sh = self.gsheets_client.open(self.sheet)
        for ii, wks in enumerate(sh.worksheets()):
@@ -71,7 +52,7 @@ class GsheetsFeeder(Feeder):
            for row in range(1 + self.header, gw.count_rows() + 1):
                url = gw.get_cell(row, 'url').strip()
                if not len(url): continue
-                #TODO: gsheet_db should check later if this is supposed to be archived
+                # TODO: gsheet_db should check later if this is supposed to be archived
                # static_status = gw.get_cell(row, 'status')
                # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
                # All checks done - archival process starts here
@@ -83,7 +64,6 @@ class GsheetsFeeder(Feeder):
        for u in ["url1", "url2"]:
            yield u

-
    def should_process_sheet(self, sheet_name: str) -> bool:
        if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
            # ALLOW rules exist AND sheet name not explicitly allowed
--- a/src/steps/gsheet.py
+++ b/src/steps/gsheet.py
@@ -0,0 +1,42 @@
+import json, gspread
+
+from loguru import logger
+from steps.step import Step
+
+
+class Gsheets(Step):
+    name = "gsheets"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        self.gsheets_client = gspread.service_account(filename=self.service_account)
+        assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "sheet": {"default": None, "help": "name of the sheet to archive"},
+            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
+            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
+            "columns": {
+                "default": {
+                    'url': 'link',
+                    'status': 'archive status',
+                    'folder': 'destination folder',
+                    'archive': 'archive location',
+                    'date': 'archive date',
+                    'thumbnail': 'thumbnail',
+                    'thumbnail_index': 'thumbnail index',
+                    'timestamp': 'upload timestamp',
+                    'title': 'upload title',
+                    'duration': 'duration',
+                    'screenshot': 'screenshot',
+                    'hash': 'hash',
+                    'wacz': 'wacz',
+                    'replaywebpage': 'replaywebpage',
+                },
+                "help": "names of columns in the google sheet",
+                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
+            },
+        }
--- a/src/steps/step.py
+++ b/src/steps/step.py
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from abc import abstractmethod, ABC
 from metadata import Metadata
-from step import Step
+from steps.step import Step

@dataclass
 class Util(Step, ABC):