From 955891a411cb2bd96a477f3751472776995b101a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 10 Dec 2022 12:03:46 +0000 Subject: [PATCH] WIP feeder --- orchestrate.yaml | 8 +++--- src/configs/v2config.py | 2 +- src/databases/database.py | 21 ++++++++++++++ src/enrichers/enricher.py | 2 +- src/feeders/feeder.py | 2 +- src/feeders/feeder_gsheet.py | 54 ++++++++++++------------------------ src/steps/gsheet.py | 42 ++++++++++++++++++++++++++++ src/{ => steps}/step.py | 0 src/utils/util.py | 2 +- 9 files changed, 88 insertions(+), 45 deletions(-) create mode 100644 src/databases/database.py create mode 100644 src/steps/gsheet.py rename src/{ => steps}/step.py (100%) diff --git a/orchestrate.yaml b/orchestrate.yaml index 3a2bc27..9626e83 100644 --- a/orchestrate.yaml +++ b/orchestrate.yaml @@ -3,11 +3,11 @@ steps: # a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary feeder: gsheets_feeder # default -> only expects URL from CLI archivers: # order matters - - tiktok - telethon - - twitter - - instagram - - webarchive # this way it runs as a failsafe only + # - tiktok + # - twitter + # - instagram + # - webarchive # this way it runs as a failsafe only enrichers: - screenshot # - wacz diff --git a/src/configs/v2config.py b/src/configs/v2config.py index 9eb35df..50c8b0f 100644 --- a/src/configs/v2config.py +++ b/src/configs/v2config.py @@ -4,7 +4,7 @@ import argparse, yaml from dataclasses import dataclass, field from typing import List from feeders.feeder import Feeder -from step import Step +from steps.step import Step from utils import Util from enrichers import Enricher from collections import defaultdict diff --git a/src/databases/database.py b/src/databases/database.py new file mode 100644 index 0000000..15f8d0d --- /dev/null +++ b/src/databases/database.py @@ -0,0 +1,21 @@ +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod, ABC +from metadata import Metadata +from steps.step import Step + +@dataclass +class Database(Step, ABC): + name = "database" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + + # only for typing... + def init(name: str, config: dict) -> Database: + return Step.init(name, config, Database) + + @abstractmethod + def enrich(self, item: Metadata) -> Metadata: pass diff --git a/src/enrichers/enricher.py b/src/enrichers/enricher.py index baa22e3..faf43d8 100644 --- a/src/enrichers/enricher.py +++ b/src/enrichers/enricher.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC from metadata import Metadata -from step import Step +from steps.step import Step @dataclass class Enricher(Step, ABC): diff --git a/src/feeders/feeder.py b/src/feeders/feeder.py index 6b7ba10..d930ba0 100644 --- a/src/feeders/feeder.py +++ b/src/feeders/feeder.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod # from metadata import Metadata -from step import Step +from steps.step import Step @dataclass diff --git a/src/feeders/feeder_gsheet.py b/src/feeders/feeder_gsheet.py index 7ebc640..a99376f 100644 --- a/src/feeders/feeder_gsheet.py +++ b/src/feeders/feeder_gsheet.py @@ -5,10 +5,11 @@ from loguru import logger # from . import Enricher from feeders.feeder import Feeder +from steps.gsheet import Gsheets from utils import GWorksheet -class GsheetsFeeder(Feeder): +class GsheetsFeeder(Gsheets, Feeder): name = "gsheets_feeder" def __init__(self, config: dict) -> None: @@ -19,41 +20,21 @@ class GsheetsFeeder(Feeder): @staticmethod def configs() -> dict: - return { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'thumbnail_index': 'thumbnail index', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'duration': 'duration', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', + return dict( + Gsheets.configs(), + ** { + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) }, - "help": "names of columns in the google sheet", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - }, - } + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed, defaults to empty", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + } + }) + def __iter__(self) -> str: sh = self.gsheets_client.open(self.sheet) for ii, wks in enumerate(sh.worksheets()): @@ -71,7 +52,7 @@ class GsheetsFeeder(Feeder): for row in range(1 + self.header, gw.count_rows() + 1): url = gw.get_cell(row, 'url').strip() if not len(url): continue - #TODO: gsheet_db should check later if this is supposed to be archived + # TODO: gsheet_db should check later if this is supposed to be archived # static_status = gw.get_cell(row, 'status') # status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '') # All checks done - archival process starts here @@ -83,7 +64,6 @@ class GsheetsFeeder(Feeder): for u in ["url1", "url2"]: yield u - def should_process_sheet(self, sheet_name: str) -> bool: if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: # ALLOW rules exist AND sheet name not explicitly allowed diff --git a/src/steps/gsheet.py b/src/steps/gsheet.py new file mode 100644 index 0000000..9654da4 --- /dev/null +++ b/src/steps/gsheet.py @@ -0,0 +1,42 @@ +import json, gspread + +from loguru import logger +from steps.step import Step + + +class Gsheets(Step): + name = "gsheets" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + self.gsheets_client = gspread.service_account(filename=self.service_account) + assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" + + @staticmethod + def configs() -> dict: + return { + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)"}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, + "columns": { + "default": { + 'url': 'link', + 'status': 'archive status', + 'folder': 'destination folder', + 'archive': 'archive location', + 'date': 'archive date', + 'thumbnail': 'thumbnail', + 'thumbnail_index': 'thumbnail index', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'duration': 'duration', + 'screenshot': 'screenshot', + 'hash': 'hash', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', + }, + "help": "names of columns in the google sheet", + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + }, + } \ No newline at end of file diff --git a/src/step.py b/src/steps/step.py similarity index 100% rename from src/step.py rename to src/steps/step.py diff --git a/src/utils/util.py b/src/utils/util.py index 51bb2e3..714d499 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC from metadata import Metadata -from step import Step +from steps.step import Step @dataclass class Util(Step, ABC):