WIP feeder

This commit is contained in:
msramalho
2022-12-10 12:03:46 +00:00
parent 9dc709d3b9
commit 955891a411
9 changed files with 88 additions and 45 deletions

View File

@@ -3,11 +3,11 @@ steps:
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
feeder: gsheets_feeder # default -> only expects URL from CLI
archivers: # order matters
- tiktok
- telethon
- twitter
- instagram
- webarchive # this way it runs as a failsafe only
# - tiktok
# - twitter
# - instagram
# - webarchive # this way it runs as a failsafe only
enrichers:
- screenshot
# - wacz

View File

@@ -4,7 +4,7 @@ import argparse, yaml
from dataclasses import dataclass, field
from typing import List
from feeders.feeder import Feeder
from step import Step
from steps.step import Step
from utils import Util
from enrichers import Enricher
from collections import defaultdict

21
src/databases/database.py Normal file
View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
from steps.step import Step
@dataclass
class Database(Step, ABC):
name = "database"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
# only for typing...
def init(name: str, config: dict) -> Database:
return Step.init(name, config, Database)
@abstractmethod
def enrich(self, item: Metadata) -> Metadata: pass

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
from step import Step
from steps.step import Step
@dataclass
class Enricher(Step, ABC):

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod
# from metadata import Metadata
from step import Step
from steps.step import Step
@dataclass

View File

@@ -5,10 +5,11 @@ from loguru import logger
# from . import Enricher
from feeders.feeder import Feeder
from steps.gsheet import Gsheets
from utils import GWorksheet
class GsheetsFeeder(Feeder):
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheets_feeder"
def __init__(self, config: dict) -> None:
@@ -19,41 +20,21 @@ class GsheetsFeeder(Feeder):
@staticmethod
def configs() -> dict:
return {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
return dict(
Gsheets.configs(),
** {
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"help": "names of columns in the google sheet",
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
},
}
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
}
})
def __iter__(self) -> str:
sh = self.gsheets_client.open(self.sheet)
for ii, wks in enumerate(sh.worksheets()):
@@ -71,7 +52,7 @@ class GsheetsFeeder(Feeder):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
#TODO: gsheet_db should check later if this is supposed to be archived
# TODO: gsheet_db should check later if this is supposed to be archived
# static_status = gw.get_cell(row, 'status')
# status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
# All checks done - archival process starts here
@@ -83,7 +64,6 @@ class GsheetsFeeder(Feeder):
for u in ["url1", "url2"]:
yield u
def should_process_sheet(self, sheet_name: str) -> bool:
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
# ALLOW rules exist AND sheet name not explicitly allowed

42
src/steps/gsheet.py Normal file
View File

@@ -0,0 +1,42 @@
import json, gspread
from loguru import logger
from steps.step import Step
class Gsheets(Step):
name = "gsheets"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
@staticmethod
def configs() -> dict:
return {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
"help": "names of columns in the google sheet",
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
},
}

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
from abc import abstractmethod, ABC
from metadata import Metadata
from step import Step
from steps.step import Step
@dataclass
class Util(Step, ABC):