mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
WIP feeder
This commit is contained in:
@@ -3,11 +3,11 @@ steps:
|
||||
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
|
||||
feeder: gsheets_feeder # default -> only expects URL from CLI
|
||||
archivers: # order matters
|
||||
- tiktok
|
||||
- telethon
|
||||
- twitter
|
||||
- instagram
|
||||
- webarchive # this way it runs as a failsafe only
|
||||
# - tiktok
|
||||
# - twitter
|
||||
# - instagram
|
||||
# - webarchive # this way it runs as a failsafe only
|
||||
enrichers:
|
||||
- screenshot
|
||||
# - wacz
|
||||
|
||||
@@ -4,7 +4,7 @@ import argparse, yaml
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
from feeders.feeder import Feeder
|
||||
from step import Step
|
||||
from steps.step import Step
|
||||
from utils import Util
|
||||
from enrichers import Enricher
|
||||
from collections import defaultdict
|
||||
|
||||
21
src/databases/database.py
Normal file
21
src/databases/database.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from metadata import Metadata
|
||||
from steps.step import Step
|
||||
|
||||
@dataclass
|
||||
class Database(Step, ABC):
|
||||
name = "database"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Database:
|
||||
return Step.init(name, config, Database)
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, item: Metadata) -> Metadata: pass
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from metadata import Metadata
|
||||
from step import Step
|
||||
from steps.step import Step
|
||||
|
||||
@dataclass
|
||||
class Enricher(Step, ABC):
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod
|
||||
# from metadata import Metadata
|
||||
from step import Step
|
||||
from steps.step import Step
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -5,10 +5,11 @@ from loguru import logger
|
||||
|
||||
# from . import Enricher
|
||||
from feeders.feeder import Feeder
|
||||
from steps.gsheet import Gsheets
|
||||
from utils import GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Feeder):
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
name = "gsheets_feeder"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
@@ -19,41 +20,21 @@ class GsheetsFeeder(Feeder):
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
return dict(
|
||||
Gsheets.configs(),
|
||||
** {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"help": "names of columns in the google sheet",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed, defaults to empty",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
}
|
||||
})
|
||||
|
||||
def __iter__(self) -> str:
|
||||
sh = self.gsheets_client.open(self.sheet)
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
@@ -71,7 +52,7 @@ class GsheetsFeeder(Feeder):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
#TODO: gsheet_db should check later if this is supposed to be archived
|
||||
# TODO: gsheet_db should check later if this is supposed to be archived
|
||||
# static_status = gw.get_cell(row, 'status')
|
||||
# status = gw.get_cell(row, 'status', fresh=static_status in ['', None] and url != '')
|
||||
# All checks done - archival process starts here
|
||||
@@ -83,7 +64,6 @@ class GsheetsFeeder(Feeder):
|
||||
for u in ["url1", "url2"]:
|
||||
yield u
|
||||
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
|
||||
42
src/steps/gsheet.py
Normal file
42
src/steps/gsheet.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import json, gspread
|
||||
|
||||
from loguru import logger
|
||||
from steps.step import Step
|
||||
|
||||
|
||||
class Gsheets(Step):
|
||||
name = "gsheets"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
from metadata import Metadata
|
||||
from step import Step
|
||||
from steps.step import Step
|
||||
|
||||
@dataclass
|
||||
class Util(Step, ABC):
|
||||
|
||||
Reference in New Issue
Block a user