From 57b3bec9351237f24116e91b6b5665a7db300033 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 20:13:12 +0000 Subject: [PATCH] Google sheets feeder and database implemented. --- .../modules/gsheet_db/__manifest__.py | 1 + .../modules/gsheet_feeder/__manifest__.py | 56 ++++++++++--------- .../modules/gsheet_feeder/gsheet_feeder.py | 43 +++----------- 3 files changed, 39 insertions(+), 61 deletions(-) diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index 8c54fe5..f2f1c35 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -1,6 +1,7 @@ { "name": "Google Sheets Database", "type": ["database"], + "entry_point": "gsheet_db::GsheetsDb", "requires_setup": True, "external_dependencies": { "python": ["loguru", "gspread", "python-slugify"], diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index e1a89a2..3d9cb08 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -7,30 +7,36 @@ "python": ["loguru", "gspread", "python-slugify"], }, "configs": { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'text': 'text content', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'pdq_hash': 'perceptual hashes', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', - }, - "help": "names of columns in the google sheet (stringified JSON object)", - "type": "auto_archiver.utils.json_loader", + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "sheet_id": { + "default": None, + "help": "(alternative to sheet name) the id of the sheet to archive", + }, + "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"}, + "service_account": { + "default": "secrets/service_account.json", + "help": "service account JSON file path", + }, + "columns": { + "default": { + "url": "link", + "status": "archive status", + "folder": "destination folder", + "archive": "archive location", + "date": "archive date", + "thumbnail": "thumbnail", + "timestamp": "upload timestamp", + "title": "upload title", + "text": "text content", + "screenshot": "screenshot", + "hash": "hash", + "pdq_hash": "perceptual hashes", + "wacz": "wacz", + "replaywebpage": "replaywebpage", }, + "help": "names of columns in the google sheet (stringified JSON object)", + "type": "auto_archiver.utils.json_loader", + }, "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", @@ -43,7 +49,7 @@ "default": True, "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "type": "bool", - } + }, }, "description": """ GsheetsFeeder @@ -61,5 +67,5 @@ ### Notes - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Create the sheet using the template provided in the docs. - """ + """, } diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 66dd014..235dd63 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,41 +21,13 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def __init__(self) -> None: - """ - Initializes the GsheetsFeeder with preloaded configurations. - """ - super().__init__() - # Initialize the gspread client with the provided service account file - # self.gsheets_client = gspread.service_account(filename=self.config["service_account"]) - # - # # Set up feeder-specific configurations from the config - # self.sheet_name = config.get("sheet") - # self.sheet_id = config.get("sheet_id") - # self.header = config.get("header", 1) - # self.columns = config.get("columns", {}) - # assert self.sheet_name or self.sheet_id, ( - # "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." - # ) - - - # # Configuration attributes - # self.sheet = config.get("sheet") - # self.sheet_id = config.get("sheet_id") - # self.header = config.get("header", 1) - # self.columns = config.get("columns", {}) - # self.allow_worksheets = config.get("allow_worksheets", set()) - # self.block_worksheets = config.get("block_worksheets", set()) - # self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True) - - # Ensure the header is an integer - # try: - # self.header = int(self.header) - # except ValueError: - # pass - # assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}" - # assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined." - # + def setup(self, config: dict): + super().setup(config) + self.gsheets_client = gspread.service_account(filename=self.service_account) + # TODO mv to validators + assert self.sheet or self.sheet_id, ( + "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." + ) def open_sheet(self): if self.sheet: @@ -63,7 +35,6 @@ class GsheetsFeeder(Feeder): else: # self.sheet_id return self.gsheets_client.open_by_key(self.sheet_id) - def __iter__(self) -> Metadata: sh = self.open_sheet() for ii, wks in enumerate(sh.worksheets()):