Update manifests and modules

2026-06-12 13:18:28 +03:00 · 2025-01-24 12:58:16 +00:00
parent ba4b330881
commit aa7ca93a43
95 changed files with 172 additions and 115 deletions
--- a/src/auto_archiver/modules/gsheet_db/init.py
+++ b/src/auto_archiver/modules/gsheet_db/init.py
@@ -0,0 +1 @@
+from .gsheet_db import GsheetsDb
--- a/src/auto_archiver/modules/gsheet_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_db/manifest.py
@@ -0,0 +1,38 @@
+{
+    "name": "Google Sheets Database",
+    "type": ["database"],
+    "requires_setup": True,
+    "external_dependencies": {
+        "python": ["loguru", "gspread", "python-slugify"],
+    },
+    "configs": {
+        "allow_worksheets": {
+            "default": set(),
+            "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
+            "type": lambda val: set(val.split(",")),
+        },
+        "block_worksheets": {
+            "default": set(),
+            "help": "(CSV) explicitly block some worksheets from being processed",
+            "type": lambda val: set(val.split(",")),
+        },
+        "use_sheet_names_in_stored_paths": {
+            "default": True,
+            "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
+        }
+    },
+    "description": """
+    GsheetsDatabase:
+    Handles integration with Google Sheets for tracking archival tasks.
+
+### Features
+- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
+- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
+- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
+- Skips redundant updates for empty or invalid data fields.
+
+### Notes
+- Currently works only with metadata provided by GsheetFeeder. 
+- Requires configuration of a linked Google Sheet and appropriate API credentials.
+    """
+}
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -0,0 +1,108 @@
+from typing import Union, Tuple
+
+import datetime
+from urllib.parse import quote
+
+from loguru import logger
+
+from auto_archiver.base_processors import Database
+from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.utils import GWorksheet
+
+
+class GsheetsDb(Database):
+    """
+        NB: only works if GsheetFeeder is used. 
+        could be updated in the future to support non-GsheetFeeder metadata 
+    """
+    name = "gsheet_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    def started(self, item: Metadata) -> None:
+        logger.warning(f"STARTED {item}")
+        gw, row = self._retrieve_gsheet(item)
+        gw.set_cell(row, 'status', 'Archive in progress')
+
+    def failed(self, item: Metadata, reason:str) -> None:
+        logger.error(f"FAILED {item}")
+        self._safe_status_update(item, f'Archive failed {reason}')
+
+    def aborted(self, item: Metadata) -> None:
+        logger.warning(f"ABORTED {item}")
+        self._safe_status_update(item, '')
+
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """check if the given item has been archived already"""
+        return False
+
+    def done(self, item: Metadata, cached: bool=False) -> None:
+        """archival result ready - should be saved to DB"""
+        logger.success(f"DONE {item.get_url()}")
+        gw, row = self._retrieve_gsheet(item)
+        # self._safe_status_update(item, 'done')
+
+        cell_updates = []
+        row_values = gw.get_row(row)
+
+        def batch_if_valid(col, val, final_value=None):
+            final_value = final_value or val
+            try:
+                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+                    cell_updates.append((row, col, final_value))
+            except Exception as e:
+                logger.error(f"Unable to batch {col}={final_value} due to {e}")
+        status_message = item.status
+        if cached:
+            status_message = f"[cached] {status_message}"
+        cell_updates.append((row, 'status', status_message))
+
+        media: Media = item.get_final_media()
+        if hasattr(media, "urls"):
+            batch_if_valid('archive', "\n".join(media.urls))
+        batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
+        batch_if_valid('title', item.get_title())
+        batch_if_valid('text', item.get("content", ""))
+        batch_if_valid('timestamp', item.get_timestamp())
+        if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
+
+        # merge all pdq hashes into a single string, if present
+        pdq_hashes = []
+        all_media = item.get_all_media()
+        for m in all_media:
+            if pdq := m.get("pdq_hash"):
+                pdq_hashes.append(pdq)
+        if len(pdq_hashes):
+            batch_if_valid('pdq_hash', ",".join(pdq_hashes))
+
+        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
+            batch_if_valid('screenshot', "\n".join(screenshot.urls))
+
+        if (thumbnail := item.get_first_image("thumbnail")):
+            if hasattr(thumbnail, "urls"):
+                batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
+
+        if (browsertrix := item.get_media_by_id("browsertrix")):
+            batch_if_valid('wacz', "\n".join(browsertrix.urls))
+            batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
+
+        gw.batch_set_cell(cell_updates)
+
+    def _safe_status_update(self, item: Metadata, new_status: str) -> None:
+        try:
+            gw, row = self._retrieve_gsheet(item)
+            gw.set_cell(row, 'status', new_status)
+        except Exception as e:
+            logger.debug(f"Unable to update sheet: {e}")
+
+    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
+        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
+        if gsheet := ArchivingContext.get("gsheet"):
+            gw: GWorksheet = gsheet.get("worksheet")
+            row: int = gsheet.get("row")
+        elif self.sheet_id:
+            print(self.sheet_id)
+
+        return gw, row