Ruff format with defaults.

2026-06-12 21:28:29 +03:00 · 2025-03-10 18:44:54 +00:00
parent cbb0414e5f
commit 85abe1837a
155 changed files with 2539 additions and 1908 deletions
--- a/src/auto_archiver/modules/gsheet_feeder_db/init.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/init.py
@@ -1,2 +1,2 @@
 from .gworksheet import GWorksheet
-from .gsheet_feeder_db import GsheetsFeederDB
+from .gsheet_feeder_db import GsheetsFeederDB
--- a/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
@@ -12,9 +12,7 @@
            "default": None,
            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
-        "header": {"default": 1,
-                   "type": "int",
-                   "help": "index of the header row (starts at 1)", "type": "int"},
+        "header": {"default": 1, "type": "int", "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
@@ -65,7 +63,7 @@
            "default": True,
            "type": "bool",
            "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
-        }
+        },
    },
    "description": """
    GsheetsFeederDatabase
--- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
@@ -8,6 +8,7 @@ The filtered rows are processed into `Metadata` objects.
 - validates the sheet's structure and filters rows based on input configurations.
 - Ensures only rows with valid URLs and unprocessed statuses are included.
 """
+
 import os
 from typing import Tuple, Union
 from urllib.parse import quote
@@ -23,7 +24,6 @@ from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp


 class GsheetsFeederDB(Feeder, Database):
-
    def setup(self) -> None:
        self.gsheets_client = gspread.service_account(filename=self.service_account)
        # TODO mv to validators
@@ -42,24 +42,28 @@ class GsheetsFeederDB(Feeder, Database):
            if not self.should_process_sheet(worksheet.title):
                logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
                continue
-            logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
+            logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
            gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
            if len(missing_cols := self.missing_required_columns(gw)):
-                logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
+                logger.warning(
+                    f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
+                )
                continue

            # process and yield metadata here:
            yield from self._process_rows(gw)
-            logger.success(f'Finished worksheet {worksheet.title}')
+            logger.success(f"Finished worksheet {worksheet.title}")

    def _process_rows(self, gw: GWorksheet):
        for row in range(1 + self.header, gw.count_rows() + 1):
-            url = gw.get_cell(row, 'url').strip()
-            if not len(url): continue
-            original_status = gw.get_cell(row, 'status')
-            status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
+            url = gw.get_cell(row, "url").strip()
+            if not len(url):
+                continue
+            original_status = gw.get_cell(row, "status")
+            status = gw.get_cell(row, "status", fresh=original_status in ["", None])
            # TODO: custom status parser(?) aka should_retry_from_status
-            if status not in ['', None]: continue
+            if status not in ["", None]:
+                continue

            # All checks done - archival process starts here
            m = Metadata().set_url(url)
@@ -70,10 +74,10 @@ class GsheetsFeederDB(Feeder, Database):
        # TODO: Check folder value not being recognised
        m.set_context("gsheet", {"row": row, "worksheet": gw})

-        if gw.get_cell_or_default(row, 'folder', "") is None:
-            folder = ''
+        if gw.get_cell_or_default(row, "folder", "") is None:
+            folder = ""
        else:
-            folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
+            folder = slugify(gw.get_cell_or_default(row, "folder", "").strip())
        if len(folder):
            if self.use_sheet_names_in_stored_paths:
                m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
@@ -91,12 +95,11 @@ class GsheetsFeederDB(Feeder, Database):

    def missing_required_columns(self, gw: GWorksheet) -> list:
        missing = []
-        for required_col in ['url', 'status']:
+        for required_col in ["url", "status"]:
            if not gw.col_exists(required_col):
                missing.append(required_col)
        return missing

-
    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")
        gw, row = self._retrieve_gsheet(item)
@@ -155,9 +158,7 @@ class GsheetsFeederDB(Feeder, Database):
        if len(pdq_hashes):
            batch_if_valid("pdq_hash", ",".join(pdq_hashes))

-        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
-            screenshot, "urls"
-        ):
+        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
            batch_if_valid("screenshot", "\n".join(screenshot.urls))

        if thumbnail := item.get_first_image("thumbnail"):
@@ -186,11 +187,12 @@ class GsheetsFeederDB(Feeder, Database):
            logger.debug(f"Unable to update sheet: {e}")

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-
        if gsheet := item.get_context("gsheet"):
            gw: GWorksheet = gsheet.get("worksheet")
            row: int = gsheet.get("row")
        elif self.sheet_id:
-            logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
+            logger.error(
+                f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
+            )

        return gw, row
--- a/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py
@@ -5,24 +5,25 @@ class GWorksheet:
    """
    This class makes read/write operations to the a worksheet easier.
    It can read the headers from a custom row number, but the row references
-    should always include the offset of the header. 
-    eg: if header=4, row 5 will be the first with data. 
+    should always include the offset of the header.
+    eg: if header=4, row 5 will be the first with data.
    """
+
    COLUMN_NAMES = {
-        'url': 'link',
-        'status': 'archive status',
-        'folder': 'destination folder',
-        'archive': 'archive location',
-        'date': 'archive date',
-        'thumbnail': 'thumbnail',
-        'timestamp': 'upload timestamp',
-        'title': 'upload title',
-        'text': 'text content',
-        'screenshot': 'screenshot',
-        'hash': 'hash',
-        'pdq_hash': 'perceptual hashes',
-        'wacz': 'wacz',
-        'replaywebpage': 'replaywebpage',
+        "url": "link",
+        "status": "archive status",
+        "folder": "destination folder",
+        "archive": "archive location",
+        "date": "archive date",
+        "thumbnail": "thumbnail",
+        "timestamp": "upload timestamp",
+        "title": "upload title",
+        "text": "text content",
+        "screenshot": "screenshot",
+        "hash": "hash",
+        "pdq_hash": "perceptual hashes",
+        "wacz": "wacz",
+        "replaywebpage": "replaywebpage",
    }

    def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
@@ -36,7 +37,7 @@ class GWorksheet:

    def _check_col_exists(self, col: str):
        if col not in self.columns:
-            raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
+            raise Exception(f"Column {col} is not in the configured column names: {self.columns.keys()}")

    def _col_index(self, col: str):
        self._check_col_exists(col)
@@ -58,7 +59,7 @@ class GWorksheet:

    def get_cell(self, row, col: str, fresh=False):
        """
-        returns the cell value from (row, col), 
+        returns the cell value from (row, col),
        where row can be an index (1-based) OR list of values
        as received from self.get_row(row)
        if fresh=True, the sheet is queried again for this cell
@@ -71,7 +72,7 @@ class GWorksheet:
            row = self.get_row(row)

        if col_index >= len(row):
-            return ''
+            return ""
        return row[col_index]

    def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
@@ -96,13 +97,9 @@ class GWorksheet:
        receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
        """
        cell_updates = [
-            {
-                'range': self.to_a1(row, col),
-                'values': [[str(val)[0:49999]]]
-            }
-            for row, col, val in cell_updates
+            {"range": self.to_a1(row, col), "values": [[str(val)[0:49999]]]} for row, col, val in cell_updates
        ]
-        self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
+        self.wks.batch_update(cell_updates, value_input_option="USER_ENTERED")

    def to_a1(self, row: int, col: str):
        # row is 1-based