Ruff format with defaults.

This commit is contained in:
erinhmclark
2025-03-10 18:44:54 +00:00
parent cbb0414e5f
commit 85abe1837a
155 changed files with 2539 additions and 1908 deletions

View File

@@ -1,2 +1,2 @@
from .gworksheet import GWorksheet
from .gsheet_feeder_db import GsheetsFeederDB
from .gsheet_feeder_db import GsheetsFeederDB

View File

@@ -12,9 +12,7 @@
"default": None,
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
},
"header": {"default": 1,
"type": "int",
"help": "index of the header row (starts at 1)", "type": "int"},
"header": {"default": 1, "type": "int", "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {
"default": "secrets/service_account.json",
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
@@ -65,7 +63,7 @@
"default": True,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
},
},
"description": """
GsheetsFeederDatabase

View File

@@ -8,6 +8,7 @@ The filtered rows are processed into `Metadata` objects.
- validates the sheet's structure and filters rows based on input configurations.
- Ensures only rows with valid URLs and unprocessed statuses are included.
"""
import os
from typing import Tuple, Union
from urllib.parse import quote
@@ -23,7 +24,6 @@ from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
class GsheetsFeederDB(Feeder, Database):
def setup(self) -> None:
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO mv to validators
@@ -42,24 +42,28 @@ class GsheetsFeederDB(Feeder, Database):
if not self.should_process_sheet(worksheet.title):
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
continue
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)):
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
logger.warning(
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
)
continue
# process and yield metadata here:
yield from self._process_rows(gw)
logger.success(f'Finished worksheet {worksheet.title}')
logger.success(f"Finished worksheet {worksheet.title}")
def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
url = gw.get_cell(row, "url").strip()
if not len(url):
continue
original_status = gw.get_cell(row, "status")
status = gw.get_cell(row, "status", fresh=original_status in ["", None])
# TODO: custom status parser(?) aka should_retry_from_status
if status not in ['', None]: continue
if status not in ["", None]:
continue
# All checks done - archival process starts here
m = Metadata().set_url(url)
@@ -70,10 +74,10 @@ class GsheetsFeederDB(Feeder, Database):
# TODO: Check folder value not being recognised
m.set_context("gsheet", {"row": row, "worksheet": gw})
if gw.get_cell_or_default(row, 'folder', "") is None:
folder = ''
if gw.get_cell_or_default(row, "folder", "") is None:
folder = ""
else:
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
folder = slugify(gw.get_cell_or_default(row, "folder", "").strip())
if len(folder):
if self.use_sheet_names_in_stored_paths:
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
@@ -91,12 +95,11 @@ class GsheetsFeederDB(Feeder, Database):
def missing_required_columns(self, gw: GWorksheet) -> list:
missing = []
for required_col in ['url', 'status']:
for required_col in ["url", "status"]:
if not gw.col_exists(required_col):
missing.append(required_col)
return missing
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
@@ -155,9 +158,7 @@ class GsheetsFeederDB(Feeder, Database):
if len(pdq_hashes):
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
screenshot, "urls"
):
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid("screenshot", "\n".join(screenshot.urls))
if thumbnail := item.get_first_image("thumbnail"):
@@ -186,11 +187,12 @@ class GsheetsFeederDB(Feeder, Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
logger.error(
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
)
return gw, row

View File

@@ -5,24 +5,25 @@ class GWorksheet:
"""
This class makes read/write operations to the a worksheet easier.
It can read the headers from a custom row number, but the row references
should always include the offset of the header.
eg: if header=4, row 5 will be the first with data.
should always include the offset of the header.
eg: if header=4, row 5 will be the first with data.
"""
COLUMN_NAMES = {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'timestamp': 'upload timestamp',
'title': 'upload title',
'text': 'text content',
'screenshot': 'screenshot',
'hash': 'hash',
'pdq_hash': 'perceptual hashes',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
}
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
@@ -36,7 +37,7 @@ class GWorksheet:
def _check_col_exists(self, col: str):
if col not in self.columns:
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
raise Exception(f"Column {col} is not in the configured column names: {self.columns.keys()}")
def _col_index(self, col: str):
self._check_col_exists(col)
@@ -58,7 +59,7 @@ class GWorksheet:
def get_cell(self, row, col: str, fresh=False):
"""
returns the cell value from (row, col),
returns the cell value from (row, col),
where row can be an index (1-based) OR list of values
as received from self.get_row(row)
if fresh=True, the sheet is queried again for this cell
@@ -71,7 +72,7 @@ class GWorksheet:
row = self.get_row(row)
if col_index >= len(row):
return ''
return ""
return row[col_index]
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
@@ -96,13 +97,9 @@ class GWorksheet:
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
"""
cell_updates = [
{
'range': self.to_a1(row, col),
'values': [[str(val)[0:49999]]]
}
for row, col, val in cell_updates
{"range": self.to_a1(row, col), "values": [[str(val)[0:49999]]]} for row, col, val in cell_updates
]
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
self.wks.batch_update(cell_updates, value_input_option="USER_ENTERED")
def to_a1(self, row: int, col: str):
# row is 1-based