mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Ruff format with defaults.
This commit is contained in:
@@ -1,2 +1,2 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
|
||||
@@ -12,9 +12,7 @@
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"header": {"default": 1, "type": "int", "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
@@ -65,7 +63,7 @@
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeederDatabase
|
||||
|
||||
@@ -8,6 +8,7 @@ The filtered rows are processed into `Metadata` objects.
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Tuple, Union
|
||||
from urllib.parse import quote
|
||||
@@ -23,7 +24,6 @@ from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
@@ -42,24 +42,28 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
logger.warning(
|
||||
f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||
)
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
logger.success(f"Finished worksheet {worksheet.title}")
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
url = gw.get_cell(row, "url").strip()
|
||||
if not len(url):
|
||||
continue
|
||||
original_status = gw.get_cell(row, "status")
|
||||
status = gw.get_cell(row, "status", fresh=original_status in ["", None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
if status not in ["", None]:
|
||||
continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
@@ -70,10 +74,10 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
if gw.get_cell_or_default(row, "folder", "") is None:
|
||||
folder = ""
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
folder = slugify(gw.get_cell_or_default(row, "folder", "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
@@ -91,12 +95,11 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
for required_col in ["url", "status"]:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
@@ -155,9 +158,7 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
@@ -186,11 +187,12 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
logger.error(
|
||||
f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder."
|
||||
)
|
||||
|
||||
return gw, row
|
||||
|
||||
@@ -5,24 +5,25 @@ class GWorksheet:
|
||||
"""
|
||||
This class makes read/write operations to the a worksheet easier.
|
||||
It can read the headers from a custom row number, but the row references
|
||||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
"""
|
||||
|
||||
COLUMN_NAMES = {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
@@ -36,7 +37,7 @@ class GWorksheet:
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
if col not in self.columns:
|
||||
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
||||
raise Exception(f"Column {col} is not in the configured column names: {self.columns.keys()}")
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
@@ -58,7 +59,7 @@ class GWorksheet:
|
||||
|
||||
def get_cell(self, row, col: str, fresh=False):
|
||||
"""
|
||||
returns the cell value from (row, col),
|
||||
returns the cell value from (row, col),
|
||||
where row can be an index (1-based) OR list of values
|
||||
as received from self.get_row(row)
|
||||
if fresh=True, the sheet is queried again for this cell
|
||||
@@ -71,7 +72,7 @@ class GWorksheet:
|
||||
row = self.get_row(row)
|
||||
|
||||
if col_index >= len(row):
|
||||
return ''
|
||||
return ""
|
||||
return row[col_index]
|
||||
|
||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||
@@ -96,13 +97,9 @@ class GWorksheet:
|
||||
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
||||
"""
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[str(val)[0:49999]]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
{"range": self.to_a1(row, col), "values": [[str(val)[0:49999]]]} for row, col, val in cell_updates
|
||||
]
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
self.wks.batch_update(cell_updates, value_input_option="USER_ENTERED")
|
||||
|
||||
def to_a1(self, row: int, col: str):
|
||||
# row is 1-based
|
||||
|
||||
Reference in New Issue
Block a user