mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Merge GSheet Feeder and Database.
This commit is contained in:
@@ -1 +0,0 @@
|
||||
from .gsheet_db import GsheetsDb
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"name": "Google Sheets Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "gsheet_db::GsheetsDb",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsDatabase:
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
"""
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
from typing import Union, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
@@ -1,2 +0,0 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
@@ -1,95 +0,0 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import os
|
||||
import gspread
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata
|
||||
from . import GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
2
src/auto_archiver/modules/gsheet_feeder_db/__init__.py
Normal file
2
src/auto_archiver/modules/gsheet_feeder_db/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"name": "Google Sheets Feeder Database",
|
||||
"type": ["feeder", "database"],
|
||||
"entry_point": "gsheet_feeder_db::GsheetsFeederDB",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
@@ -51,10 +51,23 @@
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder
|
||||
A Google Sheets-based feeder for the Auto Archiver.
|
||||
GsheetsFeederDatabase
|
||||
A Google Sheets-based feeder and optional database for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
@@ -64,11 +77,16 @@
|
||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
- If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Setup
|
||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||
- The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
|
||||
""",
|
||||
}
|
||||
196
src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
Normal file
196
src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
Normal file
@@ -0,0 +1,196 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import os
|
||||
from typing import Tuple, Union
|
||||
from urllib.parse import quote
|
||||
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder, Database, Media
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||
from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
Reference in New Issue
Block a user