From 75380b07161f3fe758ab919d8c9214565758bb8f Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 25 Feb 2025 21:32:32 +0000 Subject: [PATCH 01/15] Merge GSheet Feeder and Database. --- .../modules/gsheet_db/__init__.py | 1 - .../modules/gsheet_db/__manifest__.py | 38 ---- .../modules/gsheet_db/gsheet_db.py | 114 ---------- .../modules/gsheet_feeder/__init__.py | 2 - .../modules/gsheet_feeder/gsheet_feeder.py | 95 --------- .../modules/gsheet_feeder_db/__init__.py | 2 + .../__manifest__.py | 28 ++- .../gsheet_feeder_db/gsheet_feeder_db.py | 196 ++++++++++++++++++ .../gworksheet.py | 0 tests/databases/test_gsheet_db.py | 26 ++- tests/feeders/test_gsheet_feeder.py | 28 +-- 11 files changed, 251 insertions(+), 279 deletions(-) delete mode 100644 src/auto_archiver/modules/gsheet_db/__init__.py delete mode 100644 src/auto_archiver/modules/gsheet_db/__manifest__.py delete mode 100644 src/auto_archiver/modules/gsheet_db/gsheet_db.py delete mode 100644 src/auto_archiver/modules/gsheet_feeder/__init__.py delete mode 100644 src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py create mode 100644 src/auto_archiver/modules/gsheet_feeder_db/__init__.py rename src/auto_archiver/modules/{gsheet_feeder => gsheet_feeder_db}/__manifest__.py (69%) create mode 100644 src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py rename src/auto_archiver/modules/{gsheet_feeder => gsheet_feeder_db}/gworksheet.py (100%) diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py deleted file mode 100644 index 01fdee6..0000000 --- a/src/auto_archiver/modules/gsheet_db/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py deleted file mode 100644 index cf95245..0000000 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ /dev/null @@ -1,38 +0,0 @@ -{ - "name": "Google Sheets Database", - "type": ["database"], - "entry_point": "gsheet_db::GsheetsDb", - "requires_setup": True, - "dependencies": { - "python": ["loguru", "gspread", "slugify"], - }, - "configs": { - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed", - }, - "use_sheet_names_in_stored_paths": { - "default": True, - "type": "bool", - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - } - }, - "description": """ - GsheetsDatabase: - Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. - """ -} diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py deleted file mode 100644 index c19f2ae..0000000 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import Union, Tuple -from urllib.parse import quote - -from loguru import logger - -from auto_archiver.core import Database -from auto_archiver.core import Metadata, Media -from auto_archiver.modules.gsheet_feeder import GWorksheet -from auto_archiver.utils.misc import get_current_timestamp - - -class GsheetsDb(Database): - """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata - """ - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, "status", "Archive in progress") - - def failed(self, item: Metadata, reason: str) -> None: - logger.error(f"FAILED {item}") - self._safe_status_update(item, f"Archive failed {reason}") - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - self._safe_status_update(item, "") - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check if the given item has been archived already""" - return False - - def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item.get_url()}") - gw, row = self._retrieve_gsheet(item) - # self._safe_status_update(item, 'done') - - cell_updates = [] - row_values = gw.get_row(row) - - def batch_if_valid(col, val, final_value=None): - final_value = final_value or val - try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": - cell_updates.append((row, col, final_value)) - except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") - - status_message = item.status - if cached: - status_message = f"[cached] {status_message}" - cell_updates.append((row, "status", status_message)) - - media: Media = item.get_final_media() - if hasattr(media, "urls"): - batch_if_valid("archive", "\n".join(media.urls)) - batch_if_valid("date", True, get_current_timestamp()) - batch_if_valid("title", item.get_title()) - batch_if_valid("text", item.get("content", "")) - batch_if_valid("timestamp", item.get_timestamp()) - if media: - batch_if_valid("hash", media.get("hash", "not-calculated")) - - # merge all pdq hashes into a single string, if present - pdq_hashes = [] - all_media = item.get_all_media() - for m in all_media: - if pdq := m.get("pdq_hash"): - pdq_hashes.append(pdq) - if len(pdq_hashes): - batch_if_valid("pdq_hash", ",".join(pdq_hashes)) - - if (screenshot := item.get_media_by_id("screenshot")) and hasattr( - screenshot, "urls" - ): - batch_if_valid("screenshot", "\n".join(screenshot.urls)) - - if thumbnail := item.get_first_image("thumbnail"): - if hasattr(thumbnail, "urls"): - batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') - - if browsertrix := item.get_media_by_id("browsertrix"): - batch_if_valid("wacz", "\n".join(browsertrix.urls)) - batch_if_valid( - "replaywebpage", - "\n".join( - [ - f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" - for wacz in browsertrix.urls - ] - ), - ) - - gw.batch_set_cell(cell_updates) - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: - try: - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, "status", new_status) - except Exception as e: - logger.debug(f"Unable to update sheet: {e}") - - def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - - if gsheet := item.get_context("gsheet"): - gw: GWorksheet = gsheet.get("worksheet") - row: int = gsheet.get("row") - elif self.sheet_id: - logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") - - return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py deleted file mode 100644 index bb4230a..0000000 --- a/src/auto_archiver/modules/gsheet_feeder/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .gworksheet import GWorksheet -from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py deleted file mode 100644 index ea724e7..0000000 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. - -This reads data from Google Sheets and filters rows based on user-defined rules. -The filtered rows are processed into `Metadata` objects. - -### Key properties -- validates the sheet's structure and filters rows based on input configurations. -- Ensures only rows with valid URLs and unprocessed statuses are included. -""" -import os -import gspread - -from loguru import logger -from slugify import slugify - -from auto_archiver.core import Feeder -from auto_archiver.core import Metadata -from . import GWorksheet - - -class GsheetsFeeder(Feeder): - - def setup(self) -> None: - self.gsheets_client = gspread.service_account(filename=self.service_account) - # TODO mv to validators - if not self.sheet and not self.sheet_id: - raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") - - def open_sheet(self): - if self.sheet: - return self.gsheets_client.open(self.sheet) - else: # self.sheet_id - return self.gsheets_client.open_by_key(self.sheet_id) - - def __iter__(self) -> Metadata: - sh = self.open_sheet() - for ii, worksheet in enumerate(sh.worksheets()): - if not self.should_process_sheet(worksheet.title): - logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") - continue - logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') - gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) - if len(missing_cols := self.missing_required_columns(gw)): - logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") - continue - - # process and yield metadata here: - yield from self._process_rows(gw) - logger.success(f'Finished worksheet {worksheet.title}') - - def _process_rows(self, gw: GWorksheet): - for row in range(1 + self.header, gw.count_rows() + 1): - url = gw.get_cell(row, 'url').strip() - if not len(url): continue - original_status = gw.get_cell(row, 'status') - status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) - # TODO: custom status parser(?) aka should_retry_from_status - if status not in ['', None]: continue - - # All checks done - archival process starts here - m = Metadata().set_url(url) - self._set_context(m, gw, row) - yield m - - def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: - # TODO: Check folder value not being recognised - m.set_context("gsheet", {"row": row, "worksheet": gw}) - - if gw.get_cell_or_default(row, 'folder', "") is None: - folder = '' - else: - folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder): - if self.use_sheet_names_in_stored_paths: - m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) - else: - m.set_context("folder", folder) - - - def should_process_sheet(self, sheet_name: str) -> bool: - if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: - # ALLOW rules exist AND sheet name not explicitly allowed - return False - if len(self.block_worksheets) and sheet_name in self.block_worksheets: - # BLOCK rules exist AND sheet name is blocked - return False - return True - - def missing_required_columns(self, gw: GWorksheet) -> list: - missing = [] - for required_col in ['url', 'status']: - if not gw.col_exists(required_col): - missing.append(required_col) - return missing diff --git a/src/auto_archiver/modules/gsheet_feeder_db/__init__.py b/src/auto_archiver/modules/gsheet_feeder_db/__init__.py new file mode 100644 index 0000000..2e9ac02 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder_db/__init__.py @@ -0,0 +1,2 @@ +from .gworksheet import GWorksheet +from .gsheet_feeder_db import GsheetsFeederDB \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py similarity index 69% rename from src/auto_archiver/modules/gsheet_feeder/__manifest__.py rename to src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py index 130b9f6..bb2f447 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py @@ -1,7 +1,7 @@ { - "name": "Google Sheets Feeder", - "type": ["feeder"], - "entry_point": "gsheet_feeder::GsheetsFeeder", + "name": "Google Sheets Feeder Database", + "type": ["feeder", "database"], + "entry_point": "gsheet_feeder_db::GsheetsFeederDB", "requires_setup": True, "dependencies": { "python": ["loguru", "gspread", "slugify"], @@ -51,10 +51,23 @@ "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "type": "bool", }, + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "type": "bool", + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } }, "description": """ - GsheetsFeeder - A Google Sheets-based feeder for the Auto Archiver. + GsheetsFeederDatabase + A Google Sheets-based feeder and optional database for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. The filtered rows are processed into `Metadata` objects. @@ -64,11 +77,16 @@ - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations. - Ensures only rows with valid URLs and unprocessed statuses are included for archival. - Supports organizing stored files into folder paths based on sheet and worksheet names. + - If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. + - Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. + - Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. + - Skips redundant updates for empty or invalid data fields. ### Setup - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`. To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html). - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive. - Customize the column names in your Google sheet using the `columns` configuration. + - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder. """, } diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py new file mode 100644 index 0000000..406eeb4 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -0,0 +1,196 @@ +""" +GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + +This reads data from Google Sheets and filters rows based on user-defined rules. +The filtered rows are processed into `Metadata` objects. + +### Key properties +- validates the sheet's structure and filters rows based on input configurations. +- Ensures only rows with valid URLs and unprocessed statuses are included. +""" +import os +from typing import Tuple, Union +from urllib.parse import quote + +import gspread +from loguru import logger +from slugify import slugify + +from auto_archiver.core import Feeder, Database, Media +from auto_archiver.core import Metadata +from auto_archiver.modules.gsheet_feeder_db import GWorksheet +from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp + + +class GsheetsFeederDB(Feeder, Database): + + def setup(self) -> None: + self.gsheets_client = gspread.service_account(filename=self.service_account) + # TODO mv to validators + if not self.sheet and not self.sheet_id: + raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") + + def open_sheet(self): + if self.sheet: + return self.gsheets_client.open(self.sheet) + else: # self.sheet_id + return self.gsheets_client.open_by_key(self.sheet_id) + + def __iter__(self) -> Metadata: + sh = self.open_sheet() + for ii, worksheet in enumerate(sh.worksheets()): + if not self.should_process_sheet(worksheet.title): + logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") + continue + logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') + gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) + if len(missing_cols := self.missing_required_columns(gw)): + logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") + continue + + # process and yield metadata here: + yield from self._process_rows(gw) + logger.success(f'Finished worksheet {worksheet.title}') + + def _process_rows(self, gw: GWorksheet): + for row in range(1 + self.header, gw.count_rows() + 1): + url = gw.get_cell(row, 'url').strip() + if not len(url): continue + original_status = gw.get_cell(row, 'status') + status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) + # TODO: custom status parser(?) aka should_retry_from_status + if status not in ['', None]: continue + + # All checks done - archival process starts here + m = Metadata().set_url(url) + self._set_context(m, gw, row) + yield m + + def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: + # TODO: Check folder value not being recognised + m.set_context("gsheet", {"row": row, "worksheet": gw}) + + if gw.get_cell_or_default(row, 'folder', "") is None: + folder = '' + else: + folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) + if len(folder): + if self.use_sheet_names_in_stored_paths: + m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) + else: + m.set_context("folder", folder) + + def should_process_sheet(self, sheet_name: str) -> bool: + if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(self.block_worksheets) and sheet_name in self.block_worksheets: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def missing_required_columns(self, gw: GWorksheet) -> list: + missing = [] + for required_col in ['url', 'status']: + if not gw.col_exists(required_col): + missing.append(required_col) + return missing + + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, "status", "Archive in progress") + + def failed(self, item: Metadata, reason: str) -> None: + logger.error(f"FAILED {item}") + self._safe_status_update(item, f"Archive failed {reason}") + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + self._safe_status_update(item, "") + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check if the given item has been archived already""" + return False + + def done(self, item: Metadata, cached: bool = False) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item.get_url()}") + gw, row = self._retrieve_gsheet(item) + # self._safe_status_update(item, 'done') + + cell_updates = [] + row_values = gw.get_row(row) + + def batch_if_valid(col, val, final_value=None): + final_value = final_value or val + try: + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": + cell_updates.append((row, col, final_value)) + except Exception as e: + logger.error(f"Unable to batch {col}={final_value} due to {e}") + + status_message = item.status + if cached: + status_message = f"[cached] {status_message}" + cell_updates.append((row, "status", status_message)) + + media: Media = item.get_final_media() + if hasattr(media, "urls"): + batch_if_valid("archive", "\n".join(media.urls)) + batch_if_valid("date", True, get_current_timestamp()) + batch_if_valid("title", item.get_title()) + batch_if_valid("text", item.get("content", "")) + batch_if_valid("timestamp", item.get_timestamp()) + if media: + batch_if_valid("hash", media.get("hash", "not-calculated")) + + # merge all pdq hashes into a single string, if present + pdq_hashes = [] + all_media = item.get_all_media() + for m in all_media: + if pdq := m.get("pdq_hash"): + pdq_hashes.append(pdq) + if len(pdq_hashes): + batch_if_valid("pdq_hash", ",".join(pdq_hashes)) + + if (screenshot := item.get_media_by_id("screenshot")) and hasattr( + screenshot, "urls" + ): + batch_if_valid("screenshot", "\n".join(screenshot.urls)) + + if thumbnail := item.get_first_image("thumbnail"): + if hasattr(thumbnail, "urls"): + batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') + + if browsertrix := item.get_media_by_id("browsertrix"): + batch_if_valid("wacz", "\n".join(browsertrix.urls)) + batch_if_valid( + "replaywebpage", + "\n".join( + [ + f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" + for wacz in browsertrix.urls + ] + ), + ) + + gw.batch_set_cell(cell_updates) + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: + try: + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, "status", new_status) + except Exception as e: + logger.debug(f"Unable to update sheet: {e}") + + def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + + if gsheet := item.get_context("gsheet"): + gw: GWorksheet = gsheet.get("worksheet") + row: int = gsheet.get("row") + elif self.sheet_id: + logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") + + return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py similarity index 100% rename from src/auto_archiver/modules/gsheet_feeder/gworksheet.py rename to src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 42a21b2..2f1202d 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -2,8 +2,7 @@ from datetime import datetime, timezone import pytest from auto_archiver.core import Metadata, Media -from auto_archiver.modules.gsheet_db import GsheetsDb -from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet @pytest.fixture @@ -54,11 +53,18 @@ def mock_media(mocker): @pytest.fixture def gsheets_db(mock_gworksheet, setup_module, mocker): - db = setup_module("gsheet_db", { - "allow_worksheets": "set()", - "block_worksheets": "set()", - "use_sheet_names_in_stored_paths": "True", - }) + mocker.patch("gspread.service_account") + config: dict = { + "sheet": "testsheet", + "sheet_id": None, + "header": 1, + "service_account": "test/service_account.json", + "columns": {'url': 'link', 'status': 'archive status', 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', 'wacz': 'wacz', 'replaywebpage': 'replaywebpage'}, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + db = setup_module("gsheet_feeder_db", config) db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1)) return db @@ -108,13 +114,13 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker): - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" @@ -125,7 +131,7 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker): # clear media from metadata metadata.media = [] - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index ef150d1..9ca81b0 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -2,7 +2,7 @@ from typing import Type import gspread import pytest -from auto_archiver.modules.gsheet_feeder import GsheetsFeeder +from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB from auto_archiver.core import Metadata, Feeder @@ -11,13 +11,13 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker): mocker.patch("gspread.service_account") with pytest.raises(ValueError): setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, ) @pytest.fixture -def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: +def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB: config: dict = { "service_account": "dummy.json", "sheet": "test-auto-archiver", @@ -45,7 +45,7 @@ def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: } mocker.patch("gspread.service_account") feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", config ) feeder.gsheets_client = mocker.MagicMock() @@ -90,7 +90,7 @@ class MockWorksheet: return matching.get(col_name, default) -def test__process_rows(gsheet_feeder: GsheetsFeeder): +def test__process_rows(gsheet_feeder: GsheetsFeederDB): testworksheet = MockWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) assert len(metadata_items) == 3 @@ -98,7 +98,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): assert metadata_items[0].get("url") == "http://example.com" -def test__set_metadata(gsheet_feeder: GsheetsFeeder): +def test__set_metadata(gsheet_feeder: GsheetsFeederDB): worksheet = MockWorksheet() metadata = Metadata() gsheet_feeder._set_context(metadata, worksheet, 1) @@ -106,12 +106,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder): @pytest.mark.skip(reason="Not recognising folder column") -def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): +def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet): gsheet_feeder._set_context(worksheet, 7) assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} -def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): +def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB): testworksheet = MockWorksheet() metadata = Metadata() testworksheet.wks.title = "TestSheet" @@ -140,7 +140,7 @@ def test_open_sheet_with_name_or_id( # Setup module with parameterized values feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, ) sheet_result = feeder.open_sheet() @@ -159,7 +159,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker): mock_service_account.return_value = mock_client mock_client.open_by_key.return_value = "MockSheet" feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, ) sheet = feeder.open_sheet() @@ -170,7 +170,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker): def test_should_process_sheet(setup_module, mocker): mocker.patch("gspread.service_account") gdb = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", { "service_account": "dummy.json", "sheet": "TestSheet", @@ -187,10 +187,10 @@ def test_should_process_sheet(setup_module, mocker): @pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: - """Testing GSheetsFeeder class""" + """Testing GsheetsFeeder class""" - module_name: str = "gsheet_feeder" - feeder: GsheetsFeeder + module_name: str = "gsheet_feeder_db" + feeder: GsheetsFeederDB # You must follow the setup process explain in the docs for this to work config: dict = { "service_account": "secrets/service_account.json", From 696aafb52d49ff283c0ff09e5ea940ebc2232a06 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 25 Feb 2025 21:38:41 +0000 Subject: [PATCH 02/15] Update gsheet_feeder references in tests. --- tests/feeders/test_gworksheet.py | 2 +- tests/test_orchestrator.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py index 2b05504..b6a0b5c 100644 --- a/tests/feeders/test_gworksheet.py +++ b/tests/feeders/test_gworksheet.py @@ -1,7 +1,7 @@ # Note this isn't a feeder, but contained as utility of the gsheet feeder module import pytest -from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.modules.gsheet_feeder_db import GWorksheet class TestGWorksheet: diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 752adb8..72f4949 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -78,7 +78,7 @@ def test_help(orchestrator, basic_parser, capsys): assert "--logging.level" in logs # individual module configs - assert "--gsheet_feeder.sheet_id" in logs + assert "--gsheet_feeder_db.sheet_id" in logs def test_add_custom_modules_path(orchestrator, test_args): @@ -154,22 +154,22 @@ def test_load_modules_from_commandline(orchestrator, test_args): assert orchestrator.formatters[0].name == "example_module" def test_load_settings_for_module_from_commandline(orchestrator, test_args): - args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"] + args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.sheet_id", "123", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"] orchestrator.setup(args) assert len(orchestrator.feeders) == 1 - assert orchestrator.feeders[0].name == "gsheet_feeder" - assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123" + assert orchestrator.feeders[0].name == "gsheet_feeder_db" + assert orchestrator.config['gsheet_feeder_db']['sheet_id'] == "123" def test_multiple_orchestrator(test_args): - o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"] + o1_args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"] o1 = ArchivingOrchestrator() with pytest.raises(ValueError) as exit_error: - # this should fail because the gsheet_feeder requires a sheet_id / sheet + # this should fail because the gsheet_feeder_db requires a sheet_id / sheet o1.setup(o1_args) From 077b56c150a0663cc2fa4c521c9681984016b7ca Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 25 Feb 2025 21:32:32 +0000 Subject: [PATCH 03/15] Merge GSheet Feeder and Database. --- .../modules/gsheet_db/__init__.py | 1 - .../modules/gsheet_db/__manifest__.py | 38 ---- .../modules/gsheet_db/gsheet_db.py | 114 ---------- .../modules/gsheet_feeder/__init__.py | 2 - .../modules/gsheet_feeder/gsheet_feeder.py | 95 --------- .../modules/gsheet_feeder_db/__init__.py | 2 + .../__manifest__.py | 28 ++- .../gsheet_feeder_db/gsheet_feeder_db.py | 196 ++++++++++++++++++ .../gworksheet.py | 0 tests/databases/test_gsheet_db.py | 41 ++-- tests/feeders/test_gsheet_feeder.py | 28 +-- 11 files changed, 259 insertions(+), 286 deletions(-) delete mode 100644 src/auto_archiver/modules/gsheet_db/__init__.py delete mode 100644 src/auto_archiver/modules/gsheet_db/__manifest__.py delete mode 100644 src/auto_archiver/modules/gsheet_db/gsheet_db.py delete mode 100644 src/auto_archiver/modules/gsheet_feeder/__init__.py delete mode 100644 src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py create mode 100644 src/auto_archiver/modules/gsheet_feeder_db/__init__.py rename src/auto_archiver/modules/{gsheet_feeder => gsheet_feeder_db}/__manifest__.py (69%) create mode 100644 src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py rename src/auto_archiver/modules/{gsheet_feeder => gsheet_feeder_db}/gworksheet.py (100%) diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py deleted file mode 100644 index 01fdee6..0000000 --- a/src/auto_archiver/modules/gsheet_db/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .gsheet_db import GsheetsDb \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py deleted file mode 100644 index cf95245..0000000 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ /dev/null @@ -1,38 +0,0 @@ -{ - "name": "Google Sheets Database", - "type": ["database"], - "entry_point": "gsheet_db::GsheetsDb", - "requires_setup": True, - "dependencies": { - "python": ["loguru", "gspread", "slugify"], - }, - "configs": { - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed", - }, - "use_sheet_names_in_stored_paths": { - "default": True, - "type": "bool", - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - } - }, - "description": """ - GsheetsDatabase: - Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. - """ -} diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py deleted file mode 100644 index c19f2ae..0000000 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import Union, Tuple -from urllib.parse import quote - -from loguru import logger - -from auto_archiver.core import Database -from auto_archiver.core import Metadata, Media -from auto_archiver.modules.gsheet_feeder import GWorksheet -from auto_archiver.utils.misc import get_current_timestamp - - -class GsheetsDb(Database): - """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata - """ - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, "status", "Archive in progress") - - def failed(self, item: Metadata, reason: str) -> None: - logger.error(f"FAILED {item}") - self._safe_status_update(item, f"Archive failed {reason}") - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - self._safe_status_update(item, "") - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check if the given item has been archived already""" - return False - - def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item.get_url()}") - gw, row = self._retrieve_gsheet(item) - # self._safe_status_update(item, 'done') - - cell_updates = [] - row_values = gw.get_row(row) - - def batch_if_valid(col, val, final_value=None): - final_value = final_value or val - try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": - cell_updates.append((row, col, final_value)) - except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") - - status_message = item.status - if cached: - status_message = f"[cached] {status_message}" - cell_updates.append((row, "status", status_message)) - - media: Media = item.get_final_media() - if hasattr(media, "urls"): - batch_if_valid("archive", "\n".join(media.urls)) - batch_if_valid("date", True, get_current_timestamp()) - batch_if_valid("title", item.get_title()) - batch_if_valid("text", item.get("content", "")) - batch_if_valid("timestamp", item.get_timestamp()) - if media: - batch_if_valid("hash", media.get("hash", "not-calculated")) - - # merge all pdq hashes into a single string, if present - pdq_hashes = [] - all_media = item.get_all_media() - for m in all_media: - if pdq := m.get("pdq_hash"): - pdq_hashes.append(pdq) - if len(pdq_hashes): - batch_if_valid("pdq_hash", ",".join(pdq_hashes)) - - if (screenshot := item.get_media_by_id("screenshot")) and hasattr( - screenshot, "urls" - ): - batch_if_valid("screenshot", "\n".join(screenshot.urls)) - - if thumbnail := item.get_first_image("thumbnail"): - if hasattr(thumbnail, "urls"): - batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') - - if browsertrix := item.get_media_by_id("browsertrix"): - batch_if_valid("wacz", "\n".join(browsertrix.urls)) - batch_if_valid( - "replaywebpage", - "\n".join( - [ - f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" - for wacz in browsertrix.urls - ] - ), - ) - - gw.batch_set_cell(cell_updates) - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: - try: - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, "status", new_status) - except Exception as e: - logger.debug(f"Unable to update sheet: {e}") - - def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - - if gsheet := item.get_context("gsheet"): - gw: GWorksheet = gsheet.get("worksheet") - row: int = gsheet.get("row") - elif self.sheet_id: - logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") - - return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py deleted file mode 100644 index bb4230a..0000000 --- a/src/auto_archiver/modules/gsheet_feeder/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .gworksheet import GWorksheet -from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py deleted file mode 100644 index 2026804..0000000 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. - -This reads data from Google Sheets and filters rows based on user-defined rules. -The filtered rows are processed into `Metadata` objects. - -### Key properties -- validates the sheet's structure and filters rows based on input configurations. -- Ensures only rows with valid URLs and unprocessed statuses are included. -""" -import os -import gspread - -from loguru import logger -from slugify import slugify - -from auto_archiver.core import Feeder -from auto_archiver.core import Metadata -from . import GWorksheet - - -class GsheetsFeeder(Feeder): - - def setup(self) -> None: - self.gsheets_client = gspread.service_account(filename=self.service_account) - # TODO mv to validators - if not self.sheet and not self.sheet_id: - raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") - - def open_sheet(self): - if self.sheet: - return self.gsheets_client.open(self.sheet) - else: # self.sheet_id - return self.gsheets_client.open_by_key(self.sheet_id) - - def __iter__(self) -> Metadata: - sh = self.open_sheet() - for ii, worksheet in enumerate(sh.worksheets()): - if not self.should_process_sheet(worksheet.title): - logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") - continue - logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') - gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) - if len(missing_cols := self.missing_required_columns(gw)): - logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") - continue - - # process and yield metadata here: - yield from self._process_rows(gw) - logger.success(f'Finished worksheet {worksheet.title}') - - def _process_rows(self, gw: GWorksheet): - for row in range(1 + self.header, gw.count_rows() + 1): - url = gw.get_cell(row, 'url').strip() - if not len(url): continue - original_status = gw.get_cell(row, 'status') - status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) - # TODO: custom status parser(?) aka should_retry_from_status - if status not in ['', None]: continue - - # All checks done - archival process starts here - m = Metadata().set_url(url) - self._set_context(m, gw, row) - yield m - - def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: - - m.set_context("gsheet", {"row": row, "worksheet": gw}) - - if gw.get_cell_or_default(row, 'folder', "") is None: - folder = '' - else: - folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder): - if self.use_sheet_names_in_stored_paths: - m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) - else: - m.set_context("folder", folder) - - - def should_process_sheet(self, sheet_name: str) -> bool: - if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: - # ALLOW rules exist AND sheet name not explicitly allowed - return False - if len(self.block_worksheets) and sheet_name in self.block_worksheets: - # BLOCK rules exist AND sheet name is blocked - return False - return True - - def missing_required_columns(self, gw: GWorksheet) -> list: - missing = [] - for required_col in ['url', 'status']: - if not gw.col_exists(required_col): - missing.append(required_col) - return missing diff --git a/src/auto_archiver/modules/gsheet_feeder_db/__init__.py b/src/auto_archiver/modules/gsheet_feeder_db/__init__.py new file mode 100644 index 0000000..2e9ac02 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder_db/__init__.py @@ -0,0 +1,2 @@ +from .gworksheet import GWorksheet +from .gsheet_feeder_db import GsheetsFeederDB \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py similarity index 69% rename from src/auto_archiver/modules/gsheet_feeder/__manifest__.py rename to src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py index 130b9f6..bb2f447 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py @@ -1,7 +1,7 @@ { - "name": "Google Sheets Feeder", - "type": ["feeder"], - "entry_point": "gsheet_feeder::GsheetsFeeder", + "name": "Google Sheets Feeder Database", + "type": ["feeder", "database"], + "entry_point": "gsheet_feeder_db::GsheetsFeederDB", "requires_setup": True, "dependencies": { "python": ["loguru", "gspread", "slugify"], @@ -51,10 +51,23 @@ "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", "type": "bool", }, + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "type": "bool", + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } }, "description": """ - GsheetsFeeder - A Google Sheets-based feeder for the Auto Archiver. + GsheetsFeederDatabase + A Google Sheets-based feeder and optional database for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. The filtered rows are processed into `Metadata` objects. @@ -64,11 +77,16 @@ - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations. - Ensures only rows with valid URLs and unprocessed statuses are included for archival. - Supports organizing stored files into folder paths based on sheet and worksheet names. + - If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. + - Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. + - Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. + - Skips redundant updates for empty or invalid data fields. ### Setup - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`. To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html). - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive. - Customize the column names in your Google sheet using the `columns` configuration. + - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder. """, } diff --git a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py new file mode 100644 index 0000000..406eeb4 --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py @@ -0,0 +1,196 @@ +""" +GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + +This reads data from Google Sheets and filters rows based on user-defined rules. +The filtered rows are processed into `Metadata` objects. + +### Key properties +- validates the sheet's structure and filters rows based on input configurations. +- Ensures only rows with valid URLs and unprocessed statuses are included. +""" +import os +from typing import Tuple, Union +from urllib.parse import quote + +import gspread +from loguru import logger +from slugify import slugify + +from auto_archiver.core import Feeder, Database, Media +from auto_archiver.core import Metadata +from auto_archiver.modules.gsheet_feeder_db import GWorksheet +from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp + + +class GsheetsFeederDB(Feeder, Database): + + def setup(self) -> None: + self.gsheets_client = gspread.service_account(filename=self.service_account) + # TODO mv to validators + if not self.sheet and not self.sheet_id: + raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.") + + def open_sheet(self): + if self.sheet: + return self.gsheets_client.open(self.sheet) + else: # self.sheet_id + return self.gsheets_client.open_by_key(self.sheet_id) + + def __iter__(self) -> Metadata: + sh = self.open_sheet() + for ii, worksheet in enumerate(sh.worksheets()): + if not self.should_process_sheet(worksheet.title): + logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules") + continue + logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}') + gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns) + if len(missing_cols := self.missing_required_columns(gw)): + logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}") + continue + + # process and yield metadata here: + yield from self._process_rows(gw) + logger.success(f'Finished worksheet {worksheet.title}') + + def _process_rows(self, gw: GWorksheet): + for row in range(1 + self.header, gw.count_rows() + 1): + url = gw.get_cell(row, 'url').strip() + if not len(url): continue + original_status = gw.get_cell(row, 'status') + status = gw.get_cell(row, 'status', fresh=original_status in ['', None]) + # TODO: custom status parser(?) aka should_retry_from_status + if status not in ['', None]: continue + + # All checks done - archival process starts here + m = Metadata().set_url(url) + self._set_context(m, gw, row) + yield m + + def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata: + # TODO: Check folder value not being recognised + m.set_context("gsheet", {"row": row, "worksheet": gw}) + + if gw.get_cell_or_default(row, 'folder', "") is None: + folder = '' + else: + folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) + if len(folder): + if self.use_sheet_names_in_stored_paths: + m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title))) + else: + m.set_context("folder", folder) + + def should_process_sheet(self, sheet_name: str) -> bool: + if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(self.block_worksheets) and sheet_name in self.block_worksheets: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def missing_required_columns(self, gw: GWorksheet) -> list: + missing = [] + for required_col in ['url', 'status']: + if not gw.col_exists(required_col): + missing.append(required_col) + return missing + + + def started(self, item: Metadata) -> None: + logger.warning(f"STARTED {item}") + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, "status", "Archive in progress") + + def failed(self, item: Metadata, reason: str) -> None: + logger.error(f"FAILED {item}") + self._safe_status_update(item, f"Archive failed {reason}") + + def aborted(self, item: Metadata) -> None: + logger.warning(f"ABORTED {item}") + self._safe_status_update(item, "") + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check if the given item has been archived already""" + return False + + def done(self, item: Metadata, cached: bool = False) -> None: + """archival result ready - should be saved to DB""" + logger.success(f"DONE {item.get_url()}") + gw, row = self._retrieve_gsheet(item) + # self._safe_status_update(item, 'done') + + cell_updates = [] + row_values = gw.get_row(row) + + def batch_if_valid(col, val, final_value=None): + final_value = final_value or val + try: + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": + cell_updates.append((row, col, final_value)) + except Exception as e: + logger.error(f"Unable to batch {col}={final_value} due to {e}") + + status_message = item.status + if cached: + status_message = f"[cached] {status_message}" + cell_updates.append((row, "status", status_message)) + + media: Media = item.get_final_media() + if hasattr(media, "urls"): + batch_if_valid("archive", "\n".join(media.urls)) + batch_if_valid("date", True, get_current_timestamp()) + batch_if_valid("title", item.get_title()) + batch_if_valid("text", item.get("content", "")) + batch_if_valid("timestamp", item.get_timestamp()) + if media: + batch_if_valid("hash", media.get("hash", "not-calculated")) + + # merge all pdq hashes into a single string, if present + pdq_hashes = [] + all_media = item.get_all_media() + for m in all_media: + if pdq := m.get("pdq_hash"): + pdq_hashes.append(pdq) + if len(pdq_hashes): + batch_if_valid("pdq_hash", ",".join(pdq_hashes)) + + if (screenshot := item.get_media_by_id("screenshot")) and hasattr( + screenshot, "urls" + ): + batch_if_valid("screenshot", "\n".join(screenshot.urls)) + + if thumbnail := item.get_first_image("thumbnail"): + if hasattr(thumbnail, "urls"): + batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') + + if browsertrix := item.get_media_by_id("browsertrix"): + batch_if_valid("wacz", "\n".join(browsertrix.urls)) + batch_if_valid( + "replaywebpage", + "\n".join( + [ + f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" + for wacz in browsertrix.urls + ] + ), + ) + + gw.batch_set_cell(cell_updates) + + def _safe_status_update(self, item: Metadata, new_status: str) -> None: + try: + gw, row = self._retrieve_gsheet(item) + gw.set_cell(row, "status", new_status) + except Exception as e: + logger.debug(f"Unable to update sheet: {e}") + + def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: + + if gsheet := item.get_context("gsheet"): + gw: GWorksheet = gsheet.get("worksheet") + row: int = gsheet.get("row") + elif self.sheet_id: + logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") + + return gw, row diff --git a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py similarity index 100% rename from src/auto_archiver/modules/gsheet_feeder/gworksheet.py rename to src/auto_archiver/modules/gsheet_feeder_db/gworksheet.py diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 8b49e5a..2f1202d 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -2,8 +2,7 @@ from datetime import datetime, timezone import pytest from auto_archiver.core import Metadata, Media -from auto_archiver.modules.gsheet_db import GsheetsDb -from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet @pytest.fixture @@ -32,8 +31,9 @@ def mock_metadata(mocker): @pytest.fixture def metadata(): metadata = Metadata() - metadata.add_media(Media(filename="screenshot.png", urls=["http://example.com/screenshot.png"]).set("id", "screenshot")) - metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]).set("id", "browsertrix")) + metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"])) + metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"])) + metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"])) metadata.set_url("http://example.com") metadata.set_title("Example Title") metadata.set_content("Example Content") @@ -52,12 +52,19 @@ def mock_media(mocker): return mock_media @pytest.fixture -def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsDb: - db = setup_module("gsheet_db", { - "allow_worksheets": "set()", - "block_worksheets": "set()", - "use_sheet_names_in_stored_paths": "True", - }) +def gsheets_db(mock_gworksheet, setup_module, mocker): + mocker.patch("gspread.service_account") + config: dict = { + "sheet": "testsheet", + "sheet_id": None, + "header": 1, + "service_account": "test/service_account.json", + "columns": {'url': 'link', 'status': 'archive status', 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', 'wacz': 'wacz', 'replaywebpage': 'replaywebpage'}, + "allow_worksheets": set(), + "block_worksheets": set(), + "use_sheet_names_in_stored_paths": True, + } + db = setup_module("gsheet_feeder_db", config) db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1)) return db @@ -79,10 +86,10 @@ def expected_calls(mock_media, fixed_timestamp): (1, 'text', 'Example Content'), (1, 'timestamp', '2025-01-01T00:00:00+00:00'), (1, 'hash', 'not-calculated'), - (1, 'screenshot', 'http://example.com/screenshot.png'), - (1, 'thumbnail', '=IMAGE("http://example.com/screenshot.png")'), - (1, 'wacz', 'http://example.com/browsertrix.wacz'), - (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A//example.com/browsertrix.wacz#view=pages&url=http%3A//example.com') + # (1, 'screenshot', 'http://example.com/screenshot.png'), + # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'), + # (1, 'wacz', 'http://example.com/browsertrix.wacz'), + # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=') ] def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet): @@ -107,13 +114,13 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker): - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" @@ -124,7 +131,7 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker): # clear media from metadata metadata.media = [] - mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index ef150d1..9ca81b0 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -2,7 +2,7 @@ from typing import Type import gspread import pytest -from auto_archiver.modules.gsheet_feeder import GsheetsFeeder +from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB from auto_archiver.core import Metadata, Feeder @@ -11,13 +11,13 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker): mocker.patch("gspread.service_account") with pytest.raises(ValueError): setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, ) @pytest.fixture -def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: +def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB: config: dict = { "service_account": "dummy.json", "sheet": "test-auto-archiver", @@ -45,7 +45,7 @@ def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: } mocker.patch("gspread.service_account") feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", config ) feeder.gsheets_client = mocker.MagicMock() @@ -90,7 +90,7 @@ class MockWorksheet: return matching.get(col_name, default) -def test__process_rows(gsheet_feeder: GsheetsFeeder): +def test__process_rows(gsheet_feeder: GsheetsFeederDB): testworksheet = MockWorksheet() metadata_items = list(gsheet_feeder._process_rows(testworksheet)) assert len(metadata_items) == 3 @@ -98,7 +98,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder): assert metadata_items[0].get("url") == "http://example.com" -def test__set_metadata(gsheet_feeder: GsheetsFeeder): +def test__set_metadata(gsheet_feeder: GsheetsFeederDB): worksheet = MockWorksheet() metadata = Metadata() gsheet_feeder._set_context(metadata, worksheet, 1) @@ -106,12 +106,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder): @pytest.mark.skip(reason="Not recognising folder column") -def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet): +def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet): gsheet_feeder._set_context(worksheet, 7) assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet} -def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): +def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB): testworksheet = MockWorksheet() metadata = Metadata() testworksheet.wks.title = "TestSheet" @@ -140,7 +140,7 @@ def test_open_sheet_with_name_or_id( # Setup module with parameterized values feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, ) sheet_result = feeder.open_sheet() @@ -159,7 +159,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker): mock_service_account.return_value = mock_client mock_client.open_by_key.return_value = "MockSheet" feeder = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, ) sheet = feeder.open_sheet() @@ -170,7 +170,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker): def test_should_process_sheet(setup_module, mocker): mocker.patch("gspread.service_account") gdb = setup_module( - "gsheet_feeder", + "gsheet_feeder_db", { "service_account": "dummy.json", "sheet": "TestSheet", @@ -187,10 +187,10 @@ def test_should_process_sheet(setup_module, mocker): @pytest.mark.skip(reason="Requires a real connection") class TestGSheetsFeederReal: - """Testing GSheetsFeeder class""" + """Testing GsheetsFeeder class""" - module_name: str = "gsheet_feeder" - feeder: GsheetsFeeder + module_name: str = "gsheet_feeder_db" + feeder: GsheetsFeederDB # You must follow the setup process explain in the docs for this to work config: dict = { "service_account": "secrets/service_account.json", From d775e4612e7eef9746a64d5e7fec9fcb03ad1d3b Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 25 Feb 2025 21:38:41 +0000 Subject: [PATCH 04/15] Update gsheet_feeder references in tests. --- tests/feeders/test_gworksheet.py | 2 +- tests/test_orchestrator.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py index 2b05504..b6a0b5c 100644 --- a/tests/feeders/test_gworksheet.py +++ b/tests/feeders/test_gworksheet.py @@ -1,7 +1,7 @@ # Note this isn't a feeder, but contained as utility of the gsheet feeder module import pytest -from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.modules.gsheet_feeder_db import GWorksheet class TestGWorksheet: diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 752adb8..72f4949 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -78,7 +78,7 @@ def test_help(orchestrator, basic_parser, capsys): assert "--logging.level" in logs # individual module configs - assert "--gsheet_feeder.sheet_id" in logs + assert "--gsheet_feeder_db.sheet_id" in logs def test_add_custom_modules_path(orchestrator, test_args): @@ -154,22 +154,22 @@ def test_load_modules_from_commandline(orchestrator, test_args): assert orchestrator.formatters[0].name == "example_module" def test_load_settings_for_module_from_commandline(orchestrator, test_args): - args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"] + args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.sheet_id", "123", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"] orchestrator.setup(args) assert len(orchestrator.feeders) == 1 - assert orchestrator.feeders[0].name == "gsheet_feeder" - assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123" + assert orchestrator.feeders[0].name == "gsheet_feeder_db" + assert orchestrator.config['gsheet_feeder_db']['sheet_id'] == "123" def test_multiple_orchestrator(test_args): - o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"] + o1_args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"] o1 = ArchivingOrchestrator() with pytest.raises(ValueError) as exit_error: - # this should fail because the gsheet_feeder requires a sheet_id / sheet + # this should fail because the gsheet_feeder_db requires a sheet_id / sheet o1.setup(o1_args) From d1c8d4ba0e65f00df2f0eb2740333d6d9a72eb15 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 27 Feb 2025 11:18:10 +0000 Subject: [PATCH 05/15] Initial merge of Atlos Feeder and DB --- .../modules/atlos_feeder_db/__init__.py | 1 + .../modules/atlos_feeder_db/__manifest__.py | 42 ++++++++ .../atlos_feeder_db/atlos_feeder_database.py | 100 ++++++++++++++++++ .../modules/atlos_storage/atlos_storage.py | 4 +- 4 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 src/auto_archiver/modules/atlos_feeder_db/__init__.py create mode 100644 src/auto_archiver/modules/atlos_feeder_db/__manifest__.py create mode 100644 src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py diff --git a/src/auto_archiver/modules/atlos_feeder_db/__init__.py b/src/auto_archiver/modules/atlos_feeder_db/__init__.py new file mode 100644 index 0000000..67b243a --- /dev/null +++ b/src/auto_archiver/modules/atlos_feeder_db/__init__.py @@ -0,0 +1 @@ +from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py b/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py new file mode 100644 index 0000000..54222f6 --- /dev/null +++ b/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py @@ -0,0 +1,42 @@ +{ + "name": "Atlos Feeder Database", + "type": ["feeder", "database"], +"entry_point": "atlos_feeder_db::AtlosFeederDb", + "requires_setup": True, + "dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "api_token": { + "type": "str", + "required": True, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "type": "str" + }, + }, + "description": """ + AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival, + along with a database option to output archival results. + + Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival. + + ### Features + - Connects to the Atlos API to retrieve a list of source material URLs. + - Filters source materials based on visibility, processing status, and metadata. + - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL. + - Iterates through paginated results using a cursor for efficient API interaction. + - Outputs archival results to the Atlos API for storage and tracking. + - Updates failure status with error details when archiving fails. + - Processes and formats metadata, including ISO formatting for datetime fields. + - Skips processing for items without an Atlos ID. + + ### Notes + - Requires an Atlos API endpoint and a valid API token for authentication. + - Ensures only unprocessed, visible, and ready-to-archive URLs are returned. + - Handles pagination transparently when retrieving data from the Atlos API. + """ +} diff --git a/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py b/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py new file mode 100644 index 0000000..4bd3368 --- /dev/null +++ b/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py @@ -0,0 +1,100 @@ +import requests +from typing import Union + +from loguru import logger + +from auto_archiver.core import Database + +from auto_archiver.core import Feeder +from auto_archiver.core import Metadata + + +class AtlosFeederDb(Feeder, Database): + + def __iter__(self) -> Metadata: + # Get all the urls from the Atlos API + count = 0 + cursor = None + while True: + response = requests.get( + f"{self.atlos_url}/api/v2/source_material", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={"cursor": cursor}, + ) + data = response.json() + response.raise_for_status() + cursor = data["next"] + + for item in data["results"]: + if ( + item["source_url"] not in [None, ""] + and ( + item["metadata"] + .get("auto_archiver", {}) + .get("processed", False) + != True + ) + and item["visibility"] == "visible" + and item["status"] not in ["processing", "pending"] + ): + yield Metadata().set_url(item["source_url"]).set( + "atlos_id", item["id"] + ) + count += 1 + + if len(data["results"]) == 0 or cursor is None: + break + + + def failed(self, item: Metadata, reason: str) -> None: + """Update DB accordingly for failure""" + # If the item has no Atlos ID, there's nothing for us to do + if not item.metadata.get("atlos_id"): + logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + return + + requests.post( + f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", + headers={"Authorization": f"Bearer {self.api_token}"}, + json={"metadata": {"processed": True, "status": "error", "error": reason}}, + ).raise_for_status() + logger.info( + f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}" + ) + + def fetch(self, item: Metadata) -> Union[Metadata, bool]: + """check and fetch if the given item has been archived already, each + database should handle its own caching, and configuration mechanisms""" + return False + + def _process_metadata(self, item: Metadata) -> dict: + """Process metadata for storage on Atlos. Will convert any datetime + objects to ISO format.""" + + return { + k: v.isoformat() if hasattr(v, "isoformat") else v + for k, v in item.metadata.items() + } + + def done(self, item: Metadata, cached: bool = False) -> None: + """archival result ready - should be saved to DB""" + + if not item.metadata.get("atlos_id"): + logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") + return + + requests.post( + f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", + headers={"Authorization": f"Bearer {self.api_token}"}, + json={ + "metadata": dict( + processed=True, + status="success", + results=self._process_metadata(item), + ) + }, + ).raise_for_status() + + logger.info( + f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" + ) diff --git a/src/auto_archiver/modules/atlos_storage/atlos_storage.py b/src/auto_archiver/modules/atlos_storage/atlos_storage.py index f8eef68..86af9c6 100644 --- a/src/auto_archiver/modules/atlos_storage/atlos_storage.py +++ b/src/auto_archiver/modules/atlos_storage/atlos_storage.py @@ -7,6 +7,7 @@ from loguru import logger from auto_archiver.core import Media, Metadata from auto_archiver.core import Storage +from auto_archiver.utils import calculate_file_hash class AtlosStorage(Storage): @@ -37,7 +38,8 @@ class AtlosStorage(Storage): return False media_hash = self._hash(media) - + # media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) + # Check whether the media has already been uploaded source_material = requests.get( f"{self.atlos_url}/api/v2/source_material/{atlos_id}", From 6cb7afefdc38db53296fc9ccd9b20d9fbd3c5704 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 5 Mar 2025 10:24:54 +0000 Subject: [PATCH 06/15] Initial Atlos merge --- poetry.lock | 50 +++++++------- .../__manifest__.py | 11 ++-- .../atlos_feeder_db_storage.py} | 66 ++++++++++++++++++- 3 files changed, 93 insertions(+), 34 deletions(-) rename src/auto_archiver/modules/{atlos_feeder_db => atlos_feeder_db_storage}/__manifest__.py (79%) rename src/auto_archiver/modules/{atlos_feeder_db/atlos_feeder_database.py => atlos_feeder_db_storage/atlos_feeder_db_storage.py} (58%) diff --git a/poetry.lock b/poetry.lock index 2855bb5..16ec2f8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -103,14 +103,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "authlib" -version = "1.5.0" +version = "1.5.1" description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "Authlib-1.5.0-py2.py3-none-any.whl", hash = "sha256:b3cc5ccfc19cf87678046b6e7cb19d402d8a631a33c40e36385232203227953a"}, - {file = "authlib-1.5.0.tar.gz", hash = "sha256:8fd8bd8f806485a532ac39a17b579982cf54688f956174f995cc938a91725423"}, + {file = "authlib-1.5.1-py2.py3-none-any.whl", hash = "sha256:8408861cbd9b4ea2ff759b00b6f02fd7d81ac5a56d0b2b22c08606c6049aae11"}, + {file = "authlib-1.5.1.tar.gz", hash = "sha256:5cbc85ecb0667312c1cdc2f9095680bb735883b123fb509fde1e65b1c5df972e"}, ] [package.dependencies] @@ -172,18 +172,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.37.0" +version = "1.37.5" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.37.0-py3-none-any.whl", hash = "sha256:03bd8c93b226f07d944fd6b022e11a307bff94ab6a21d51675d7e3ea81ee8424"}, - {file = "boto3-1.37.0.tar.gz", hash = "sha256:01015b38017876d79efd7273f35d9a4adfba505237159621365bed21b9b65eca"}, + {file = "boto3-1.37.5-py3-none-any.whl", hash = "sha256:12166353519aca0cc8d9dcfbbb0d38f8915955a5912b8cb241b2b2314f0dbc14"}, + {file = "boto3-1.37.5.tar.gz", hash = "sha256:ae6e7048beeaa4478368e554a4b290e3928beb0ae8d8767d108d72381a81af30"}, ] [package.dependencies] -botocore = ">=1.37.0,<1.38.0" +botocore = ">=1.37.5,<1.38.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.37.0" +version = "1.37.5" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.37.0-py3-none-any.whl", hash = "sha256:d01661f38c0edac87424344cdf4169f3ab9bc1bf1b677c8b230d025eb66c54a3"}, - {file = "botocore-1.37.0.tar.gz", hash = "sha256:b129d091a8360b4152ab65327186bf4e250de827c4a9b7ddf40a72b1acf1f3c1"}, + {file = "botocore-1.37.5-py3-none-any.whl", hash = "sha256:e5cfbb8026d5b4fadd9b3a18b61d238a41a8b8f620ab75873dc1467d456150d6"}, + {file = "botocore-1.37.5.tar.gz", hash = "sha256:f8f526d33ae74d242c577e0440b57b9ec7d53edd41db211155ec8087fe7a5a21"}, ] [package.dependencies] @@ -781,14 +781,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-api-python-client" -version = "2.161.0" +version = "2.162.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.161.0-py2.py3-none-any.whl", hash = "sha256:9476a5a4f200bae368140453df40f9cda36be53fa7d0e9a9aac4cdb859a26448"}, - {file = "google_api_python_client-2.161.0.tar.gz", hash = "sha256:324c0cce73e9ea0a0d2afd5937e01b7c2d6a4d7e2579cdb6c384f9699d6c9f37"}, + {file = "google_api_python_client-2.162.0-py2.py3-none-any.whl", hash = "sha256:49365fa4f7795fe81a747f5544d6528ea94314fa59664e0ea1005f603facf1ec"}, + {file = "google_api_python_client-2.162.0.tar.gz", hash = "sha256:5f8bc934a5b6eea73a7d12d999e6585c1823179f48340234acb385e2502e735a"}, ] [package.dependencies] @@ -860,14 +860,14 @@ tool = ["click (>=6.0.0)"] [[package]] name = "googleapis-common-protos" -version = "1.68.0" +version = "1.69.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "googleapis_common_protos-1.68.0-py2.py3-none-any.whl", hash = "sha256:aaf179b2f81df26dfadac95def3b16a95064c76a5f45f07e4c68a21bb371c4ac"}, - {file = "googleapis_common_protos-1.68.0.tar.gz", hash = "sha256:95d38161f4f9af0d9423eed8fb7b64ffd2568c3464eb542ff02c5bfa1953ab3c"}, + {file = "googleapis_common_protos-1.69.0-py2.py3-none-any.whl", hash = "sha256:17835fdc4fa8da1d61cfe2d4d5d57becf7c61d4112f8d81c67eaa9d7ce43042d"}, + {file = "googleapis_common_protos-1.69.0.tar.gz", hash = "sha256:5a46d58af72846f59009b9c4710425b9af2139555c71837081706b213b298187"}, ] [package.dependencies] @@ -878,14 +878,14 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] [[package]] name = "gspread" -version = "6.1.4" +version = "6.2.0" description = "Google Spreadsheets Python API" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "gspread-6.1.4-py3-none-any.whl", hash = "sha256:c34781c426031a243ad154952b16f21ac56a5af90687885fbee3d1fba5280dcd"}, - {file = "gspread-6.1.4.tar.gz", hash = "sha256:b8eec27de7cadb338bb1b9f14a9be168372dee8965c0da32121816b5050ac1de"}, + {file = "gspread-6.2.0-py3-none-any.whl", hash = "sha256:7fa1a11e1ecacc6c5946fa016be05941baca8540404314f59aec963dd8ae5db3"}, + {file = "gspread-6.2.0.tar.gz", hash = "sha256:bc3d02d1c39e0b40bfc8035b4fec407aa71a17f343fc81cc7e3f75bfa6555de6"}, ] [package.dependencies] @@ -1777,14 +1777,14 @@ files = [ [[package]] name = "pytest" -version = "8.3.4" +version = "8.3.5" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" groups = ["dev"] files = [ - {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, - {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, + {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, + {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, ] [package.dependencies] @@ -2248,14 +2248,14 @@ files = [ [[package]] name = "s3transfer" -version = "0.11.2" +version = "0.11.3" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"}, - {file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"}, + {file = "s3transfer-0.11.3-py3-none-any.whl", hash = "sha256:ca855bdeb885174b5ffa95b9913622459d4ad8e331fc98eb01e6d5eb6a30655d"}, + {file = "s3transfer-0.11.3.tar.gz", hash = "sha256:edae4977e3a122445660c7c114bba949f9d191bae3b34a096f18a1c8c354527a"}, ] [package.dependencies] diff --git a/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py b/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py similarity index 79% rename from src/auto_archiver/modules/atlos_feeder_db/__manifest__.py rename to src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py index 54222f6..3920246 100644 --- a/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py @@ -1,7 +1,7 @@ { - "name": "Atlos Feeder Database", - "type": ["feeder", "database"], -"entry_point": "atlos_feeder_db::AtlosFeederDb", + "name": "Atlos Feeder Database Storage", + "type": ["feeder", "database", "storage"], +"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage", "requires_setup": True, "dependencies": { "python": ["loguru", "requests"], @@ -19,11 +19,9 @@ }, }, "description": """ - AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival, + AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media, along with a database option to output archival results. - Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival. - ### Features - Connects to the Atlos API to retrieve a list of source material URLs. - Filters source materials based on visibility, processing status, and metadata. @@ -33,6 +31,7 @@ - Updates failure status with error details when archiving fails. - Processes and formats metadata, including ISO formatting for datetime fields. - Skips processing for items without an Atlos ID. + - Saves media files to Atlos, organizing them into folders based on the provided path structure. ### Notes - Requires an Atlos API endpoint and a valid API token for authentication. diff --git a/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py similarity index 58% rename from src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py rename to src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index 4bd3368..7bcd74e 100644 --- a/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -1,15 +1,19 @@ -import requests +import hashlib +import os +from typing import IO, Optional from typing import Union +import requests from loguru import logger from auto_archiver.core import Database - from auto_archiver.core import Feeder +from auto_archiver.core import Media from auto_archiver.core import Metadata +from auto_archiver.core import Storage -class AtlosFeederDb(Feeder, Database): +class AtlosFeederDbStorage(Feeder, Database, Storage): def __iter__(self) -> Metadata: # Get all the urls from the Atlos API @@ -98,3 +102,59 @@ class AtlosFeederDb(Feeder, Database): logger.info( f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" ) + + def get_cdn_url(self, _media: Media) -> str: + # It's not always possible to provide an exact URL, because it's + # possible that the media once uploaded could have been copied to + # another project. + return self.atlos_url + + def _hash(self, media: Media) -> str: + # Hash the media file using sha-256. We don't use the existing auto archiver + # hash because there's no guarantee that the configuerer is using sha-256, which + # is how Atlos hashes files. + + sha256 = hashlib.sha256() + with open(media.filename, "rb") as f: + while True: + buf = f.read(4096) + if not buf: break + sha256.update(buf) + return sha256.hexdigest() + + def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: + atlos_id = metadata.get("atlos_id") + if atlos_id is None: + logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos") + return False + + media_hash = self._hash(media) + # media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) + + # Check whether the media has already been uploaded + source_material = requests.get( + f"{self.atlos_url}/api/v2/source_material/{atlos_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + ).json()["result"] + existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])] + if media_hash in existing_media: + logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos") + return True + + # Upload the media to the Atlos API + requests.post( + f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={ + "title": media.properties + }, + files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))}, + ).raise_for_status() + + logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") + + return True + + # must be implemented even if unused + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: + pass From 0f911543cd6da3cc1d2e3a883230c7d9171fd620 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 5 Mar 2025 13:49:11 +0000 Subject: [PATCH 07/15] Atlos refactor --- .../atlos_feeder_db_storage/__init__.py | 1 + .../atlos_feeder_db_storage.py | 192 +++++++++--------- 2 files changed, 92 insertions(+), 101 deletions(-) create mode 100644 src/auto_archiver/modules/atlos_feeder_db_storage/__init__.py diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/__init__.py b/src/auto_archiver/modules/atlos_feeder_db_storage/__init__.py new file mode 100644 index 0000000..8d62823 --- /dev/null +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/__init__.py @@ -0,0 +1 @@ +from .atlos_feeder_db_storage import AtlosFeederDbStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index 7bcd74e..698cd41 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -1,70 +1,80 @@ import hashlib import os -from typing import IO, Optional -from typing import Union +from typing import IO, Iterator, Optional, Union import requests from loguru import logger -from auto_archiver.core import Database -from auto_archiver.core import Feeder -from auto_archiver.core import Media -from auto_archiver.core import Metadata -from auto_archiver.core import Storage +from auto_archiver.core import Database, Feeder, Media, Metadata, Storage +from auto_archiver.utils import calculate_file_hash class AtlosFeederDbStorage(Feeder, Database, Storage): - def __iter__(self) -> Metadata: - # Get all the urls from the Atlos API - count = 0 + @property + def session(self) -> requests.Session: + """create and return a persistent session.""" + if not hasattr(self, "_session"): + self._session = requests.Session() + return self._session + + def _get(self, endpoint: str, params: Optional[dict] = None) -> dict: + """Wrapper for GET requests to the Atlos API.""" + url = f"{self.atlos_url}{endpoint}" + response = self.session.get( + url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params + ) + response.raise_for_status() + return response.json() + + def _post( + self, + endpoint: str, + json: Optional[dict] = None, + params: Optional[dict] = None, + files: Optional[dict] = None, + ) -> dict: + """Wrapper for POST requests to the Atlos API.""" + url = f"{self.atlos_url}{endpoint}" + response = self.session.post( + url, + headers={"Authorization": f"Bearer {self.api_token}"}, + json=json, + params=params, + files=files, + ) + response.raise_for_status() + return response.json() + + def __iter__(self) -> Iterator[Metadata]: + """Iterate over unprocessed, visible source materials from Atlos.""" cursor = None while True: - response = requests.get( - f"{self.atlos_url}/api/v2/source_material", - headers={"Authorization": f"Bearer {self.api_token}"}, - params={"cursor": cursor}, - ) - data = response.json() - response.raise_for_status() - cursor = data["next"] - - for item in data["results"]: + data = self._get("/api/v2/source_material", params={"cursor": cursor}) + cursor = data.get("next") + results = data.get("results", []) + for item in results: if ( - item["source_url"] not in [None, ""] - and ( - item["metadata"] - .get("auto_archiver", {}) - .get("processed", False) - != True - ) - and item["visibility"] == "visible" - and item["status"] not in ["processing", "pending"] + item.get("source_url") not in [None, ""] + and not item.get("metadata", {}).get("auto_archiver", {}).get("processed", False) + and item.get("visibility") == "visible" + and item.get("status") not in ["processing", "pending"] ): - yield Metadata().set_url(item["source_url"]).set( - "atlos_id", item["id"] - ) - count += 1 - - if len(data["results"]) == 0 or cursor is None: + yield Metadata().set_url(item["source_url"]).set("atlos_id", item["id"]) + if not results or cursor is None: break - def failed(self, item: Metadata, reason: str) -> None: - """Update DB accordingly for failure""" - # If the item has no Atlos ID, there's nothing for us to do - if not item.metadata.get("atlos_id"): + """Mark an item as failed in Atlos, if the ID exists.""" + atlos_id = item.metadata.get("atlos_id") + if not atlos_id: logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, + self._post( + f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", json={"metadata": {"processed": True, "status": "error", "error": reason}}, - ).raise_for_status() - logger.info( - f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}" ) + logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check and fetch if the given item has been archived already, each @@ -74,87 +84,67 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): def _process_metadata(self, item: Metadata) -> dict: """Process metadata for storage on Atlos. Will convert any datetime objects to ISO format.""" - return { k: v.isoformat() if hasattr(v, "isoformat") else v for k, v in item.metadata.items() } def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - - if not item.metadata.get("atlos_id"): + """Mark an item as successfully archived in Atlos.""" + atlos_id = item.metadata.get("atlos_id") + if not atlos_id: logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, + self._post( + f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver", json={ - "metadata": dict( - processed=True, - status="success", - results=self._process_metadata(item), - ) + "metadata": { + "processed": True, + "status": "success", + "results": self._process_metadata(item), + } }, - ).raise_for_status() - - logger.info( - f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" ) + logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos") def get_cdn_url(self, _media: Media) -> str: - # It's not always possible to provide an exact URL, because it's - # possible that the media once uploaded could have been copied to - # another project. + """Return the base Atlos URL as the CDN URL.""" return self.atlos_url - def _hash(self, media: Media) -> str: - # Hash the media file using sha-256. We don't use the existing auto archiver - # hash because there's no guarantee that the configuerer is using sha-256, which - # is how Atlos hashes files. - - sha256 = hashlib.sha256() - with open(media.filename, "rb") as f: - while True: - buf = f.read(4096) - if not buf: break - sha256.update(buf) - return sha256.hexdigest() - def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: - atlos_id = metadata.get("atlos_id") - if atlos_id is None: - logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos") + """Upload a media file to Atlos if it has not been uploaded already.""" + if metadata is None: + logger.error(f"No metadata provided for {media.filename}") return False - media_hash = self._hash(media) - # media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) + atlos_id = metadata.get("atlos_id") + if not atlos_id: + logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.") + return False + + media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) # Check whether the media has already been uploaded - source_material = requests.get( - f"{self.atlos_url}/api/v2/source_material/{atlos_id}", - headers={"Authorization": f"Bearer {self.api_token}"}, - ).json()["result"] - existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])] + source_material = self._get(f"/api/v2/source_material/{atlos_id}")["result"] + existing_media = [ + artifact.get("file_hash_sha256") + for artifact in source_material.get("artifacts", []) + ] if media_hash in existing_media: logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos") return True # Upload the media to the Atlos API - requests.post( - f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}", - headers={"Authorization": f"Bearer {self.api_token}"}, - params={ - "title": media.properties - }, - files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))}, - ).raise_for_status() - + with open(media.filename, "rb") as file_obj: + self._post( + f"/api/v2/source_material/upload/{atlos_id}", + params={"title": media.properties}, + files={"file": (os.path.basename(media.filename), file_obj)}, + ) logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") - return True - # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: - pass + """Upload a file-like object; not implemented.""" + raise NotImplementedError("uploadf method is not implemented yet.") + From b9c2f98f46c3432cf2dec11936eb797ae7dd5cfe Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 5 Mar 2025 21:24:38 +0000 Subject: [PATCH 08/15] Update Atlos tests --- .../modules/atlos_db/__init__.py | 1 - .../modules/atlos_db/__manifest__.py | 38 --------- .../modules/atlos_db/atlos_db.py | 66 --------------- .../modules/atlos_feeder/__init__.py | 1 - .../modules/atlos_feeder/__manifest__.py | 34 -------- .../modules/atlos_feeder/atlos_feeder.py | 42 ---------- .../modules/atlos_feeder_db/__init__.py | 1 - .../atlos_feeder_db_storage.py | 7 +- .../modules/atlos_storage/__init__.py | 1 - .../modules/atlos_storage/__manifest__.py | 32 ------- .../modules/atlos_storage/atlos_storage.py | 68 --------------- tests/databases/test_atlos_db.py | 47 +++++------ tests/feeders/test_atlos_feeder.py | 27 +++--- tests/storages/test_atlos_storage.py | 84 ++++++------------- 14 files changed, 66 insertions(+), 383 deletions(-) delete mode 100644 src/auto_archiver/modules/atlos_db/__init__.py delete mode 100644 src/auto_archiver/modules/atlos_db/__manifest__.py delete mode 100644 src/auto_archiver/modules/atlos_db/atlos_db.py delete mode 100644 src/auto_archiver/modules/atlos_feeder/__init__.py delete mode 100644 src/auto_archiver/modules/atlos_feeder/__manifest__.py delete mode 100644 src/auto_archiver/modules/atlos_feeder/atlos_feeder.py delete mode 100644 src/auto_archiver/modules/atlos_feeder_db/__init__.py delete mode 100644 src/auto_archiver/modules/atlos_storage/__init__.py delete mode 100644 src/auto_archiver/modules/atlos_storage/__manifest__.py delete mode 100644 src/auto_archiver/modules/atlos_storage/atlos_storage.py diff --git a/src/auto_archiver/modules/atlos_db/__init__.py b/src/auto_archiver/modules/atlos_db/__init__.py deleted file mode 100644 index e14d202..0000000 --- a/src/auto_archiver/modules/atlos_db/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .atlos_db import AtlosDb \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py deleted file mode 100644 index d23ff23..0000000 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ /dev/null @@ -1,38 +0,0 @@ -{ - "name": "Atlos Database", - "type": ["database"], - "entry_point": "atlos_db::AtlosDb", - "requires_setup": True, - "dependencies": - {"python": ["loguru", - ""], - "bin": [""]}, - "configs": { - "api_token": { - "default": None, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "required": True, - "type": "str", - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - }, - }, - "description": """ -Handles integration with the Atlos platform for managing archival results. - -### Features -- Outputs archival results to the Atlos API for storage and tracking. -- Updates failure status with error details when archiving fails. -- Processes and formats metadata, including ISO formatting for datetime fields. -- Skips processing for items without an Atlos ID. - -### Setup -Required configs: -- atlos_url: Base URL for the Atlos API. -- api_token: Authentication token for API access. -""" -, -} diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py deleted file mode 100644 index baa9fef..0000000 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import Union - -import requests -from loguru import logger - -from auto_archiver.core import Database -from auto_archiver.core import Metadata - - -class AtlosDb(Database): - """ - Outputs results to Atlos - """ - - def failed(self, item: Metadata, reason: str) -> None: - """Update DB accordingly for failure""" - # If the item has no Atlos ID, there's nothing for us to do - if not item.metadata.get("atlos_id"): - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") - return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, - json={"metadata": {"processed": True, "status": "error", "error": reason}}, - ).raise_for_status() - logger.info( - f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}" - ) - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check and fetch if the given item has been archived already, each - database should handle its own caching, and configuration mechanisms""" - return False - - def _process_metadata(self, item: Metadata) -> dict: - """Process metadata for storage on Atlos. Will convert any datetime - objects to ISO format.""" - - return { - k: v.isoformat() if hasattr(v, "isoformat") else v - for k, v in item.metadata.items() - } - - def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - - if not item.metadata.get("atlos_id"): - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") - return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, - json={ - "metadata": dict( - processed=True, - status="success", - results=self._process_metadata(item), - ) - }, - ).raise_for_status() - - logger.info( - f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" - ) diff --git a/src/auto_archiver/modules/atlos_feeder/__init__.py b/src/auto_archiver/modules/atlos_feeder/__init__.py deleted file mode 100644 index 67b243a..0000000 --- a/src/auto_archiver/modules/atlos_feeder/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py deleted file mode 100644 index d59f420..0000000 --- a/src/auto_archiver/modules/atlos_feeder/__manifest__.py +++ /dev/null @@ -1,34 +0,0 @@ -{ - "name": "Atlos Feeder", - "type": ["feeder"], - "requires_setup": True, - "dependencies": { - "python": ["loguru", "requests"], - }, - "configs": { - "api_token": { - "type": "str", - "required": True, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - }, - }, - "description": """ - AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival. - - ### Features - - Connects to the Atlos API to retrieve a list of source material URLs. - - Filters source materials based on visibility, processing status, and metadata. - - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL. - - Iterates through paginated results using a cursor for efficient API interaction. - - ### Notes - - Requires an Atlos API endpoint and a valid API token for authentication. - - Ensures only unprocessed, visible, and ready-to-archive URLs are returned. - - Handles pagination transparently when retrieving data from the Atlos API. - """ -} diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py deleted file mode 100644 index 8c8f9cb..0000000 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ /dev/null @@ -1,42 +0,0 @@ -import requests -from loguru import logger - -from auto_archiver.core import Feeder -from auto_archiver.core import Metadata - - -class AtlosFeeder(Feeder): - - def __iter__(self) -> Metadata: - # Get all the urls from the Atlos API - count = 0 - cursor = None - while True: - response = requests.get( - f"{self.atlos_url}/api/v2/source_material", - headers={"Authorization": f"Bearer {self.api_token}"}, - params={"cursor": cursor}, - ) - data = response.json() - response.raise_for_status() - cursor = data["next"] - - for item in data["results"]: - if ( - item["source_url"] not in [None, ""] - and ( - item["metadata"] - .get("auto_archiver", {}) - .get("processed", False) - != True - ) - and item["visibility"] == "visible" - and item["status"] not in ["processing", "pending"] - ): - yield Metadata().set_url(item["source_url"]).set( - "atlos_id", item["id"] - ) - count += 1 - - if len(data["results"]) == 0 or cursor is None: - break diff --git a/src/auto_archiver/modules/atlos_feeder_db/__init__.py b/src/auto_archiver/modules/atlos_feeder_db/__init__.py deleted file mode 100644 index 67b243a..0000000 --- a/src/auto_archiver/modules/atlos_feeder_db/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index 698cd41..0d00eff 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -11,12 +11,9 @@ from auto_archiver.utils import calculate_file_hash class AtlosFeederDbStorage(Feeder, Database, Storage): - @property - def session(self) -> requests.Session: + def setup(self) -> requests.Session: """create and return a persistent session.""" - if not hasattr(self, "_session"): - self._session = requests.Session() - return self._session + self.session = requests.Session() def _get(self, endpoint: str, params: Optional[dict] = None) -> dict: """Wrapper for GET requests to the Atlos API.""" diff --git a/src/auto_archiver/modules/atlos_storage/__init__.py b/src/auto_archiver/modules/atlos_storage/__init__.py deleted file mode 100644 index 9e815c7..0000000 --- a/src/auto_archiver/modules/atlos_storage/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .atlos_storage import AtlosStorage \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_storage/__manifest__.py b/src/auto_archiver/modules/atlos_storage/__manifest__.py deleted file mode 100644 index 55b5120..0000000 --- a/src/auto_archiver/modules/atlos_storage/__manifest__.py +++ /dev/null @@ -1,32 +0,0 @@ -{ - "name": "Atlos Storage", - "type": ["storage"], - "requires_setup": True, - "dependencies": { - "python": ["loguru", "boto3"], - "bin": [] - }, - "description": """ - Stores media files in a [Atlos](https://www.atlos.org/). - - ### Features - - Saves media files to Atlos, organizing them into folders based on the provided path structure. - - ### Notes - - Requires setup with Atlos credentials. - - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure. - """, - "configs": { - "api_token": { - "default": None, - "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", - "required": True, - "type": "str" - }, - "atlos_url": { - "default": "https://platform.atlos.org", - "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", - "type": "str" - }, - } -} \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_storage/atlos_storage.py b/src/auto_archiver/modules/atlos_storage/atlos_storage.py deleted file mode 100644 index 86af9c6..0000000 --- a/src/auto_archiver/modules/atlos_storage/atlos_storage.py +++ /dev/null @@ -1,68 +0,0 @@ -import hashlib -import os -from typing import IO, Optional - -import requests -from loguru import logger - -from auto_archiver.core import Media, Metadata -from auto_archiver.core import Storage -from auto_archiver.utils import calculate_file_hash - - -class AtlosStorage(Storage): - - def get_cdn_url(self, _media: Media) -> str: - # It's not always possible to provide an exact URL, because it's - # possible that the media once uploaded could have been copied to - # another project. - return self.atlos_url - - def _hash(self, media: Media) -> str: - # Hash the media file using sha-256. We don't use the existing auto archiver - # hash because there's no guarantee that the configuerer is using sha-256, which - # is how Atlos hashes files. - - sha256 = hashlib.sha256() - with open(media.filename, "rb") as f: - while True: - buf = f.read(4096) - if not buf: break - sha256.update(buf) - return sha256.hexdigest() - - def upload(self, media: Media, metadata: Optional[Metadata]=None, **_kwargs) -> bool: - atlos_id = metadata.get("atlos_id") - if atlos_id is None: - logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos") - return False - - media_hash = self._hash(media) - # media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) - - # Check whether the media has already been uploaded - source_material = requests.get( - f"{self.atlos_url}/api/v2/source_material/{atlos_id}", - headers={"Authorization": f"Bearer {self.api_token}"}, - ).json()["result"] - existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])] - if media_hash in existing_media: - logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos") - return True - - # Upload the media to the Atlos API - requests.post( - f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}", - headers={"Authorization": f"Bearer {self.api_token}"}, - params={ - "title": media.properties - }, - files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))}, - ).raise_for_status() - - logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") - - return True - - # must be implemented even if unused - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass diff --git a/tests/databases/test_atlos_db.py b/tests/databases/test_atlos_db.py index 82c07ef..a73f1df 100644 --- a/tests/databases/test_atlos_db.py +++ b/tests/databases/test_atlos_db.py @@ -2,7 +2,7 @@ import pytest from datetime import datetime from auto_archiver.core import Metadata -from auto_archiver.modules.atlos_db import AtlosDb +from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosDb class FakeAPIResponse: @@ -12,19 +12,28 @@ class FakeAPIResponse: self._data = data self.raise_error = raise_error + def json(self) -> dict: + return self._data + def raise_for_status(self) -> None: if self.raise_error: raise Exception("HTTP error") @pytest.fixture -def atlos_db(setup_module) -> AtlosDb: +def atlos_db(setup_module, mocker) -> AtlosDb: """Fixture for AtlosDb.""" configs: dict = { "api_token": "abc123", "atlos_url": "https://platform.atlos.org", } - return setup_module("atlos_db", configs) + mocker.patch("requests.Session") + atlos_feeder = setup_module("atlos_feeder_db_storage", configs) + fake_session = mocker.MagicMock() + # Configure the default response to have no results so that __iter__ terminates + fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []}) + atlos_feeder.session = fake_session + return atlos_feeder def test_failed_no_atlos_id(atlos_db, metadata, mocker): @@ -38,25 +47,20 @@ def test_failed_with_atlos_id(atlos_db, metadata, mocker): """Test failed() posts failure when atlos_id is present.""" metadata.set("atlos_id", 42) fake_resp = FakeAPIResponse({}, raise_error=False) - post_mock = mocker.patch("requests.post", return_value=fake_resp) + post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp) atlos_db.failed(metadata, "failure reason") - expected_url = ( - f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver" - ) - expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"} + expected_endpoint = f"/api/v2/source_material/metadata/42/auto_archiver" expected_json = { "metadata": {"processed": True, "status": "error", "error": "failure reason"} } - post_mock.assert_called_once_with( - expected_url, headers=expected_headers, json=expected_json - ) + post_mock.assert_called_once_with(expected_endpoint, json=expected_json) def test_failed_http_error(atlos_db, metadata, mocker): """Test failed() raises exception on HTTP error.""" metadata.set("atlos_id", 42) - fake_resp = FakeAPIResponse({}, raise_error=True) - mocker.patch("requests.post", return_value=fake_resp) + # Patch _post to raise an exception instead of returning a fake response. + mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error")) with pytest.raises(Exception, match="HTTP error"): atlos_db.failed(metadata, "failure reason") @@ -81,12 +85,9 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker): now = datetime.now() metadata.set("timestamp", now) fake_resp = FakeAPIResponse({}, raise_error=False) - post_mock = mocker.patch("requests.post", return_value=fake_resp) + post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp) atlos_db.done(metadata) - expected_url = ( - f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver" - ) - expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"} + expected_endpoint = f"/api/v2/source_material/metadata/99/auto_archiver" expected_results = metadata.metadata.copy() expected_results["timestamp"] = now.isoformat() expected_json = { @@ -96,15 +97,13 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker): "results": expected_results, } } - post_mock.assert_called_once_with( - expected_url, headers=expected_headers, json=expected_json - ) + post_mock.assert_called_once_with(expected_endpoint, json=expected_json) def test_done_http_error(atlos_db, metadata, mocker): - """Test done() raises exception on HTTP error.""" + """Test done() raises an exception on HTTP error.""" metadata.set("atlos_id", 123) - fake_resp = FakeAPIResponse({}, raise_error=True) - mocker.patch("requests.post", return_value=fake_resp) + # Patch _post to raise an exception. + mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error")) with pytest.raises(Exception, match="HTTP error"): atlos_db.done(metadata) diff --git a/tests/feeders/test_atlos_feeder.py b/tests/feeders/test_atlos_feeder.py index f26bdc9..1ef9fab 100644 --- a/tests/feeders/test_atlos_feeder.py +++ b/tests/feeders/test_atlos_feeder.py @@ -1,5 +1,5 @@ import pytest -from auto_archiver.modules.atlos_feeder import AtlosFeeder +from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosFeeder class FakeAPIResponse: @@ -18,23 +18,26 @@ class FakeAPIResponse: @pytest.fixture -def atlos_feeder(setup_module) -> AtlosFeeder: +def atlos_feeder(setup_module, mocker) -> AtlosFeeder: """Fixture for AtlosFeeder.""" configs: dict = { "api_token": "abc123", "atlos_url": "https://platform.atlos.org", } - return setup_module("atlos_feeder", configs) + mocker.patch("requests.Session") + atlos_feeder = setup_module("atlos_feeder_db_storage", configs) + fake_session = mocker.MagicMock() + # Configure the default response to have no results so that __iter__ terminates + fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []}) + atlos_feeder.session = fake_session + return atlos_feeder @pytest.fixture -def mock_atlos_api(mocker): - """Fixture to mock requests to Atlos API.""" +def mock_atlos_api(atlos_feeder): + """Fixture to update the atlos_feeder.session.get side_effect.""" def _mock_responses(responses): - mocker.patch( - "requests.get", - side_effect=[FakeAPIResponse(data) for data in responses], - ) + atlos_feeder.session.get.side_effect = [FakeAPIResponse(data) for data in responses] return _mock_responses @@ -100,9 +103,7 @@ def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api): def test_atlos_feeder_http_error(atlos_feeder, mocker): """Test raises an exception on HTTP error.""" - mocker.patch( - "requests.get", - return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True), - ) + fake_response = FakeAPIResponse({"next": None, "results": []}, raise_error=True) + atlos_feeder.session.get.side_effect = [fake_response] with pytest.raises(Exception, match="HTTP error"): list(atlos_feeder) diff --git a/tests/storages/test_atlos_storage.py b/tests/storages/test_atlos_storage.py index 7528456..bcd8f18 100644 --- a/tests/storages/test_atlos_storage.py +++ b/tests/storages/test_atlos_storage.py @@ -2,7 +2,7 @@ import os import hashlib import pytest from auto_archiver.core import Media, Metadata -from auto_archiver.modules.atlos_storage import AtlosStorage +from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosStorage class FakeAPIResponse: @@ -21,13 +21,19 @@ class FakeAPIResponse: @pytest.fixture -def atlos_storage(setup_module) -> AtlosStorage: +def atlos_storage(setup_module, mocker) -> AtlosStorage: """Fixture for AtlosStorage.""" configs: dict = { "api_token": "abc123", "atlos_url": "https://platform.atlos.org", } - return setup_module("atlos_storage", configs) + mocker.patch("requests.Session") + atlos_feeder = setup_module("atlos_feeder_db_storage", configs) + mock_session = mocker.MagicMock() + # Configure the default response to have no results so that __iter__ terminates + mock_session.get.return_value = FakeAPIResponse({"next": None, "results": []}) + atlos_feeder.session = mock_session + return atlos_feeder @pytest.fixture @@ -49,17 +55,6 @@ def test_get_cdn_url(atlos_storage: AtlosStorage) -> None: assert url == atlos_storage.atlos_url -def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None: - """Test _hash() computes the correct SHA-256 hash of a file.""" - content = b"hello world" - file_path = tmp_path / "test.txt" - file_path.write_bytes(content) - media = Media(filename="dummy.mp4") - media.filename = str(file_path) - expected_hash = hashlib.sha256(content).hexdigest() - assert atlos_storage._hash(media) == expected_hash - - def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None: """Test upload() returns False when metadata lacks atlos_id.""" metadata = Metadata() # atlos_id not set @@ -69,74 +64,49 @@ def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, post_mock.assert_not_called() -def test_upload_already_uploaded(atlos_storage: AtlosStorage, - metadata: Metadata, - media: Media, - tmp_path, - mocker) -> None: +def test_upload_already_uploaded(atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None: """Test upload() returns True if media hash already exists.""" content = b"media content" metadata.set("atlos_id", 101) media_hash = hashlib.sha256(content).hexdigest() - fake_get = FakeAPIResponse({ - "result": {"artifacts": [{"file_hash_sha256": media_hash}]} - }) - get_mock = mocker.patch("requests.get", return_value=fake_get) - post_mock = mocker.patch("requests.post") + fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": media_hash}]}} + get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response) + post_mock = mocker.patch.object(atlos_storage, "_post") result = atlos_storage.upload(media, metadata) assert result is True get_mock.assert_called_once() post_mock.assert_not_called() -def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, - metadata: Metadata, - media: Media, - mocker) -> None: +def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None: """Test upload() uploads media when not already present.""" metadata.set("atlos_id", 202) - fake_get = FakeAPIResponse({ - "result": {"artifacts": [{"file_hash_sha256": "different_hash"}]} - }) - get_mock = mocker.patch("requests.get", return_value=fake_get) - fake_post = FakeAPIResponse({}, raise_error=False) - post_mock = mocker.patch("requests.post", return_value=fake_post) + fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}} + get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response) + fake_post_response = {"result": "uploaded"} + post_mock = mocker.patch.object(atlos_storage, "_post", return_value=fake_post_response) result = atlos_storage.upload(media, metadata) assert result is True + get_mock.assert_called_once() post_mock.assert_called_once() - expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202" + expected_endpoint = f"/api/v2/source_material/upload/202" + call_args = post_mock.call_args[0] + assert call_args[0] == expected_endpoint + call_kwargs = post_mock.call_args[1] expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"} expected_params = {"title": media.properties} - call_kwargs = post_mock.call_args.kwargs - assert call_kwargs["headers"] == expected_headers assert call_kwargs["params"] == expected_params - # Verify the URL passed to requests.post. - posted_url = call_kwargs.get("url") or post_mock.call_args.args[0] - assert posted_url == expected_url - # Verify files parameter contains the correct filename. file_tuple = call_kwargs["files"]["file"] assert file_tuple[0] == os.path.basename(media.filename) -def test_upload_post_http_error(tmp_path, - atlos_storage: AtlosStorage, - metadata: Metadata, - media: Media, - mocker) -> None: +def test_upload_post_http_error(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None: """Test upload() propagates HTTP error during POST.""" metadata.set("atlos_id", 303) - fake_get = FakeAPIResponse({ - "result": {"artifacts": []} - }) - mocker.patch("requests.get", return_value=fake_get) - fake_post = FakeAPIResponse({}, raise_error=True) - mocker.patch("requests.post", return_value=fake_post) + fake_get_response = {"result": {"artifacts": []}} + mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response) + mocker.patch.object(atlos_storage, "_post", side_effect=Exception("HTTP error")) with pytest.raises(Exception, match="HTTP error"): atlos_storage.upload(media, metadata) - -def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None: - """Test uploadf() returns None (not implemented).""" - result = atlos_storage.uploadf(None, "dummy") - assert result is None From 32329c6b2c6c70233ea0a9627ff8fc7f8dd1237f Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Mar 2025 00:11:43 +0000 Subject: [PATCH 09/15] Update Google Sheet how to docs. --- docs/source/how_to/gsheets_setup.md | 79 +++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/docs/source/how_to/gsheets_setup.md b/docs/source/how_to/gsheets_setup.md index 20cedd5..ade8024 100644 --- a/docs/source/how_to/gsheets_setup.md +++ b/docs/source/how_to/gsheets_setup.md @@ -8,18 +8,25 @@ This guide explains how to set up Google Sheets to process URLs automatically an ### 1. Setting up your Google Sheet -Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive. Your sheet can have many other columns that the Auto Archiver can use, and you can also include any other columns for your own personal use. +Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive. +Your sheet can have many other columns that the Auto Archiver can use, and you can also include any additional columns for your own personal use. The order of the columns does not matter, the naming just needs to be correctly assigned to its corresponding value in the configuration file. -We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project. +We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project, as this matches the default column names. Here's an overview of all the columns, and what a complete sheet would look like. -Inputs: +**Inputs:** -* **Link** *(required)*: the URL of the post to archive +These are processed by the Gsheet Feeder and passed to the Auto Archiver. + +* **Link** *(required)*: the URL of the post that is to be archived * **Destination folder**: custom folder for archived file (regardless of storage) -Outputs: +**Outputs:** + +These are updated by the Gsheet DB module during the archiving process. +Note the required columns are only required if you are using the Gsheet DB module as well as the feeder. + * **Archive status** *(required)*: Status of archive operation * **Archive location**: URL of archived post * **Archive date**: Date archived @@ -33,9 +40,11 @@ Outputs: * **WACZ**: Link to a WACZ web archive of post * **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive -For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) +For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. +In this example the Ghseet Feeder and Gsheet DB are being used, and the archive is in progress. +(Note that the column names are not case sensitive.) -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png) +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../../demo-before.png) We'll change the name of the 'Destination Folder' column in step 3. @@ -51,43 +60,47 @@ Once you've downloaded the file, save it to `secrets/service_account.json` Now that you've set up your Google sheet, and you've set up the service account so Auto Archiver can access the sheet, the final step is to set your configuration. -First, make sure you have `gsheet_feeder` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look: +First, make sure you have `gsheet_feeder_db` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look: ```{code} yaml steps: feeders: - - gsheet_feeder + - gsheet_feeder_db ... databases: - - gsheet_db # optional, if you also want to store the results in the Google sheet + - gsheet_feeder_db # optional, if you also want to store the results in the Google sheet and tract the status of active archivals. ... ``` -Next, set up the `gsheet_feeder` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up he file, and set the `gsheet_feeder.sheet` setting or the `gsheet_feeder.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet. For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'. +Next, set up the `gsheet_feeder_db` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up the file, and set the `gsheet_feeder_db.sheet` setting or the `gsheet_feeder_db.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet. +For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'. Here's how this might look: ```{code} yaml ... -gsheet_feeder: +gsheet_feeder_db: sheet: 'My Awesome Sheet' ... ``` You can also pass these settings directly on the command line without having to edit the file, here'a an example of how to do that (using docker): -`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder.sheet "Auto archive test 2023-2"`. +`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder_db.sheet "My Awesome Sheet 2"`. Here, the sheet name has been overridden/specified in the command line invocation. ### 3a. (Optional) Changing the column names -In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder.column` setting in the configuration file. For more information on this setting, see the [Gsheet Feeder docs](../modules/autogen/feeder/gsheet_feeder.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like: +In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder_db.column` setting in the configuration file. +For more information on this setting, see the [Gsheet Feeder Database docs](../modules/autogen/feeder/gsheet_feeder_db.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like: ```{code} yaml ... -gsheet_feeder: +gsheet_feeder_db: sheet: 'My Awesome Sheet' + header: 1 + service_account: secrets/service_account.json columns: url: link status: archive status @@ -103,20 +116,44 @@ gsheet_feeder: pdq_hash: perceptual hashes wacz: wacz replaywebpage: replaywebpage + ``` +## 4. Running the Auto Archiver +### Feeding the URLs to the Auto Archiver -## Viewing the Results after archiving +The URLs to be archived should be added to the Google Sheet, and optionally a folder value. Leave all the other configured columns empty (but you may add additional columns for your own use, as long as they don't conflict with the column names mapped in the configuration file). +The Auto Archiver will archive any URLs which have an empty 'status' column -With the `ghseet_db` installed, once you start running the Auto Archiver, it will updates the "Archive status" column. +### Viewing the Results after archiving -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png) +With the `ghseet_feeder_db` installed, once you start running the Auto Archiver, it will update the "Archive status" column. +The status will be set to "Archive in progress" once the archival starts. If the archival is stopped during a run, either manually or because an error is raised the status value should be cleared. + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../../demo-progress.png) The links are downloaded and archived, and the spreadsheet is updated to the following: -![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png) +![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../../demo-after.png) -Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. +Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder_db.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. -![The archive result for a link in the demo sheet.](../demo-archive.png) +![The archive result for a link in the demo sheet.](../../demo-archive.png) + +### Troubleshooting + +**Hanging Archival in progress status** + +Occasionally system crashes or other unexpected events can cause the Auto Archiver to exit without cleaning up the status value. +If you are sure that all archival processes have stopped but you still see "Archive in progress" in the status column, you can manually clear the status column to allow the Auto Archiver to retry that archival on the next run. + +**Nothing archived status** + +Sometimes this means the tool is genuinely unable to extract the content at this point in time, but sometimes it can be resolved with different configurations. +Try: + - Turning on additional 'extractor' types in the configuration file (this can appear as 'no archiver' in the status column). + - Changing credentials or refreshing session files for extractors which require them + - Check if the extractors can accept any additional configurations such as adding a cookie file. + + From 40e5fe7a7ee6779753731294656db02d3d94e170 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Mar 2025 13:46:09 +0000 Subject: [PATCH 10/15] Update __manifest__.py for merged Atlos module. --- .../atlos_feeder_db_storage/__manifest__.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py b/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py index 3920246..2ea8f8f 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py @@ -19,23 +19,28 @@ }, }, "description": """ - AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media, - along with a database option to output archival results. + A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media, + + [Atlos](https://www.atlos.org/) is a visual investigation and archiving platform designed for investigative research, journalism, and open-source intelligence (OSINT). + It helps users organize, analyze, and store media from various sources, making it easier to track and investigate digital evidence. + + To get started create a new project and obtain an API token from the settings page. You can group event's into Atlos's 'incidents'. + Here you can add 'source material' by URLn and the Atlos feeder will fetch these URLs for archival. + + You can use Atlos only as a 'feeder', however you can also implement the 'database' and 'storage' features to store the media files in Atlos which is recommended. + The Auto Archiver will retain the Atlos ID for each item, ensuring that the media and database outputs are uplaoded back into the relevant media item. + ### Features - Connects to the Atlos API to retrieve a list of source material URLs. - - Filters source materials based on visibility, processing status, and metadata. - - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL. - - Iterates through paginated results using a cursor for efficient API interaction. - - Outputs archival results to the Atlos API for storage and tracking. - - Updates failure status with error details when archiving fails. - - Processes and formats metadata, including ISO formatting for datetime fields. - - Skips processing for items without an Atlos ID. - - Saves media files to Atlos, organizing them into folders based on the provided path structure. + - Iterates through the URLs from all source material items which are unprocessed, visible, and ready to archive. + - If the storage option is selected, it will store the media files alongside the original source material item in Atlos. + - Is the database option is selected it will output the results to the media item, as well as updating failure status with error details when archiving fails. + - Skips Storege/ database upload for items without an Atlos ID - restricting that you must use the Atlos feeder so that it has the Atlos ID to store the results with. ### Notes - - Requires an Atlos API endpoint and a valid API token for authentication. + - Requires an Atlos account with a project and a valid API token for authentication. - Ensures only unprocessed, visible, and ready-to-archive URLs are returned. - - Handles pagination transparently when retrieving data from the Atlos API. + - Feches any media items within an Atlos project, regardless of separation into incidents. """ } From 65109e377f3b437e10d59f25fcf1b4e7a3a3536a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Mar 2025 15:39:15 +0000 Subject: [PATCH 11/15] Remove raising exception in atlos_feeder_db_storage.py --- .../modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index 0d00eff..ef82238 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -143,5 +143,5 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: """Upload a file-like object; not implemented.""" - raise NotImplementedError("uploadf method is not implemented yet.") + pass From 8fcec692b773854d0211cbd6724c948d7ea745f7 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Mar 2025 15:42:20 +0000 Subject: [PATCH 12/15] Add comments to highlight different steps of atlos_feeder_db_storage.py --- .../atlos_feeder_db_storage/atlos_feeder_db_storage.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index ef82238..87b4f82 100644 --- a/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -43,6 +43,8 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): response.raise_for_status() return response.json() + # ! Atlos Module - Feeder Methods + def __iter__(self) -> Iterator[Metadata]: """Iterate over unprocessed, visible source materials from Atlos.""" cursor = None @@ -61,6 +63,8 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): if not results or cursor is None: break + # ! Atlos Module - Database Methods + def failed(self, item: Metadata, reason: str) -> None: """Mark an item as failed in Atlos, if the ID exists.""" atlos_id = item.metadata.get("atlos_id") @@ -104,6 +108,8 @@ class AtlosFeederDbStorage(Feeder, Database, Storage): ) logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos") + # ! Atlos Module - Storage Methods + def get_cdn_url(self, _media: Media) -> str: """Return the base Atlos URL as the CDN URL.""" return self.atlos_url From 09e09e9ab9cdcb8d96fcd2fcf7f810138dec6027 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Mar 2025 16:28:17 +0000 Subject: [PATCH 13/15] Document module renames in 'upgrading from 0.12' how to --- docs/source/how_to/new_config_format.md | 39 ++++++++++++++++--------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/docs/source/how_to/new_config_format.md b/docs/source/how_to/new_config_format.md index ba05d7d..99c2a06 100644 --- a/docs/source/how_to/new_config_format.md +++ b/docs/source/how_to/new_config_format.md @@ -22,7 +22,7 @@ your configuration file or on the command line (using --feeders) ```{code} yaml steps: - feeder: gsheet_feeder + feeder: cli_feeder ... ``` @@ -75,28 +75,39 @@ The names of the actual modules have also changed, so for any extractor modules - `wayback_archiver_enricher` → `wayback_extractor_enricher` - `vk_archiver` → `vk_extractor` -Additionally, the `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md). + +#### c) Module Renaming + + +The `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md). + +The `atlos` modules have been merged into one, as have the `gsheets` feeder and database. + +- `atlos_feeder` → `atlos_feeder_db_storage` +- `atlos_storage` → `atlos_feeder_db_storage` +- `atlos_db` → `atlos_feeder_db_storage` +- `gsheet_feeder` → `gsheet_feeder_db` +- `gsheet_db` → `gsheet_feeder_db` + Example: ```{code} yaml steps: + feeders: + - gsheet_feeder_db # formerly gsheet_feeder ... - archivers: - - telethon_archiver - - youtube_archiver - - vk_archiver - -# renaming 'archiver' to 'extractor', and renaming the youtube_archiver the above config will become: -steps: + extractors: # formerly 'archivers' + - telethon_extractor # formerly telethon_archiver + - generic_extractor # formerly youtube_archiver + - vk_extractor # formerly vk_archiver + databases: + - gsheet_feeder_db # formerly gsheet_db ... - extractors: - - telethon_extractor - - vk_extractor - - generic_extractor ``` -#### c) Redundant / Obsolete Modules + +#### d) Redundant / Obsolete Modules With v0.13 of Auto Archiver, the following modules have been removed and their features have been built in to the generic_extractor. You should remove them from the 'steps' section of your configuration file: From a8fcd0b9a07269ff87da6c449f891dd842cc5f3b Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Mar 2025 16:37:58 +0000 Subject: [PATCH 14/15] Further info in how to for the new config format --- docs/source/how_to/authentication_how_to.md | 2 +- docs/source/how_to/new_config_format.md | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/source/how_to/authentication_how_to.md b/docs/source/how_to/authentication_how_to.md index 8994271..0e842fb 100644 --- a/docs/source/how_to/authentication_how_to.md +++ b/docs/source/how_to/authentication_how_to.md @@ -46,7 +46,7 @@ First, we need to install an extension in our browser to export the cookies for **2. Export the cookies** -```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for achiving. +```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for archiving. ``` Once the extension is installed in your preferred browser, login to Twitter in this browser, and then activate the extension and export the cookies. You can choose to export all your cookies for your browser, or just cookies for this specific site. In the image below, we're only exporting cookies for Twitter/x.com: diff --git a/docs/source/how_to/new_config_format.md b/docs/source/how_to/new_config_format.md index 99c2a06..5cef3c8 100644 --- a/docs/source/how_to/new_config_format.md +++ b/docs/source/how_to/new_config_format.md @@ -26,11 +26,9 @@ steps: ... ``` -## Updating your configuration file +The next two sections outline the two methods you have for updating your file. -To update your configuration file, you can either: - -### 1. Manually edit the configuration file and change the values. +## 1. Manually edit the configuration file and change the values. This is recommended if you want to keep all your old settings. Follow the steps below to change the relevant settings: @@ -106,6 +104,16 @@ steps: ``` +```{note} + +Don't forget to also rename the configuration settings. For example: + +```{code} yaml +gsheet_feeder_db: # formerly gsheet_feeder + service_account: secrets/service_account.json + sheet: My Google Sheet +... +``` #### d) Redundant / Obsolete Modules @@ -115,7 +123,7 @@ With v0.13 of Auto Archiver, the following modules have been removed and their f * `tiktok_archiver` - use the `generic_extractor` to extract TikTok videos. -### 2. Auto-generate a new config, then copy over your settings. +## 2. Auto-generate a new config, then copy over your settings. Using this method, you can have Auto Archiver auto-generate a configuration file for you, then you can copy over the desired settings from your old config file. This is probably the easiest method and quickest to setup, but it may require some trial and error as you copy over your settings. From e519ba2433538ab724cadf963aca863a606eefd4 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 7 Mar 2025 16:40:34 +0000 Subject: [PATCH 15/15] Add 'reject all' cookie button --- src/auto_archiver/utils/webdriver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 1d01df2..cb4e2a9 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -89,7 +89,7 @@ class CookieSettingDriver(webdriver.Firefox): else: # for all other sites, try and use some common button text to reject/accept cookies - for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]: + for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Reject all", "Accept all cookies"]: try: xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]" WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()