diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index cc9cf3d..910f48b 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -70,12 +70,15 @@ class GDriveStorage(Storage): filename = path_parts[-1] logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}") for folder in path_parts[0:-1]: - folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True) + folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False) parent_id = folder_id - # get id of file inside folder (or sub folder) # TODO: supressing the error as being checked before first upload file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False) + if not file_id: + # + logger.info(f"file {filename} not found in folder {folder_id}") + return None return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" def upload(self, media: Media, **kwargs) -> bool: diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 3bb27b7..682eb94 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -1,6 +1,4 @@ from typing import Union, Tuple - -import datetime from urllib.parse import quote from loguru import logger @@ -8,33 +6,33 @@ from loguru import logger from auto_archiver.core import Database from auto_archiver.core import Metadata, Media from auto_archiver.modules.gsheet_feeder import GWorksheet +from auto_archiver.utils.misc import get_current_timestamp class GsheetsDb(Database): """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata + NB: only works if GsheetFeeder is used. + could be updated in the future to support non-GsheetFeeder metadata """ - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', 'Archive in progress') + gw.set_cell(row, "status", "Archive in progress") - def failed(self, item: Metadata, reason:str) -> None: + def failed(self, item: Metadata, reason: str) -> None: logger.error(f"FAILED {item}") - self._safe_status_update(item, f'Archive failed {reason}') + self._safe_status_update(item, f"Archive failed {reason}") def aborted(self, item: Metadata) -> None: logger.warning(f"ABORTED {item}") - self._safe_status_update(item, '') + self._safe_status_update(item, "") def fetch(self, item: Metadata) -> Union[Metadata, bool]: """check if the given item has been archived already""" return False - def done(self, item: Metadata, cached: bool=False) -> None: + def done(self, item: Metadata, cached: bool = False) -> None: """archival result ready - should be saved to DB""" logger.success(f"DONE {item.get_url()}") gw, row = self._retrieve_gsheet(item) @@ -46,23 +44,25 @@ class GsheetsDb(Database): def batch_if_valid(col, val, final_value=None): final_value = final_value or val try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': + if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "": cell_updates.append((row, col, final_value)) except Exception as e: logger.error(f"Unable to batch {col}={final_value} due to {e}") + status_message = item.status if cached: status_message = f"[cached] {status_message}" - cell_updates.append((row, 'status', status_message)) + cell_updates.append((row, "status", status_message)) media: Media = item.get_final_media() if hasattr(media, "urls"): - batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, self._get_current_datetime_iso()) - batch_if_valid('title', item.get_title()) - batch_if_valid('text', item.get("content", "")) - batch_if_valid('timestamp', item.get_timestamp()) - if media: batch_if_valid('hash', media.get("hash", "not-calculated")) + batch_if_valid("archive", "\n".join(media.urls)) + batch_if_valid("date", True, get_current_timestamp()) + batch_if_valid("title", item.get_title()) + batch_if_valid("text", item.get("content", "")) + batch_if_valid("timestamp", item.get_timestamp()) + if media: + batch_if_valid("hash", media.get("hash", "not-calculated")) # merge all pdq hashes into a single string, if present pdq_hashes = [] @@ -71,31 +71,35 @@ class GsheetsDb(Database): if pdq := m.get("pdq_hash"): pdq_hashes.append(pdq) if len(pdq_hashes): - batch_if_valid('pdq_hash', ",".join(pdq_hashes)) + batch_if_valid("pdq_hash", ",".join(pdq_hashes)) - if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): - batch_if_valid('screenshot', "\n".join(screenshot.urls)) + if (screenshot := item.get_media_by_id("screenshot")) and hasattr( + screenshot, "urls" + ): + batch_if_valid("screenshot", "\n".join(screenshot.urls)) - if (thumbnail := item.get_first_image("thumbnail")): + if thumbnail := item.get_first_image("thumbnail"): if hasattr(thumbnail, "urls"): - batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")') + batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")') - if (browsertrix := item.get_media_by_id("browsertrix")): - batch_if_valid('wacz', "\n".join(browsertrix.urls)) - batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) + if browsertrix := item.get_media_by_id("browsertrix"): + batch_if_valid("wacz", "\n".join(browsertrix.urls)) + batch_if_valid( + "replaywebpage", + "\n".join( + [ + f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}" + for wacz in browsertrix.urls + ] + ), + ) gw.batch_set_cell(cell_updates) - @staticmethod - def _get_current_datetime_iso() -> str: - """Helper method to generate the current datetime in ISO format.""" - return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat() - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: try: gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', new_status) + gw.set_cell(row, "status", new_status) except Exception as e: logger.debug(f"Unable to update sheet: {e}") diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 0147ff2..947db9e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,12 +18,14 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self) -> None: + def setup(self, config: dict) -> None: + """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ + super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py index 1539df6..98e743e 100644 --- a/src/auto_archiver/modules/whisper_enricher/__manifest__.py +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -1,4 +1,4 @@ -a={ +{ "name": "Whisper Enricher", "type": ["enricher"], "requires_setup": True, diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 004d91c..a51ffc1 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -4,7 +4,6 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.modules.s3_storage import S3Storage from auto_archiver.core.module import get_module class WhisperEnricher(Enricher): @@ -14,13 +13,17 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def enrich(self, to_enrich: Metadata) -> None: - storages = self.config['steps']['storages'] - if not "s3_storage" in storages: + def setup(self, config: dict) -> None: + super().setup(config) + self.stores = self.config['steps']['storages'] + self.s3 = get_module("s3_storage", self.config) + if not "s3_storage" in self.stores: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return - self.s3 = get_module("s3_storage", self.config) + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.") diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 300a710..e4c214c 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -1,9 +1,7 @@ - - import os import json import uuid -from datetime import datetime +from datetime import datetime, timezone import requests from loguru import logger @@ -58,5 +56,37 @@ def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] + def json_loader(cli_val): return json.loads(cli_val) + +def get_current_datetime_iso() -> str: + return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat() + + +def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None: + # parse a datetime string with option of passing a specific format + try: + return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str) + except ValueError as e: + logger.error(f"Unable to parse datestring {dt_str}: {e}") + return None + + +def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None: + # Consistent parsing of timestamps + # If utc=True, the timezone is set to UTC, + # if iso=True, the output is an iso string + if not ts: return + try: + if isinstance(ts, str): ts = datetime.fromisoformat(ts) + if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts) + if utc: ts = ts.replace(tzinfo=timezone.utc) + if iso: return ts.isoformat() + return ts + except Exception as e: + logger.error(f"Unable to parse timestamp {ts}: {e}") + return None + +def get_current_timestamp() -> str: + return get_timestamp(datetime.now()) \ No newline at end of file diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index bdc2811..0a655a8 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -103,19 +103,20 @@ def test_failed(gsheets_db, mock_metadata, mock_gworksheet): gsheets_db.failed(mock_metadata, reason) mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}') + def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): gsheets_db.aborted(mock_metadata) mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '') def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls): - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) def test_done_cached(gsheets_db, metadata, mock_gworksheet): - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" @@ -126,7 +127,8 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet): def test_done_missing_media(gsheets_db, metadata, mock_gworksheet): # clear media from metadata metadata.media = [] - with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'): + with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", + return_value='2025-02-01T00:00:00+00:00'): gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index d3f7bd6..c119e3f 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -185,5 +185,4 @@ class TestInstagramAPIExtractor(TestExtractorBase): result = self.extractor.download_profile(metadata, "test_user") assert result.is_success() - assert "Error downloading stories for test_user" in result.metadata["errors"] - # assert "Error downloading posts for test_user" in result.metadata["errors"] \ No newline at end of file + assert "Error downloading stories for test_user" in result.metadata["errors"] \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index b82641d..d7a1e53 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -1,5 +1,4 @@ import os -import pickle from typing import Type from unittest.mock import patch, MagicMock diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 103610e..ecf57f1 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -7,10 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GsheetsFeeder from auto_archiver.core import Metadata, Feeder -def test_initialise_without_sheet_and_sheet_id(setup_module): - """Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set. - (shouldn't really be asserting in there) - """ +def test_setup_without_sheet_and_sheet_id(setup_module): + # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set. with patch("gspread.service_account"): with pytest.raises(AssertionError): setup_module( @@ -145,7 +143,6 @@ def test_open_sheet_with_name_or_id( "gsheet_feeder", {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, ) - feeder.initialise() sheet_result = feeder.open_sheet() # Validate the correct method was called getattr(mock_client, expected_method).assert_called_once_with( @@ -165,7 +162,6 @@ def test_open_sheet_with_sheet_id(setup_module): "gsheet_feeder", {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, ) - feeder.initialise() sheet = feeder.open_sheet() mock_client.open_by_key.assert_called_once_with("ABC123") assert sheet == "MockSheet" @@ -263,7 +259,6 @@ class TestGSheetsFeederReal: ["https://example.com", "done"], ] worksheet.append_rows(test_rows) - self.feeder.initialise() metadata_list = list(self.feeder) # Validate that only the first row is processed diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index b7417ad..4259cb2 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -21,16 +21,6 @@ class TestGDriveStorage(TestStorageBase): 'service_account': 'fake_service_account.json' } - @pytest.mark.skip(reason="Requires real credentials") - @pytest.mark.download - def test_initialize_with_real_credentials(self): - """ - Test that the Google Drive service can be initialized with real credentials. - """ - self.storage.service_account = 'secrets/service_account.json' # Path to real credentials - self.storage.initialise() - assert self.storage.service is not None - def test_initialize_fails_with_non_existent_creds(self): """ @@ -38,6 +28,35 @@ class TestGDriveStorage(TestStorageBase): """ # Act and Assert with pytest.raises(FileNotFoundError) as exc_info: - self.storage.initialise() + self.storage.setup(self.config) assert "No such file or directory" in str(exc_info.value) + def test_path_parts(self): + media = Media(filename="test.jpg") + media.key = "folder1/folder2/test.jpg" + +# @pytest.mark.skip(reason="Requires real credentials") +@pytest.mark.download +class TestGDriveStorageConnected(TestStorageBase): + """ + 'Real' tests for GDriveStorage. + """ + + module_name: str = "gdrive_storage" + storage: Type[GDriveStorage] + config: dict = {'path_generator': 'url', + 'filename_generator': 'static', + # TODO: replace with real root folder id + 'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk", + 'oauth_token': None, + 'service_account': 'secrets/service_account.json' + } + + + def test_initialize_with_real_credentials(self): + """ + Test that the Google Drive service can be initialized with real credentials. + """ + assert self.storage.service is not None + + diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 7270c80..b07e107 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -159,3 +159,7 @@ def test_get_context(): assert m.get_context("somekey") == "somevalue" assert m.get_context("anotherkey") == "anothervalue" assert len(m._context) == 2 + + +def test_choose_most_complete(): + pass \ No newline at end of file