Small fixes.

Add timestamp helper method.
This commit is contained in:
erinhmclark
2025-02-10 15:57:42 +00:00
parent 950624dd4b
commit f311621e58
12 changed files with 129 additions and 69 deletions

View File

@@ -70,12 +70,15 @@ class GDriveStorage(Storage):
filename = path_parts[-1]
logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
for folder in path_parts[0:-1]:
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
parent_id = folder_id
# get id of file inside folder (or sub folder)
# TODO: supressing the error as being checked before first upload
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False)
if not file_id:
#
logger.info(f"file {filename} not found in folder {folder_id}")
return None
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
def upload(self, media: Media, **kwargs) -> bool:

View File

@@ -1,6 +1,4 @@
from typing import Union, Tuple
import datetime
from urllib.parse import quote
from loguru import logger
@@ -8,33 +6,33 @@ from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.gsheet_feeder import GWorksheet
from auto_archiver.utils.misc import get_current_timestamp
class GsheetsDb(Database):
"""
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
"""
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', 'Archive in progress')
gw.set_cell(row, "status", "Archive in progress")
def failed(self, item: Metadata, reason:str) -> None:
def failed(self, item: Metadata, reason: str) -> None:
logger.error(f"FAILED {item}")
self._safe_status_update(item, f'Archive failed {reason}')
self._safe_status_update(item, f"Archive failed {reason}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
self._safe_status_update(item, '')
self._safe_status_update(item, "")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check if the given item has been archived already"""
return False
def done(self, item: Metadata, cached: bool=False) -> None:
def done(self, item: Metadata, cached: bool = False) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item.get_url()}")
gw, row = self._retrieve_gsheet(item)
@@ -46,23 +44,25 @@ class GsheetsDb(Database):
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
try:
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value))
except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status
if cached:
status_message = f"[cached] {status_message}"
cell_updates.append((row, 'status', status_message))
cell_updates.append((row, "status", status_message))
media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, self._get_current_datetime_iso())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
batch_if_valid("archive", "\n".join(media.urls))
batch_if_valid("date", True, get_current_timestamp())
batch_if_valid("title", item.get_title())
batch_if_valid("text", item.get("content", ""))
batch_if_valid("timestamp", item.get_timestamp())
if media:
batch_if_valid("hash", media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present
pdq_hashes = []
@@ -71,31 +71,35 @@ class GsheetsDb(Database):
if pdq := m.get("pdq_hash"):
pdq_hashes.append(pdq)
if len(pdq_hashes):
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
screenshot, "urls"
):
batch_if_valid("screenshot", "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")):
if thumbnail := item.get_first_image("thumbnail"):
if hasattr(thumbnail, "urls"):
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
if (browsertrix := item.get_media_by_id("browsertrix")):
batch_if_valid('wacz', "\n".join(browsertrix.urls))
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
if browsertrix := item.get_media_by_id("browsertrix"):
batch_if_valid("wacz", "\n".join(browsertrix.urls))
batch_if_valid(
"replaywebpage",
"\n".join(
[
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
for wacz in browsertrix.urls
]
),
)
gw.batch_set_cell(cell_updates)
@staticmethod
def _get_current_datetime_iso() -> str:
"""Helper method to generate the current datetime in ISO format."""
return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
try:
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', new_status)
gw.set_cell(row, "status", new_status)
except Exception as e:
logger.debug(f"Unable to update sheet: {e}")

View File

@@ -18,12 +18,14 @@ class TelethonExtractor(Extractor):
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
def setup(self) -> None:
def setup(self, config: dict) -> None:
"""
1. makes a copy of session_file that is removed in cleanup
2. trigger login process for telegram or proceed if already saved in a session file
3. joins channel_invites where needed
"""
super().setup(config)
logger.info(f"SETUP {self.name} checking login...")
# make a copy of the session that is used exclusively with this archiver instance

View File

@@ -1,4 +1,4 @@
a={
{
"name": "Whisper Enricher",
"type": ["enricher"],
"requires_setup": True,

View File

@@ -4,7 +4,6 @@ from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher):
@@ -14,13 +13,17 @@ class WhisperEnricher(Enricher):
Only works if an S3 compatible storage is used
"""
def enrich(self, to_enrich: Metadata) -> None:
storages = self.config['steps']['storages']
if not "s3_storage" in storages:
def setup(self, config: dict) -> None:
super().setup(config)
self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config)
if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
self.s3 = get_module("s3_storage", self.config)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")

View File

@@ -1,9 +1,7 @@
import os
import json
import uuid
from datetime import datetime
from datetime import datetime, timezone
import requests
from loguru import logger
@@ -58,5 +56,37 @@ def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]
def json_loader(cli_val):
return json.loads(cli_val)
def get_current_datetime_iso() -> str:
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()
def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
# parse a datetime string with option of passing a specific format
try:
return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
except ValueError as e:
logger.error(f"Unable to parse datestring {dt_str}: {e}")
return None
def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
# Consistent parsing of timestamps
# If utc=True, the timezone is set to UTC,
# if iso=True, the output is an iso string
if not ts: return
try:
if isinstance(ts, str): ts = datetime.fromisoformat(ts)
if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
if utc: ts = ts.replace(tzinfo=timezone.utc)
if iso: return ts.isoformat()
return ts
except Exception as e:
logger.error(f"Unable to parse timestamp {ts}: {e}")
return None
def get_current_timestamp() -> str:
return get_timestamp(datetime.now())

View File

@@ -103,19 +103,20 @@ def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
gsheets_db.failed(mock_metadata, reason)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
gsheets_db.aborted(mock_metadata)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls):
with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
gsheets_db.done(metadata)
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
def test_done_cached(gsheets_db, metadata, mock_gworksheet):
with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
gsheets_db.done(metadata, cached=True)
# Verify the status message includes "[cached]"
@@ -126,7 +127,8 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet):
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet):
# clear media from metadata
metadata.media = []
with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp",
return_value='2025-02-01T00:00:00+00:00'):
gsheets_db.done(metadata)
# Verify nothing media-related gets updated
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]

View File

@@ -185,5 +185,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
result = self.extractor.download_profile(metadata, "test_user")
assert result.is_success()
assert "Error downloading stories for test_user" in result.metadata["errors"]
# assert "Error downloading posts for test_user" in result.metadata["errors"]
assert "Error downloading stories for test_user" in result.metadata["errors"]

View File

@@ -1,5 +1,4 @@
import os
import pickle
from typing import Type
from unittest.mock import patch, MagicMock

View File

@@ -7,10 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
from auto_archiver.core import Metadata, Feeder
def test_initialise_without_sheet_and_sheet_id(setup_module):
"""Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set.
(shouldn't really be asserting in there)
"""
def test_setup_without_sheet_and_sheet_id(setup_module):
# Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
with patch("gspread.service_account"):
with pytest.raises(AssertionError):
setup_module(
@@ -145,7 +143,6 @@ def test_open_sheet_with_name_or_id(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
)
feeder.initialise()
sheet_result = feeder.open_sheet()
# Validate the correct method was called
getattr(mock_client, expected_method).assert_called_once_with(
@@ -165,7 +162,6 @@ def test_open_sheet_with_sheet_id(setup_module):
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
)
feeder.initialise()
sheet = feeder.open_sheet()
mock_client.open_by_key.assert_called_once_with("ABC123")
assert sheet == "MockSheet"
@@ -263,7 +259,6 @@ class TestGSheetsFeederReal:
["https://example.com", "done"],
]
worksheet.append_rows(test_rows)
self.feeder.initialise()
metadata_list = list(self.feeder)
# Validate that only the first row is processed

View File

@@ -21,16 +21,6 @@ class TestGDriveStorage(TestStorageBase):
'service_account': 'fake_service_account.json'
}
@pytest.mark.skip(reason="Requires real credentials")
@pytest.mark.download
def test_initialize_with_real_credentials(self):
"""
Test that the Google Drive service can be initialized with real credentials.
"""
self.storage.service_account = 'secrets/service_account.json' # Path to real credentials
self.storage.initialise()
assert self.storage.service is not None
def test_initialize_fails_with_non_existent_creds(self):
"""
@@ -38,6 +28,35 @@ class TestGDriveStorage(TestStorageBase):
"""
# Act and Assert
with pytest.raises(FileNotFoundError) as exc_info:
self.storage.initialise()
self.storage.setup(self.config)
assert "No such file or directory" in str(exc_info.value)
def test_path_parts(self):
media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg"
# @pytest.mark.skip(reason="Requires real credentials")
@pytest.mark.download
class TestGDriveStorageConnected(TestStorageBase):
"""
'Real' tests for GDriveStorage.
"""
module_name: str = "gdrive_storage"
storage: Type[GDriveStorage]
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
# TODO: replace with real root folder id
'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
'oauth_token': None,
'service_account': 'secrets/service_account.json'
}
def test_initialize_with_real_credentials(self):
"""
Test that the Google Drive service can be initialized with real credentials.
"""
assert self.storage.service is not None

View File

@@ -159,3 +159,7 @@ def test_get_context():
assert m.get_context("somekey") == "somevalue"
assert m.get_context("anotherkey") == "anothervalue"
assert len(m._context) == 2
def test_choose_most_complete():
pass