mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Small fixes.
Add timestamp helper method.
This commit is contained in:
@@ -70,12 +70,15 @@ class GDriveStorage(Storage):
|
||||
filename = path_parts[-1]
|
||||
logger.info(f"looking for folders for {path_parts[0:-1]} before getting url for {filename=}")
|
||||
for folder in path_parts[0:-1]:
|
||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
|
||||
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=False)
|
||||
parent_id = folder_id
|
||||
|
||||
# get id of file inside folder (or sub folder)
|
||||
# TODO: supressing the error as being checked before first upload
|
||||
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=False)
|
||||
if not file_id:
|
||||
#
|
||||
logger.info(f"file {filename} not found in folder {folder_id}")
|
||||
return None
|
||||
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
from typing import Union, Tuple
|
||||
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
@@ -8,33 +6,33 @@ from loguru import logger
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f'Archive failed {reason}')
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, '')
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
@@ -46,23 +44,25 @@ class GsheetsDb(Database):
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, 'status', status_message))
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, self._get_current_datetime_iso())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
@@ -71,31 +71,35 @@ class GsheetsDb(Database):
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if (thumbnail := item.get_first_image("thumbnail")):
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if (browsertrix := item.get_media_by_id("browsertrix")):
|
||||
batch_if_valid('wacz', "\n".join(browsertrix.urls))
|
||||
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
@staticmethod
|
||||
def _get_current_datetime_iso() -> str:
|
||||
"""Helper method to generate the current datetime in ISO format."""
|
||||
return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', new_status)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
|
||||
@@ -18,12 +18,14 @@ class TelethonExtractor(Extractor):
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
def setup(self) -> None:
|
||||
def setup(self, config: dict) -> None:
|
||||
|
||||
"""
|
||||
1. makes a copy of session_file that is removed in cleanup
|
||||
2. trigger login process for telegram or proceed if already saved in a session file
|
||||
3. joins channel_invites where needed
|
||||
"""
|
||||
super().setup(config)
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
|
||||
# make a copy of the session that is used exclusively with this archiver instance
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
a={
|
||||
{
|
||||
"name": "Whisper Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
|
||||
@@ -4,7 +4,6 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.s3_storage import S3Storage
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
@@ -14,13 +13,17 @@ class WhisperEnricher(Enricher):
|
||||
Only works if an S3 compatible storage is used
|
||||
"""
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
storages = self.config['steps']['storages']
|
||||
if not "s3_storage" in storages:
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
self.stores = self.config['steps']['storages']
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
if not "s3_storage" in self.stores:
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
|
||||
self.s3 = get_module("s3_storage", self.config)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
|
||||
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
@@ -58,5 +56,37 @@ def random_str(length: int = 32) -> str:
|
||||
assert length <= 32, "length must be less than 32 as UUID4 is used"
|
||||
return str(uuid.uuid4()).replace("-", "")[:length]
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
|
||||
def get_current_datetime_iso() -> str:
|
||||
return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
|
||||
# parse a datetime string with option of passing a specific format
|
||||
try:
|
||||
return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
|
||||
except ValueError as e:
|
||||
logger.error(f"Unable to parse datestring {dt_str}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
|
||||
# Consistent parsing of timestamps
|
||||
# If utc=True, the timezone is set to UTC,
|
||||
# if iso=True, the output is an iso string
|
||||
if not ts: return
|
||||
try:
|
||||
if isinstance(ts, str): ts = datetime.fromisoformat(ts)
|
||||
if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
|
||||
if utc: ts = ts.replace(tzinfo=timezone.utc)
|
||||
if iso: return ts.isoformat()
|
||||
return ts
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to parse timestamp {ts}: {e}")
|
||||
return None
|
||||
|
||||
def get_current_timestamp() -> str:
|
||||
return get_timestamp(datetime.now())
|
||||
@@ -103,19 +103,20 @@ def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
|
||||
gsheets_db.failed(mock_metadata, reason)
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
|
||||
|
||||
|
||||
def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
|
||||
gsheets_db.aborted(mock_metadata)
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
|
||||
|
||||
|
||||
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls):
|
||||
with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
|
||||
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
|
||||
gsheets_db.done(metadata)
|
||||
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
|
||||
|
||||
|
||||
def test_done_cached(gsheets_db, metadata, mock_gworksheet):
|
||||
with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
|
||||
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
|
||||
gsheets_db.done(metadata, cached=True)
|
||||
|
||||
# Verify the status message includes "[cached]"
|
||||
@@ -126,7 +127,8 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet):
|
||||
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet):
|
||||
# clear media from metadata
|
||||
metadata.media = []
|
||||
with patch.object(gsheets_db, '_get_current_datetime_iso', return_value='2025-02-01T00:00:00+00:00'):
|
||||
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp",
|
||||
return_value='2025-02-01T00:00:00+00:00'):
|
||||
gsheets_db.done(metadata)
|
||||
# Verify nothing media-related gets updated
|
||||
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
|
||||
|
||||
@@ -185,5 +185,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
result = self.extractor.download_profile(metadata, "test_user")
|
||||
|
||||
assert result.is_success()
|
||||
assert "Error downloading stories for test_user" in result.metadata["errors"]
|
||||
# assert "Error downloading posts for test_user" in result.metadata["errors"]
|
||||
assert "Error downloading stories for test_user" in result.metadata["errors"]
|
||||
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import pickle
|
||||
from typing import Type
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
|
||||
@@ -7,10 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
|
||||
from auto_archiver.core import Metadata, Feeder
|
||||
|
||||
|
||||
def test_initialise_without_sheet_and_sheet_id(setup_module):
|
||||
"""Ensure initialise() raises AssertionError if neither sheet nor sheet_id is set.
|
||||
(shouldn't really be asserting in there)
|
||||
"""
|
||||
def test_setup_without_sheet_and_sheet_id(setup_module):
|
||||
# Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
|
||||
with patch("gspread.service_account"):
|
||||
with pytest.raises(AssertionError):
|
||||
setup_module(
|
||||
@@ -145,7 +143,6 @@ def test_open_sheet_with_name_or_id(
|
||||
"gsheet_feeder",
|
||||
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
|
||||
)
|
||||
feeder.initialise()
|
||||
sheet_result = feeder.open_sheet()
|
||||
# Validate the correct method was called
|
||||
getattr(mock_client, expected_method).assert_called_once_with(
|
||||
@@ -165,7 +162,6 @@ def test_open_sheet_with_sheet_id(setup_module):
|
||||
"gsheet_feeder",
|
||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
|
||||
)
|
||||
feeder.initialise()
|
||||
sheet = feeder.open_sheet()
|
||||
mock_client.open_by_key.assert_called_once_with("ABC123")
|
||||
assert sheet == "MockSheet"
|
||||
@@ -263,7 +259,6 @@ class TestGSheetsFeederReal:
|
||||
["https://example.com", "done"],
|
||||
]
|
||||
worksheet.append_rows(test_rows)
|
||||
self.feeder.initialise()
|
||||
metadata_list = list(self.feeder)
|
||||
|
||||
# Validate that only the first row is processed
|
||||
|
||||
@@ -21,16 +21,6 @@ class TestGDriveStorage(TestStorageBase):
|
||||
'service_account': 'fake_service_account.json'
|
||||
}
|
||||
|
||||
@pytest.mark.skip(reason="Requires real credentials")
|
||||
@pytest.mark.download
|
||||
def test_initialize_with_real_credentials(self):
|
||||
"""
|
||||
Test that the Google Drive service can be initialized with real credentials.
|
||||
"""
|
||||
self.storage.service_account = 'secrets/service_account.json' # Path to real credentials
|
||||
self.storage.initialise()
|
||||
assert self.storage.service is not None
|
||||
|
||||
|
||||
def test_initialize_fails_with_non_existent_creds(self):
|
||||
"""
|
||||
@@ -38,6 +28,35 @@ class TestGDriveStorage(TestStorageBase):
|
||||
"""
|
||||
# Act and Assert
|
||||
with pytest.raises(FileNotFoundError) as exc_info:
|
||||
self.storage.initialise()
|
||||
self.storage.setup(self.config)
|
||||
assert "No such file or directory" in str(exc_info.value)
|
||||
|
||||
def test_path_parts(self):
|
||||
media = Media(filename="test.jpg")
|
||||
media.key = "folder1/folder2/test.jpg"
|
||||
|
||||
# @pytest.mark.skip(reason="Requires real credentials")
|
||||
@pytest.mark.download
|
||||
class TestGDriveStorageConnected(TestStorageBase):
|
||||
"""
|
||||
'Real' tests for GDriveStorage.
|
||||
"""
|
||||
|
||||
module_name: str = "gdrive_storage"
|
||||
storage: Type[GDriveStorage]
|
||||
config: dict = {'path_generator': 'url',
|
||||
'filename_generator': 'static',
|
||||
# TODO: replace with real root folder id
|
||||
'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
|
||||
'oauth_token': None,
|
||||
'service_account': 'secrets/service_account.json'
|
||||
}
|
||||
|
||||
|
||||
def test_initialize_with_real_credentials(self):
|
||||
"""
|
||||
Test that the Google Drive service can be initialized with real credentials.
|
||||
"""
|
||||
assert self.storage.service is not None
|
||||
|
||||
|
||||
|
||||
@@ -159,3 +159,7 @@ def test_get_context():
|
||||
assert m.get_context("somekey") == "somevalue"
|
||||
assert m.get_context("anotherkey") == "anothervalue"
|
||||
assert len(m._context) == 2
|
||||
|
||||
|
||||
def test_choose_most_complete():
|
||||
pass
|
||||
Reference in New Issue
Block a user