Merge main into timestamping_enricher

This commit is contained in:
Patrick Robertson
2025-03-24 15:09:29 +04:00
219 changed files with 11049 additions and 2933 deletions

View File

@@ -1,6 +1,7 @@
"""
pytest conftest file, for shared fixtures and configuration
"""
import os
import pickle
from datetime import datetime, timezone
@@ -16,32 +17,36 @@ from auto_archiver.core.module import ModuleFactory
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
# what comes first will be run first (at the end of all other tests not mentioned)
# format is the name of the module (python file) without the .py extension
TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
@pytest.fixture
def setup_module(request):
def _setup_module(module_name, config={}):
def _setup_module(module_name, config=None):
if config is None:
config = {}
module_factory = ModuleFactory()
if isinstance(module_name, type):
# get the module name:
# if the class does not have a .name, use the name of the parent folder
module_name = module_name.__module__.rsplit(".",2)[-2]
module_name = module_name.__module__.rsplit(".", 2)[-2]
m = module_factory.get_module(module_name, {module_name: config})
# add the tmp_dir to the module
tmp_dir = TemporaryDirectory()
m.tmp_dir = tmp_dir.name
def cleanup():
tmp_dir.cleanup()
request.addfinalizer(cleanup)
return m
return _setup_module
@pytest.fixture
def check_hash():
def _check_hash(filename: str, hash: str):
@@ -51,6 +56,7 @@ def check_hash():
return _check_hash
@pytest.fixture
def make_item():
def _make_item(url: str, **kwargs) -> Metadata:
@@ -62,7 +68,6 @@ def make_item():
return _make_item
def pytest_collection_modifyitems(items):
module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
@@ -78,13 +83,13 @@ def pytest_collection_modifyitems(items):
items[:] = sorted_items
# Incremental testing - fail tests in a class if any previous test fails
# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
# store history of failures per test class name and per index in parametrize (if parametrize used)
_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
def pytest_runtest_makereport(item, call):
if "incremental" in item.keywords:
# incremental marker is used
@@ -93,17 +98,11 @@ def pytest_runtest_makereport(item, call):
# retrieve the class name of the test
cls_name = str(item.cls)
# retrieve the index of the test (if parametrize is used in combination with incremental)
parametrize_index = (
tuple(item.callspec.indices.values())
if hasattr(item, "callspec")
else ()
)
parametrize_index = tuple(item.callspec.indices.values()) if hasattr(item, "callspec") else ()
# retrieve the name of the test function
test_name = item.originalname or item.name
# store in _test_failed_incremental the original name of the failed test
_test_failed_incremental.setdefault(cls_name, {}).setdefault(
parametrize_index, test_name
)
_test_failed_incremental.setdefault(cls_name, {}).setdefault(parametrize_index, test_name)
def pytest_runtest_setup(item):
@@ -119,16 +118,17 @@ def pytest_runtest_setup(item):
pytest.xfail(f"previous test failed ({test_name})")
@pytest.fixture()
@pytest.fixture
def unpickle():
"""
Returns a helper function that unpickles a file
** gets the file from the test_files directory: tests/data/ **
"""
def _unpickle(path):
with open(os.path.join("tests/data", path), "rb") as f:
return pickle.load(f)
return _unpickle
@@ -151,9 +151,9 @@ def sample_datetime():
return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
@pytest.fixture(autouse=True)
@pytest.fixture
def mock_sleep(mocker):
"""Globally mock time.sleep to avoid delays."""
"""Mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@@ -162,4 +162,4 @@ def metadata():
metadata = Metadata()
metadata.set("_processed_at", "2021-01-01T00:00:00")
metadata.set_url("https://example.com")
return metadata
return metadata

View File

@@ -1,5 +1,6 @@
# this is a dummy class used to test importing a dropin in the
# generic extractor by filename/path
class Dropin:
pass
pass

View File

@@ -0,0 +1,11 @@
{
# Display Name of your module
"name": "Example Extractor",
# Optional version number, for your own versioning purposes
"version": 2.0,
# The type of the module, must be one (or more) of the built in module types
"type": ["extractor"],
# a boolean indicating whether or not a module requires additional user setup before it can be used
# for example: adding API keys, installing additional software etc.
"requires_setup": False,
}

View File

@@ -0,0 +1,6 @@
from auto_archiver.core import Extractor
class ExampleExtractor(Extractor):
def download(self, item):
print("download")

View File

@@ -1 +1 @@
from .example_module import ExampleModule
from .example_module import ExampleModule

View File

@@ -16,14 +16,14 @@
"dependencies": {
"python": ["loguru"],
"bin": ["bash"],
},
# configurations that this module takes. These are argparse-compliant dicationaries, that are
},
# configurations that this module takes. These are argparse-compliant dicationaries, that are
# used to create command line arguments when the programme is run.
# The full name of the config option will become: `module_name.config_name`
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"},
"required_field": {"required": True, "help": "required field in the CSV file"},
},
"csv_file": {"default": "db.csv", "help": "CSV file name"},
"required_field": {"required": True, "help": "required field in the CSV file"},
},
# A description of the module, used for documentation
"description": "This is an example module",
}
}

View File

@@ -1,5 +1,6 @@
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def download(self, item):
print("download")
@@ -7,7 +8,6 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def __iter__(self):
yield Metadata().set_url("https://example.com")
def done(self, result):
print("done")
@@ -16,13 +16,12 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
def get_cdn_url(self, media):
return "nice_url"
def save(self, item):
print("save")
def uploadf(self, file, key, **kwargs):
print("uploadf")
def format(self, item):
print("format")

View File

@@ -1,6 +1,5 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.modules.api_db import AAApiDb
@@ -41,9 +40,16 @@ def test_fetch(api_db, metadata, mocker):
mock_datetime = mocker.patch("auto_archiver.core.metadata.datetime.datetime")
mock_datetime.now.return_value = "2021-01-01T00:00:00"
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = [{"result": {}}, {"result":
{'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'},
'status': 'no archiver'}}]
mock_get.return_value.json.return_value = [
{"result": {}},
{
"result": {
"media": [],
"metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"},
"status": "no archiver",
}
},
]
assert api_db.fetch(metadata) == metadata
@@ -52,8 +58,15 @@ def test_done_success(api_db, metadata, mocker):
mock_post.return_value.status_code = 201
api_db.done(metadata)
mock_post.assert_called_once()
mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive",
json={'author_id': 'Someone', 'url': 'https://example.com',
'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'},
headers={'Authorization': 'Bearer test-token'})
mock_post.assert_called_once_with(
"https://api.example.com/interop/submit-archive",
json={
"author_id": "Someone",
"url": "https://example.com",
"public": False,
"group_id": "123",
"tags": ["[", "]"],
"result": '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}',
},
headers={"Authorization": "Bearer test-token"},
)

View File

@@ -2,7 +2,7 @@ import pytest
from datetime import datetime
from auto_archiver.core import Metadata
from auto_archiver.modules.atlos_db import AtlosDb
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosDb
class FakeAPIResponse:
@@ -12,19 +12,28 @@ class FakeAPIResponse:
self._data = data
self.raise_error = raise_error
def json(self) -> dict:
return self._data
def raise_for_status(self) -> None:
if self.raise_error:
raise Exception("HTTP error")
@pytest.fixture
def atlos_db(setup_module) -> AtlosDb:
def atlos_db(setup_module, mocker) -> AtlosDb:
"""Fixture for AtlosDb."""
configs: dict = {
"api_token": "abc123",
"atlos_url": "https://platform.atlos.org",
}
return setup_module("atlos_db", configs)
mocker.patch("requests.Session")
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
fake_session = mocker.MagicMock()
# Configure the default response to have no results so that __iter__ terminates
fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
atlos_feeder.session = fake_session
return atlos_feeder
def test_failed_no_atlos_id(atlos_db, metadata, mocker):
@@ -38,25 +47,18 @@ def test_failed_with_atlos_id(atlos_db, metadata, mocker):
"""Test failed() posts failure when atlos_id is present."""
metadata.set("atlos_id", 42)
fake_resp = FakeAPIResponse({}, raise_error=False)
post_mock = mocker.patch("requests.post", return_value=fake_resp)
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
atlos_db.failed(metadata, "failure reason")
expected_url = (
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver"
)
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
expected_json = {
"metadata": {"processed": True, "status": "error", "error": "failure reason"}
}
post_mock.assert_called_once_with(
expected_url, headers=expected_headers, json=expected_json
)
expected_endpoint = "/api/v2/source_material/metadata/42/auto_archiver"
expected_json = {"metadata": {"processed": True, "status": "error", "error": "failure reason"}}
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)
def test_failed_http_error(atlos_db, metadata, mocker):
"""Test failed() raises exception on HTTP error."""
metadata.set("atlos_id", 42)
fake_resp = FakeAPIResponse({}, raise_error=True)
mocker.patch("requests.post", return_value=fake_resp)
# Patch _post to raise an exception instead of returning a fake response.
mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
with pytest.raises(Exception, match="HTTP error"):
atlos_db.failed(metadata, "failure reason")
@@ -81,12 +83,9 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
now = datetime.now()
metadata.set("timestamp", now)
fake_resp = FakeAPIResponse({}, raise_error=False)
post_mock = mocker.patch("requests.post", return_value=fake_resp)
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
atlos_db.done(metadata)
expected_url = (
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver"
)
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
expected_endpoint = "/api/v2/source_material/metadata/99/auto_archiver"
expected_results = metadata.metadata.copy()
expected_results["timestamp"] = now.isoformat()
expected_json = {
@@ -96,15 +95,13 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
"results": expected_results,
}
}
post_mock.assert_called_once_with(
expected_url, headers=expected_headers, json=expected_json
)
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)
def test_done_http_error(atlos_db, metadata, mocker):
"""Test done() raises exception on HTTP error."""
"""Test done() raises an exception on HTTP error."""
metadata.set("atlos_id", 123)
fake_resp = FakeAPIResponse({}, raise_error=True)
mocker.patch("requests.post", return_value=fake_resp)
# Patch _post to raise an exception.
mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
with pytest.raises(Exception, match="HTTP error"):
atlos_db.done(metadata)

View File

@@ -1,4 +1,3 @@
from auto_archiver.modules.csv_db import CSVDb
from auto_archiver.core import Metadata
@@ -9,12 +8,21 @@ def test_store_item(tmp_path, setup_module):
temp_db = tmp_path / "temp_db.csv"
db = setup_module(CSVDb, {"csv_file": temp_db.as_posix()})
item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")
item = (
Metadata()
.set_url("http://example.com")
.set_title("Example")
.set_content("Example content")
.success("my-archiver")
)
db.done(item)
with open(temp_db, "r", encoding="utf-8") as f:
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
assert (
f.read().strip()
== f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
)
# TODO: csv db doesn't have a fetch method - need to add it (?)
# assert db.fetch(item) == item
# assert db.fetch(item) == item

View File

@@ -2,8 +2,7 @@ from datetime import datetime, timezone
import pytest
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.gsheet_db import GsheetsDb
from auto_archiver.modules.gsheet_feeder import GWorksheet
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet
@pytest.fixture
@@ -29,6 +28,7 @@ def mock_metadata(mocker):
metadata.get_first_image.return_value = None
return metadata
@pytest.fixture
def metadata():
metadata = Metadata()
@@ -52,13 +52,36 @@ def mock_media(mocker):
mock_media.get.return_value = "not-calculated"
return mock_media
@pytest.fixture
def gsheets_db(mock_gworksheet, setup_module, mocker):
db = setup_module("gsheet_db", {
"allow_worksheets": "set()",
"block_worksheets": "set()",
"use_sheet_names_in_stored_paths": "True",
})
def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsFeederDB:
mocker.patch("gspread.service_account")
config: dict = {
"sheet": "testsheet",
"sheet_id": None,
"header": 1,
"service_account": "test/service_account.json",
"columns": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
}
db = setup_module("gsheet_feeder_db", config)
db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
return db
@@ -72,20 +95,21 @@ def fixed_timestamp():
@pytest.fixture
def expected_calls(mock_media, fixed_timestamp):
"""Fixture for the expected cell updates."""
return [
(1, 'status', 'my-archiver: success'),
(1, 'archive', 'http://example.com/screenshot.png'),
(1, 'date', '2025-02-01T00:00:00+00:00'),
(1, 'title', 'Example Title'),
(1, 'text', 'Example Content'),
(1, 'timestamp', '2025-01-01T00:00:00+00:00'),
(1, 'hash', 'not-calculated'),
return [
(1, "status", "my-archiver: success"),
(1, "archive", "http://example.com/screenshot.png"),
(1, "date", "2025-02-01T00:00:00+00:00"),
(1, "title", "Example Title"),
(1, "text", "Example Content"),
(1, "timestamp", "2025-01-01T00:00:00+00:00"),
(1, "hash", "not-calculated"),
# (1, 'screenshot', 'http://example.com/screenshot.png'),
# (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
# (1, 'wacz', 'http://example.com/browsertrix.wacz'),
# (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
]
def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
gw, row = gsheets_db._retrieve_gsheet(metadata)
assert gw == mock_gworksheet
@@ -94,27 +118,34 @@ def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
def test_started(gsheets_db, mock_metadata, mock_gworksheet):
gsheets_db.started(mock_metadata)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Archive in progress")
def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
reason = "Test failure"
gsheets_db.failed(mock_metadata, reason)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", f"Archive failed {reason}")
def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
gsheets_db.aborted(mock_metadata)
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "")
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
mocker.patch(
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
return_value="2025-02-01T00:00:00+00:00",
)
gsheets_db.done(metadata)
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
mocker.patch(
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
return_value="2025-02-01T00:00:00+00:00",
)
gsheets_db.done(metadata, cached=True)
# Verify the status message includes "[cached]"
@@ -125,15 +156,17 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
# clear media from metadata
metadata.media = []
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
mocker.patch(
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
return_value="2025-02-01T00:00:00+00:00",
)
gsheets_db.done(metadata)
# Verify nothing media-related gets updated
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'}
media_fields = {"archive", "screenshot", "thumbnail", "wacz", "replaywebpage"}
assert all(call[1] not in media_fields for call in call_args)
def test_safe_status_update(gsheets_db, metadata, mock_gworksheet):
gsheets_db._safe_status_update(metadata, "Test status")
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status')
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Test status")

View File

@@ -4,34 +4,50 @@ from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import ModuleFactory
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"),
("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6")
])
@pytest.mark.parametrize(
"algorithm, filename, expected_hash",
[
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
(
"SHA3-512",
"tests/data/testfile_1.txt",
"d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e",
),
(
"SHA3-512",
"tests/data/testfile_2.txt",
"e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6",
),
],
)
def test_calculate_hash(algorithm, filename, expected_hash, setup_module):
# test SHA-256
he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 100})
assert he.calculate_hash(filename) == expected_hash
def test_default_config_values(setup_module):
he = setup_module(HashEnricher)
assert he.algorithm == "SHA-256"
assert he.chunksize == 16000000
def test_config():
# test default config
c = ModuleFactory().get_module_lazy('hash_enricher').configs
c = ModuleFactory().get_module_lazy("hash_enricher").configs
assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
assert c["algorithm"]["help"] == "hash algorithm to use"
assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
assert (
c["chunksize"]["help"]
== "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
)
def test_hash_media(setup_module):
he = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 1})
# generate metadata with two test files
@@ -46,4 +62,4 @@ def test_hash_media(setup_module):
he.enrich(m)
assert m.media[0].get("hash") == "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"

View File

@@ -1,4 +1,3 @@
import datetime
from datetime import datetime, timedelta, timezone
import pytest
@@ -16,6 +15,7 @@ def mock_metadata(mocker):
mock.get_all_media.return_value = []
return mock
@pytest.fixture
def mock_media(mocker):
"""Creates a mock Media object."""
@@ -59,6 +59,7 @@ def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
assert metadata.get("total_bytes") == 3000
assert metadata.get("total_size") == "2.9 KB"
@pytest.mark.parametrize(
"size, expected",
[
@@ -74,6 +75,7 @@ def test_human_readable_bytes(size, expected):
enricher = MetaEnricher()
assert enricher.human_readable_bytes(size) == expected
def test_enrich_file_sizes_no_media(meta_enricher, metadata):
"""Test that enrich_file_sizes() handles empty media list gracefully."""
meta_enricher.enrich_file_sizes(metadata)
@@ -91,4 +93,4 @@ def test_enrich_archive_duration(meta_enricher, metadata, mocker):
mock_datetime.now.return_value = mock_now
meta_enricher.enrich_archive_duration(metadata)
assert metadata.get("archive_duration_seconds") == 630
assert metadata.get("archive_duration_seconds") == 630

View File

@@ -1,4 +1,3 @@
import pytest
from auto_archiver.core import Media
@@ -33,9 +32,7 @@ def test_get_metadata(enricher, output, expected, mocker):
result = enricher.get_metadata("test.jpg")
assert result == expected
mock_run.assert_called_once_with(
["exiftool", "test.jpg"], capture_output=True, text=True
)
mock_run.assert_called_once_with(["exiftool", "test.jpg"], capture_output=True, text=True)
def test_get_metadata_exiftool_not_found(enricher, mocker):
@@ -85,4 +82,3 @@ def test_metadata_pickle(enricher, unpickle, mocker):
actual_media = metadata.media
assert len(expected_media) == len(actual_media)
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")

View File

@@ -0,0 +1,276 @@
import pytest
import hashlib
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
from opentimestamps.calendar import RemoteCalendar
from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
from auto_archiver.core import Metadata, Media
# TODO: Remove once timestamping overhaul is merged
@pytest.fixture
def sample_media(tmp_path) -> Media:
"""Fixture creating a Media object with temporary source file"""
src_file = tmp_path / "source.txt"
src_file.write_text("test content")
return Media(_key="subdir/test.txt", filename=str(src_file))
@pytest.fixture
def sample_file_path(tmp_path):
tmp_file = tmp_path / "test.txt"
tmp_file.write_text("This is a test file content for OpenTimestamps")
return str(tmp_file)
@pytest.fixture
def detached_timestamp_file():
"""Create a simple detached timestamp file for testing"""
file_hash = hashlib.sha256(b"Test content").digest()
from opentimestamps.core.op import OpSHA256
file_hash_op = OpSHA256()
timestamp = Timestamp(file_hash)
# Add a pending attestation
pending = PendingAttestation("https://example.calendar.com")
timestamp.attestations.add(pending)
# Add a bitcoin attestation
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
timestamp.attestations.add(bitcoin)
return DetachedTimestampFile(file_hash_op, timestamp)
@pytest.fixture
def verified_timestamp_file():
"""Create a timestamp file with a Bitcoin attestation"""
file_hash = hashlib.sha256(b"Verified content").digest()
from opentimestamps.core.op import OpSHA256
file_hash_op = OpSHA256()
timestamp = Timestamp(file_hash)
# Add only a Bitcoin attestation
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
timestamp.attestations.add(bitcoin)
return DetachedTimestampFile(file_hash_op, timestamp)
@pytest.fixture
def pending_timestamp_file():
"""Create a timestamp file with only pending attestations"""
file_hash = hashlib.sha256(b"Pending content").digest()
from opentimestamps.core.op import OpSHA256
file_hash_op = OpSHA256()
timestamp = Timestamp(file_hash)
# Add only pending attestations
pending1 = PendingAttestation("https://example1.calendar.com")
pending2 = PendingAttestation("https://example2.calendar.com")
timestamp.attestations.add(pending1)
timestamp.attestations.add(pending2)
return DetachedTimestampFile(file_hash_op, timestamp)
@pytest.mark.download
def test_download_tsr(setup_module, mocker):
"""Test submitting a hash to calendar servers"""
# Mock the RemoteCalendar submit method
mock_submit = mocker.patch.object(RemoteCalendar, "submit")
test_timestamp = Timestamp(hashlib.sha256(b"test").digest())
mock_submit.return_value = test_timestamp
# Create a calendar
calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org")
# Test submission
file_hash = hashlib.sha256(b"Test file content").digest()
result = calendar.submit(file_hash)
assert mock_submit.called
assert isinstance(result, Timestamp)
assert result == test_timestamp
def test_verify_timestamp(setup_module, detached_timestamp_file):
"""Test the verification of timestamp attestations"""
ots = setup_module("opentimestamps_enricher")
# Test verification
verification_info = ots.verify_timestamp(detached_timestamp_file)
# Check verification results
assert verification_info["attestation_count"] == 2
assert verification_info["verified"] is True
assert len(verification_info["attestations"]) == 2
# Check attestation types
assertion_types = [a["status"] for a in verification_info["attestations"]]
assert "pending" in assertion_types
assert "confirmed" in assertion_types
# Check Bitcoin attestation details
bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed")
assert bitcoin_attestation["block_height"] == 783000
def test_verify_pending_only(setup_module, pending_timestamp_file):
"""Test verification of timestamps with only pending attestations"""
ots = setup_module("opentimestamps_enricher")
verification_info = ots.verify_timestamp(pending_timestamp_file)
assert verification_info["attestation_count"] == 2
assert verification_info["verified"] is False
# All attestations should be of type "pending"
assert all(a["status"] == "pending" for a in verification_info["attestations"])
# Check URIs of pending attestations
uris = [a["uri"] for a in verification_info["attestations"]]
assert "https://example1.calendar.com" in uris
assert "https://example2.calendar.com" in uris
def test_verify_bitcoin_completed(setup_module, verified_timestamp_file):
"""Test verification of timestamps with completed Bitcoin attestations"""
ots = setup_module("opentimestamps_enricher")
verification_info = ots.verify_timestamp(verified_timestamp_file)
assert verification_info["attestation_count"] == 1
assert verification_info["verified"] is True
assert "pending" not in verification_info
# Check that the attestation is a Bitcoin attestation
attestation = verification_info["attestations"][0]
assert attestation["status"] == "confirmed"
assert attestation["block_height"] == 783000
def test_full_enriching(setup_module, sample_file_path, sample_media, mocker):
"""Test the complete enrichment process"""
# Mock the calendar submission to avoid network requests
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
# Create a function that returns a new timestamp for each call
def side_effect(digest):
test_timestamp = Timestamp(digest)
# Add a bitcoin attestation to the test timestamp
bitcoin = BitcoinBlockHeaderAttestation(783000)
test_timestamp.attestations.add(bitcoin)
return test_timestamp
mock_calendar.side_effect = side_effect
ots = setup_module("opentimestamps_enricher")
# Create test metadata with sample file
metadata = Metadata().set_url("https://example.com")
sample_media.filename = sample_file_path
metadata.add_media(sample_media)
# Run enrichment
ots.enrich(metadata)
# Verify results
assert metadata.get("opentimestamped") is True
assert metadata.get("opentimestamps_count") == 1
# Check that we have one parent media item: the original
assert len(metadata.media) == 1
# Check that the original media was updated
assert metadata.media[0].get("opentimestamps") is True
# Check the timestamp file media is a child of the original
assert len(metadata.media[0].get("opentimestamp_files")) == 1
timestamp_media = metadata.media[0].get("opentimestamp_files")[0]
assert timestamp_media.get("opentimestamps_version") is not None
# Check verification results on the timestamp media
assert timestamp_media.get("verified") is True
assert timestamp_media.get("attestation_count") == 1
def test_full_enriching_one_calendar_error(
setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file
):
"""Test enrichment when one calendar server returns an error"""
# Mock the calendar submission to raise an exception
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935"))
# Add a bitcoin attestation to the test timestamp
bitcoin = BitcoinBlockHeaderAttestation(783000)
test_timestamp.attestations.add(bitcoin)
mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")]
ots = setup_module(
"opentimestamps_enricher",
{
"calendar_urls": [
"https://alice.btc.calendar.opentimestamps.org",
"https://bob.btc.calendar.opentimestamps.org",
]
},
)
# Create test metadata with sample file
metadata = Metadata().set_url("https://example.com")
sample_media.filename = sample_file_path
metadata.add_media(sample_media)
# Run enrichment (should complete despite calendar errors)
ots.enrich(metadata)
# Verify results
assert metadata.get("opentimestamped") is True
assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob
def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker):
"""Test enrichment when calendar servers return errors"""
# Mock the calendar submission to raise an exception
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
mock_calendar.side_effect = Exception("Calendar server error")
ots = setup_module("opentimestamps_enricher")
# Create test metadata with sample file
metadata = Metadata().set_url("https://example.com")
sample_media.filename = sample_file_path
metadata.add_media(sample_media)
# Run enrichment (should complete despite calendar errors)
ots.enrich(metadata)
# Verify results
assert metadata.get("opentimestamped") is False
assert metadata.get("opentimestamps_count") is None
def test_no_files_to_stamp(setup_module):
"""Test enrichment with no files to timestamp"""
ots = setup_module("opentimestamps_enricher")
# Create empty metadata
metadata = Metadata().set_url("https://example.com")
# Run enrichment
ots.enrich(metadata)
# Verify no timestamping occurred
assert metadata.get("opentimestamped") is None
assert len(metadata.media) == 0

View File

@@ -14,23 +14,21 @@ def enricher(setup_module):
def metadata_with_images():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2"))
m.add_media(Media(filename="image1.jpg", _key="image1"))
m.add_media(Media(filename="image2.jpg", _key="image2"))
return m
def test_successful_enrich(metadata_with_images, mocker):
with (
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
mocker.patch("PIL.Image.open"),
mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100))
mocker.patch("PIL.Image.open")
mocker.patch.object(Media, "is_image", return_value=True)
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
# Ensure the hash is set for image media
for media in metadata_with_images.media:
assert media.get("pdq_hash") is not None
# Ensure the hash is set for image media
for media in metadata_with_images.media:
assert media.get("pdq_hash") is not None
def test_enrich_skip_non_image(metadata_with_images, mocker):
@@ -59,7 +57,7 @@ def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
("screenshot", False),
("warc-file-123", False),
("regular-image", True),
]
],
)
def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
metadata = Metadata()
@@ -75,4 +73,3 @@ def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
media_item = metadata.media[0]
assert (media_item.get("pdq_hash") is not None) == should_have_hash

View File

@@ -15,13 +15,15 @@ def mock_selenium_env(mocker):
mock_which = mocker.patch("shutil.which")
mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True)
mocker.patch("pathlib.Path.is_file", return_value=True)
mock_popen = mocker.patch("subprocess.Popen")
mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
# Define side effect for `shutil.which`
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
@@ -83,8 +85,8 @@ def test_enrich_adds_screenshot(
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookies=None,
cookiejar=None,
cookie=None,
cookie_jar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
@@ -104,13 +106,7 @@ def test_enrich_adds_screenshot(
],
)
def test_enrich_auth_wall(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
url,
is_auth,
mocker
screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
@@ -128,9 +124,39 @@ def test_enrich_auth_wall(
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_handle_timeout_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
assert "[SKIP] SCREENSHOT since url" in caplog.text
@pytest.mark.parametrize(
"auth",
[
{"cookie": "cookie"},
{"cookies_jar": "cookie"},
],
)
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
# patch the authentication dict:
screenshot_enricher.authentication = {"example.com": auth}
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
assert "[SKIP] SCREENSHOT since url" not in caplog.text
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
mock_driver, mock_driver_class, _ = mock_selenium_env
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
@@ -140,9 +166,7 @@ def test_handle_timeout_exception(
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
@@ -152,9 +176,7 @@ def test_handle_general_exception(
mock_log = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with(
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
)
mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error")
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1
@@ -167,13 +189,12 @@ def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_sel
# Mock the print_page method to return base64-encoded content
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
# Patch functions with mocker
mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
mock_random_str = mocker.patch(
mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
mocker.patch(
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
return_value="fixed123",
)
mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
mock_log_error = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify screenshot and PDF creation

View File

@@ -51,4 +51,3 @@ def test_ssl_error_handling(enricher, metadata, mocker):
mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error"))
with pytest.raises(ssl.SSLError, match="SSL error"):
enricher.enrich(metadata)

View File

@@ -25,7 +25,7 @@ def mock_ffmpeg_environment(mocker):
# Mocking all the ffmpeg calls in one place
mock_ffmpeg_input = mocker.patch("ffmpeg.input")
mock_makedirs = mocker.patch("os.makedirs")
mocker.patch.object(Media, "is_video", return_value=True),
(mocker.patch.object(Media, "is_video", return_value=True),)
mock_probe = mocker.patch(
"ffmpeg.probe",
return_value={
@@ -35,9 +35,7 @@ def mock_ffmpeg_environment(mocker):
},
)
mock_output = mocker.MagicMock()
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = (
mock_output
)
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = mock_output
return {
"mock_ffmpeg_input": mock_ffmpeg_input,
@@ -47,14 +45,21 @@ def mock_ffmpeg_environment(mocker):
}
@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [
(10, 5, 5), # Capped at max_thumbnails
(1, 10, 2), # Less than max_thumbnails
(60, 7, 7), # Matches exactly
])
@pytest.mark.parametrize(
"thumbnails_per_minute, max_thumbnails, expected_count",
[
(10, 5, 5), # Capped at max_thumbnails
(1, 10, 2), # Less than max_thumbnails
(60, 7, 7), # Matches exactly
],
)
def test_enrich_thumbnail_limits(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment,
thumbnails_per_minute, max_thumbnails, expected_count
thumbnail_enricher,
metadata_with_video,
mock_ffmpeg_environment,
thumbnails_per_minute,
max_thumbnails,
expected_count,
):
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
@@ -65,8 +70,8 @@ def test_enrich_thumbnail_limits(
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert len(thumbnails) == expected_count
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
mocker.patch("os.makedirs")
mock_logger = mocker.patch("loguru.logger.error")
@@ -74,36 +79,43 @@ def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, m
thumbnail_enricher.enrich(metadata_with_video)
# Ensure error was logged
mock_logger.assert_called_with(
f"error getting duration of video video.mp4: Probe error"
)
mock_logger.assert_called_with("error getting duration of video video.mp4: Probe error")
# Ensure no thumbnails were created
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert thumbnails is None
def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch.object(Media, "is_video", return_value=False)
mock_ffmpeg = mocker.patch("ffmpeg.input")
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg.assert_not_called()
mocker.patch.object(Media, "is_video", return_value=False)
mock_ffmpeg = mocker.patch("ffmpeg.input")
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg.assert_not_called()
@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [
(60, 5, 5), # caught by max
(60, 20, 10), # caught by t/min
(0, 20, 1), # test min caught (1)
(11, 20, 1), # test min caught (1)
(12, 20, 2), # test caught by t/min
])
@pytest.mark.parametrize(
"thumbnails_per_minute,max_thumbnails,expected_count",
[
(60, 5, 5), # caught by max
(60, 20, 10), # caught by t/min
(0, 20, 1), # test min caught (1)
(11, 20, 1), # test min caught (1)
(12, 20, 2), # test caught by t/min
],
)
def test_enrich_handles_short_video(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker
thumbnail_enricher,
metadata_with_video,
mock_ffmpeg_environment,
thumbnails_per_minute,
max_thumbnails,
expected_count,
mocker,
):
# override mock duration
fake_duration = 10
mocker.patch(
"ffmpeg.probe",
return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
)
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
@@ -114,9 +126,7 @@ def test_enrich_handles_short_video(
assert len(thumbnails) == expected_count
def test_uses_existing_duration(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment
):
def test_uses_existing_duration(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment):
metadata_with_video.media[0].set("duration", 60)
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg_environment["mock_probe"].assert_not_called()
@@ -125,7 +135,7 @@ def test_uses_existing_duration(
def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
fake_duration = 120
mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]})
mocker.patch("ffmpeg.probe", return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]})
thumbnail_enricher.thumbnails_per_minute = 2
thumbnail_enricher.max_thumbnails = 4

View File

@@ -4,6 +4,7 @@ from zipfile import ZipFile
import pytest
from auto_archiver.core import Metadata, Media
from auto_archiver.core.consts import SetupError
@pytest.fixture
@@ -22,6 +23,15 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
return wacz
def test_raises_error_without_docker_installed(setup_module, mocker, caplog):
# pretend that docker isn't installed
mocker.patch("shutil.which").return_value = None
with pytest.raises(SetupError):
setup_module("wacz_extractor_enricher", {})
assert "requires external dependency 'docker' which is not available/setup" in caplog.text
def test_setup_without_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
wacz_enricher.setup()

View File

@@ -5,37 +5,52 @@ from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnr
from auto_archiver.core import Metadata
@pytest.fixture(autouse=True)
def mock_sleep(mocker):
"""Mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@pytest.fixture
def mock_is_auth_wall(mocker):
"""Fixture to mock is_auth_wall behavior."""
def _mock_is_auth_wall(return_value: bool):
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
return _mock_is_auth_wall
@pytest.fixture
def mock_post_success(mocker):
"""Fixture to mock POST requests with a successful response."""
def _mock_post(json_data: dict = None, status_code: int = 200):
json_data = json_data or {"job_id": "job123"}
json_data = {"job_id": "job123"} if json_data is None else json_data
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.post", return_value=resp)
return _mock_post
@pytest.fixture
def mock_get_success(mocker):
"""Fixture to mock GET requests returning a completed archive status."""
def _mock_get(json_data: dict = None, status_code: int = 200):
json_data = json_data or {
"status": "success",
"timestamp": "20250101010101",
"original_url": "https://example.com"
"original_url": "https://example.com",
}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.get", return_value=resp)
return _mock_get
@pytest.fixture
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
configs: dict = {
@@ -49,12 +64,7 @@ def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
return setup_module("wayback_extractor_enricher", configs)
def test_download_success(
wayback_extractor_enricher,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
def test_download_success(wayback_extractor_enricher, mock_is_auth_wall, mock_post_success, mock_get_success):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
@@ -63,34 +73,28 @@ def test_download_success(
result = wayback_extractor_enricher.download(metadata)
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
mock_is_auth_wall(True)
result = wayback_extractor_enricher.enrich(metadata)
assert result is None
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
metadata.set("wayback", "existing")
result = wayback_extractor_enricher.enrich(metadata)
assert result is True
def test_enrich_post_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
def test_enrich_post_failure(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
mock_is_auth_wall(False)
mock_post_success(json_data={"error": "server error"}, status_code=500)
result = wayback_extractor_enricher.enrich(metadata)
assert result is False
assert "Internet archive failed with status of 500" in metadata.get("wayback")
def test_enrich_post_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mocker
):
def test_enrich_post_json_decode_error(wayback_extractor_enricher, metadata, mock_is_auth_wall, mocker):
mock_is_auth_wall(False)
resp = mocker.Mock(status_code=200)
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
@@ -98,22 +102,15 @@ def test_enrich_post_json_decode_error(
mocker.patch("requests.post", return_value=resp)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_no_job_id(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
def test_enrich_no_job_id(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
mock_is_auth_wall(False)
mock_post_success(json_data={})
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_success(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
@@ -122,24 +119,18 @@ def test_enrich_get_success(
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
def test_enrich_get_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success(json_data={"status": "failed"}, status_code=400)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_request_exception(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
):
mock_is_auth_wall(False)
mock_post_success()
@@ -149,12 +140,9 @@ def test_enrich_get_request_exception(
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"
def test_enrich_get_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
):
mock_is_auth_wall(False)
mock_post_success()

View File

@@ -7,6 +7,12 @@ from auto_archiver.modules.whisper_enricher import WhisperEnricher
TEST_S3_URL = "http://cdn.example.com/test.mp4"
@pytest.fixture(autouse=True)
def mock_sleep(mocker):
"""Mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@pytest.fixture
def enricher(mocker):
"""Fixture with mocked S3 and API dependencies"""
@@ -16,7 +22,7 @@ def enricher(mocker):
"include_srt": False,
"timeout": 5,
"action": "translate",
"steps": {"storages": ["s3_storage"]}
"steps": {"storages": ["s3_storage"]},
}
mock_s3 = mocker.MagicMock(spec=S3Storage)
mock_s3.get_cdn_url.return_value = TEST_S3_URL
@@ -25,7 +31,7 @@ def enricher(mocker):
instance.display_name = "Whisper Enricher"
instance.config_setup({instance.name: config})
# bypassing the setup method and mocking S3 setup
instance.stores = config['steps']['storages']
instance.stores = config["steps"]["storages"]
instance.s3 = mock_s3
yield instance, mock_s3
@@ -63,19 +69,14 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
# Mock the complete API interaction chain
mock_status_response = mocker.MagicMock()
mock_status_response.status_code = 200
mock_status_response.json.return_value = {
"status": "success",
"meta": {}
}
mock_status_response.json.return_value = {"status": "success", "meta": {}}
mock_artifacts_response = mocker.MagicMock()
mock_artifacts_response.status_code = 200
mock_artifacts_response.json.return_value = [{
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
}]
mock_artifacts_response.json.return_value = [{"data": [{"start": 0, "end": 5, "text": "test transcript"}]}]
# Set up mock response sequence
mock_requests.get.side_effect = [
mock_status_response, # First call: status check
mock_artifacts_response # Second call: artifacts check
mock_artifacts_response, # Second call: artifacts check
]
# Run enrichment (without opening file)
@@ -84,15 +85,17 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
mock_requests.post.assert_called_once_with(
"http://testapi/jobs",
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
headers={"Authorization": "Bearer whisper-key"}
headers={"Authorization": "Bearer whisper-key"},
)
# Verify job status checks
assert mock_requests.get.call_count == 2
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript',
'job_artifacts_check': 'http://testapi/jobs/job123/artifacts',
'job_id': 'job123',
'job_status_check': 'http://testapi/jobs/job123'}
assert metadata.media[0].get("whisper_model") == {
"artifact_0_text": "test transcript",
"job_artifacts_check": "http://testapi/jobs/job123/artifacts",
"job_id": "job123",
"job_status_check": "http://testapi/jobs/job123",
}
def test_submit_job(enricher, mocker):

View File

@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor
class TestExtractorBase(object):
extractor_module: str = None
config: dict = None
@@ -17,7 +16,7 @@ class TestExtractorBase(object):
assert self.config is not None, "self.config must be a dict set on the subclass"
self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
assert test_response is not False

View File

@@ -9,26 +9,28 @@ import pytest
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from .test_extractor_base import TestExtractorBase
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
class TestGenericExtractor(TestExtractorBase):
"""Tests Generic Extractor
"""
extractor_module = 'generic_extractor'
"""Tests Generic Extractor"""
extractor_module = "generic_extractor"
extractor: GenericExtractor
config = {
'subtitles': False,
'comments': False,
'livestreams': False,
'live_from_start': False,
'end_means_success': True,
'allow_playlist': False,
'max_downloads': "inf",
'proxy': None,
'cookies_from_browser': False,
'cookie_file': None,
}
"subtitles": False,
"comments": False,
"livestreams": False,
"live_from_start": False,
"end_means_success": True,
"allow_playlist": False,
"max_downloads": "inf",
"proxy": None,
"cookies_from_browser": False,
"cookie_file": None,
}
def test_load_dropin(self):
# test loading dropins that are in the generic_archiver package
package = "auto_archiver.modules.generic_extractor"
@@ -38,21 +40,42 @@ class TestGenericExtractor(TestExtractorBase):
path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@pytest.mark.parametrize(
"url, suitable_extractors",
[
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
],
)
def test_suitable_extractors(self, url, suitable_extractors):
suitable_extractors = suitable_extractors + ["generic"] # the generic is valid for all
extractors = list(self.extractor.suitable_extractors(url))
assert len(extractors) == len(suitable_extractors)
assert [e.ie_key().lower() for e in extractors] == suitable_extractors
@pytest.mark.parametrize("url, is_suitable", [
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
("https://www.twitch.tv/videos/1167226570", True),
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
("https://google.com", True)])
def test_suitable_urls(self, make_item, url, is_suitable):
@pytest.mark.parametrize(
"url, is_suitable",
[
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
("https://www.twitch.tv/videos/1167226570", True),
(
"https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
True,
),
("https://google.com", True),
],
)
def test_suitable_urls(self, url, is_suitable):
"""
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
and then if and only if all archivers fails, does it fall back to the generic archiver)
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
and then if and only if all archivers fails, does it fall back to the generic archiver)
"""
assert self.extractor.suitable(url) == is_suitable
@@ -63,11 +86,14 @@ class TestGenericExtractor(TestExtractorBase):
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
@pytest.mark.download
@pytest.mark.parametrize("url", [
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1"
])
@pytest.mark.parametrize(
"url",
[
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1",
],
)
def test_download_nonexistent_media(self, make_item, url):
"""
Test to make sure that the extractor doesn't break on non-existend posts/media
@@ -78,7 +104,10 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert not result
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
@pytest.mark.skipif(
CI,
reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
)
@pytest.mark.download
def test_youtube_download(self, make_item):
# url https://www.youtube.com/watch?v=5qap5aO4i9A
@@ -87,7 +116,10 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
assert (
result.get("description")
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
)
assert len(result.media) == 2
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
assert Path(result.media[1].filename).name == "hqdefault.jpg"
@@ -103,7 +135,7 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.download
def test_bluesky_download_no_media(self, make_item):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
@@ -115,7 +147,7 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_video(self, make_item):
@@ -130,14 +162,14 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_poll(self, make_item):
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_single_image(self, make_item):
@@ -159,7 +191,7 @@ class TestGenericExtractor(TestExtractorBase):
url = "https://x.com/Bellingcat/status/17197025860711058"
response = self.extractor.download(make_item(url))
assert not response
@pytest.mark.download
def test_twitter_download_malformed_tweetid(self, make_item):
# this tweet does not exist
@@ -169,17 +201,17 @@ class TestGenericExtractor(TestExtractorBase):
@pytest.mark.download
def test_twitter_download_tweet_no_media(self, make_item):
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.extractor.download(item)
self.assertValidResponseMetadata(
post,
"Onion rings are just vegetable donuts.",
"Cookie Monster - Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"yt-dlp_Twitter: success"
"yt-dlp_Twitter: success",
)
assert post.get("content") == "Onion rings are just vegetable donuts."
@pytest.mark.download
def test_twitter_download_video(self, make_item):
url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -187,26 +219,75 @@ class TestGenericExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
)
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
@pytest.mark.xfail(
reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
)
@pytest.mark.download
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
])
@pytest.mark.parametrize(
"url, title, timestamp, image_hash",
[
(
"https://x.com/SozinhoRamalho/status/1876710769913450647",
"ignore tweet, testing sensitivity warning nudity",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876710875475681357",
"ignore tweet, testing sensitivity warning violence",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876711053813227618",
"ignore tweet, testing sensitivity warning sensitive",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876711141314801937",
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
],
)
def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
"""Download tweets with sensitive media"""
post = self.extractor.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
assert post.media[0].hash == image_hash
assert post.media[0].hash == image_hash
@pytest.mark.download
def test_download_facebook_video(self, make_item):
post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
assert len(post.media) == 2
assert post.media[0].filename.endswith("588371253839133.mp4")
assert post.media[0].mimetype == "video/mp4"
assert post.media[1].filename.endswith(".jpg")
assert post.media[1].mimetype == "image/jpeg"
assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
@pytest.mark.download
def test_download_facebook_image(self, make_item):
post = self.extractor.download(
make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
)
assert len(post.media) == 1
assert post.media[0].filename.endswith(".png")
assert "Byline Festival - BylineFest Partner" == post.get_title()
@pytest.mark.download
def test_download_facebook_text_only(self, make_item):
url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
post = self.extractor.download(make_item(url))
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
assert post.get_title() == "Bellingcat"

View File

@@ -15,10 +15,11 @@ def mock_user_response():
"username": "test_user",
"full_name": "Test User",
"profile_pic_url_hd": "http://example.com/profile.jpg",
"profile_pic_url": "http://example.com/profile_lowres.jpg"
"profile_pic_url": "http://example.com/profile_lowres.jpg",
}
}
@pytest.fixture
def mock_post_response():
return {
@@ -27,16 +28,14 @@ def mock_post_response():
"caption_text": "Test Caption",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/video.mp4",
"thumbnail_url": "http://example.com/thumbnail.jpg"
"thumbnail_url": "http://example.com/thumbnail.jpg",
}
@pytest.fixture
def mock_story_response():
return [{
"id": "story_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/story.mp4"
}]
return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
@pytest.fixture
def mock_highlight_response():
@@ -46,11 +45,13 @@ def mock_highlight_response():
"highlight:123": {
"id": "123",
"title": "Test Highlight",
"items": [{
"id": "item_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/highlight.mp4"
}]
"items": [
{
"id": "item_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/highlight.mp4",
}
],
}
}
}
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
m.set("netloc", "instagram.com")
return m
@pytest.mark.parametrize("url,expected", [
("https://instagram.com/user", [("", "user", "")]),
("https://instagr.am/p/post_id", []),
("https://youtube.com", []),
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
])
@pytest.mark.parametrize(
"url,expected",
[
("https://instagram.com/user", [("", "user", "")]),
("https://instagr.am/p/post_id", []),
("https://youtube.com", []),
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
],
)
def test_url_parsing(self, url, expected):
assert self.extractor.valid_url.findall(url) == expected
def test_initialize(self):
assert self.extractor.api_endpoint[-1] != "/"
@pytest.mark.parametrize("input_dict,expected", [
({"x": 0, "valid": "data"}, {"valid": "data"}),
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
])
@pytest.mark.parametrize(
"input_dict,expected",
[
({"x": 0, "valid": "data"}, {"valid": "data"}),
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
],
)
def test_cleanup_dict(self, input_dict, expected):
assert self.extractor.cleanup_dict(input_dict) == expected
@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_basic(self, metadata, mock_user_response, mocker):
"""Test basic profile download without full_profile"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_download = mocker.patch.object(self.extractor, 'download_from_url')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_download = mocker.patch.object(self.extractor, "download_from_url")
# Mock API responses
mock_call.return_value = mock_user_response
mock_download.return_value = "profile.jpg"
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
"""Test full profile download with stories/posts"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
mock_story_response
]
mock_call.side_effect = [mock_user_response, mock_story_response]
mock_highlights.return_value = None
mock_stories.return_value = mock_story_response
mock_posts.return_value = None
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_not_found(self, metadata, mocker):
"""Test profile not found error"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_call.return_value = {"user": None}
with pytest.raises(AssertionError) as exc_info:
self.extractor.download_profile(metadata, "invalid_user")
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
"""Test error handling in full profile mode"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
Exception("Stories API failed"),
Exception("Posts API failed")
]
mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
mock_highlights.return_value = None
mock_tagged.return_value = None
stories_tagged.return_value = None
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
result = self.extractor.download_profile(metadata, "test_user")
assert result.is_success()
assert "Error downloading stories for test_user" in result.metadata["errors"]
assert "Error downloading stories for test_user" in result.metadata["errors"]

View File

@@ -1,21 +1,41 @@
import pytest
from auto_archiver.modules.instagram_extractor import InstagramExtractor
from .test_extractor_base import TestExtractorBase
class TestInstagramExtractor(TestExtractorBase):
extractor_module: str = 'instagram_extractor'
config: dict = {}
@pytest.fixture
def instagram_extractor(setup_module, mocker):
extractor_module: str = "instagram_extractor"
config: dict = {
"username": "user_name",
"password": "password123",
"download_folder": "instaloader",
"session_file": "secrets/instaloader.session",
}
fake_loader = mocker.MagicMock()
fake_loader.load_session_from_file.return_value = None
fake_loader.login.return_value = None
fake_loader.save_session_to_file.return_value = None
mocker.patch(
"instaloader.Instaloader",
return_value=fake_loader,
)
return setup_module(extractor_module, config)
@pytest.mark.parametrize("url", [
@pytest.mark.parametrize(
"url",
[
"https://www.instagram.com/p/",
"https://www.instagram.com/p/1234567890/",
"https://www.instagram.com/reel/1234567890/",
"https://www.instagram.com/username/",
"https://www.instagram.com/username/stories/",
"https://www.instagram.com/username/highlights/",
])
def test_regex_matches(self, url):
# post
assert InstagramExtractor.valid_url.match(url)
],
)
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
"""
Ensure that the valid_url regex matches all provided Instagram URLs.
"""
assert instagram_extractor.valid_url.match(url)

View File

@@ -7,10 +7,16 @@ from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtracto
from tests.extractors.test_extractor_base import TestExtractorBase
@pytest.fixture(autouse=True)
def mock_sleep(mocker):
"""Mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@pytest.fixture
def patch_extractor_methods(request, setup_module, mocker):
mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
yield
@@ -35,12 +41,7 @@ def mock_telegram_client(mocker):
@pytest.fixture
def extractor(setup_module, patch_extractor_methods, mocker):
extractor_module = "instagram_tbot_extractor"
config = {
"api_id": 12345,
"api_hash": "test_api_hash",
"session_file": "test_session",
"timeout": 4
}
config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
extractor = setup_module(extractor_module, config)
extractor.client = mocker.MagicMock()
extractor.session_file = "test_session"
@@ -79,21 +80,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
"session_file": "secrets/anon-insta",
}
@pytest.mark.parametrize("url, expected_status, message, len_media", [
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6),
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
])
@pytest.mark.parametrize(
"url, expected_status, message, len_media",
[
(
"https://www.instagram.com/p/C4QgLbrIKXG",
"insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6,
),
(
"https://www.instagram.com/reel/DEVLK8qoIbg/",
"insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3,
),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
],
)
def test_download(self, url, expected_status, message, len_media, metadata_sample):
"""Test the `download()` method with various Instagram URLs."""
metadata_sample.set_url(url)

View File

@@ -0,0 +1,177 @@
from datetime import datetime, timezone
import time
import pytest
import yt_dlp
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from auto_archiver.modules.generic_extractor.tiktok import Tiktok, TikTokIE
from .test_extractor_base import TestExtractorBase
@pytest.fixture(autouse=True)
def skip_ytdlp_own_methods(mocker):
# mock this method, so that we skip the ytdlp download in these tests
mocker.patch("auto_archiver.modules.generic_extractor.tiktok.Tiktok.skip_ytdlp_download", return_value=True)
mocker.patch(
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.suitable_extractors",
return_value=[e for e in yt_dlp.YoutubeDL()._ies.values() if e.IE_NAME == "TikTok"],
)
@pytest.fixture
def mock_get(mocker):
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
@pytest.fixture
def tiktok_dropin() -> Tiktok:
return Tiktok()
class TestTiktokTikwmExtractor(TestExtractorBase):
"""
Test suite for TestTiktokTikwmExtractor.
"""
extractor_module = "generic_extractor"
extractor: GenericExtractor
config = {}
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
@pytest.mark.parametrize(
"url, is_suitable",
[
("https://bellingcat.com", False),
("https://youtube.com", False),
("https://tiktok.co/", False),
("https://tiktok.com/", False),
("https://www.tiktok.com/", False),
("https://api.cool.tiktok.com/", False),
(VALID_EXAMPLE_URL, True),
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
("https://vt.tiktok.com/ZSMTJeqRP/", True),
],
)
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
assert tiktok_dropin.suitable(url, TikTokIE()) == is_suitable
def test_invalid_json_responses(self, mock_get, make_item, caplog):
mock_get.return_value.status_code = 200
mock_get.return_value.json.side_effect = ValueError
with caplog.at_level("DEBUG"):
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
assert (
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
in caplog.text
)
mock_get.return_value.json.side_effect = Exception
with caplog.at_level("ERROR"):
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
mock_get.assert_called()
assert mock_get.call_count == 2
assert mock_get.return_value.json.call_count == 2
assert (
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
in caplog.text
)
@pytest.mark.parametrize(
"response",
[
({"msg": "failure"}),
({"msg": "success"}),
],
)
def test_unsuccessful_responses(self, mock_get, make_item, response, caplog):
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = response
with caplog.at_level("DEBUG"):
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
assert "failed to get a valid response from tikwm.com" in caplog.text
@pytest.mark.parametrize(
"response,has_vid",
[
({"data": {"id": 123}}, False),
({"data": {"wmplay": "url"}}, True),
({"data": {"play": "url"}}, True),
],
)
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"msg": "success", **response}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
if not has_vid:
assert result is False
else:
assert result.is_success()
assert len(result.media) == 1
mock_get.assert_called()
assert mock_get.call_count == 1 + int(has_vid)
mock_get.return_value.json.assert_called_once()
def test_correct_data_extracted(self, mock_get, make_item):
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {
"msg": "success",
"data": {
"wmplay": "url",
"origin_cover": "cover.jpg",
"title": "Title",
"id": 123,
"duration": 60,
"create_time": 1736301699,
"author": "Author",
"other": "data",
},
}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Title"
assert result.get("author") == "Author"
assert result.get("api_data") == {"other": "data", "id": 123}
assert result.media[1].get("duration") == 60
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
@pytest.mark.download
def test_download_video(self, make_item):
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert (
result.get_title()
== "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
)
assert result.get("author").get("unique_id") == "bbcnews"
assert result.get("api_data").get("id") == "7478038212070411542"
assert result.media[1].get("duration") == 59
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
@pytest.mark.download
def test_download_sensitive_video(self, make_item):
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
# Required for rate limiting
time.sleep(1.1)
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
assert result.get("author").get("id") == "7197400619475649562"
assert result.get("api_data").get("id") == "7441821351142362375"
assert result.media[1].get("duration") == 34
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)

View File

@@ -1,6 +1,5 @@
import os
import datetime
import hashlib
import pytest
from pytwitter.models.media import MediaVariant
@@ -10,8 +9,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor
@pytest.mark.incremental
class TestTwitterApiExtractor(TestExtractorBase):
extractor_module = 'twitter_api_extractor'
extractor_module: TwitterApiExtractor = "twitter_api_extractor"
config = {
"bearer_tokens": [],
@@ -22,41 +20,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
"access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
}
@pytest.mark.parametrize("url, expected", [
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
])
@pytest.mark.parametrize(
"url, expected",
[
(
"https://x.com/bellingcat/status/1874097816571961839",
"https://x.com/bellingcat/status/1874097816571961839",
), # x.com urls unchanged
(
"https://twitter.com/bellingcat/status/1874097816571961839",
"https://twitter.com/bellingcat/status/1874097816571961839",
), # twitter urls unchanged
(
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
), # don't strip params from twitter urls (changed Jan 2025)
(
"https://www.bellingcat.com/category/resources/",
"https://www.bellingcat.com/category/resources/",
), # non-twitter/x urls unchanged
(
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
), # shouldn't strip params from non-twitter/x URLs
],
)
def test_sanitize_url(self, url, expected):
assert expected == self.extractor.sanitize_url(url)
@pytest.mark.download
def test_sanitize_url_download(self):
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
"https://t.co/yl3oOJatFp"
)
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False)
])
@pytest.mark.parametrize(
"url, exptected_username, exptected_tweetid",
[
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False),
],
)
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
username, tweet_id = self.extractor.get_username_tweet_id(url)
assert exptected_username == username
assert exptected_tweetid == tweet_id
def test_choose_variants(self):
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
]
variant_list = [
MediaVariant(
content_type="application/x-mpegURL",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
),
MediaVariant(
bit_rate=256000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
),
MediaVariant(
bit_rate=832000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
),
MediaVariant(
bit_rate=2176000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
),
]
chosen_variant = self.extractor.choose_variant(variant_list)
assert chosen_variant == variant_list[3]
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.download
def test_download_nonexistent_tweet(self, make_item):
@@ -76,7 +112,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.download
def test_download_tweet_no_media(self, make_item):
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.extractor.download(item)
@@ -84,7 +119,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
post,
"Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"twitter-api: success"
"twitter-api: success",
)
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@@ -95,27 +130,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
)
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.parametrize("url, title, timestamp", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
])
@pytest.mark.parametrize(
"url, title, timestamp",
[
(
"https://x.com/SozinhoRamalho/status/1876710769913450647",
"ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876710875475681357",
"ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876711053813227618",
"ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876711141314801937",
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
],
)
@pytest.mark.download
def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
"""Download tweets with sensitive media"""
post = self.extractor.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
# check the SHA1 hash (quick) of the media, to make sure it's valid
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")

View File

@@ -0,0 +1,77 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.modules.vk_extractor import VkExtractor
@pytest.fixture
def mock_vk_scraper(mocker):
"""Fixture to mock VkScraper."""
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
@pytest.fixture
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
"""Fixture to initialize VkExtractor with mocked VkScraper."""
extractor_module = "vk_extractor"
configs = {
"username": "name",
"password": "password123",
"session_file": "secrets/vk_config.v2.json",
}
vk = setup_module(extractor_module, configs)
vk.vks = mock_vk_scraper.return_value
return vk
def test_netloc(vk_extractor, metadata):
# metadata url set as: "https://example.com/"
assert vk_extractor.download(metadata) is False
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
metadata.set_url("https://vk.com/valid-wall")
vk_extractor.vks.scrape.return_value = []
assert vk_extractor.download(metadata) is False
assert metadata.netloc == "vk.com"
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
mock_scrapes = [
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
]
mock_filenames = ["image1.jpg", "image2.png"]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = mock_filenames
metadata.set_url("https://vk.com/valid-wall")
result = vk_extractor.download(metadata)
# Test metadata
assert result.is_success()
assert result.status == "vk: success"
assert result.get_title() == "Post Title"
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert "Another Post" in result.metadata["content"]
# Test Media objects
assert len(result.media) == 2
assert result.media[0].filename == "image1.jpg"
assert result.media[1].filename == "image2.png"
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
def test_adds_first_title_and_timestamp(vk_extractor):
metadata = Metadata().set_url("https://vk.com/no-metadata")
metadata.set_url("https://vk.com/no-metadata")
mock_scrapes = [
{"text": "value", "datetime": "2023-01-01T00:00:00"},
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = []
result = vk_extractor.download(metadata)
assert result.get_title() == "value"
# formatted timestamp
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert result.is_success()

View File

@@ -1,5 +1,5 @@
import pytest
from auto_archiver.modules.atlos_feeder import AtlosFeeder
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosFeeder
class FakeAPIResponse:
@@ -18,44 +18,63 @@ class FakeAPIResponse:
@pytest.fixture
def atlos_feeder(setup_module) -> AtlosFeeder:
def atlos_feeder(setup_module, mocker) -> AtlosFeeder:
"""Fixture for AtlosFeeder."""
configs: dict = {
"api_token": "abc123",
"atlos_url": "https://platform.atlos.org",
}
return setup_module("atlos_feeder", configs)
mocker.patch("requests.Session")
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
fake_session = mocker.MagicMock()
# Configure the default response to have no results so that __iter__ terminates
fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
atlos_feeder.session = fake_session
return atlos_feeder
@pytest.fixture
def mock_atlos_api(mocker):
"""Fixture to mock requests to Atlos API."""
def mock_atlos_api(atlos_feeder):
"""Fixture to update the atlos_feeder.session.get side_effect."""
def _mock_responses(responses):
mocker.patch(
"requests.get",
side_effect=[FakeAPIResponse(data) for data in responses],
)
atlos_feeder.session.get.side_effect = [FakeAPIResponse(data) for data in responses]
return _mock_responses
def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
"""Test valid items are yielded and invalid ones ignored."""
mock_atlos_api([
{
"next": None,
"results": [
{"source_url": "http://example.com", "id": 1,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
{"source_url": "", "id": 2,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
{"source_url": "http://example.org", "id": 3,
"metadata": {"auto_archiver": {"processed": True}},
"visibility": "visible", "status": "complete"},
],
}
])
mock_atlos_api(
[
{
"next": None,
"results": [
{
"source_url": "http://example.com",
"id": 1,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
{
"source_url": "",
"id": 2,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
{
"source_url": "http://example.org",
"id": 3,
"metadata": {"auto_archiver": {"processed": True}},
"visibility": "visible",
"status": "complete",
},
],
}
]
)
items = list(atlos_feeder)
assert len(items) == 1
@@ -65,24 +84,34 @@ def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
def test_atlos_feeder_multiple_pages(atlos_feeder, mock_atlos_api):
"""Test iteration over multiple pages with valid items."""
mock_atlos_api([
{
"next": "cursor2",
"results": [
{"source_url": "http://example1.com", "id": 10,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
],
},
{
"next": None,
"results": [
{"source_url": "http://example2.com", "id": 20,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
],
},
])
mock_atlos_api(
[
{
"next": "cursor2",
"results": [
{
"source_url": "http://example1.com",
"id": 10,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
],
},
{
"next": None,
"results": [
{
"source_url": "http://example2.com",
"id": 20,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible",
"status": "complete",
},
],
},
]
)
items = list(atlos_feeder)
assert len(items) == 2
@@ -100,9 +129,7 @@ def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api):
def test_atlos_feeder_http_error(atlos_feeder, mocker):
"""Test raises an exception on HTTP error."""
mocker.patch(
"requests.get",
return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True),
)
fake_response = FakeAPIResponse({"next": None, "results": []}, raise_error=True)
atlos_feeder.session.get.side_effect = [fake_response]
with pytest.raises(Exception, match="HTTP error"):
list(atlos_feeder)

View File

@@ -1,13 +1,16 @@
import pytest
@pytest.fixture
def headerless_csv_file():
return "tests/data/csv_no_headers.csv"
@pytest.fixture
def header_csv_file():
return "tests/data/csv_with_headers.csv"
@pytest.fixture
def header_csv_file_non_default_column():
return "tests/data/csv_with_headers_non_default_column.csv"
@@ -23,6 +26,7 @@ def test_csv_feeder_no_headers(headerless_csv_file, setup_module):
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
def test_csv_feeder_with_headers(header_csv_file, setup_module):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
@@ -33,10 +37,10 @@ def test_csv_feeder_with_headers(header_csv_file, setup_module):
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
with caplog.at_level("WARNING"):
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1})
urls = list(feeder)
@@ -54,4 +58,4 @@ def test_csv_feeder_column_by_name(header_csv_file, setup_module):
urls = list(feeder)
assert len(urls) == 2
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
assert urls[1].get_url() == "https://example.com/2/"

View File

@@ -2,7 +2,7 @@ from typing import Type
import gspread
import pytest
from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB
from auto_archiver.core import Metadata, Feeder
@@ -11,43 +11,40 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
mocker.patch("gspread.service_account")
with pytest.raises(ValueError):
setup_module(
"gsheet_feeder",
"gsheet_feeder_db",
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
)
@pytest.fixture
def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
config: dict = {
"service_account": "dummy.json",
"sheet": "test-auto-archiver",
"sheet_id": None,
"header": 1,
"columns": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
}
"service_account": "dummy.json",
"sheet": "test-auto-archiver",
"sheet_id": None,
"header": 1,
"columns": {
"url": "link",
"status": "archive status",
"folder": "destination folder",
"archive": "archive location",
"date": "archive date",
"thumbnail": "thumbnail",
"timestamp": "upload timestamp",
"title": "upload title",
"text": "text content",
"screenshot": "screenshot",
"hash": "hash",
"pdq_hash": "perceptual hashes",
"wacz": "wacz",
"replaywebpage": "replaywebpage",
},
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
}
mocker.patch("gspread.service_account")
feeder = setup_module(
"gsheet_feeder",
config
)
feeder = setup_module("gsheet_feeder_db", config)
feeder.gsheets_client = mocker.MagicMock()
return feeder
@@ -90,7 +87,7 @@ class MockWorksheet:
return matching.get(col_name, default)
def test__process_rows(gsheet_feeder: GsheetsFeeder):
def test__process_rows(gsheet_feeder: GsheetsFeederDB):
testworksheet = MockWorksheet()
metadata_items = list(gsheet_feeder._process_rows(testworksheet))
assert len(metadata_items) == 3
@@ -98,7 +95,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
assert metadata_items[0].get("url") == "http://example.com"
def test__set_metadata(gsheet_feeder: GsheetsFeeder):
def test__set_metadata(gsheet_feeder: GsheetsFeederDB):
worksheet = MockWorksheet()
metadata = Metadata()
gsheet_feeder._set_context(metadata, worksheet, 1)
@@ -106,12 +103,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder):
@pytest.mark.skip(reason="Not recognising folder column")
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet):
gsheet_feeder._set_context(worksheet, 7)
assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
testworksheet = MockWorksheet()
metadata = Metadata()
testworksheet.wks.title = "TestSheet"
@@ -128,9 +125,7 @@ def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
(None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"),
],
)
def test_open_sheet_with_name_or_id(
setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker
):
def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker):
"""Ensure open_sheet() correctly opens by name or ID based on configuration."""
mock_service_account = mocker.patch("gspread.service_account")
mock_client = mocker.MagicMock()
@@ -140,14 +135,12 @@ def test_open_sheet_with_name_or_id(
# Setup module with parameterized values
feeder = setup_module(
"gsheet_feeder",
"gsheet_feeder_db",
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
)
sheet_result = feeder.open_sheet()
# Validate the correct method was called
getattr(mock_client, expected_method).assert_called_once_with(
expected_arg
), f"Failed: {description}"
getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
assert sheet_result == "MockSheet", f"Failed: {description}"
@@ -159,7 +152,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
mock_service_account.return_value = mock_client
mock_client.open_by_key.return_value = "MockSheet"
feeder = setup_module(
"gsheet_feeder",
"gsheet_feeder_db",
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
)
sheet = feeder.open_sheet()
@@ -170,7 +163,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
def test_should_process_sheet(setup_module, mocker):
mocker.patch("gspread.service_account")
gdb = setup_module(
"gsheet_feeder",
"gsheet_feeder_db",
{
"service_account": "dummy.json",
"sheet": "TestSheet",
@@ -179,18 +172,18 @@ def test_should_process_sheet(setup_module, mocker):
"block_worksheets": {"Sheet3"},
},
)
assert gdb.should_process_sheet("TestSheet") == True
assert gdb.should_process_sheet("Sheet3") == False
assert gdb.should_process_sheet("TestSheet") is True
assert gdb.should_process_sheet("Sheet3") is False
# False if allow_worksheets is set
assert gdb.should_process_sheet("AnotherSheet") == False
assert gdb.should_process_sheet("AnotherSheet") is False
@pytest.mark.skip(reason="Requires a real connection")
class TestGSheetsFeederReal:
"""Testing GSheetsFeeder class"""
"""Testing GsheetsFeeder class"""
module_name: str = "gsheet_feeder"
feeder: GsheetsFeeder
module_name: str = "gsheet_feeder_db"
feeder: GsheetsFeederDB
# You must follow the setup process explain in the docs for this to work
config: dict = {
"service_account": "secrets/service_account.json",
@@ -220,9 +213,7 @@ class TestGSheetsFeederReal:
@pytest.fixture(autouse=True)
def setup_feeder(self, setup_module):
assert (
self.module_name is not None
), "self.module_name must be set on the subclass"
assert self.module_name is not None, "self.module_name must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.feeder: Type[Feeder] = setup_module(self.module_name, self.config)
@@ -241,9 +232,7 @@ class TestGSheetsFeederReal:
"""Ensure open_sheet() connects to a real Google Sheets instance."""
sheet = self.feeder.open_sheet()
assert sheet is not None, "open_sheet() should return a valid sheet instance"
assert hasattr(
sheet, "worksheets"
), "Returned object should have worksheets method"
assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"
def test_iter_yields_metadata_real_data(self):
"""Ensure __iter__() yields Metadata objects for real test sheet data."""

View File

@@ -1,7 +1,7 @@
# Note this isn't a feeder, but contained as utility of the gsheet feeder module
import pytest
from auto_archiver.modules.gsheet_feeder import GWorksheet
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
class TestGWorksheet:
@@ -81,40 +81,27 @@ class TestGWorksheet:
(False, ""),
],
)
def test_get_cell_or_default_handles_empty_values(
self, mock_worksheet, when_empty, expected
):
def test_get_cell_or_default_handles_empty_values(self, mock_worksheet, when_empty, expected):
mock_worksheet.get_values.return_value[1][0] = "" # Empty URL cell
g = GWorksheet(mock_worksheet)
assert (
g.get_cell_or_default(
2, "url", default="default", when_empty_use_default=when_empty
)
== expected
)
assert g.get_cell_or_default(2, "url", default="default", when_empty_use_default=when_empty) == expected
def test_get_cell_or_default_handles_missing_columns(self, gworksheet):
assert (
gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
)
assert gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
# Test write operations
def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet):
gworksheet.set_cell(2, "url", "new_url")
mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url")
def test_batch_set_cell_formats_requests_correctly(
self, mock_worksheet, gworksheet
):
def test_batch_set_cell_formats_requests_correctly(self, mock_worksheet, gworksheet):
updates = [(2, "url", "new_url"), (3, "status", "processed")]
gworksheet.batch_set_cell(updates)
expected_batch = [
{"range": "A2", "values": [["new_url"]]},
{"range": "B3", "values": [["processed"]]},
]
mock_worksheet.batch_update.assert_called_once_with(
expected_batch, value_input_option="USER_ENTERED"
)
mock_worksheet.batch_update.assert_called_once_with(expected_batch, value_input_option="USER_ENTERED")
def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet):
long_value = "x" * 50000

View File

@@ -5,13 +5,13 @@ from auto_archiver.core import Metadata, Media
def test_format(setup_module):
formatter = setup_module(HtmlFormatter)
metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com')
metadata = Metadata().set("content", "Hello, world!").set_url("https://example.com")
final_media = formatter.format(metadata)
assert isinstance(final_media, Media)
assert ".html" in final_media.filename
with open (final_media.filename, "r", encoding="utf-8") as f:
with open(final_media.filename, "r", encoding="utf-8") as f:
content = f.read()
assert "Hello, world!" in content
assert final_media.mimetype == "text/html"
assert "SHA-256:" in final_media.get('hash')
assert "SHA-256:" in final_media.get("hash")

View File

@@ -8,6 +8,7 @@ class TestS3Storage:
"""
Test suite for S3Storage.
"""
module_name: str = "s3_storage"
storage: Type[S3Storage]
config: dict = {
@@ -32,28 +33,28 @@ class TestS3Storage:
"""Test that S3 client is initialized with correct parameters"""
assert self.storage.s3 is not None
assert self.storage.s3.meta.region_name == 'test-region'
assert self.storage.s3.meta.region_name == "test-region"
def test_get_cdn_url_generation(self):
"""Test CDN URL formatting """
"""Test CDN URL formatting"""
media = Media("test.txt")
media.key = "path/to/file.txt"
media._key = "path/to/file.txt"
url = self.storage.get_cdn_url(media)
assert url == "https://cdn.example.com/path/to/file.txt"
media.key = "another/path.jpg"
media._key = "another/path.jpg"
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
def test_uploadf_sets_acl_public(self, mocker):
media = Media("test.txt")
mock_file = mocker.MagicMock()
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
self.storage.uploadf(mock_file, media)
mock_s3_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Bucket="test-bucket",
Key=media.key,
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
)
def test_upload_decision_logic(self, mocker):
@@ -61,45 +62,48 @@ class TestS3Storage:
media = Media("test.txt")
assert self.storage.is_upload_needed(media) is True
self.storage.random_no_duplicate = True
mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
mocker.patch(
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
return_value="beepboop123beepboop123beepboop123",
)
mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
assert self.storage.is_upload_needed(media) is False
assert media.key == 'existing_key.txt'
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
assert media.key == "existing_key.txt"
mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")
def test_skips_upload_when_duplicate_exists(self, mocker):
"""Test that upload skips when file_in_folder finds existing object"""
self.storage.random_no_duplicate = True
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
mocker.patch.object(S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt")
media = Media("test.txt")
media.key = "original_path.txt"
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
media._key = "original_path.txt"
mocker.patch(
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
return_value="beepboop123beepboop123beepboop123",
)
assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_folder/existing_file.txt"
assert media.get("previously archived") is True
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
result = self.storage.uploadf(None, media)
mock_upload.assert_not_called()
assert result is True
def test_uploads_with_correct_parameters(self, mocker):
media = Media("test.txt")
media.key = "original_key.txt"
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
media.mimetype = 'image/png'
media._key = "original_key.txt"
mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
media.mimetype = "image/png"
mock_file = mocker.MagicMock()
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
self.storage.uploadf(mock_file, media)
mock_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Key='original_key.txt',
ExtraArgs={
'ACL': 'public-read',
'ContentType': 'image/png'
}
Bucket="test-bucket",
Key="original_key.txt",
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
)
def test_file_in_folder_exists(self, mocker):
mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
mocker.patch.object(self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]})
assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"

View File

@@ -2,7 +2,7 @@ import os
import hashlib
import pytest
from auto_archiver.core import Media, Metadata
from auto_archiver.modules.atlos_storage import AtlosStorage
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosStorage
class FakeAPIResponse:
@@ -21,13 +21,19 @@ class FakeAPIResponse:
@pytest.fixture
def atlos_storage(setup_module) -> AtlosStorage:
def atlos_storage(setup_module, mocker) -> AtlosStorage:
"""Fixture for AtlosStorage."""
configs: dict = {
"api_token": "abc123",
"atlos_url": "https://platform.atlos.org",
}
return setup_module("atlos_storage", configs)
mocker.patch("requests.Session")
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
mock_session = mocker.MagicMock()
# Configure the default response to have no results so that __iter__ terminates
mock_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
atlos_feeder.session = mock_session
return atlos_feeder
@pytest.fixture
@@ -38,7 +44,7 @@ def media(tmp_path) -> Media:
file_path.write_bytes(content)
media = Media(filename=str(file_path))
media.properties = {"something": "Title"}
media.key = "key"
media._key = "key"
return media
@@ -49,17 +55,6 @@ def test_get_cdn_url(atlos_storage: AtlosStorage) -> None:
assert url == atlos_storage.atlos_url
def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None:
"""Test _hash() computes the correct SHA-256 hash of a file."""
content = b"hello world"
file_path = tmp_path / "test.txt"
file_path.write_bytes(content)
media = Media(filename="dummy.mp4")
media.filename = str(file_path)
expected_hash = hashlib.sha256(content).hexdigest()
assert atlos_storage._hash(media) == expected_hash
def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None:
"""Test upload() returns False when metadata lacks atlos_id."""
metadata = Metadata() # atlos_id not set
@@ -69,74 +64,49 @@ def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media,
post_mock.assert_not_called()
def test_upload_already_uploaded(atlos_storage: AtlosStorage,
metadata: Metadata,
media: Media,
tmp_path,
mocker) -> None:
def test_upload_already_uploaded(atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
"""Test upload() returns True if media hash already exists."""
content = b"media content"
metadata.set("atlos_id", 101)
media_hash = hashlib.sha256(content).hexdigest()
fake_get = FakeAPIResponse({
"result": {"artifacts": [{"file_hash_sha256": media_hash}]}
})
get_mock = mocker.patch("requests.get", return_value=fake_get)
post_mock = mocker.patch("requests.post")
fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": media_hash}]}}
get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
post_mock = mocker.patch.object(atlos_storage, "_post")
result = atlos_storage.upload(media, metadata)
assert result is True
get_mock.assert_called_once()
post_mock.assert_not_called()
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage,
metadata: Metadata,
media: Media,
mocker) -> None:
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
"""Test upload() uploads media when not already present."""
metadata.set("atlos_id", 202)
fake_get = FakeAPIResponse({
"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}
})
get_mock = mocker.patch("requests.get", return_value=fake_get)
fake_post = FakeAPIResponse({}, raise_error=False)
post_mock = mocker.patch("requests.post", return_value=fake_post)
fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}}
get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
fake_post_response = {"result": "uploaded"}
post_mock = mocker.patch.object(atlos_storage, "_post", return_value=fake_post_response)
result = atlos_storage.upload(media, metadata)
assert result is True
get_mock.assert_called_once()
post_mock.assert_called_once()
expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202"
expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"}
expected_endpoint = "/api/v2/source_material/upload/202"
call_args = post_mock.call_args[0]
assert call_args[0] == expected_endpoint
call_kwargs = post_mock.call_args[1]
expected_params = {"title": media.properties}
call_kwargs = post_mock.call_args.kwargs
assert call_kwargs["headers"] == expected_headers
assert call_kwargs["params"] == expected_params
# Verify the URL passed to requests.post.
posted_url = call_kwargs.get("url") or post_mock.call_args.args[0]
assert posted_url == expected_url
# Verify files parameter contains the correct filename.
file_tuple = call_kwargs["files"]["file"]
assert file_tuple[0] == os.path.basename(media.filename)
def test_upload_post_http_error(tmp_path,
atlos_storage: AtlosStorage,
metadata: Metadata,
media: Media,
mocker) -> None:
def test_upload_post_http_error(
tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker
) -> None:
"""Test upload() propagates HTTP error during POST."""
metadata.set("atlos_id", 303)
fake_get = FakeAPIResponse({
"result": {"artifacts": []}
})
mocker.patch("requests.get", return_value=fake_get)
fake_post = FakeAPIResponse({}, raise_error=True)
mocker.patch("requests.post", return_value=fake_post)
fake_get_response = {"result": {"artifacts": []}}
mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
mocker.patch.object(atlos_storage, "_post", side_effect=Exception("HTTP error"))
with pytest.raises(Exception, match="HTTP error"):
atlos_storage.upload(media, metadata)
def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None:
"""Test uploadf() returns None (not implemented)."""
result = atlos_storage.uploadf(None, "dummy")
assert result is None

View File

@@ -1,37 +1,42 @@
from typing import Type
import pytest
from oauth2client import service_account
from auto_archiver.core import Media
from auto_archiver.modules.gdrive_storage import GDriveStorage
from auto_archiver.core.metadata import Metadata
from tests.storages.test_storage_base import TestStorageBase
@pytest.fixture(autouse=True)
def mock_sleep(mocker):
"""Mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@pytest.fixture
def gdrive_storage(setup_module, mocker):
def gdrive_storage(setup_module, mocker) -> GDriveStorage:
module_name: str = "gdrive_storage"
storage: GDriveStorage
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
'root_folder_id': "fake_root_folder_id",
'oauth_token': None,
'service_account': 'fake_service_account.json'
}
mocker.patch('google.oauth2.service_account.Credentials.from_service_account_file')
config: dict = {
"path_generator": "url",
"filename_generator": "static",
"root_folder_id": "fake_root_folder_id",
"oauth_token": None,
"service_account": "fake_service_account.json",
}
mocker.patch("google.oauth2.service_account.Credentials.from_service_account_file")
return setup_module(module_name, config)
def test_initialize_fails_with_non_existent_creds(setup_module):
"""Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
(and isn't mocked)
(and isn't mocked)
"""
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
'root_folder_id': "fake_root_folder_id",
'oauth_token': None,
'service_account': 'fake_service_account.json'
}
config: dict = {
"path_generator": "url",
"filename_generator": "static",
"root_folder_id": "fake_root_folder_id",
"oauth_token": None,
"service_account": "fake_service_account.json",
}
with pytest.raises(FileNotFoundError) as exc_info:
setup_module("gdrive_storage", config)
assert "No such file or directory" in str(exc_info.value)
@@ -48,10 +53,10 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
result = gdrive_storage._get_id_from_parent_and_name("parent", "mock", retries=1, use_mime_type=False)
assert result == "123"
def test_path_parts():
media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg"
media._key = "folder1/folder2/test.jpg"
@pytest.mark.skip(reason="Requires real credentials")
@@ -63,19 +68,17 @@ class TestGDriveStorageConnected(TestStorageBase):
module_name: str = "gdrive_storage"
storage: Type[GDriveStorage]
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
# TODO: replace with real root folder id
'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
'oauth_token': None,
'service_account': 'secrets/service_account.json'
}
config: dict = {
"path_generator": "url",
"filename_generator": "static",
# TODO: replace with real root folder id
"root_folder_id": "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
"oauth_token": None,
"service_account": "secrets/service_account.json",
}
def test_initialize_with_real_credentials(self):
"""
Test that the Google Drive service can be initialized with real credentials.
"""
assert self.storage.service is not None

View File

@@ -1,43 +1,63 @@
import os
from pathlib import Path
import pytest
from auto_archiver.core import Media
from auto_archiver.core import Media, Metadata
from auto_archiver.modules.local_storage import LocalStorage
from auto_archiver.core.consts import SetupError
@pytest.fixture
def local_storage(setup_module) -> LocalStorage:
def local_storage(setup_module, tmp_path) -> LocalStorage:
save_to = tmp_path / "local_archive"
save_to.mkdir()
configs: dict = {
"path_generator": "flat",
"filename_generator": "static",
"save_to": "./local_archive",
"save_to": str(save_to),
"save_absolute": False,
}
return setup_module("local_storage", configs)
@pytest.fixture
def sample_media(tmp_path) -> Media:
"""Fixture creating a Media object with temporary source file"""
src_file = tmp_path / "source.txt"
src_file.write_text("test content")
return Media(filename=str(src_file))
def test_too_long_save_path(setup_module):
with pytest.raises(SetupError):
setup_module("local_storage", {"save_to": "long" * 100})
def test_get_cdn_url_relative(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
local_storage.filename_generator = "random"
media = Media(filename="dummy.txt")
local_storage.set_key(media, "https://example.com", Metadata())
expected = os.path.join(local_storage.save_to, media.key)
assert local_storage.get_cdn_url(media) == expected
def test_get_cdn_url_absolute(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
local_storage.filename_generator = "random"
media = Media(filename="dummy.txt")
local_storage.save_absolute = True
local_storage.set_key(media, "https://example.com", Metadata())
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
assert local_storage.get_cdn_url(media) == expected
def test_upload_file_contents_and_metadata(local_storage, sample_media):
local_storage.store(sample_media, "https://example.com", Metadata())
dest = os.path.join(local_storage.save_to, sample_media.key)
assert local_storage.upload(sample_media) is True
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
def test_upload_nonexistent_source(local_storage):
media = Media(key="missing.txt", filename="nonexistent.txt")
media = Media(_key="missing.txt", filename="nonexistent.txt")
with pytest.raises(FileNotFoundError):
local_storage.upload(media)

View File

@@ -2,21 +2,109 @@ from typing import Type
import pytest
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.core.storage import Storage
from auto_archiver.core.module import ModuleFactory
class TestStorageBase(object):
module_name: str = None
config: dict = None
@pytest.fixture(autouse=True)
def setup_storage(self, setup_module):
assert (
self.module_name is not None
), "self.module_name must be set on the subclass"
assert self.module_name is not None, "self.module_name must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.storage: Type[Storage] = setup_module(
self.module_name, self.config
)
self.storage: Type[Storage] = setup_module(self.module_name, self.config)
class TestBaseStorage(Storage):
name = "test_storage"
def get_cdn_url(self, media):
return "cdn_url"
def uploadf(self, file, key, **kwargs):
return True
@pytest.fixture
def dummy_file(tmp_path):
# create dummy.txt file
dummy_file = tmp_path / "dummy.txt"
dummy_file.write_text("test content")
return str(dummy_file)
@pytest.fixture
def storage_base():
def _storage_base(config):
storage_base = TestBaseStorage()
storage_base.config_setup({TestBaseStorage.name: config})
storage_base.module_factory = ModuleFactory()
return storage_base
return _storage_base
@pytest.mark.parametrize(
"path_generator, filename_generator, url, expected_key",
[
("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"),
("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"),
("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"),
("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
],
)
def test_storage_name_generation(
storage_base, path_generator, filename_generator, url, expected_key, mocker, tmp_path, dummy_file
):
mock_random = mocker.patch("auto_archiver.core.storage.random_str")
mock_random.return_value = "pretend-random"
config: dict = {
"path_generator": path_generator,
"filename_generator": filename_generator,
}
storage: Storage = storage_base(config)
assert storage.path_generator == path_generator
assert storage.filename_generator == filename_generator
metadata = Metadata()
metadata.set_context("folder", "folder")
media = Media(filename=dummy_file)
storage.set_key(media, url, metadata)
print(media.key)
assert media.key == expected_key
def test_really_long_name(storage_base, dummy_file):
config: dict = {
"path_generator": "url",
"filename_generator": "static",
}
storage: Storage = storage_base(config)
url = f"https://example.com/{'file' * 100}"
media = Media(filename=dummy_file)
storage.set_key(media, url, Metadata())
assert media.key == f"https-example-com-{'file' * 13}/6ae8a75555209fd6c44157c0.txt"
def test_storage_loads_hash_enricher(storage_base, dummy_file):
"""Ensure 'hash_enricher' is properly loaded without an explicit import."""
config = {"path_generator": "url", "filename_generator": "static"}
storage = storage_base(config)
url = "https://example.com/file/"
media = Media(filename=dummy_file)
metadata = Metadata()
try:
storage.set_key(media, url, metadata)
except Exception as e:
pytest.fail(f"Storage failed to dynamically load hash_enricher: {e}")
assert media.key is not None, "Expected media.key to be set, but it was None"

View File

@@ -3,39 +3,46 @@ from auto_archiver.core import config
from ruamel.yaml.scanner import ScannerError
from ruamel.yaml.comments import CommentedMap
def test_return_default_config_for_nonexistent_file():
assert config.read_yaml("nonexistent_file.yaml") == config.EMPTY_CONFIG
def test_return_default_config_for_empty_file(tmp_path):
empty_file = tmp_path / "empty_file.yaml"
empty_file.write_text("")
assert config.read_yaml(empty_file) == config.EMPTY_CONFIG
def test_raise_error_on_invalid_yaml(tmp_path):
invalid_yaml = tmp_path / "invalid_yaml.yaml"
invalid_yaml.write_text("key: \"value_without_end_quote")
invalid_yaml.write_text('key: "value_without_end_quote')
# make sure it raises ScannerError
with pytest.raises(ScannerError):
config.read_yaml(invalid_yaml)
def test_write_yaml(tmp_path):
yaml_file = tmp_path / "write_yaml.yaml"
config.store_yaml(config.EMPTY_CONFIG, yaml_file.as_posix())
assert "steps:\n" in yaml_file.read_text()
def test_round_trip_comments(tmp_path):
yaml_file = tmp_path / "round_trip_comments.yaml"
with open(yaml_file, "w") as f:
f.write("generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2")
f.write(
"generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2"
)
loaded = config.read_yaml(yaml_file)
# check the comments are preserved
assert loaded['generic_extractor']['facebook_cookie'] == "abc"
assert loaded['generic_extractor'].ca.items['facebook_cookie'][2].value == "# end of line comment\n"
assert loaded["generic_extractor"]["facebook_cookie"] == "abc"
assert loaded["generic_extractor"].ca.items["facebook_cookie"][2].value == "# end of line comment\n"
# add some more items to my_settings
loaded['generic_extractor']['list_type'].append("bellingcat")
loaded["generic_extractor"]["list_type"].append("bellingcat")
config.store_yaml(loaded, yaml_file.as_posix())
assert "# comments: false" in yaml_file.read_text()
@@ -43,14 +50,17 @@ def test_round_trip_comments(tmp_path):
assert "abc # end of line comment" in yaml_file.read_text()
assert "- value2\n - bellingcat" in yaml_file.read_text()
def test_merge_dicts():
yaml_dict = config.EMPTY_CONFIG
yaml_dict['settings'] = CommentedMap(**{
yaml_dict["settings"] = CommentedMap(
**{
"key1": ["a"],
"key2": "old_value",
"key3": ["a", "b", "c"],
"key5": "value5",
})
}
)
dotdict = {
"settings.key1": ["b", "c"],
@@ -67,15 +77,16 @@ def test_merge_dicts():
def test_check_types():
assert config.is_list_type([]) == True
assert config.is_list_type(()) == True
assert config.is_list_type(set()) == True
assert config.is_list_type({}) == False
assert config.is_list_type("") == False
assert config.is_dict_type({}) == True
assert config.is_dict_type(CommentedMap()) == True
assert config.is_dict_type([]) == False
assert config.is_dict_type("") == False
assert config.is_list_type([]) is True
assert config.is_list_type(()) is True
assert config.is_list_type(set()) is True
assert config.is_list_type({}) is False
assert config.is_list_type("") is False
assert config.is_dict_type({}) is True
assert config.is_dict_type(CommentedMap()) is True
assert config.is_dict_type([]) is False
assert config.is_dict_type("") is False
def test_from_dot_notation():
dotdict = {
@@ -88,16 +99,17 @@ def test_from_dot_notation():
assert normal_dict["settings"]["key2"] == "new_value"
assert normal_dict["settings"]["key3"]["key4"] == "value"
def test_to_dot_notation():
yaml_dict = config.EMPTY_CONFIG
yaml_dict['settings'] = {
yaml_dict["settings"] = {
"key1": ["a", "b", "c"],
"key2": "new_value",
"key3": {
"key4": "value",
}
},
}
dotdict = config.to_dot_notation(yaml_dict)
assert dotdict["settings.key1"] == ["a", "b", "c"]
assert dotdict["settings.key2"] == "new_value"
assert dotdict["settings.key3.key4"] == "value"
assert dotdict["settings.key3.key4"] == "value"

View File

@@ -10,21 +10,23 @@ def orchestration_file_path(tmp_path):
folder.mkdir(exist_ok=True)
return (folder / "example_orch.yaml").as_posix()
@pytest.fixture
def orchestration_file(orchestration_file_path):
def _orchestration_file(content=''):
def _orchestration_file(content=""):
with open(orchestration_file_path, "w") as f:
f.write(content)
return orchestration_file_path
return _orchestration_file
@pytest.fixture
def autoarchiver(tmp_path, monkeypatch, request):
def _autoarchiver(args=[]):
def cleanup():
from loguru import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0
logger.add(sys.stderr)
@@ -44,9 +46,9 @@ def autoarchiver(tmp_path, monkeypatch, request):
def test_run_auto_archiver_no_args(caplog, autoarchiver):
with pytest.raises(SystemExit):
autoarchiver()
assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text
def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
# exec 'auto-archiver' on the command lin
with pytest.raises(SystemExit):
@@ -54,6 +56,7 @@ def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
assert "Make sure the file exists and try again, or run without th" in caplog.text
def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# create a valid (empty) orchestration file
path = orchestration_file(content="")
@@ -64,6 +67,7 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# should treat an empty file as if there is no file at all
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
from auto_archiver.__main__ import main
@@ -75,4 +79,4 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
with pytest.raises(SystemExit):
main()
assert "No URLs provided. Please provide at least one" in caplog.text
assert "No URLs provided. Please provide at least one" in caplog.text

View File

@@ -62,18 +62,8 @@ def test_simple_merge(basic_metadata):
def test_left_merge():
left = (
Metadata()
.set("tags", ["a"])
.set("stats", {"views": 10})
.set("status", "success")
)
right = (
Metadata()
.set("tags", ["b"])
.set("stats", {"likes": 5})
.set("status", "no archiver")
)
left = Metadata().set("tags", ["a"]).set("stats", {"views": 10}).set("status", "success")
right = Metadata().set("tags", ["b"]).set("stats", {"likes": 5}).set("status", "no archiver")
left.merge(right, overwrite_left=True)
assert left.get("status") == "no archiver"
@@ -120,6 +110,7 @@ def test_is_empty():
def test_store():
pass
# Test Media operations
@@ -176,6 +167,7 @@ def test_choose_most_complete():
res = Metadata.choose_most_complete([m_more, m_less])
assert res.metadata.get("title") == "Title 1"
def test_choose_most_complete_from_pickles(unpickle):
# test most complete from pickles before and after an enricher has run
# Only compares length of media, not the actual media

View File

@@ -1,40 +1,41 @@
import sys
import pytest
from auto_archiver.core.module import ModuleFactory, LazyBaseModule
from auto_archiver.core.base_module import BaseModule
from auto_archiver.core.consts import SetupError
@pytest.fixture
def example_module():
import auto_archiver
module_factory = ModuleFactory()
previous_path = auto_archiver.modules.__path__
# previous_path = auto_archiver.modules.__path__
auto_archiver.modules.__path__.append("tests/data/test_modules/")
return module_factory.get_module_lazy("example_module")
def test_get_module_lazy(example_module):
assert example_module.name == "example_module"
assert example_module.display_name == "Example Module"
assert example_module.manifest is not None
def test_python_dependency_check(example_module):
# example_module requires loguru, which is not installed
# monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["python"] = ["does_not_exist"]
with pytest.raises(SystemExit) as load_error:
with pytest.raises(SetupError):
example_module.load({})
assert load_error.value.code == 1
def test_binary_dependency_check(example_module):
# example_module requires ffmpeg, which is not installed
# monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["binary"] = ["does_not_exist"]
def test_module_dependency_check_loads_module(example_module):
# example_module requires cli_feeder, which is not installed
# monkey patch the manifest to include a nonexistnet dependency
@@ -49,19 +50,20 @@ def test_module_dependency_check_loads_module(example_module):
assert module_factory._lazy_modules["hash_enricher"] is not None
assert module_factory._lazy_modules["hash_enricher"]._instance is not None
def test_load_module(example_module):
def test_load_module(example_module):
# setup the module, and check that config is set to the default values
loaded_module = example_module.load({})
assert loaded_module is not None
assert isinstance(loaded_module, BaseModule)
assert loaded_module.name == "example_module"
assert loaded_module.display_name == "Example Module"
assert loaded_module.config["example_module"] == {"csv_file" : "db.csv"}
assert loaded_module.config["example_module"] == {"csv_file": "db.csv"}
# check that the vlaue is set on the module itself
assert loaded_module.csv_file == "db.csv"
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_load_modules(module_name):
# test that specific modules can be loaded
@@ -78,6 +80,20 @@ def test_load_modules(module_name):
# check that default settings are applied
default_config = module.configs
assert loaded_module.name in loaded_module.config.keys()
defaults = {k for k in default_config}
assert defaults in [loaded_module.config[module_name].keys()]
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_config_defaults(module_name):
# test the values of the default config values are set
# Note: some modules can alter values in the setup() method, this test checks cases that don't
module = ModuleFactory().get_module_lazy(module_name)
loaded_module = module.load({})
# check that default config values are set
default_config = module.configs
defaults = {k: v.get("default") for k, v in default_config.items()}
assert defaults == loaded_module.config[module_name]
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
@@ -96,5 +112,3 @@ def test_lazy_base_module(module_name):
assert len(lazy_module.configs) > 0
assert len(lazy_module.description) > 0
assert len(lazy_module.version) > 0

View File

@@ -1,59 +1,73 @@
import pytest
import sys
from argparse import ArgumentParser, ArgumentTypeError
from auto_archiver.core.orchestrator import ArchivingOrchestrator
from auto_archiver.version import __version__
from auto_archiver.core.config import read_yaml, store_yaml
from auto_archiver.core import Metadata
from auto_archiver.core.consts import SetupError
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/"
@pytest.fixture
def test_args():
return ["--config", TEST_ORCHESTRATION,
"--module_paths", TEST_MODULES,
"--example_module.required_field", "some_value"] # just set this for normal testing, we will remove it later
return [
"--config",
TEST_ORCHESTRATION,
"--module_paths",
TEST_MODULES,
"--example_module.required_field",
"some_value",
] # just set this for normal testing, we will remove it later
@pytest.fixture
def orchestrator():
return ArchivingOrchestrator()
@pytest.fixture
def basic_parser(orchestrator) -> ArgumentParser:
return orchestrator.setup_basic_parser()
def test_setup_orchestrator(orchestrator):
assert orchestrator is not None
def test_parse_config():
pass
def test_parse_basic(basic_parser):
args = basic_parser.parse_args(["--config", TEST_ORCHESTRATION])
assert args.config_file == TEST_ORCHESTRATION
@pytest.mark.parametrize("mode", ["simple", "full"])
def test_mode(basic_parser, mode):
args = basic_parser.parse_args(["--mode", mode])
assert args.mode == mode
def test_mode_invalid(basic_parser, capsys):
with pytest.raises(SystemExit) as exit_error:
basic_parser.parse_args(["--mode", "invalid"])
assert exit_error.value.code == 2
assert "invalid choice" in capsys.readouterr().err
def test_version(basic_parser, capsys):
with pytest.raises(SystemExit) as exit_error:
basic_parser.parse_args(["--version"])
assert exit_error.value.code == 0
assert capsys.readouterr().out == f"{__version__}\n"
def test_help(orchestrator, basic_parser, capsys):
def test_help(orchestrator, basic_parser, capsys):
args = basic_parser.parse_args(["--help"])
assert args.help == True
assert args.help is True
# test the show_help() on orchestrator
with pytest.raises(SystemExit) as exit_error:
@@ -78,19 +92,22 @@ def test_help(orchestrator, basic_parser, capsys):
assert "--logging.level" in logs
# individual module configs
assert "--gsheet_feeder.sheet_id" in logs
assert "--gsheet_feeder_db.sheet_id" in logs
def test_add_custom_modules_path(orchestrator, test_args):
orchestrator.setup_config(test_args)
import auto_archiver
assert "tests/data/test_modules/" in auto_archiver.modules.__path__
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
orchestrator.setup_config(test_args + # we still need to load the real path to get the example_module
["--module_paths", "tests/data/invalid_test_modules/"])
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
orchestrator.setup_config(
test_args # we still need to load the real path to get the example_module
+ ["--module_paths", "tests/data/invalid_test_modules/"]
)
assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
@@ -99,16 +116,16 @@ def test_check_required_values(orchestrator, caplog, test_args):
# drop the example_module.required_field from the test_args
test_args = test_args[:-2]
with pytest.raises(SystemExit) as exit_error:
config = orchestrator.setup_config(test_args)
with pytest.raises(SystemExit):
orchestrator.setup_config(test_args)
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
# load the default example yaml, add a required field, then run the orchestrator
test_yaml = read_yaml(TEST_ORCHESTRATION)
test_yaml['example_module'] = {'required_field': 'some_value'}
test_yaml["example_module"] = {"required_field": "some_value"}
# write it to a temp file
tmp_file = (tmp_path / "temp_config.yaml").as_posix()
store_yaml(test_yaml, tmp_file)
@@ -117,27 +134,42 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
assert config is not None
def test_load_authentication_string(orchestrator, test_args):
config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
def test_load_authentication_string(orchestrator, test_args):
config = orchestrator.setup_config(
test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']
)
assert config["authentication"] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
def test_load_authentication_string_concat_site(orchestrator, test_args):
config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
assert config['authentication'] == {"x.com": {"api_key": "my_key"},
"twitter.com": {"api_key": "my_key"}}
assert config["authentication"] == {"x.com": {"api_key": "my_key"}, "twitter.com": {"api_key": "my_key"}}
def test_load_invalid_authentication_string(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
orchestrator.setup_config(test_args + ["--authentication", "{''invalid_json"])
def test_load_authentication_invalid_dict(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
def test_load_modules_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
args = test_args + [
"--feeders",
"example_module",
"--extractors",
"example_module",
"--databases",
"example_module",
"--enrichers",
"example_module",
"--formatters",
"example_module",
]
orchestrator.setup(args)
@@ -153,27 +185,37 @@ def test_load_modules_from_commandline(orchestrator, test_args):
assert orchestrator.enrichers[0].name == "example_module"
assert orchestrator.formatters[0].name == "example_module"
def test_load_settings_for_module_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
args = test_args + [
"--feeders",
"gsheet_feeder_db",
"--gsheet_feeder_db.sheet_id",
"123",
"--gsheet_feeder_db.service_account",
"tests/data/test_service_account.json",
]
orchestrator.setup(args)
assert len(orchestrator.feeders) == 1
assert orchestrator.feeders[0].name == "gsheet_feeder"
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
assert orchestrator.feeders[0].name == "gsheet_feeder_db"
assert orchestrator.config["gsheet_feeder_db"]["sheet_id"] == "123"
def test_multiple_orchestrator(test_args):
o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
o1_args = test_args + [
"--feeders",
"gsheet_feeder_db",
"--gsheet_feeder_db.service_account",
"tests/data/test_service_account.json",
]
o1 = ArchivingOrchestrator()
with pytest.raises(ValueError) as exit_error:
# this should fail because the gsheet_feeder requires a sheet_id / sheet
with pytest.raises(ValueError):
# this should fail because the gsheet_feeder_db requires a sheet_id / sheet
o1.setup(o1_args)
o2_args = test_args + ["--feeders", "example_module"]
o2 = ArchivingOrchestrator()
o2.setup(o2_args)
@@ -182,4 +224,16 @@ def test_multiple_orchestrator(test_args):
output: Metadata = list(o2.feed())
assert len(output) == 1
assert output[0].get_url() == "https://example.com"
assert output[0].get_url() == "https://example.com"
def test_wrong_step_type(test_args, caplog):
args = test_args + [
"--feeders",
"example_extractor", # example_extractor is not a valid feeder!
]
orchestrator = ArchivingOrchestrator()
with pytest.raises(SetupError) as err:
orchestrator.setup(args)
assert "Module 'example_extractor' is not a feeder" in str(err.value)

View File

@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import (
update_nested_dict,
calculate_file_hash,
random_str,
get_timestamp
get_timestamp,
)
@@ -38,40 +38,46 @@ class TestDirectoryUtils:
mkdir_if_not_exists(existing_dir)
assert existing_dir.exists()
class TestURLExpansion:
@pytest.mark.parametrize("input_url,expected", [
("https://example.com", "https://example.com"),
("https://t.co/test", "https://expanded.url")
])
@pytest.mark.parametrize(
"input_url,expected",
[("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")],
)
def test_expand_url(self, input_url, expected, mocker):
mock_response = mocker.Mock()
mock_response.url = "https://expanded.url"
mocker.patch('requests.get', return_value=mock_response)
mocker.patch("requests.get", return_value=mock_response)
result = expand_url(input_url)
assert result == expected
def test_expand_url_handles_errors(self, caplog, mocker):
mocker.patch('requests.get', side_effect=Exception("Connection error"))
mocker.patch("requests.get", side_effect=Exception("Connection error"))
url = "https://t.co/error"
result = expand_url(url)
assert result == url
assert f"Failed to expand url {url}" in caplog.text
class TestAttributeHandling:
class Sample:
exists = "value"
none = None
@pytest.mark.parametrize("obj,attr,default,expected", [
(Sample(), "exists", "default", "value"),
(Sample(), "none", "default", "default"),
(Sample(), "missing", "default", "default"),
(None, "anything", "fallback", "fallback"),
])
@pytest.mark.parametrize(
"obj,attr,default,expected",
[
(Sample(), "exists", "default", "value"),
(Sample(), "none", "default", "default"),
(Sample(), "missing", "default", "default"),
(None, "anything", "fallback", "fallback"),
],
)
def test_getattr_or(self, obj, attr, default, expected):
# Test gets attribute or returns a default value
assert getattr_or(obj, attr, default) == expected
class TestDateTimeHandling:
def test_datetime_encoder(self, sample_datetime):
result = json.dumps({"dt": sample_datetime}, cls=DateTimeEncoder)
@@ -83,11 +89,14 @@ class TestDateTimeHandling:
result = dump_payload(payload)
assert str(sample_datetime) in result
@pytest.mark.parametrize("dt_str,fmt,expected", [
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
("invalid", None, None),
])
@pytest.mark.parametrize(
"dt_str,fmt,expected",
[
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
("invalid", None, None),
],
)
def test_datetime_from_string(self, dt_str, fmt, expected):
result = get_datetime_from_str(dt_str, fmt)
if expected is None:
@@ -95,16 +104,21 @@ class TestDateTimeHandling:
else:
assert result == expected.replace(tzinfo=result.tzinfo)
class TestDictUtils:
@pytest.mark.parametrize("original,update,expected", [
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
])
@pytest.mark.parametrize(
"original,update,expected",
[
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
],
)
def test_update_nested_dict(self, original, update, expected):
update_nested_dict(original, update)
assert original == expected
class TestHashingUtils:
def test_file_hashing(self, sample_file):
expected = hashlib.sha256(b"test content").hexdigest()
@@ -118,6 +132,7 @@ class TestHashingUtils:
expected = hashlib.sha256(content).hexdigest()
assert calculate_file_hash(str(file_path)) == expected
class TestMiscUtils:
def test_random_str_length(self):
for length in [8, 16, 32]:
@@ -131,14 +146,17 @@ class TestMiscUtils:
def test_random_str_uniqueness(self):
assert random_str() != random_str()
@pytest.mark.parametrize("ts_input,utc,iso,expected_type", [
(datetime.now(), True, True, str),
("2023-01-01T12:00:00+00:00", False, False, datetime),
(1672574400, True, True, str),
])
@pytest.mark.parametrize(
"ts_input,utc,iso,expected_type",
[
(datetime.now(), True, True, str),
("2023-01-01T12:00:00+00:00", False, False, datetime),
(1672574400, True, True, str),
],
)
def test_timestamp_parsing(self, ts_input, utc, iso, expected_type):
result = get_timestamp(ts_input, utc=utc, iso=iso)
assert isinstance(result, expected_type)
def test_invalid_timestamp_returns_none(self):
assert get_timestamp("invalid-date") is None
assert get_timestamp("invalid-date") is None

113
tests/utils/test_urls.py Normal file
View File

@@ -0,0 +1,113 @@
import pytest
from auto_archiver.utils.url import (
is_auth_wall,
check_url_or_raise,
domain_for_url,
is_relevant_url,
remove_get_parameters,
twitter_best_quality_url,
)
@pytest.mark.parametrize(
"url, is_auth",
[
("https://example.com", False),
("https://t.me/c/abc/123", True),
("https://t.me/not-private/", False),
("https://instagram.com", True),
("https://www.instagram.com", True),
("https://www.instagram.com/p/INVALID", True),
("https://www.instagram.com/p/C4QgLbrIKXG/", True),
],
)
def test_is_auth_wall(url, is_auth):
assert is_auth_wall(url) == is_auth
@pytest.mark.parametrize(
"url, raises",
[
("http://example.com", False),
("https://example.com", False),
("ftp://example.com", True),
("http://localhost", True),
("http://", True),
],
)
def test_check_url_or_raise(url, raises):
if raises:
with pytest.raises(ValueError):
check_url_or_raise(url)
else:
assert check_url_or_raise(url)
@pytest.mark.parametrize(
"url, domain",
[
("https://example.com", "example.com"),
("https://www.example.com", "www.example.com"),
("https://www.example.com/path", "www.example.com"),
("https://", ""),
("http://localhost", "localhost"),
],
)
def test_domain_for_url(url, domain):
assert domain_for_url(url) == domain
@pytest.mark.parametrize(
"url, without_get",
[
("https://example.com", "https://example.com"),
("https://example.com?utm_source=example", "https://example.com"),
("https://example.com?utm_source=example&other=1", "https://example.com"),
("https://example.com/something", "https://example.com/something"),
("https://example.com/something?utm_source=example", "https://example.com/something"),
],
)
def test_remove_get_parameters(url, without_get):
assert remove_get_parameters(url) == without_get
@pytest.mark.parametrize(
"url, relevant",
[
("https://example.com", True),
("https://example.com/favicon.ico", False),
("https://twimg.com/profile_images", False),
("https://twimg.com/something/default_profile_images", False),
("https://scontent.cdninstagram.com/username/150x150.jpg", False),
("https://static.cdninstagram.com/rsrc.php/", False),
("https://telegram.org/img/emoji/", False),
("https://www.youtube.com/s/gaming/emoji/", False),
("https://yt3.ggpht.com/default-user=", False),
("https://www.youtube.com/s/search/audio/", False),
("https://ok.ru/res/i/", False),
("https://vk.com/emoji/", False),
("https://vk.com/images/", False),
("https://vk.com/images/reaction/", False),
("https://wikipedia.org/static", False),
("https://example.com/file.svg", False),
("https://example.com/file.ico", False),
("https://example.com/file.mp4", True),
("https://example.com/150x150.jpg", True),
("https://example.com/rsrc.php/", True),
("https://example.com/img/emoji/", True),
],
)
def test_is_relevant_url(url, relevant):
assert is_relevant_url(url) == relevant
@pytest.mark.parametrize(
"url, best_quality",
[
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
],
)
def test_twitter_best_quality_url(url, best_quality):
assert twitter_best_quality_url(url) == best_quality