mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Merge main into timestamping_enricher
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
pytest conftest file, for shared fixtures and configuration
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from datetime import datetime, timezone
|
||||
@@ -16,32 +17,36 @@ from auto_archiver.core.module import ModuleFactory
|
||||
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
|
||||
# what comes first will be run first (at the end of all other tests not mentioned)
|
||||
# format is the name of the module (python file) without the .py extension
|
||||
TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
|
||||
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_module(request):
|
||||
def _setup_module(module_name, config={}):
|
||||
|
||||
def _setup_module(module_name, config=None):
|
||||
if config is None:
|
||||
config = {}
|
||||
module_factory = ModuleFactory()
|
||||
|
||||
if isinstance(module_name, type):
|
||||
# get the module name:
|
||||
# if the class does not have a .name, use the name of the parent folder
|
||||
module_name = module_name.__module__.rsplit(".",2)[-2]
|
||||
module_name = module_name.__module__.rsplit(".", 2)[-2]
|
||||
|
||||
m = module_factory.get_module(module_name, {module_name: config})
|
||||
# add the tmp_dir to the module
|
||||
tmp_dir = TemporaryDirectory()
|
||||
m.tmp_dir = tmp_dir.name
|
||||
|
||||
|
||||
def cleanup():
|
||||
tmp_dir.cleanup()
|
||||
|
||||
request.addfinalizer(cleanup)
|
||||
|
||||
return m
|
||||
|
||||
return _setup_module
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def check_hash():
|
||||
def _check_hash(filename: str, hash: str):
|
||||
@@ -51,6 +56,7 @@ def check_hash():
|
||||
|
||||
return _check_hash
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def make_item():
|
||||
def _make_item(url: str, **kwargs) -> Metadata:
|
||||
@@ -62,7 +68,6 @@ def make_item():
|
||||
return _make_item
|
||||
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(items):
|
||||
module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
|
||||
|
||||
@@ -78,13 +83,13 @@ def pytest_collection_modifyitems(items):
|
||||
items[:] = sorted_items
|
||||
|
||||
|
||||
|
||||
# Incremental testing - fail tests in a class if any previous test fails
|
||||
# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
|
||||
|
||||
# store history of failures per test class name and per index in parametrize (if parametrize used)
|
||||
_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
|
||||
|
||||
|
||||
def pytest_runtest_makereport(item, call):
|
||||
if "incremental" in item.keywords:
|
||||
# incremental marker is used
|
||||
@@ -93,17 +98,11 @@ def pytest_runtest_makereport(item, call):
|
||||
# retrieve the class name of the test
|
||||
cls_name = str(item.cls)
|
||||
# retrieve the index of the test (if parametrize is used in combination with incremental)
|
||||
parametrize_index = (
|
||||
tuple(item.callspec.indices.values())
|
||||
if hasattr(item, "callspec")
|
||||
else ()
|
||||
)
|
||||
parametrize_index = tuple(item.callspec.indices.values()) if hasattr(item, "callspec") else ()
|
||||
# retrieve the name of the test function
|
||||
test_name = item.originalname or item.name
|
||||
# store in _test_failed_incremental the original name of the failed test
|
||||
_test_failed_incremental.setdefault(cls_name, {}).setdefault(
|
||||
parametrize_index, test_name
|
||||
)
|
||||
_test_failed_incremental.setdefault(cls_name, {}).setdefault(parametrize_index, test_name)
|
||||
|
||||
|
||||
def pytest_runtest_setup(item):
|
||||
@@ -119,16 +118,17 @@ def pytest_runtest_setup(item):
|
||||
pytest.xfail(f"previous test failed ({test_name})")
|
||||
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
@pytest.fixture
|
||||
def unpickle():
|
||||
"""
|
||||
Returns a helper function that unpickles a file
|
||||
** gets the file from the test_files directory: tests/data/ **
|
||||
"""
|
||||
|
||||
def _unpickle(path):
|
||||
with open(os.path.join("tests/data", path), "rb") as f:
|
||||
return pickle.load(f)
|
||||
|
||||
return _unpickle
|
||||
|
||||
|
||||
@@ -151,9 +151,9 @@ def sample_datetime():
|
||||
return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@pytest.fixture
|
||||
def mock_sleep(mocker):
|
||||
"""Globally mock time.sleep to avoid delays."""
|
||||
"""Mock time.sleep to avoid delays."""
|
||||
return mocker.patch("time.sleep")
|
||||
|
||||
|
||||
@@ -162,4 +162,4 @@ def metadata():
|
||||
metadata = Metadata()
|
||||
metadata.set("_processed_at", "2021-01-01T00:00:00")
|
||||
metadata.set_url("https://example.com")
|
||||
return metadata
|
||||
return metadata
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# this is a dummy class used to test importing a dropin in the
|
||||
# generic extractor by filename/path
|
||||
|
||||
|
||||
class Dropin:
|
||||
pass
|
||||
pass
|
||||
|
||||
11
tests/data/test_modules/example_extractor/__manifest__.py
Normal file
11
tests/data/test_modules/example_extractor/__manifest__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
# Display Name of your module
|
||||
"name": "Example Extractor",
|
||||
# Optional version number, for your own versioning purposes
|
||||
"version": 2.0,
|
||||
# The type of the module, must be one (or more) of the built in module types
|
||||
"type": ["extractor"],
|
||||
# a boolean indicating whether or not a module requires additional user setup before it can be used
|
||||
# for example: adding API keys, installing additional software etc.
|
||||
"requires_setup": False,
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
from auto_archiver.core import Extractor
|
||||
|
||||
|
||||
class ExampleExtractor(Extractor):
|
||||
def download(self, item):
|
||||
print("download")
|
||||
@@ -1 +1 @@
|
||||
from .example_module import ExampleModule
|
||||
from .example_module import ExampleModule
|
||||
|
||||
@@ -16,14 +16,14 @@
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": ["bash"],
|
||||
},
|
||||
# configurations that this module takes. These are argparse-compliant dicationaries, that are
|
||||
},
|
||||
# configurations that this module takes. These are argparse-compliant dicationaries, that are
|
||||
# used to create command line arguments when the programme is run.
|
||||
# The full name of the config option will become: `module_name.config_name`
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"},
|
||||
"required_field": {"required": True, "help": "required field in the CSV file"},
|
||||
},
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"},
|
||||
"required_field": {"required": True, "help": "required field in the CSV file"},
|
||||
},
|
||||
# A description of the module, used for documentation
|
||||
"description": "This is an example module",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
|
||||
|
||||
|
||||
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||
def download(self, item):
|
||||
print("download")
|
||||
@@ -7,7 +8,6 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||
def __iter__(self):
|
||||
yield Metadata().set_url("https://example.com")
|
||||
|
||||
|
||||
def done(self, result):
|
||||
print("done")
|
||||
|
||||
@@ -16,13 +16,12 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||
|
||||
def get_cdn_url(self, media):
|
||||
return "nice_url"
|
||||
|
||||
|
||||
def save(self, item):
|
||||
print("save")
|
||||
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
print("uploadf")
|
||||
|
||||
|
||||
def format(self, item):
|
||||
print("format")
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.api_db import AAApiDb
|
||||
|
||||
|
||||
@@ -41,9 +40,16 @@ def test_fetch(api_db, metadata, mocker):
|
||||
mock_datetime = mocker.patch("auto_archiver.core.metadata.datetime.datetime")
|
||||
mock_datetime.now.return_value = "2021-01-01T00:00:00"
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = [{"result": {}}, {"result":
|
||||
{'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'},
|
||||
'status': 'no archiver'}}]
|
||||
mock_get.return_value.json.return_value = [
|
||||
{"result": {}},
|
||||
{
|
||||
"result": {
|
||||
"media": [],
|
||||
"metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"},
|
||||
"status": "no archiver",
|
||||
}
|
||||
},
|
||||
]
|
||||
assert api_db.fetch(metadata) == metadata
|
||||
|
||||
|
||||
@@ -52,8 +58,15 @@ def test_done_success(api_db, metadata, mocker):
|
||||
mock_post.return_value.status_code = 201
|
||||
api_db.done(metadata)
|
||||
mock_post.assert_called_once()
|
||||
mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive",
|
||||
json={'author_id': 'Someone', 'url': 'https://example.com',
|
||||
'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'},
|
||||
headers={'Authorization': 'Bearer test-token'})
|
||||
|
||||
mock_post.assert_called_once_with(
|
||||
"https://api.example.com/interop/submit-archive",
|
||||
json={
|
||||
"author_id": "Someone",
|
||||
"url": "https://example.com",
|
||||
"public": False,
|
||||
"group_id": "123",
|
||||
"tags": ["[", "]"],
|
||||
"result": '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}',
|
||||
},
|
||||
headers={"Authorization": "Bearer test-token"},
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@ import pytest
|
||||
from datetime import datetime
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.atlos_db import AtlosDb
|
||||
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosDb
|
||||
|
||||
|
||||
class FakeAPIResponse:
|
||||
@@ -12,19 +12,28 @@ class FakeAPIResponse:
|
||||
self._data = data
|
||||
self.raise_error = raise_error
|
||||
|
||||
def json(self) -> dict:
|
||||
return self._data
|
||||
|
||||
def raise_for_status(self) -> None:
|
||||
if self.raise_error:
|
||||
raise Exception("HTTP error")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def atlos_db(setup_module) -> AtlosDb:
|
||||
def atlos_db(setup_module, mocker) -> AtlosDb:
|
||||
"""Fixture for AtlosDb."""
|
||||
configs: dict = {
|
||||
"api_token": "abc123",
|
||||
"atlos_url": "https://platform.atlos.org",
|
||||
}
|
||||
return setup_module("atlos_db", configs)
|
||||
mocker.patch("requests.Session")
|
||||
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
|
||||
fake_session = mocker.MagicMock()
|
||||
# Configure the default response to have no results so that __iter__ terminates
|
||||
fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
|
||||
atlos_feeder.session = fake_session
|
||||
return atlos_feeder
|
||||
|
||||
|
||||
def test_failed_no_atlos_id(atlos_db, metadata, mocker):
|
||||
@@ -38,25 +47,18 @@ def test_failed_with_atlos_id(atlos_db, metadata, mocker):
|
||||
"""Test failed() posts failure when atlos_id is present."""
|
||||
metadata.set("atlos_id", 42)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=False)
|
||||
post_mock = mocker.patch("requests.post", return_value=fake_resp)
|
||||
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
|
||||
atlos_db.failed(metadata, "failure reason")
|
||||
expected_url = (
|
||||
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver"
|
||||
)
|
||||
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
|
||||
expected_json = {
|
||||
"metadata": {"processed": True, "status": "error", "error": "failure reason"}
|
||||
}
|
||||
post_mock.assert_called_once_with(
|
||||
expected_url, headers=expected_headers, json=expected_json
|
||||
)
|
||||
expected_endpoint = "/api/v2/source_material/metadata/42/auto_archiver"
|
||||
expected_json = {"metadata": {"processed": True, "status": "error", "error": "failure reason"}}
|
||||
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)
|
||||
|
||||
|
||||
def test_failed_http_error(atlos_db, metadata, mocker):
|
||||
"""Test failed() raises exception on HTTP error."""
|
||||
metadata.set("atlos_id", 42)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=True)
|
||||
mocker.patch("requests.post", return_value=fake_resp)
|
||||
# Patch _post to raise an exception instead of returning a fake response.
|
||||
mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
atlos_db.failed(metadata, "failure reason")
|
||||
|
||||
@@ -81,12 +83,9 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
|
||||
now = datetime.now()
|
||||
metadata.set("timestamp", now)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=False)
|
||||
post_mock = mocker.patch("requests.post", return_value=fake_resp)
|
||||
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
|
||||
atlos_db.done(metadata)
|
||||
expected_url = (
|
||||
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver"
|
||||
)
|
||||
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
|
||||
expected_endpoint = "/api/v2/source_material/metadata/99/auto_archiver"
|
||||
expected_results = metadata.metadata.copy()
|
||||
expected_results["timestamp"] = now.isoformat()
|
||||
expected_json = {
|
||||
@@ -96,15 +95,13 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
|
||||
"results": expected_results,
|
||||
}
|
||||
}
|
||||
post_mock.assert_called_once_with(
|
||||
expected_url, headers=expected_headers, json=expected_json
|
||||
)
|
||||
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)
|
||||
|
||||
|
||||
def test_done_http_error(atlos_db, metadata, mocker):
|
||||
"""Test done() raises exception on HTTP error."""
|
||||
"""Test done() raises an exception on HTTP error."""
|
||||
metadata.set("atlos_id", 123)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=True)
|
||||
mocker.patch("requests.post", return_value=fake_resp)
|
||||
# Patch _post to raise an exception.
|
||||
mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
atlos_db.done(metadata)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
from auto_archiver.modules.csv_db import CSVDb
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
@@ -9,12 +8,21 @@ def test_store_item(tmp_path, setup_module):
|
||||
temp_db = tmp_path / "temp_db.csv"
|
||||
db = setup_module(CSVDb, {"csv_file": temp_db.as_posix()})
|
||||
|
||||
item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")
|
||||
item = (
|
||||
Metadata()
|
||||
.set_url("http://example.com")
|
||||
.set_title("Example")
|
||||
.set_content("Example content")
|
||||
.success("my-archiver")
|
||||
)
|
||||
|
||||
db.done(item)
|
||||
|
||||
with open(temp_db, "r", encoding="utf-8") as f:
|
||||
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
|
||||
assert (
|
||||
f.read().strip()
|
||||
== f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
|
||||
)
|
||||
|
||||
# TODO: csv db doesn't have a fetch method - need to add it (?)
|
||||
# assert db.fetch(item) == item
|
||||
# assert db.fetch(item) == item
|
||||
|
||||
@@ -2,8 +2,7 @@ from datetime import datetime, timezone
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_db import GsheetsDb
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -29,6 +28,7 @@ def mock_metadata(mocker):
|
||||
metadata.get_first_image.return_value = None
|
||||
return metadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def metadata():
|
||||
metadata = Metadata()
|
||||
@@ -52,13 +52,36 @@ def mock_media(mocker):
|
||||
mock_media.get.return_value = "not-calculated"
|
||||
return mock_media
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gsheets_db(mock_gworksheet, setup_module, mocker):
|
||||
db = setup_module("gsheet_db", {
|
||||
"allow_worksheets": "set()",
|
||||
"block_worksheets": "set()",
|
||||
"use_sheet_names_in_stored_paths": "True",
|
||||
})
|
||||
def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsFeederDB:
|
||||
mocker.patch("gspread.service_account")
|
||||
config: dict = {
|
||||
"sheet": "testsheet",
|
||||
"sheet_id": None,
|
||||
"header": 1,
|
||||
"service_account": "test/service_account.json",
|
||||
"columns": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"allow_worksheets": set(),
|
||||
"block_worksheets": set(),
|
||||
"use_sheet_names_in_stored_paths": True,
|
||||
}
|
||||
db = setup_module("gsheet_feeder_db", config)
|
||||
db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
|
||||
return db
|
||||
|
||||
@@ -72,20 +95,21 @@ def fixed_timestamp():
|
||||
@pytest.fixture
|
||||
def expected_calls(mock_media, fixed_timestamp):
|
||||
"""Fixture for the expected cell updates."""
|
||||
return [
|
||||
(1, 'status', 'my-archiver: success'),
|
||||
(1, 'archive', 'http://example.com/screenshot.png'),
|
||||
(1, 'date', '2025-02-01T00:00:00+00:00'),
|
||||
(1, 'title', 'Example Title'),
|
||||
(1, 'text', 'Example Content'),
|
||||
(1, 'timestamp', '2025-01-01T00:00:00+00:00'),
|
||||
(1, 'hash', 'not-calculated'),
|
||||
return [
|
||||
(1, "status", "my-archiver: success"),
|
||||
(1, "archive", "http://example.com/screenshot.png"),
|
||||
(1, "date", "2025-02-01T00:00:00+00:00"),
|
||||
(1, "title", "Example Title"),
|
||||
(1, "text", "Example Content"),
|
||||
(1, "timestamp", "2025-01-01T00:00:00+00:00"),
|
||||
(1, "hash", "not-calculated"),
|
||||
# (1, 'screenshot', 'http://example.com/screenshot.png'),
|
||||
# (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
|
||||
# (1, 'wacz', 'http://example.com/browsertrix.wacz'),
|
||||
# (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
|
||||
]
|
||||
|
||||
|
||||
def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
|
||||
gw, row = gsheets_db._retrieve_gsheet(metadata)
|
||||
assert gw == mock_gworksheet
|
||||
@@ -94,27 +118,34 @@ def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
|
||||
|
||||
def test_started(gsheets_db, mock_metadata, mock_gworksheet):
|
||||
gsheets_db.started(mock_metadata)
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress')
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Archive in progress")
|
||||
|
||||
|
||||
def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
|
||||
reason = "Test failure"
|
||||
gsheets_db.failed(mock_metadata, reason)
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, "status", f"Archive failed {reason}")
|
||||
|
||||
|
||||
def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
|
||||
gsheets_db.aborted(mock_metadata)
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "")
|
||||
|
||||
|
||||
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
|
||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
|
||||
return_value="2025-02-01T00:00:00+00:00",
|
||||
)
|
||||
gsheets_db.done(metadata)
|
||||
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
|
||||
|
||||
|
||||
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
|
||||
return_value="2025-02-01T00:00:00+00:00",
|
||||
)
|
||||
gsheets_db.done(metadata, cached=True)
|
||||
|
||||
# Verify the status message includes "[cached]"
|
||||
@@ -125,15 +156,17 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||
# clear media from metadata
|
||||
metadata.media = []
|
||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
|
||||
return_value="2025-02-01T00:00:00+00:00",
|
||||
)
|
||||
gsheets_db.done(metadata)
|
||||
# Verify nothing media-related gets updated
|
||||
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
|
||||
media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'}
|
||||
media_fields = {"archive", "screenshot", "thumbnail", "wacz", "replaywebpage"}
|
||||
assert all(call[1] not in media_fields for call in call_args)
|
||||
|
||||
|
||||
def test_safe_status_update(gsheets_db, metadata, mock_gworksheet):
|
||||
gsheets_db._safe_status_update(metadata, "Test status")
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status')
|
||||
|
||||
|
||||
mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Test status")
|
||||
|
||||
@@ -4,34 +4,50 @@ from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
||||
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
|
||||
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
|
||||
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
|
||||
("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"),
|
||||
("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6")
|
||||
])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"algorithm, filename, expected_hash",
|
||||
[
|
||||
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
|
||||
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
|
||||
(
|
||||
"SHA3-512",
|
||||
"tests/data/testfile_1.txt",
|
||||
"d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e",
|
||||
),
|
||||
(
|
||||
"SHA3-512",
|
||||
"tests/data/testfile_2.txt",
|
||||
"e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_calculate_hash(algorithm, filename, expected_hash, setup_module):
|
||||
# test SHA-256
|
||||
he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 100})
|
||||
assert he.calculate_hash(filename) == expected_hash
|
||||
|
||||
|
||||
def test_default_config_values(setup_module):
|
||||
he = setup_module(HashEnricher)
|
||||
assert he.algorithm == "SHA-256"
|
||||
assert he.chunksize == 16000000
|
||||
|
||||
|
||||
def test_config():
|
||||
# test default config
|
||||
c = ModuleFactory().get_module_lazy('hash_enricher').configs
|
||||
c = ModuleFactory().get_module_lazy("hash_enricher").configs
|
||||
assert c["algorithm"]["default"] == "SHA-256"
|
||||
assert c["chunksize"]["default"] == 16000000
|
||||
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
|
||||
assert c["algorithm"]["help"] == "hash algorithm to use"
|
||||
assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
|
||||
assert (
|
||||
c["chunksize"]["help"]
|
||||
== "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
|
||||
)
|
||||
|
||||
|
||||
def test_hash_media(setup_module):
|
||||
|
||||
he = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 1})
|
||||
|
||||
# generate metadata with two test files
|
||||
@@ -46,4 +62,4 @@ def test_hash_media(setup_module):
|
||||
he.enrich(m)
|
||||
|
||||
assert m.media[0].get("hash") == "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
|
||||
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
|
||||
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import datetime
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import pytest
|
||||
@@ -16,6 +15,7 @@ def mock_metadata(mocker):
|
||||
mock.get_all_media.return_value = []
|
||||
return mock
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_media(mocker):
|
||||
"""Creates a mock Media object."""
|
||||
@@ -59,6 +59,7 @@ def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
|
||||
assert metadata.get("total_bytes") == 3000
|
||||
assert metadata.get("total_size") == "2.9 KB"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"size, expected",
|
||||
[
|
||||
@@ -74,6 +75,7 @@ def test_human_readable_bytes(size, expected):
|
||||
enricher = MetaEnricher()
|
||||
assert enricher.human_readable_bytes(size) == expected
|
||||
|
||||
|
||||
def test_enrich_file_sizes_no_media(meta_enricher, metadata):
|
||||
"""Test that enrich_file_sizes() handles empty media list gracefully."""
|
||||
meta_enricher.enrich_file_sizes(metadata)
|
||||
@@ -91,4 +93,4 @@ def test_enrich_archive_duration(meta_enricher, metadata, mocker):
|
||||
mock_datetime.now.return_value = mock_now
|
||||
meta_enricher.enrich_archive_duration(metadata)
|
||||
|
||||
assert metadata.get("archive_duration_seconds") == 630
|
||||
assert metadata.get("archive_duration_seconds") == 630
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Media
|
||||
@@ -33,9 +32,7 @@ def test_get_metadata(enricher, output, expected, mocker):
|
||||
|
||||
result = enricher.get_metadata("test.jpg")
|
||||
assert result == expected
|
||||
mock_run.assert_called_once_with(
|
||||
["exiftool", "test.jpg"], capture_output=True, text=True
|
||||
)
|
||||
mock_run.assert_called_once_with(["exiftool", "test.jpg"], capture_output=True, text=True)
|
||||
|
||||
|
||||
def test_get_metadata_exiftool_not_found(enricher, mocker):
|
||||
@@ -85,4 +82,3 @@ def test_metadata_pickle(enricher, unpickle, mocker):
|
||||
actual_media = metadata.media
|
||||
assert len(expected_media) == len(actual_media)
|
||||
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
||||
|
||||
|
||||
276
tests/enrichers/test_opentimestamps_enricher.py
Normal file
276
tests/enrichers/test_opentimestamps_enricher.py
Normal file
@@ -0,0 +1,276 @@
|
||||
import pytest
|
||||
import hashlib
|
||||
|
||||
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
|
||||
from opentimestamps.calendar import RemoteCalendar
|
||||
from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
# TODO: Remove once timestamping overhaul is merged
|
||||
@pytest.fixture
|
||||
def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
return Media(_key="subdir/test.txt", filename=str(src_file))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_path(tmp_path):
|
||||
tmp_file = tmp_path / "test.txt"
|
||||
tmp_file.write_text("This is a test file content for OpenTimestamps")
|
||||
return str(tmp_file)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def detached_timestamp_file():
|
||||
"""Create a simple detached timestamp file for testing"""
|
||||
file_hash = hashlib.sha256(b"Test content").digest()
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
|
||||
file_hash_op = OpSHA256()
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Add a pending attestation
|
||||
pending = PendingAttestation("https://example.calendar.com")
|
||||
timestamp.attestations.add(pending)
|
||||
|
||||
# Add a bitcoin attestation
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
|
||||
timestamp.attestations.add(bitcoin)
|
||||
|
||||
return DetachedTimestampFile(file_hash_op, timestamp)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def verified_timestamp_file():
|
||||
"""Create a timestamp file with a Bitcoin attestation"""
|
||||
file_hash = hashlib.sha256(b"Verified content").digest()
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
|
||||
file_hash_op = OpSHA256()
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Add only a Bitcoin attestation
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000) # Some block height
|
||||
timestamp.attestations.add(bitcoin)
|
||||
|
||||
return DetachedTimestampFile(file_hash_op, timestamp)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pending_timestamp_file():
|
||||
"""Create a timestamp file with only pending attestations"""
|
||||
file_hash = hashlib.sha256(b"Pending content").digest()
|
||||
from opentimestamps.core.op import OpSHA256
|
||||
|
||||
file_hash_op = OpSHA256()
|
||||
timestamp = Timestamp(file_hash)
|
||||
|
||||
# Add only pending attestations
|
||||
pending1 = PendingAttestation("https://example1.calendar.com")
|
||||
pending2 = PendingAttestation("https://example2.calendar.com")
|
||||
timestamp.attestations.add(pending1)
|
||||
timestamp.attestations.add(pending2)
|
||||
|
||||
return DetachedTimestampFile(file_hash_op, timestamp)
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_tsr(setup_module, mocker):
|
||||
"""Test submitting a hash to calendar servers"""
|
||||
# Mock the RemoteCalendar submit method
|
||||
mock_submit = mocker.patch.object(RemoteCalendar, "submit")
|
||||
test_timestamp = Timestamp(hashlib.sha256(b"test").digest())
|
||||
mock_submit.return_value = test_timestamp
|
||||
|
||||
# Create a calendar
|
||||
calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org")
|
||||
|
||||
# Test submission
|
||||
file_hash = hashlib.sha256(b"Test file content").digest()
|
||||
result = calendar.submit(file_hash)
|
||||
|
||||
assert mock_submit.called
|
||||
assert isinstance(result, Timestamp)
|
||||
assert result == test_timestamp
|
||||
|
||||
|
||||
def test_verify_timestamp(setup_module, detached_timestamp_file):
|
||||
"""Test the verification of timestamp attestations"""
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Test verification
|
||||
verification_info = ots.verify_timestamp(detached_timestamp_file)
|
||||
|
||||
# Check verification results
|
||||
assert verification_info["attestation_count"] == 2
|
||||
assert verification_info["verified"] is True
|
||||
assert len(verification_info["attestations"]) == 2
|
||||
|
||||
# Check attestation types
|
||||
assertion_types = [a["status"] for a in verification_info["attestations"]]
|
||||
assert "pending" in assertion_types
|
||||
assert "confirmed" in assertion_types
|
||||
|
||||
# Check Bitcoin attestation details
|
||||
bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed")
|
||||
assert bitcoin_attestation["block_height"] == 783000
|
||||
|
||||
|
||||
def test_verify_pending_only(setup_module, pending_timestamp_file):
|
||||
"""Test verification of timestamps with only pending attestations"""
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
verification_info = ots.verify_timestamp(pending_timestamp_file)
|
||||
|
||||
assert verification_info["attestation_count"] == 2
|
||||
assert verification_info["verified"] is False
|
||||
|
||||
# All attestations should be of type "pending"
|
||||
assert all(a["status"] == "pending" for a in verification_info["attestations"])
|
||||
|
||||
# Check URIs of pending attestations
|
||||
uris = [a["uri"] for a in verification_info["attestations"]]
|
||||
assert "https://example1.calendar.com" in uris
|
||||
assert "https://example2.calendar.com" in uris
|
||||
|
||||
|
||||
def test_verify_bitcoin_completed(setup_module, verified_timestamp_file):
|
||||
"""Test verification of timestamps with completed Bitcoin attestations"""
|
||||
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
verification_info = ots.verify_timestamp(verified_timestamp_file)
|
||||
|
||||
assert verification_info["attestation_count"] == 1
|
||||
assert verification_info["verified"] is True
|
||||
assert "pending" not in verification_info
|
||||
|
||||
# Check that the attestation is a Bitcoin attestation
|
||||
attestation = verification_info["attestations"][0]
|
||||
assert attestation["status"] == "confirmed"
|
||||
assert attestation["block_height"] == 783000
|
||||
|
||||
|
||||
def test_full_enriching(setup_module, sample_file_path, sample_media, mocker):
|
||||
"""Test the complete enrichment process"""
|
||||
|
||||
# Mock the calendar submission to avoid network requests
|
||||
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
|
||||
|
||||
# Create a function that returns a new timestamp for each call
|
||||
def side_effect(digest):
|
||||
test_timestamp = Timestamp(digest)
|
||||
# Add a bitcoin attestation to the test timestamp
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000)
|
||||
test_timestamp.attestations.add(bitcoin)
|
||||
return test_timestamp
|
||||
|
||||
mock_calendar.side_effect = side_effect
|
||||
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Create test metadata with sample file
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.filename = sample_file_path
|
||||
metadata.add_media(sample_media)
|
||||
|
||||
# Run enrichment
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify results
|
||||
assert metadata.get("opentimestamped") is True
|
||||
assert metadata.get("opentimestamps_count") == 1
|
||||
|
||||
# Check that we have one parent media item: the original
|
||||
assert len(metadata.media) == 1
|
||||
|
||||
# Check that the original media was updated
|
||||
assert metadata.media[0].get("opentimestamps") is True
|
||||
|
||||
# Check the timestamp file media is a child of the original
|
||||
assert len(metadata.media[0].get("opentimestamp_files")) == 1
|
||||
|
||||
timestamp_media = metadata.media[0].get("opentimestamp_files")[0]
|
||||
|
||||
assert timestamp_media.get("opentimestamps_version") is not None
|
||||
|
||||
# Check verification results on the timestamp media
|
||||
assert timestamp_media.get("verified") is True
|
||||
assert timestamp_media.get("attestation_count") == 1
|
||||
|
||||
|
||||
def test_full_enriching_one_calendar_error(
|
||||
setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file
|
||||
):
|
||||
"""Test enrichment when one calendar server returns an error"""
|
||||
# Mock the calendar submission to raise an exception
|
||||
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
|
||||
|
||||
test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935"))
|
||||
# Add a bitcoin attestation to the test timestamp
|
||||
bitcoin = BitcoinBlockHeaderAttestation(783000)
|
||||
test_timestamp.attestations.add(bitcoin)
|
||||
|
||||
mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")]
|
||||
|
||||
ots = setup_module(
|
||||
"opentimestamps_enricher",
|
||||
{
|
||||
"calendar_urls": [
|
||||
"https://alice.btc.calendar.opentimestamps.org",
|
||||
"https://bob.btc.calendar.opentimestamps.org",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
# Create test metadata with sample file
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.filename = sample_file_path
|
||||
metadata.add_media(sample_media)
|
||||
|
||||
# Run enrichment (should complete despite calendar errors)
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify results
|
||||
assert metadata.get("opentimestamped") is True
|
||||
assert metadata.get("opentimestamps_count") == 1 # only alice worked, not bob
|
||||
|
||||
|
||||
def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker):
|
||||
"""Test enrichment when calendar servers return errors"""
|
||||
# Mock the calendar submission to raise an exception
|
||||
mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
|
||||
mock_calendar.side_effect = Exception("Calendar server error")
|
||||
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Create test metadata with sample file
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.filename = sample_file_path
|
||||
metadata.add_media(sample_media)
|
||||
|
||||
# Run enrichment (should complete despite calendar errors)
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify results
|
||||
assert metadata.get("opentimestamped") is False
|
||||
assert metadata.get("opentimestamps_count") is None
|
||||
|
||||
|
||||
def test_no_files_to_stamp(setup_module):
|
||||
"""Test enrichment with no files to timestamp"""
|
||||
ots = setup_module("opentimestamps_enricher")
|
||||
|
||||
# Create empty metadata
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
|
||||
# Run enrichment
|
||||
ots.enrich(metadata)
|
||||
|
||||
# Verify no timestamping occurred
|
||||
assert metadata.get("opentimestamped") is None
|
||||
assert len(metadata.media) == 0
|
||||
@@ -14,23 +14,21 @@ def enricher(setup_module):
|
||||
def metadata_with_images():
|
||||
m = Metadata()
|
||||
m.set_url("https://example.com")
|
||||
m.add_media(Media(filename="image1.jpg", key="image1"))
|
||||
m.add_media(Media(filename="image2.jpg", key="image2"))
|
||||
m.add_media(Media(filename="image1.jpg", _key="image1"))
|
||||
m.add_media(Media(filename="image2.jpg", _key="image2"))
|
||||
return m
|
||||
|
||||
|
||||
def test_successful_enrich(metadata_with_images, mocker):
|
||||
with (
|
||||
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
|
||||
mocker.patch("PIL.Image.open"),
|
||||
mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image,
|
||||
):
|
||||
enricher = PdqHashEnricher()
|
||||
enricher.enrich(metadata_with_images)
|
||||
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100))
|
||||
mocker.patch("PIL.Image.open")
|
||||
mocker.patch.object(Media, "is_image", return_value=True)
|
||||
enricher = PdqHashEnricher()
|
||||
enricher.enrich(metadata_with_images)
|
||||
|
||||
# Ensure the hash is set for image media
|
||||
for media in metadata_with_images.media:
|
||||
assert media.get("pdq_hash") is not None
|
||||
# Ensure the hash is set for image media
|
||||
for media in metadata_with_images.media:
|
||||
assert media.get("pdq_hash") is not None
|
||||
|
||||
|
||||
def test_enrich_skip_non_image(metadata_with_images, mocker):
|
||||
@@ -59,7 +57,7 @@ def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
|
||||
("screenshot", False),
|
||||
("warc-file-123", False),
|
||||
("regular-image", True),
|
||||
]
|
||||
],
|
||||
)
|
||||
def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
|
||||
metadata = Metadata()
|
||||
@@ -75,4 +73,3 @@ def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
|
||||
|
||||
media_item = metadata.media[0]
|
||||
assert (media_item.get("pdq_hash") is not None) == should_have_hash
|
||||
|
||||
|
||||
@@ -15,13 +15,15 @@ def mock_selenium_env(mocker):
|
||||
mock_which = mocker.patch("shutil.which")
|
||||
mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
|
||||
mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
|
||||
mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True)
|
||||
mocker.patch("pathlib.Path.is_file", return_value=True)
|
||||
mock_popen = mocker.patch("subprocess.Popen")
|
||||
mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
|
||||
mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
|
||||
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
|
||||
|
||||
# Define side effect for `shutil.which`
|
||||
def mock_which_side_effect(dep):
|
||||
return "/mock/geckodriver" if dep == "geckodriver" else None
|
||||
|
||||
mock_which.side_effect = mock_which_side_effect
|
||||
|
||||
# Mock binary paths
|
||||
@@ -83,8 +85,8 @@ def test_enrich_adds_screenshot(
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
mock_driver_class.assert_called_once_with(
|
||||
cookies=None,
|
||||
cookiejar=None,
|
||||
cookie=None,
|
||||
cookie_jar=None,
|
||||
facebook_accept_cookies=False,
|
||||
options=mock_options_instance,
|
||||
)
|
||||
@@ -104,13 +106,7 @@ def test_enrich_adds_screenshot(
|
||||
],
|
||||
)
|
||||
def test_enrich_auth_wall(
|
||||
screenshot_enricher,
|
||||
metadata_with_video,
|
||||
mock_selenium_env,
|
||||
common_patches,
|
||||
url,
|
||||
is_auth,
|
||||
mocker
|
||||
screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker
|
||||
):
|
||||
# Testing with and without is_auth_wall
|
||||
mock_driver, mock_driver_class, _ = mock_selenium_env
|
||||
@@ -128,9 +124,39 @@ def test_enrich_auth_wall(
|
||||
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
|
||||
|
||||
|
||||
def test_handle_timeout_exception(
|
||||
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
|
||||
):
|
||||
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
|
||||
assert "[SKIP] SCREENSHOT since url" in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"auth",
|
||||
[
|
||||
{"cookie": "cookie"},
|
||||
{"cookies_jar": "cookie"},
|
||||
],
|
||||
)
|
||||
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
|
||||
|
||||
# patch the authentication dict:
|
||||
screenshot_enricher.authentication = {"example.com": auth}
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
|
||||
assert "[SKIP] SCREENSHOT since url" not in caplog.text
|
||||
|
||||
|
||||
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
|
||||
mock_driver, mock_driver_class, _ = mock_selenium_env
|
||||
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
|
||||
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
|
||||
with caplog.at_level("WARNING"):
|
||||
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
|
||||
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
|
||||
|
||||
|
||||
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
|
||||
mock_driver.get.side_effect = TimeoutException
|
||||
@@ -140,9 +166,7 @@ def test_handle_timeout_exception(
|
||||
assert len(metadata_with_video.media) == 1
|
||||
|
||||
|
||||
def test_handle_general_exception(
|
||||
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
|
||||
):
|
||||
def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
|
||||
"""Test proper handling of unexpected general exceptions"""
|
||||
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
|
||||
# Simulate a generic exception when save_screenshot is called
|
||||
@@ -152,9 +176,7 @@ def test_handle_general_exception(
|
||||
mock_log = mocker.patch("loguru.logger.error")
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
# Verify that the exception was logged with the log
|
||||
mock_log.assert_called_once_with(
|
||||
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
|
||||
)
|
||||
mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error")
|
||||
# And no new media was added due to the error
|
||||
assert len(metadata_with_video.media) == 1
|
||||
|
||||
@@ -167,13 +189,12 @@ def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_sel
|
||||
# Mock the print_page method to return base64-encoded content
|
||||
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
|
||||
# Patch functions with mocker
|
||||
mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
|
||||
mock_random_str = mocker.patch(
|
||||
mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
|
||||
return_value="fixed123",
|
||||
)
|
||||
mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
|
||||
mock_log_error = mocker.patch("loguru.logger.error")
|
||||
|
||||
screenshot_enricher.enrich(metadata_with_video)
|
||||
# Verify screenshot and PDF creation
|
||||
|
||||
@@ -51,4 +51,3 @@ def test_ssl_error_handling(enricher, metadata, mocker):
|
||||
mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error"))
|
||||
with pytest.raises(ssl.SSLError, match="SSL error"):
|
||||
enricher.enrich(metadata)
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ def mock_ffmpeg_environment(mocker):
|
||||
# Mocking all the ffmpeg calls in one place
|
||||
mock_ffmpeg_input = mocker.patch("ffmpeg.input")
|
||||
mock_makedirs = mocker.patch("os.makedirs")
|
||||
mocker.patch.object(Media, "is_video", return_value=True),
|
||||
(mocker.patch.object(Media, "is_video", return_value=True),)
|
||||
mock_probe = mocker.patch(
|
||||
"ffmpeg.probe",
|
||||
return_value={
|
||||
@@ -35,9 +35,7 @@ def mock_ffmpeg_environment(mocker):
|
||||
},
|
||||
)
|
||||
mock_output = mocker.MagicMock()
|
||||
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = (
|
||||
mock_output
|
||||
)
|
||||
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = mock_output
|
||||
|
||||
return {
|
||||
"mock_ffmpeg_input": mock_ffmpeg_input,
|
||||
@@ -47,14 +45,21 @@ def mock_ffmpeg_environment(mocker):
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [
|
||||
(10, 5, 5), # Capped at max_thumbnails
|
||||
(1, 10, 2), # Less than max_thumbnails
|
||||
(60, 7, 7), # Matches exactly
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"thumbnails_per_minute, max_thumbnails, expected_count",
|
||||
[
|
||||
(10, 5, 5), # Capped at max_thumbnails
|
||||
(1, 10, 2), # Less than max_thumbnails
|
||||
(60, 7, 7), # Matches exactly
|
||||
],
|
||||
)
|
||||
def test_enrich_thumbnail_limits(
|
||||
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment,
|
||||
thumbnails_per_minute, max_thumbnails, expected_count
|
||||
thumbnail_enricher,
|
||||
metadata_with_video,
|
||||
mock_ffmpeg_environment,
|
||||
thumbnails_per_minute,
|
||||
max_thumbnails,
|
||||
expected_count,
|
||||
):
|
||||
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
|
||||
thumbnail_enricher.max_thumbnails = max_thumbnails
|
||||
@@ -65,8 +70,8 @@ def test_enrich_thumbnail_limits(
|
||||
thumbnails = metadata_with_video.media[0].get("thumbnails")
|
||||
assert len(thumbnails) == expected_count
|
||||
|
||||
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
|
||||
|
||||
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
|
||||
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
|
||||
mocker.patch("os.makedirs")
|
||||
mock_logger = mocker.patch("loguru.logger.error")
|
||||
@@ -74,36 +79,43 @@ def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, m
|
||||
|
||||
thumbnail_enricher.enrich(metadata_with_video)
|
||||
# Ensure error was logged
|
||||
mock_logger.assert_called_with(
|
||||
f"error getting duration of video video.mp4: Probe error"
|
||||
)
|
||||
mock_logger.assert_called_with("error getting duration of video video.mp4: Probe error")
|
||||
# Ensure no thumbnails were created
|
||||
thumbnails = metadata_with_video.media[0].get("thumbnails")
|
||||
assert thumbnails is None
|
||||
|
||||
|
||||
def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker):
|
||||
mocker.patch.object(Media, "is_video", return_value=False)
|
||||
mock_ffmpeg = mocker.patch("ffmpeg.input")
|
||||
thumbnail_enricher.enrich(metadata_with_video)
|
||||
mock_ffmpeg.assert_not_called()
|
||||
mocker.patch.object(Media, "is_video", return_value=False)
|
||||
mock_ffmpeg = mocker.patch("ffmpeg.input")
|
||||
thumbnail_enricher.enrich(metadata_with_video)
|
||||
mock_ffmpeg.assert_not_called()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [
|
||||
(60, 5, 5), # caught by max
|
||||
(60, 20, 10), # caught by t/min
|
||||
(0, 20, 1), # test min caught (1)
|
||||
(11, 20, 1), # test min caught (1)
|
||||
(12, 20, 2), # test caught by t/min
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"thumbnails_per_minute,max_thumbnails,expected_count",
|
||||
[
|
||||
(60, 5, 5), # caught by max
|
||||
(60, 20, 10), # caught by t/min
|
||||
(0, 20, 1), # test min caught (1)
|
||||
(11, 20, 1), # test min caught (1)
|
||||
(12, 20, 2), # test caught by t/min
|
||||
],
|
||||
)
|
||||
def test_enrich_handles_short_video(
|
||||
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker
|
||||
thumbnail_enricher,
|
||||
metadata_with_video,
|
||||
mock_ffmpeg_environment,
|
||||
thumbnails_per_minute,
|
||||
max_thumbnails,
|
||||
expected_count,
|
||||
mocker,
|
||||
):
|
||||
# override mock duration
|
||||
fake_duration = 10
|
||||
mocker.patch(
|
||||
"ffmpeg.probe",
|
||||
return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
|
||||
return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
|
||||
)
|
||||
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
|
||||
thumbnail_enricher.max_thumbnails = max_thumbnails
|
||||
@@ -114,9 +126,7 @@ def test_enrich_handles_short_video(
|
||||
assert len(thumbnails) == expected_count
|
||||
|
||||
|
||||
def test_uses_existing_duration(
|
||||
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment
|
||||
):
|
||||
def test_uses_existing_duration(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment):
|
||||
metadata_with_video.media[0].set("duration", 60)
|
||||
thumbnail_enricher.enrich(metadata_with_video)
|
||||
mock_ffmpeg_environment["mock_probe"].assert_not_called()
|
||||
@@ -125,7 +135,7 @@ def test_uses_existing_duration(
|
||||
|
||||
def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
|
||||
fake_duration = 120
|
||||
mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]})
|
||||
mocker.patch("ffmpeg.probe", return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]})
|
||||
thumbnail_enricher.thumbnails_per_minute = 2
|
||||
thumbnail_enricher.max_thumbnails = 4
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ from zipfile import ZipFile
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -22,6 +23,15 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
|
||||
return wacz
|
||||
|
||||
|
||||
def test_raises_error_without_docker_installed(setup_module, mocker, caplog):
|
||||
# pretend that docker isn't installed
|
||||
mocker.patch("shutil.which").return_value = None
|
||||
with pytest.raises(SetupError):
|
||||
setup_module("wacz_extractor_enricher", {})
|
||||
|
||||
assert "requires external dependency 'docker' which is not available/setup" in caplog.text
|
||||
|
||||
|
||||
def test_setup_without_docker(wacz_enricher, mocker):
|
||||
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
|
||||
wacz_enricher.setup()
|
||||
|
||||
@@ -5,37 +5,52 @@ from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnr
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_sleep(mocker):
|
||||
"""Mock time.sleep to avoid delays."""
|
||||
return mocker.patch("time.sleep")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_is_auth_wall(mocker):
|
||||
"""Fixture to mock is_auth_wall behavior."""
|
||||
|
||||
def _mock_is_auth_wall(return_value: bool):
|
||||
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
|
||||
|
||||
return _mock_is_auth_wall
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_post_success(mocker):
|
||||
"""Fixture to mock POST requests with a successful response."""
|
||||
|
||||
def _mock_post(json_data: dict = None, status_code: int = 200):
|
||||
json_data = json_data or {"job_id": "job123"}
|
||||
json_data = {"job_id": "job123"} if json_data is None else json_data
|
||||
resp = mocker.Mock(status_code=status_code)
|
||||
resp.json.return_value = json_data
|
||||
return mocker.patch("requests.post", return_value=resp)
|
||||
|
||||
return _mock_post
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_get_success(mocker):
|
||||
"""Fixture to mock GET requests returning a completed archive status."""
|
||||
|
||||
def _mock_get(json_data: dict = None, status_code: int = 200):
|
||||
json_data = json_data or {
|
||||
"status": "success",
|
||||
"timestamp": "20250101010101",
|
||||
"original_url": "https://example.com"
|
||||
"original_url": "https://example.com",
|
||||
}
|
||||
resp = mocker.Mock(status_code=status_code)
|
||||
resp.json.return_value = json_data
|
||||
return mocker.patch("requests.get", return_value=resp)
|
||||
|
||||
return _mock_get
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
|
||||
configs: dict = {
|
||||
@@ -49,12 +64,7 @@ def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
|
||||
return setup_module("wayback_extractor_enricher", configs)
|
||||
|
||||
|
||||
def test_download_success(
|
||||
wayback_extractor_enricher,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mock_get_success
|
||||
):
|
||||
def test_download_success(wayback_extractor_enricher, mock_is_auth_wall, mock_post_success, mock_get_success):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
mock_get_success()
|
||||
@@ -63,34 +73,28 @@ def test_download_success(
|
||||
result = wayback_extractor_enricher.download(metadata)
|
||||
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
|
||||
|
||||
|
||||
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
|
||||
mock_is_auth_wall(True)
|
||||
result = wayback_extractor_enricher.enrich(metadata)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
|
||||
metadata.set("wayback", "existing")
|
||||
result = wayback_extractor_enricher.enrich(metadata)
|
||||
assert result is True
|
||||
|
||||
def test_enrich_post_failure(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success
|
||||
):
|
||||
|
||||
def test_enrich_post_failure(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success(json_data={"error": "server error"}, status_code=500)
|
||||
result = wayback_extractor_enricher.enrich(metadata)
|
||||
assert result is False
|
||||
assert "Internet archive failed with status of 500" in metadata.get("wayback")
|
||||
|
||||
def test_enrich_post_json_decode_error(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mocker
|
||||
):
|
||||
|
||||
def test_enrich_post_json_decode_error(wayback_extractor_enricher, metadata, mock_is_auth_wall, mocker):
|
||||
mock_is_auth_wall(False)
|
||||
resp = mocker.Mock(status_code=200)
|
||||
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
|
||||
@@ -98,22 +102,15 @@ def test_enrich_post_json_decode_error(
|
||||
mocker.patch("requests.post", return_value=resp)
|
||||
assert wayback_extractor_enricher.enrich(metadata) is False
|
||||
|
||||
def test_enrich_no_job_id(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success
|
||||
):
|
||||
|
||||
def test_enrich_no_job_id(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success(json_data={})
|
||||
assert wayback_extractor_enricher.enrich(metadata) is False
|
||||
|
||||
|
||||
def test_enrich_get_success(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mock_get_success
|
||||
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
@@ -122,24 +119,18 @@ def test_enrich_get_success(
|
||||
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
|
||||
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
|
||||
|
||||
|
||||
def test_enrich_get_failure(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mock_get_success
|
||||
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
mock_get_success(json_data={"status": "failed"}, status_code=400)
|
||||
assert wayback_extractor_enricher.enrich(metadata) is False
|
||||
|
||||
|
||||
def test_enrich_get_request_exception(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mocker
|
||||
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
@@ -149,12 +140,9 @@ def test_enrich_get_request_exception(
|
||||
assert wayback_extractor_enricher.enrich(metadata) is True
|
||||
assert metadata.get("wayback").get("job_id") == "job123"
|
||||
|
||||
|
||||
def test_enrich_get_json_decode_error(
|
||||
wayback_extractor_enricher,
|
||||
metadata,
|
||||
mock_is_auth_wall,
|
||||
mock_post_success,
|
||||
mocker
|
||||
wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
|
||||
):
|
||||
mock_is_auth_wall(False)
|
||||
mock_post_success()
|
||||
|
||||
@@ -7,6 +7,12 @@ from auto_archiver.modules.whisper_enricher import WhisperEnricher
|
||||
TEST_S3_URL = "http://cdn.example.com/test.mp4"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_sleep(mocker):
|
||||
"""Mock time.sleep to avoid delays."""
|
||||
return mocker.patch("time.sleep")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def enricher(mocker):
|
||||
"""Fixture with mocked S3 and API dependencies"""
|
||||
@@ -16,7 +22,7 @@ def enricher(mocker):
|
||||
"include_srt": False,
|
||||
"timeout": 5,
|
||||
"action": "translate",
|
||||
"steps": {"storages": ["s3_storage"]}
|
||||
"steps": {"storages": ["s3_storage"]},
|
||||
}
|
||||
mock_s3 = mocker.MagicMock(spec=S3Storage)
|
||||
mock_s3.get_cdn_url.return_value = TEST_S3_URL
|
||||
@@ -25,7 +31,7 @@ def enricher(mocker):
|
||||
instance.display_name = "Whisper Enricher"
|
||||
instance.config_setup({instance.name: config})
|
||||
# bypassing the setup method and mocking S3 setup
|
||||
instance.stores = config['steps']['storages']
|
||||
instance.stores = config["steps"]["storages"]
|
||||
instance.s3 = mock_s3
|
||||
yield instance, mock_s3
|
||||
|
||||
@@ -63,19 +69,14 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
|
||||
# Mock the complete API interaction chain
|
||||
mock_status_response = mocker.MagicMock()
|
||||
mock_status_response.status_code = 200
|
||||
mock_status_response.json.return_value = {
|
||||
"status": "success",
|
||||
"meta": {}
|
||||
}
|
||||
mock_status_response.json.return_value = {"status": "success", "meta": {}}
|
||||
mock_artifacts_response = mocker.MagicMock()
|
||||
mock_artifacts_response.status_code = 200
|
||||
mock_artifacts_response.json.return_value = [{
|
||||
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
|
||||
}]
|
||||
mock_artifacts_response.json.return_value = [{"data": [{"start": 0, "end": 5, "text": "test transcript"}]}]
|
||||
# Set up mock response sequence
|
||||
mock_requests.get.side_effect = [
|
||||
mock_status_response, # First call: status check
|
||||
mock_artifacts_response # Second call: artifacts check
|
||||
mock_artifacts_response, # Second call: artifacts check
|
||||
]
|
||||
|
||||
# Run enrichment (without opening file)
|
||||
@@ -84,15 +85,17 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
|
||||
mock_requests.post.assert_called_once_with(
|
||||
"http://testapi/jobs",
|
||||
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
|
||||
headers={"Authorization": "Bearer whisper-key"}
|
||||
headers={"Authorization": "Bearer whisper-key"},
|
||||
)
|
||||
# Verify job status checks
|
||||
assert mock_requests.get.call_count == 2
|
||||
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
|
||||
assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript',
|
||||
'job_artifacts_check': 'http://testapi/jobs/job123/artifacts',
|
||||
'job_id': 'job123',
|
||||
'job_status_check': 'http://testapi/jobs/job123'}
|
||||
assert metadata.media[0].get("whisper_model") == {
|
||||
"artifact_0_text": "test transcript",
|
||||
"job_artifacts_check": "http://testapi/jobs/job123/artifacts",
|
||||
"job_id": "job123",
|
||||
"job_status_check": "http://testapi/jobs/job123",
|
||||
}
|
||||
|
||||
|
||||
def test_submit_job(enricher, mocker):
|
||||
|
||||
@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor
|
||||
|
||||
|
||||
class TestExtractorBase(object):
|
||||
|
||||
extractor_module: str = None
|
||||
config: dict = None
|
||||
|
||||
@@ -17,7 +16,7 @@ class TestExtractorBase(object):
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
|
||||
self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
|
||||
|
||||
|
||||
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
|
||||
assert test_response is not False
|
||||
|
||||
|
||||
@@ -9,26 +9,28 @@ import pytest
|
||||
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
|
||||
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||
|
||||
|
||||
class TestGenericExtractor(TestExtractorBase):
|
||||
"""Tests Generic Extractor
|
||||
"""
|
||||
extractor_module = 'generic_extractor'
|
||||
"""Tests Generic Extractor"""
|
||||
|
||||
extractor_module = "generic_extractor"
|
||||
extractor: GenericExtractor
|
||||
|
||||
config = {
|
||||
'subtitles': False,
|
||||
'comments': False,
|
||||
'livestreams': False,
|
||||
'live_from_start': False,
|
||||
'end_means_success': True,
|
||||
'allow_playlist': False,
|
||||
'max_downloads': "inf",
|
||||
'proxy': None,
|
||||
'cookies_from_browser': False,
|
||||
'cookie_file': None,
|
||||
}
|
||||
|
||||
"subtitles": False,
|
||||
"comments": False,
|
||||
"livestreams": False,
|
||||
"live_from_start": False,
|
||||
"end_means_success": True,
|
||||
"allow_playlist": False,
|
||||
"max_downloads": "inf",
|
||||
"proxy": None,
|
||||
"cookies_from_browser": False,
|
||||
"cookie_file": None,
|
||||
}
|
||||
|
||||
def test_load_dropin(self):
|
||||
# test loading dropins that are in the generic_archiver package
|
||||
package = "auto_archiver.modules.generic_extractor"
|
||||
@@ -38,21 +40,42 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
path = os.path.join(dirname(dirname(__file__)), "data/")
|
||||
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, suitable_extractors",
|
||||
[
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
||||
],
|
||||
)
|
||||
def test_suitable_extractors(self, url, suitable_extractors):
|
||||
suitable_extractors = suitable_extractors + ["generic"] # the generic is valid for all
|
||||
extractors = list(self.extractor.suitable_extractors(url))
|
||||
assert len(extractors) == len(suitable_extractors)
|
||||
assert [e.ie_key().lower() for e in extractors] == suitable_extractors
|
||||
|
||||
|
||||
@pytest.mark.parametrize("url, is_suitable", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
|
||||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
|
||||
("https://google.com", True)])
|
||||
def test_suitable_urls(self, make_item, url, is_suitable):
|
||||
@pytest.mark.parametrize(
|
||||
"url, is_suitable",
|
||||
[
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
|
||||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
(
|
||||
"https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
|
||||
True,
|
||||
),
|
||||
("https://google.com", True),
|
||||
],
|
||||
)
|
||||
def test_suitable_urls(self, url, is_suitable):
|
||||
"""
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
and then if and only if all archivers fails, does it fall back to the generic archiver)
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
and then if and only if all archivers fails, does it fall back to the generic archiver)
|
||||
"""
|
||||
assert self.extractor.suitable(url) == is_suitable
|
||||
|
||||
@@ -63,11 +86,14 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url", [
|
||||
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
|
||||
"twitter.com/bellingcat/status/123",
|
||||
"https://www.youtube.com/watch?v=1"
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
|
||||
"twitter.com/bellingcat/status/123",
|
||||
"https://www.youtube.com/watch?v=1",
|
||||
],
|
||||
)
|
||||
def test_download_nonexistent_media(self, make_item, url):
|
||||
"""
|
||||
Test to make sure that the extractor doesn't break on non-existend posts/media
|
||||
@@ -78,7 +104,10 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
result = self.extractor.download(item)
|
||||
assert not result
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
|
||||
@pytest.mark.skipif(
|
||||
CI,
|
||||
reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
|
||||
)
|
||||
@pytest.mark.download
|
||||
def test_youtube_download(self, make_item):
|
||||
# url https://www.youtube.com/watch?v=5qap5aO4i9A
|
||||
@@ -87,7 +116,10 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
result = self.extractor.download(item)
|
||||
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
|
||||
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
|
||||
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
assert (
|
||||
result.get("description")
|
||||
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
)
|
||||
assert len(result.media) == 2
|
||||
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
|
||||
assert Path(result.media[1].filename).name == "hqdefault.jpg"
|
||||
@@ -103,7 +135,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_bluesky_download_no_media(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
|
||||
@@ -115,7 +147,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
|
||||
@pytest.mark.download
|
||||
def test_truthsocial_download_video(self, make_item):
|
||||
@@ -130,14 +162,14 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
|
||||
@pytest.mark.download
|
||||
def test_truthsocial_download_poll(self, make_item):
|
||||
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
|
||||
@pytest.mark.download
|
||||
def test_truthsocial_download_single_image(self, make_item):
|
||||
@@ -159,7 +191,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
url = "https://x.com/Bellingcat/status/17197025860711058"
|
||||
response = self.extractor.download(make_item(url))
|
||||
assert not response
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_malformed_tweetid(self, make_item):
|
||||
# this tweet does not exist
|
||||
@@ -169,17 +201,17 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_tweet_no_media(self, make_item):
|
||||
|
||||
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.extractor.download(item)
|
||||
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
"Cookie Monster - Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"yt-dlp_Twitter: success"
|
||||
"yt-dlp_Twitter: success",
|
||||
)
|
||||
|
||||
assert post.get("content") == "Onion rings are just vegetable donuts."
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_video(self, make_item):
|
||||
url = "https://x.com/bellingcat/status/1871552600346415571"
|
||||
@@ -187,26 +219,75 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
|
||||
@pytest.mark.xfail(
|
||||
reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
|
||||
)
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, title, timestamp, image_hash",
|
||||
[
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710769913450647",
|
||||
"ignore tweet, testing sensitivity warning nudity",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710875475681357",
|
||||
"ignore tweet, testing sensitivity warning violence",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711053813227618",
|
||||
"ignore tweet, testing sensitivity warning sensitive",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711141314801937",
|
||||
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
|
||||
post = self.extractor.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
self.assertValidResponseMetadata(post, title, timestamp)
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].hash == image_hash
|
||||
assert post.media[0].hash == image_hash
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_video(self, make_item):
|
||||
post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
|
||||
assert len(post.media) == 2
|
||||
assert post.media[0].filename.endswith("588371253839133.mp4")
|
||||
assert post.media[0].mimetype == "video/mp4"
|
||||
|
||||
assert post.media[1].filename.endswith(".jpg")
|
||||
assert post.media[1].mimetype == "image/jpeg"
|
||||
|
||||
assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_image(self, make_item):
|
||||
post = self.extractor.download(
|
||||
make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
|
||||
)
|
||||
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].filename.endswith(".png")
|
||||
assert "Byline Festival - BylineFest Partner" == post.get_title()
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_text_only(self, make_item):
|
||||
url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
|
||||
post = self.extractor.download(make_item(url))
|
||||
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
|
||||
assert post.get_title() == "Bellingcat"
|
||||
|
||||
@@ -15,10 +15,11 @@ def mock_user_response():
|
||||
"username": "test_user",
|
||||
"full_name": "Test User",
|
||||
"profile_pic_url_hd": "http://example.com/profile.jpg",
|
||||
"profile_pic_url": "http://example.com/profile_lowres.jpg"
|
||||
"profile_pic_url": "http://example.com/profile_lowres.jpg",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_post_response():
|
||||
return {
|
||||
@@ -27,16 +28,14 @@ def mock_post_response():
|
||||
"caption_text": "Test Caption",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/video.mp4",
|
||||
"thumbnail_url": "http://example.com/thumbnail.jpg"
|
||||
"thumbnail_url": "http://example.com/thumbnail.jpg",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_story_response():
|
||||
return [{
|
||||
"id": "story_123",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/story.mp4"
|
||||
}]
|
||||
return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_highlight_response():
|
||||
@@ -46,11 +45,13 @@ def mock_highlight_response():
|
||||
"highlight:123": {
|
||||
"id": "123",
|
||||
"title": "Test Highlight",
|
||||
"items": [{
|
||||
"id": "item_123",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/highlight.mp4"
|
||||
}]
|
||||
"items": [
|
||||
{
|
||||
"id": "item_123",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/highlight.mp4",
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
m.set("netloc", "instagram.com")
|
||||
return m
|
||||
|
||||
@pytest.mark.parametrize("url,expected", [
|
||||
("https://instagram.com/user", [("", "user", "")]),
|
||||
("https://instagr.am/p/post_id", []),
|
||||
("https://youtube.com", []),
|
||||
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
|
||||
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
|
||||
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url,expected",
|
||||
[
|
||||
("https://instagram.com/user", [("", "user", "")]),
|
||||
("https://instagr.am/p/post_id", []),
|
||||
("https://youtube.com", []),
|
||||
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
|
||||
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
|
||||
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
|
||||
],
|
||||
)
|
||||
def test_url_parsing(self, url, expected):
|
||||
assert self.extractor.valid_url.findall(url) == expected
|
||||
|
||||
def test_initialize(self):
|
||||
assert self.extractor.api_endpoint[-1] != "/"
|
||||
|
||||
@pytest.mark.parametrize("input_dict,expected", [
|
||||
({"x": 0, "valid": "data"}, {"valid": "data"}),
|
||||
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected",
|
||||
[
|
||||
({"x": 0, "valid": "data"}, {"valid": "data"}),
|
||||
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
|
||||
],
|
||||
)
|
||||
def test_cleanup_dict(self, input_dict, expected):
|
||||
assert self.extractor.cleanup_dict(input_dict) == expected
|
||||
|
||||
@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_basic(self, metadata, mock_user_response, mocker):
|
||||
"""Test basic profile download without full_profile"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_download = mocker.patch.object(self.extractor, 'download_from_url')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_download = mocker.patch.object(self.extractor, "download_from_url")
|
||||
# Mock API responses
|
||||
mock_call.return_value = mock_user_response
|
||||
mock_download.return_value = "profile.jpg"
|
||||
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
|
||||
"""Test full profile download with stories/posts"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
|
||||
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
|
||||
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
|
||||
mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
|
||||
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
|
||||
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
|
||||
mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")
|
||||
|
||||
self.extractor.full_profile = True
|
||||
mock_call.side_effect = [
|
||||
mock_user_response,
|
||||
mock_story_response
|
||||
]
|
||||
mock_call.side_effect = [mock_user_response, mock_story_response]
|
||||
mock_highlights.return_value = None
|
||||
mock_stories.return_value = mock_story_response
|
||||
mock_posts.return_value = None
|
||||
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_not_found(self, metadata, mocker):
|
||||
"""Test profile not found error"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_call.return_value = {"user": None}
|
||||
with pytest.raises(AssertionError) as exc_info:
|
||||
self.extractor.download_profile(metadata, "invalid_user")
|
||||
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
|
||||
"""Test error handling in full profile mode"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
|
||||
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
|
||||
stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
|
||||
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
|
||||
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
|
||||
stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
|
||||
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
|
||||
|
||||
self.extractor.full_profile = True
|
||||
mock_call.side_effect = [
|
||||
mock_user_response,
|
||||
Exception("Stories API failed"),
|
||||
Exception("Posts API failed")
|
||||
]
|
||||
mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
|
||||
mock_highlights.return_value = None
|
||||
mock_tagged.return_value = None
|
||||
stories_tagged.return_value = None
|
||||
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
result = self.extractor.download_profile(metadata, "test_user")
|
||||
|
||||
assert result.is_success()
|
||||
assert "Error downloading stories for test_user" in result.metadata["errors"]
|
||||
assert "Error downloading stories for test_user" in result.metadata["errors"]
|
||||
|
||||
@@ -1,21 +1,41 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.instagram_extractor import InstagramExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
class TestInstagramExtractor(TestExtractorBase):
|
||||
|
||||
extractor_module: str = 'instagram_extractor'
|
||||
config: dict = {}
|
||||
@pytest.fixture
|
||||
def instagram_extractor(setup_module, mocker):
|
||||
extractor_module: str = "instagram_extractor"
|
||||
config: dict = {
|
||||
"username": "user_name",
|
||||
"password": "password123",
|
||||
"download_folder": "instaloader",
|
||||
"session_file": "secrets/instaloader.session",
|
||||
}
|
||||
fake_loader = mocker.MagicMock()
|
||||
fake_loader.load_session_from_file.return_value = None
|
||||
fake_loader.login.return_value = None
|
||||
fake_loader.save_session_to_file.return_value = None
|
||||
mocker.patch(
|
||||
"instaloader.Instaloader",
|
||||
return_value=fake_loader,
|
||||
)
|
||||
return setup_module(extractor_module, config)
|
||||
|
||||
@pytest.mark.parametrize("url", [
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"https://www.instagram.com/p/",
|
||||
"https://www.instagram.com/p/1234567890/",
|
||||
"https://www.instagram.com/reel/1234567890/",
|
||||
"https://www.instagram.com/username/",
|
||||
"https://www.instagram.com/username/stories/",
|
||||
"https://www.instagram.com/username/highlights/",
|
||||
])
|
||||
def test_regex_matches(self, url):
|
||||
# post
|
||||
assert InstagramExtractor.valid_url.match(url)
|
||||
],
|
||||
)
|
||||
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
|
||||
"""
|
||||
Ensure that the valid_url regex matches all provided Instagram URLs.
|
||||
"""
|
||||
assert instagram_extractor.valid_url.match(url)
|
||||
|
||||
@@ -7,10 +7,16 @@ from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtracto
|
||||
from tests.extractors.test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_sleep(mocker):
|
||||
"""Mock time.sleep to avoid delays."""
|
||||
return mocker.patch("time.sleep")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patch_extractor_methods(request, setup_module, mocker):
|
||||
mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
|
||||
mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
|
||||
mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
|
||||
mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
|
||||
yield
|
||||
|
||||
|
||||
@@ -35,12 +41,7 @@ def mock_telegram_client(mocker):
|
||||
@pytest.fixture
|
||||
def extractor(setup_module, patch_extractor_methods, mocker):
|
||||
extractor_module = "instagram_tbot_extractor"
|
||||
config = {
|
||||
"api_id": 12345,
|
||||
"api_hash": "test_api_hash",
|
||||
"session_file": "test_session",
|
||||
"timeout": 4
|
||||
}
|
||||
config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
|
||||
extractor = setup_module(extractor_module, config)
|
||||
extractor.client = mocker.MagicMock()
|
||||
extractor.session_file = "test_session"
|
||||
@@ -79,21 +80,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
|
||||
"session_file": "secrets/anon-insta",
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, expected_status, message, len_media", [
|
||||
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
|
||||
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
|
||||
6),
|
||||
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
|
||||
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
|
||||
3),
|
||||
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
|
||||
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
|
||||
# Seems to be working intermittently for highlights
|
||||
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
|
||||
# Marking invalid url as success
|
||||
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
|
||||
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, expected_status, message, len_media",
|
||||
[
|
||||
(
|
||||
"https://www.instagram.com/p/C4QgLbrIKXG",
|
||||
"insta-via-bot: success",
|
||||
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
|
||||
6,
|
||||
),
|
||||
(
|
||||
"https://www.instagram.com/reel/DEVLK8qoIbg/",
|
||||
"insta-via-bot: success",
|
||||
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
|
||||
3,
|
||||
),
|
||||
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
|
||||
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
|
||||
# Seems to be working intermittently for highlights
|
||||
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
|
||||
# Marking invalid url as success
|
||||
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
|
||||
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
|
||||
],
|
||||
)
|
||||
def test_download(self, url, expected_status, message, len_media, metadata_sample):
|
||||
"""Test the `download()` method with various Instagram URLs."""
|
||||
metadata_sample.set_url(url)
|
||||
|
||||
177
tests/extractors/test_tiktok_tikwm_extractor.py
Normal file
177
tests/extractors/test_tiktok_tikwm_extractor.py
Normal file
@@ -0,0 +1,177 @@
|
||||
from datetime import datetime, timezone
|
||||
import time
|
||||
import pytest
|
||||
import yt_dlp
|
||||
|
||||
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
|
||||
from auto_archiver.modules.generic_extractor.tiktok import Tiktok, TikTokIE
|
||||
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_ytdlp_own_methods(mocker):
|
||||
# mock this method, so that we skip the ytdlp download in these tests
|
||||
mocker.patch("auto_archiver.modules.generic_extractor.tiktok.Tiktok.skip_ytdlp_download", return_value=True)
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.suitable_extractors",
|
||||
return_value=[e for e in yt_dlp.YoutubeDL()._ies.values() if e.IE_NAME == "TikTok"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_get(mocker):
|
||||
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tiktok_dropin() -> Tiktok:
|
||||
return Tiktok()
|
||||
|
||||
|
||||
class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
"""
|
||||
Test suite for TestTiktokTikwmExtractor.
|
||||
"""
|
||||
|
||||
extractor_module = "generic_extractor"
|
||||
extractor: GenericExtractor
|
||||
|
||||
config = {}
|
||||
|
||||
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, is_suitable",
|
||||
[
|
||||
("https://bellingcat.com", False),
|
||||
("https://youtube.com", False),
|
||||
("https://tiktok.co/", False),
|
||||
("https://tiktok.com/", False),
|
||||
("https://www.tiktok.com/", False),
|
||||
("https://api.cool.tiktok.com/", False),
|
||||
(VALID_EXAMPLE_URL, True),
|
||||
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
|
||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
|
||||
("https://vt.tiktok.com/ZSMTJeqRP/", True),
|
||||
],
|
||||
)
|
||||
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
|
||||
assert tiktok_dropin.suitable(url, TikTokIE()) == is_suitable
|
||||
|
||||
def test_invalid_json_responses(self, mock_get, make_item, caplog):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.side_effect = ValueError
|
||||
with caplog.at_level("DEBUG"):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
mock_get.return_value.json.side_effect = Exception
|
||||
with caplog.at_level("ERROR"):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 2
|
||||
assert mock_get.return_value.json.call_count == 2
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response",
|
||||
[
|
||||
({"msg": "failure"}),
|
||||
({"msg": "success"}),
|
||||
],
|
||||
)
|
||||
def test_unsuccessful_responses(self, mock_get, make_item, response, caplog):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = response
|
||||
with caplog.at_level("DEBUG"):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
assert "failed to get a valid response from tikwm.com" in caplog.text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response,has_vid",
|
||||
[
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
],
|
||||
)
|
||||
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
if not has_vid:
|
||||
assert result is False
|
||||
else:
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 1
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 1 + int(has_vid)
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
|
||||
def test_correct_data_extracted(self, mock_get, make_item):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {
|
||||
"msg": "success",
|
||||
"data": {
|
||||
"wmplay": "url",
|
||||
"origin_cover": "cover.jpg",
|
||||
"title": "Title",
|
||||
"id": 123,
|
||||
"duration": 60,
|
||||
"create_time": 1736301699,
|
||||
"author": "Author",
|
||||
"other": "data",
|
||||
},
|
||||
}
|
||||
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Title"
|
||||
assert result.get("author") == "Author"
|
||||
assert result.get("api_data") == {"other": "data", "id": 123}
|
||||
assert result.media[1].get("duration") == 60
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_video(self, make_item):
|
||||
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
|
||||
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert (
|
||||
result.get_title()
|
||||
== "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
|
||||
)
|
||||
assert result.get("author").get("unique_id") == "bbcnews"
|
||||
assert result.get("api_data").get("id") == "7478038212070411542"
|
||||
assert result.media[1].get("duration") == 59
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_video(self, make_item):
|
||||
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
|
||||
# Required for rate limiting
|
||||
time.sleep(1.1)
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
|
||||
assert result.get("author").get("id") == "7197400619475649562"
|
||||
assert result.get("api_data").get("id") == "7441821351142362375"
|
||||
assert result.media[1].get("duration") == 34
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
import datetime
|
||||
import hashlib
|
||||
import pytest
|
||||
|
||||
from pytwitter.models.media import MediaVariant
|
||||
@@ -10,8 +9,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor
|
||||
|
||||
@pytest.mark.incremental
|
||||
class TestTwitterApiExtractor(TestExtractorBase):
|
||||
|
||||
extractor_module = 'twitter_api_extractor'
|
||||
extractor_module: TwitterApiExtractor = "twitter_api_extractor"
|
||||
|
||||
config = {
|
||||
"bearer_tokens": [],
|
||||
@@ -22,41 +20,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
"access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, expected", [
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
|
||||
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
|
||||
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, expected",
|
||||
[
|
||||
(
|
||||
"https://x.com/bellingcat/status/1874097816571961839",
|
||||
"https://x.com/bellingcat/status/1874097816571961839",
|
||||
), # x.com urls unchanged
|
||||
(
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839",
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839",
|
||||
), # twitter urls unchanged
|
||||
(
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
), # don't strip params from twitter urls (changed Jan 2025)
|
||||
(
|
||||
"https://www.bellingcat.com/category/resources/",
|
||||
"https://www.bellingcat.com/category/resources/",
|
||||
), # non-twitter/x urls unchanged
|
||||
(
|
||||
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
), # shouldn't strip params from non-twitter/x URLs
|
||||
],
|
||||
)
|
||||
def test_sanitize_url(self, url, expected):
|
||||
assert expected == self.extractor.sanitize_url(url)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_sanitize_url_download(self):
|
||||
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
|
||||
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
|
||||
"https://t.co/yl3oOJatFp"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://www.bellingcat.com/category/resources/", False, False)
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, exptected_username, exptected_tweetid",
|
||||
[
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://www.bellingcat.com/category/resources/", False, False),
|
||||
],
|
||||
)
|
||||
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
|
||||
|
||||
username, tweet_id = self.extractor.get_username_tweet_id(url)
|
||||
assert exptected_username == username
|
||||
assert exptected_tweetid == tweet_id
|
||||
|
||||
def test_choose_variants(self):
|
||||
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
|
||||
variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
|
||||
MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
|
||||
MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
|
||||
MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
|
||||
]
|
||||
variant_list = [
|
||||
MediaVariant(
|
||||
content_type="application/x-mpegURL",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
|
||||
),
|
||||
MediaVariant(
|
||||
bit_rate=256000,
|
||||
content_type="video/mp4",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
|
||||
),
|
||||
MediaVariant(
|
||||
bit_rate=832000,
|
||||
content_type="video/mp4",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
|
||||
),
|
||||
MediaVariant(
|
||||
bit_rate=2176000,
|
||||
content_type="video/mp4",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
|
||||
),
|
||||
]
|
||||
chosen_variant = self.extractor.choose_variant(variant_list)
|
||||
assert chosen_variant == variant_list[3]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@pytest.mark.download
|
||||
def test_download_nonexistent_tweet(self, make_item):
|
||||
@@ -76,7 +112,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@pytest.mark.download
|
||||
def test_download_tweet_no_media(self, make_item):
|
||||
|
||||
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.extractor.download(item)
|
||||
|
||||
@@ -84,7 +119,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"twitter-api: success"
|
||||
"twitter-api: success",
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@@ -95,27 +130,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@pytest.mark.parametrize("url, title, timestamp", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, title, timestamp",
|
||||
[
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710769913450647",
|
||||
"ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710875475681357",
|
||||
"ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711053813227618",
|
||||
"ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711141314801937",
|
||||
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
|
||||
post = self.extractor.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
self.assertValidResponseMetadata(post, title, timestamp)
|
||||
assert len(post.media) == 1
|
||||
# check the SHA1 hash (quick) of the media, to make sure it's valid
|
||||
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
|
||||
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
|
||||
|
||||
77
tests/extractors/test_vk_extractor.py
Normal file
77
tests/extractors/test_vk_extractor.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.vk_extractor import VkExtractor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vk_scraper(mocker):
|
||||
"""Fixture to mock VkScraper."""
|
||||
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
|
||||
"""Fixture to initialize VkExtractor with mocked VkScraper."""
|
||||
extractor_module = "vk_extractor"
|
||||
configs = {
|
||||
"username": "name",
|
||||
"password": "password123",
|
||||
"session_file": "secrets/vk_config.v2.json",
|
||||
}
|
||||
vk = setup_module(extractor_module, configs)
|
||||
vk.vks = mock_vk_scraper.return_value
|
||||
return vk
|
||||
|
||||
|
||||
def test_netloc(vk_extractor, metadata):
|
||||
# metadata url set as: "https://example.com/"
|
||||
assert vk_extractor.download(metadata) is False
|
||||
|
||||
|
||||
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
vk_extractor.vks.scrape.return_value = []
|
||||
assert vk_extractor.download(metadata) is False
|
||||
assert metadata.netloc == "vk.com"
|
||||
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
|
||||
|
||||
|
||||
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
|
||||
mock_scrapes = [
|
||||
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
|
||||
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
|
||||
]
|
||||
mock_filenames = ["image1.jpg", "image2.png"]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = mock_filenames
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
result = vk_extractor.download(metadata)
|
||||
# Test metadata
|
||||
assert result.is_success()
|
||||
assert result.status == "vk: success"
|
||||
assert result.get_title() == "Post Title"
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert "Another Post" in result.metadata["content"]
|
||||
# Test Media objects
|
||||
assert len(result.media) == 2
|
||||
assert result.media[0].filename == "image1.jpg"
|
||||
assert result.media[1].filename == "image2.png"
|
||||
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
|
||||
|
||||
|
||||
def test_adds_first_title_and_timestamp(vk_extractor):
|
||||
metadata = Metadata().set_url("https://vk.com/no-metadata")
|
||||
metadata.set_url("https://vk.com/no-metadata")
|
||||
mock_scrapes = [
|
||||
{"text": "value", "datetime": "2023-01-01T00:00:00"},
|
||||
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
|
||||
]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = []
|
||||
result = vk_extractor.download(metadata)
|
||||
|
||||
assert result.get_title() == "value"
|
||||
# formatted timestamp
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert result.is_success()
|
||||
@@ -1,5 +1,5 @@
|
||||
import pytest
|
||||
from auto_archiver.modules.atlos_feeder import AtlosFeeder
|
||||
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosFeeder
|
||||
|
||||
|
||||
class FakeAPIResponse:
|
||||
@@ -18,44 +18,63 @@ class FakeAPIResponse:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def atlos_feeder(setup_module) -> AtlosFeeder:
|
||||
def atlos_feeder(setup_module, mocker) -> AtlosFeeder:
|
||||
"""Fixture for AtlosFeeder."""
|
||||
configs: dict = {
|
||||
"api_token": "abc123",
|
||||
"atlos_url": "https://platform.atlos.org",
|
||||
}
|
||||
return setup_module("atlos_feeder", configs)
|
||||
mocker.patch("requests.Session")
|
||||
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
|
||||
fake_session = mocker.MagicMock()
|
||||
# Configure the default response to have no results so that __iter__ terminates
|
||||
fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
|
||||
atlos_feeder.session = fake_session
|
||||
return atlos_feeder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_atlos_api(mocker):
|
||||
"""Fixture to mock requests to Atlos API."""
|
||||
def mock_atlos_api(atlos_feeder):
|
||||
"""Fixture to update the atlos_feeder.session.get side_effect."""
|
||||
|
||||
def _mock_responses(responses):
|
||||
mocker.patch(
|
||||
"requests.get",
|
||||
side_effect=[FakeAPIResponse(data) for data in responses],
|
||||
)
|
||||
atlos_feeder.session.get.side_effect = [FakeAPIResponse(data) for data in responses]
|
||||
|
||||
return _mock_responses
|
||||
|
||||
|
||||
def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
|
||||
"""Test valid items are yielded and invalid ones ignored."""
|
||||
mock_atlos_api([
|
||||
{
|
||||
"next": None,
|
||||
"results": [
|
||||
{"source_url": "http://example.com", "id": 1,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible", "status": "complete"},
|
||||
{"source_url": "", "id": 2,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible", "status": "complete"},
|
||||
{"source_url": "http://example.org", "id": 3,
|
||||
"metadata": {"auto_archiver": {"processed": True}},
|
||||
"visibility": "visible", "status": "complete"},
|
||||
],
|
||||
}
|
||||
])
|
||||
mock_atlos_api(
|
||||
[
|
||||
{
|
||||
"next": None,
|
||||
"results": [
|
||||
{
|
||||
"source_url": "http://example.com",
|
||||
"id": 1,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible",
|
||||
"status": "complete",
|
||||
},
|
||||
{
|
||||
"source_url": "",
|
||||
"id": 2,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible",
|
||||
"status": "complete",
|
||||
},
|
||||
{
|
||||
"source_url": "http://example.org",
|
||||
"id": 3,
|
||||
"metadata": {"auto_archiver": {"processed": True}},
|
||||
"visibility": "visible",
|
||||
"status": "complete",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
items = list(atlos_feeder)
|
||||
assert len(items) == 1
|
||||
@@ -65,24 +84,34 @@ def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
|
||||
|
||||
def test_atlos_feeder_multiple_pages(atlos_feeder, mock_atlos_api):
|
||||
"""Test iteration over multiple pages with valid items."""
|
||||
mock_atlos_api([
|
||||
{
|
||||
"next": "cursor2",
|
||||
"results": [
|
||||
{"source_url": "http://example1.com", "id": 10,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible", "status": "complete"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"next": None,
|
||||
"results": [
|
||||
{"source_url": "http://example2.com", "id": 20,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible", "status": "complete"},
|
||||
],
|
||||
},
|
||||
])
|
||||
mock_atlos_api(
|
||||
[
|
||||
{
|
||||
"next": "cursor2",
|
||||
"results": [
|
||||
{
|
||||
"source_url": "http://example1.com",
|
||||
"id": 10,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible",
|
||||
"status": "complete",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"next": None,
|
||||
"results": [
|
||||
{
|
||||
"source_url": "http://example2.com",
|
||||
"id": 20,
|
||||
"metadata": {"auto_archiver": {"processed": False}},
|
||||
"visibility": "visible",
|
||||
"status": "complete",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
items = list(atlos_feeder)
|
||||
assert len(items) == 2
|
||||
@@ -100,9 +129,7 @@ def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api):
|
||||
|
||||
def test_atlos_feeder_http_error(atlos_feeder, mocker):
|
||||
"""Test raises an exception on HTTP error."""
|
||||
mocker.patch(
|
||||
"requests.get",
|
||||
return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True),
|
||||
)
|
||||
fake_response = FakeAPIResponse({"next": None, "results": []}, raise_error=True)
|
||||
atlos_feeder.session.get.side_effect = [fake_response]
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
list(atlos_feeder)
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def headerless_csv_file():
|
||||
return "tests/data/csv_no_headers.csv"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def header_csv_file():
|
||||
return "tests/data/csv_with_headers.csv"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def header_csv_file_non_default_column():
|
||||
return "tests/data/csv_with_headers_non_default_column.csv"
|
||||
@@ -23,6 +26,7 @@ def test_csv_feeder_no_headers(headerless_csv_file, setup_module):
|
||||
assert urls[0].get_url() == "https://example.com/1/"
|
||||
assert urls[1].get_url() == "https://example.com/2/"
|
||||
|
||||
|
||||
def test_csv_feeder_with_headers(header_csv_file, setup_module):
|
||||
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
|
||||
|
||||
@@ -33,10 +37,10 @@ def test_csv_feeder_with_headers(header_csv_file, setup_module):
|
||||
assert urls[0].get_url() == "https://example.com/1/"
|
||||
assert urls[1].get_url() == "https://example.com/2/"
|
||||
|
||||
|
||||
def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog):
|
||||
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
|
||||
|
||||
|
||||
with caplog.at_level("WARNING"):
|
||||
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1})
|
||||
urls = list(feeder)
|
||||
@@ -54,4 +58,4 @@ def test_csv_feeder_column_by_name(header_csv_file, setup_module):
|
||||
urls = list(feeder)
|
||||
assert len(urls) == 2
|
||||
assert urls[0].get_url() == "https://example.com/1/"
|
||||
assert urls[1].get_url() == "https://example.com/2/"
|
||||
assert urls[1].get_url() == "https://example.com/2/"
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Type
|
||||
|
||||
import gspread
|
||||
import pytest
|
||||
from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
|
||||
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB
|
||||
from auto_archiver.core import Metadata, Feeder
|
||||
|
||||
|
||||
@@ -11,43 +11,40 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
|
||||
mocker.patch("gspread.service_account")
|
||||
with pytest.raises(ValueError):
|
||||
setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
|
||||
def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
|
||||
config: dict = {
|
||||
"service_account": "dummy.json",
|
||||
"sheet": "test-auto-archiver",
|
||||
"sheet_id": None,
|
||||
"header": 1,
|
||||
"columns": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"allow_worksheets": set(),
|
||||
"block_worksheets": set(),
|
||||
"use_sheet_names_in_stored_paths": True,
|
||||
}
|
||||
"service_account": "dummy.json",
|
||||
"sheet": "test-auto-archiver",
|
||||
"sheet_id": None,
|
||||
"header": 1,
|
||||
"columns": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "text content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"allow_worksheets": set(),
|
||||
"block_worksheets": set(),
|
||||
"use_sheet_names_in_stored_paths": True,
|
||||
}
|
||||
mocker.patch("gspread.service_account")
|
||||
feeder = setup_module(
|
||||
"gsheet_feeder",
|
||||
config
|
||||
)
|
||||
feeder = setup_module("gsheet_feeder_db", config)
|
||||
feeder.gsheets_client = mocker.MagicMock()
|
||||
return feeder
|
||||
|
||||
@@ -90,7 +87,7 @@ class MockWorksheet:
|
||||
return matching.get(col_name, default)
|
||||
|
||||
|
||||
def test__process_rows(gsheet_feeder: GsheetsFeeder):
|
||||
def test__process_rows(gsheet_feeder: GsheetsFeederDB):
|
||||
testworksheet = MockWorksheet()
|
||||
metadata_items = list(gsheet_feeder._process_rows(testworksheet))
|
||||
assert len(metadata_items) == 3
|
||||
@@ -98,7 +95,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
|
||||
assert metadata_items[0].get("url") == "http://example.com"
|
||||
|
||||
|
||||
def test__set_metadata(gsheet_feeder: GsheetsFeeder):
|
||||
def test__set_metadata(gsheet_feeder: GsheetsFeederDB):
|
||||
worksheet = MockWorksheet()
|
||||
metadata = Metadata()
|
||||
gsheet_feeder._set_context(metadata, worksheet, 1)
|
||||
@@ -106,12 +103,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder):
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Not recognising folder column")
|
||||
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
|
||||
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet):
|
||||
gsheet_feeder._set_context(worksheet, 7)
|
||||
assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
|
||||
|
||||
|
||||
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
|
||||
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
|
||||
testworksheet = MockWorksheet()
|
||||
metadata = Metadata()
|
||||
testworksheet.wks.title = "TestSheet"
|
||||
@@ -128,9 +125,7 @@ def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
|
||||
(None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"),
|
||||
],
|
||||
)
|
||||
def test_open_sheet_with_name_or_id(
|
||||
setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker
|
||||
):
|
||||
def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker):
|
||||
"""Ensure open_sheet() correctly opens by name or ID based on configuration."""
|
||||
mock_service_account = mocker.patch("gspread.service_account")
|
||||
mock_client = mocker.MagicMock()
|
||||
@@ -140,14 +135,12 @@ def test_open_sheet_with_name_or_id(
|
||||
|
||||
# Setup module with parameterized values
|
||||
feeder = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
|
||||
)
|
||||
sheet_result = feeder.open_sheet()
|
||||
# Validate the correct method was called
|
||||
getattr(mock_client, expected_method).assert_called_once_with(
|
||||
expected_arg
|
||||
), f"Failed: {description}"
|
||||
getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
|
||||
assert sheet_result == "MockSheet", f"Failed: {description}"
|
||||
|
||||
|
||||
@@ -159,7 +152,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
|
||||
mock_service_account.return_value = mock_client
|
||||
mock_client.open_by_key.return_value = "MockSheet"
|
||||
feeder = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
|
||||
)
|
||||
sheet = feeder.open_sheet()
|
||||
@@ -170,7 +163,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
|
||||
def test_should_process_sheet(setup_module, mocker):
|
||||
mocker.patch("gspread.service_account")
|
||||
gdb = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{
|
||||
"service_account": "dummy.json",
|
||||
"sheet": "TestSheet",
|
||||
@@ -179,18 +172,18 @@ def test_should_process_sheet(setup_module, mocker):
|
||||
"block_worksheets": {"Sheet3"},
|
||||
},
|
||||
)
|
||||
assert gdb.should_process_sheet("TestSheet") == True
|
||||
assert gdb.should_process_sheet("Sheet3") == False
|
||||
assert gdb.should_process_sheet("TestSheet") is True
|
||||
assert gdb.should_process_sheet("Sheet3") is False
|
||||
# False if allow_worksheets is set
|
||||
assert gdb.should_process_sheet("AnotherSheet") == False
|
||||
assert gdb.should_process_sheet("AnotherSheet") is False
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Requires a real connection")
|
||||
class TestGSheetsFeederReal:
|
||||
"""Testing GSheetsFeeder class"""
|
||||
"""Testing GsheetsFeeder class"""
|
||||
|
||||
module_name: str = "gsheet_feeder"
|
||||
feeder: GsheetsFeeder
|
||||
module_name: str = "gsheet_feeder_db"
|
||||
feeder: GsheetsFeederDB
|
||||
# You must follow the setup process explain in the docs for this to work
|
||||
config: dict = {
|
||||
"service_account": "secrets/service_account.json",
|
||||
@@ -220,9 +213,7 @@ class TestGSheetsFeederReal:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_feeder(self, setup_module):
|
||||
assert (
|
||||
self.module_name is not None
|
||||
), "self.module_name must be set on the subclass"
|
||||
assert self.module_name is not None, "self.module_name must be set on the subclass"
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
self.feeder: Type[Feeder] = setup_module(self.module_name, self.config)
|
||||
|
||||
@@ -241,9 +232,7 @@ class TestGSheetsFeederReal:
|
||||
"""Ensure open_sheet() connects to a real Google Sheets instance."""
|
||||
sheet = self.feeder.open_sheet()
|
||||
assert sheet is not None, "open_sheet() should return a valid sheet instance"
|
||||
assert hasattr(
|
||||
sheet, "worksheets"
|
||||
), "Returned object should have worksheets method"
|
||||
assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"
|
||||
|
||||
def test_iter_yields_metadata_real_data(self):
|
||||
"""Ensure __iter__() yields Metadata objects for real test sheet data."""
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Note this isn't a feeder, but contained as utility of the gsheet feeder module
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||
|
||||
|
||||
class TestGWorksheet:
|
||||
@@ -81,40 +81,27 @@ class TestGWorksheet:
|
||||
(False, ""),
|
||||
],
|
||||
)
|
||||
def test_get_cell_or_default_handles_empty_values(
|
||||
self, mock_worksheet, when_empty, expected
|
||||
):
|
||||
def test_get_cell_or_default_handles_empty_values(self, mock_worksheet, when_empty, expected):
|
||||
mock_worksheet.get_values.return_value[1][0] = "" # Empty URL cell
|
||||
g = GWorksheet(mock_worksheet)
|
||||
assert (
|
||||
g.get_cell_or_default(
|
||||
2, "url", default="default", when_empty_use_default=when_empty
|
||||
)
|
||||
== expected
|
||||
)
|
||||
assert g.get_cell_or_default(2, "url", default="default", when_empty_use_default=when_empty) == expected
|
||||
|
||||
def test_get_cell_or_default_handles_missing_columns(self, gworksheet):
|
||||
assert (
|
||||
gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
|
||||
)
|
||||
assert gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
|
||||
|
||||
# Test write operations
|
||||
def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet):
|
||||
gworksheet.set_cell(2, "url", "new_url")
|
||||
mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url")
|
||||
|
||||
def test_batch_set_cell_formats_requests_correctly(
|
||||
self, mock_worksheet, gworksheet
|
||||
):
|
||||
def test_batch_set_cell_formats_requests_correctly(self, mock_worksheet, gworksheet):
|
||||
updates = [(2, "url", "new_url"), (3, "status", "processed")]
|
||||
gworksheet.batch_set_cell(updates)
|
||||
expected_batch = [
|
||||
{"range": "A2", "values": [["new_url"]]},
|
||||
{"range": "B3", "values": [["processed"]]},
|
||||
]
|
||||
mock_worksheet.batch_update.assert_called_once_with(
|
||||
expected_batch, value_input_option="USER_ENTERED"
|
||||
)
|
||||
mock_worksheet.batch_update.assert_called_once_with(expected_batch, value_input_option="USER_ENTERED")
|
||||
|
||||
def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet):
|
||||
long_value = "x" * 50000
|
||||
|
||||
@@ -5,13 +5,13 @@ from auto_archiver.core import Metadata, Media
|
||||
def test_format(setup_module):
|
||||
formatter = setup_module(HtmlFormatter)
|
||||
|
||||
metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com')
|
||||
metadata = Metadata().set("content", "Hello, world!").set_url("https://example.com")
|
||||
|
||||
final_media = formatter.format(metadata)
|
||||
assert isinstance(final_media, Media)
|
||||
assert ".html" in final_media.filename
|
||||
with open (final_media.filename, "r", encoding="utf-8") as f:
|
||||
with open(final_media.filename, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
assert "Hello, world!" in content
|
||||
assert final_media.mimetype == "text/html"
|
||||
assert "SHA-256:" in final_media.get('hash')
|
||||
assert "SHA-256:" in final_media.get("hash")
|
||||
|
||||
@@ -8,6 +8,7 @@ class TestS3Storage:
|
||||
"""
|
||||
Test suite for S3Storage.
|
||||
"""
|
||||
|
||||
module_name: str = "s3_storage"
|
||||
storage: Type[S3Storage]
|
||||
config: dict = {
|
||||
@@ -32,28 +33,28 @@ class TestS3Storage:
|
||||
"""Test that S3 client is initialized with correct parameters"""
|
||||
|
||||
assert self.storage.s3 is not None
|
||||
assert self.storage.s3.meta.region_name == 'test-region'
|
||||
assert self.storage.s3.meta.region_name == "test-region"
|
||||
|
||||
def test_get_cdn_url_generation(self):
|
||||
"""Test CDN URL formatting """
|
||||
"""Test CDN URL formatting"""
|
||||
media = Media("test.txt")
|
||||
media.key = "path/to/file.txt"
|
||||
media._key = "path/to/file.txt"
|
||||
url = self.storage.get_cdn_url(media)
|
||||
assert url == "https://cdn.example.com/path/to/file.txt"
|
||||
media.key = "another/path.jpg"
|
||||
media._key = "another/path.jpg"
|
||||
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
|
||||
|
||||
def test_uploadf_sets_acl_public(self, mocker):
|
||||
media = Media("test.txt")
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_s3_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket='test-bucket',
|
||||
Bucket="test-bucket",
|
||||
Key=media.key,
|
||||
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
|
||||
)
|
||||
|
||||
def test_upload_decision_logic(self, mocker):
|
||||
@@ -61,45 +62,48 @@ class TestS3Storage:
|
||||
media = Media("test.txt")
|
||||
assert self.storage.is_upload_needed(media) is True
|
||||
self.storage.random_no_duplicate = True
|
||||
mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
|
||||
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
|
||||
return_value="beepboop123beepboop123beepboop123",
|
||||
)
|
||||
mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == 'existing_key.txt'
|
||||
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
|
||||
assert media.key == "existing_key.txt"
|
||||
mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")
|
||||
|
||||
def test_skips_upload_when_duplicate_exists(self, mocker):
|
||||
"""Test that upload skips when file_in_folder finds existing object"""
|
||||
self.storage.random_no_duplicate = True
|
||||
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
|
||||
mocker.patch.object(S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt")
|
||||
media = Media("test.txt")
|
||||
media.key = "original_path.txt"
|
||||
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
|
||||
media._key = "original_path.txt"
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
|
||||
return_value="beepboop123beepboop123beepboop123",
|
||||
)
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == "existing_folder/existing_file.txt"
|
||||
assert media.get("previously archived") is True
|
||||
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
result = self.storage.uploadf(None, media)
|
||||
mock_upload.assert_not_called()
|
||||
assert result is True
|
||||
|
||||
def test_uploads_with_correct_parameters(self, mocker):
|
||||
media = Media("test.txt")
|
||||
media.key = "original_key.txt"
|
||||
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
|
||||
media.mimetype = 'image/png'
|
||||
media._key = "original_key.txt"
|
||||
mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
|
||||
media.mimetype = "image/png"
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket='test-bucket',
|
||||
Key='original_key.txt',
|
||||
ExtraArgs={
|
||||
'ACL': 'public-read',
|
||||
'ContentType': 'image/png'
|
||||
}
|
||||
Bucket="test-bucket",
|
||||
Key="original_key.txt",
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
|
||||
)
|
||||
|
||||
def test_file_in_folder_exists(self, mocker):
|
||||
mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
|
||||
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
|
||||
mocker.patch.object(self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]})
|
||||
assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import hashlib
|
||||
import pytest
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.modules.atlos_storage import AtlosStorage
|
||||
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosStorage
|
||||
|
||||
|
||||
class FakeAPIResponse:
|
||||
@@ -21,13 +21,19 @@ class FakeAPIResponse:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def atlos_storage(setup_module) -> AtlosStorage:
|
||||
def atlos_storage(setup_module, mocker) -> AtlosStorage:
|
||||
"""Fixture for AtlosStorage."""
|
||||
configs: dict = {
|
||||
"api_token": "abc123",
|
||||
"atlos_url": "https://platform.atlos.org",
|
||||
}
|
||||
return setup_module("atlos_storage", configs)
|
||||
mocker.patch("requests.Session")
|
||||
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
|
||||
mock_session = mocker.MagicMock()
|
||||
# Configure the default response to have no results so that __iter__ terminates
|
||||
mock_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
|
||||
atlos_feeder.session = mock_session
|
||||
return atlos_feeder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -38,7 +44,7 @@ def media(tmp_path) -> Media:
|
||||
file_path.write_bytes(content)
|
||||
media = Media(filename=str(file_path))
|
||||
media.properties = {"something": "Title"}
|
||||
media.key = "key"
|
||||
media._key = "key"
|
||||
return media
|
||||
|
||||
|
||||
@@ -49,17 +55,6 @@ def test_get_cdn_url(atlos_storage: AtlosStorage) -> None:
|
||||
assert url == atlos_storage.atlos_url
|
||||
|
||||
|
||||
def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None:
|
||||
"""Test _hash() computes the correct SHA-256 hash of a file."""
|
||||
content = b"hello world"
|
||||
file_path = tmp_path / "test.txt"
|
||||
file_path.write_bytes(content)
|
||||
media = Media(filename="dummy.mp4")
|
||||
media.filename = str(file_path)
|
||||
expected_hash = hashlib.sha256(content).hexdigest()
|
||||
assert atlos_storage._hash(media) == expected_hash
|
||||
|
||||
|
||||
def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None:
|
||||
"""Test upload() returns False when metadata lacks atlos_id."""
|
||||
metadata = Metadata() # atlos_id not set
|
||||
@@ -69,74 +64,49 @@ def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media,
|
||||
post_mock.assert_not_called()
|
||||
|
||||
|
||||
def test_upload_already_uploaded(atlos_storage: AtlosStorage,
|
||||
metadata: Metadata,
|
||||
media: Media,
|
||||
tmp_path,
|
||||
mocker) -> None:
|
||||
def test_upload_already_uploaded(atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
|
||||
"""Test upload() returns True if media hash already exists."""
|
||||
content = b"media content"
|
||||
metadata.set("atlos_id", 101)
|
||||
media_hash = hashlib.sha256(content).hexdigest()
|
||||
fake_get = FakeAPIResponse({
|
||||
"result": {"artifacts": [{"file_hash_sha256": media_hash}]}
|
||||
})
|
||||
get_mock = mocker.patch("requests.get", return_value=fake_get)
|
||||
post_mock = mocker.patch("requests.post")
|
||||
fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": media_hash}]}}
|
||||
get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
|
||||
post_mock = mocker.patch.object(atlos_storage, "_post")
|
||||
result = atlos_storage.upload(media, metadata)
|
||||
assert result is True
|
||||
get_mock.assert_called_once()
|
||||
post_mock.assert_not_called()
|
||||
|
||||
|
||||
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage,
|
||||
metadata: Metadata,
|
||||
media: Media,
|
||||
mocker) -> None:
|
||||
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
|
||||
"""Test upload() uploads media when not already present."""
|
||||
metadata.set("atlos_id", 202)
|
||||
fake_get = FakeAPIResponse({
|
||||
"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}
|
||||
})
|
||||
get_mock = mocker.patch("requests.get", return_value=fake_get)
|
||||
fake_post = FakeAPIResponse({}, raise_error=False)
|
||||
post_mock = mocker.patch("requests.post", return_value=fake_post)
|
||||
fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}}
|
||||
get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
|
||||
fake_post_response = {"result": "uploaded"}
|
||||
post_mock = mocker.patch.object(atlos_storage, "_post", return_value=fake_post_response)
|
||||
result = atlos_storage.upload(media, metadata)
|
||||
assert result is True
|
||||
|
||||
get_mock.assert_called_once()
|
||||
post_mock.assert_called_once()
|
||||
expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202"
|
||||
expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"}
|
||||
expected_endpoint = "/api/v2/source_material/upload/202"
|
||||
call_args = post_mock.call_args[0]
|
||||
assert call_args[0] == expected_endpoint
|
||||
call_kwargs = post_mock.call_args[1]
|
||||
expected_params = {"title": media.properties}
|
||||
call_kwargs = post_mock.call_args.kwargs
|
||||
assert call_kwargs["headers"] == expected_headers
|
||||
assert call_kwargs["params"] == expected_params
|
||||
# Verify the URL passed to requests.post.
|
||||
posted_url = call_kwargs.get("url") or post_mock.call_args.args[0]
|
||||
assert posted_url == expected_url
|
||||
# Verify files parameter contains the correct filename.
|
||||
file_tuple = call_kwargs["files"]["file"]
|
||||
assert file_tuple[0] == os.path.basename(media.filename)
|
||||
|
||||
|
||||
def test_upload_post_http_error(tmp_path,
|
||||
atlos_storage: AtlosStorage,
|
||||
metadata: Metadata,
|
||||
media: Media,
|
||||
mocker) -> None:
|
||||
def test_upload_post_http_error(
|
||||
tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker
|
||||
) -> None:
|
||||
"""Test upload() propagates HTTP error during POST."""
|
||||
metadata.set("atlos_id", 303)
|
||||
fake_get = FakeAPIResponse({
|
||||
"result": {"artifacts": []}
|
||||
})
|
||||
mocker.patch("requests.get", return_value=fake_get)
|
||||
fake_post = FakeAPIResponse({}, raise_error=True)
|
||||
mocker.patch("requests.post", return_value=fake_post)
|
||||
fake_get_response = {"result": {"artifacts": []}}
|
||||
mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
|
||||
mocker.patch.object(atlos_storage, "_post", side_effect=Exception("HTTP error"))
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
atlos_storage.upload(media, metadata)
|
||||
|
||||
|
||||
def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None:
|
||||
"""Test uploadf() returns None (not implemented)."""
|
||||
result = atlos_storage.uploadf(None, "dummy")
|
||||
assert result is None
|
||||
|
||||
@@ -1,37 +1,42 @@
|
||||
from typing import Type
|
||||
import pytest
|
||||
from oauth2client import service_account
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.modules.gdrive_storage import GDriveStorage
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from tests.storages.test_storage_base import TestStorageBase
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_sleep(mocker):
|
||||
"""Mock time.sleep to avoid delays."""
|
||||
return mocker.patch("time.sleep")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gdrive_storage(setup_module, mocker):
|
||||
def gdrive_storage(setup_module, mocker) -> GDriveStorage:
|
||||
module_name: str = "gdrive_storage"
|
||||
storage: GDriveStorage
|
||||
config: dict = {'path_generator': 'url',
|
||||
'filename_generator': 'static',
|
||||
'root_folder_id': "fake_root_folder_id",
|
||||
'oauth_token': None,
|
||||
'service_account': 'fake_service_account.json'
|
||||
}
|
||||
mocker.patch('google.oauth2.service_account.Credentials.from_service_account_file')
|
||||
config: dict = {
|
||||
"path_generator": "url",
|
||||
"filename_generator": "static",
|
||||
"root_folder_id": "fake_root_folder_id",
|
||||
"oauth_token": None,
|
||||
"service_account": "fake_service_account.json",
|
||||
}
|
||||
mocker.patch("google.oauth2.service_account.Credentials.from_service_account_file")
|
||||
return setup_module(module_name, config)
|
||||
|
||||
|
||||
def test_initialize_fails_with_non_existent_creds(setup_module):
|
||||
"""Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
|
||||
(and isn't mocked)
|
||||
(and isn't mocked)
|
||||
"""
|
||||
config: dict = {'path_generator': 'url',
|
||||
'filename_generator': 'static',
|
||||
'root_folder_id': "fake_root_folder_id",
|
||||
'oauth_token': None,
|
||||
'service_account': 'fake_service_account.json'
|
||||
}
|
||||
config: dict = {
|
||||
"path_generator": "url",
|
||||
"filename_generator": "static",
|
||||
"root_folder_id": "fake_root_folder_id",
|
||||
"oauth_token": None,
|
||||
"service_account": "fake_service_account.json",
|
||||
}
|
||||
with pytest.raises(FileNotFoundError) as exc_info:
|
||||
setup_module("gdrive_storage", config)
|
||||
assert "No such file or directory" in str(exc_info.value)
|
||||
@@ -48,10 +53,10 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
|
||||
result = gdrive_storage._get_id_from_parent_and_name("parent", "mock", retries=1, use_mime_type=False)
|
||||
assert result == "123"
|
||||
|
||||
|
||||
def test_path_parts():
|
||||
media = Media(filename="test.jpg")
|
||||
media.key = "folder1/folder2/test.jpg"
|
||||
|
||||
media._key = "folder1/folder2/test.jpg"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Requires real credentials")
|
||||
@@ -63,19 +68,17 @@ class TestGDriveStorageConnected(TestStorageBase):
|
||||
|
||||
module_name: str = "gdrive_storage"
|
||||
storage: Type[GDriveStorage]
|
||||
config: dict = {'path_generator': 'url',
|
||||
'filename_generator': 'static',
|
||||
# TODO: replace with real root folder id
|
||||
'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
|
||||
'oauth_token': None,
|
||||
'service_account': 'secrets/service_account.json'
|
||||
}
|
||||
|
||||
config: dict = {
|
||||
"path_generator": "url",
|
||||
"filename_generator": "static",
|
||||
# TODO: replace with real root folder id
|
||||
"root_folder_id": "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
|
||||
"oauth_token": None,
|
||||
"service_account": "secrets/service_account.json",
|
||||
}
|
||||
|
||||
def test_initialize_with_real_credentials(self):
|
||||
"""
|
||||
Test that the Google Drive service can be initialized with real credentials.
|
||||
"""
|
||||
assert self.storage.service is not None
|
||||
|
||||
|
||||
|
||||
@@ -1,43 +1,63 @@
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.modules.local_storage import LocalStorage
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def local_storage(setup_module) -> LocalStorage:
|
||||
def local_storage(setup_module, tmp_path) -> LocalStorage:
|
||||
save_to = tmp_path / "local_archive"
|
||||
save_to.mkdir()
|
||||
configs: dict = {
|
||||
"path_generator": "flat",
|
||||
"filename_generator": "static",
|
||||
"save_to": "./local_archive",
|
||||
"save_to": str(save_to),
|
||||
"save_absolute": False,
|
||||
}
|
||||
return setup_module("local_storage", configs)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
return Media(filename=str(src_file))
|
||||
|
||||
|
||||
def test_too_long_save_path(setup_module):
|
||||
with pytest.raises(SetupError):
|
||||
setup_module("local_storage", {"save_to": "long" * 100})
|
||||
|
||||
def test_get_cdn_url_relative(local_storage):
|
||||
media = Media(key="test.txt", filename="dummy.txt")
|
||||
local_storage.filename_generator = "random"
|
||||
media = Media(filename="dummy.txt")
|
||||
local_storage.set_key(media, "https://example.com", Metadata())
|
||||
expected = os.path.join(local_storage.save_to, media.key)
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
def test_get_cdn_url_absolute(local_storage):
|
||||
media = Media(key="test.txt", filename="dummy.txt")
|
||||
local_storage.filename_generator = "random"
|
||||
|
||||
media = Media(filename="dummy.txt")
|
||||
local_storage.save_absolute = True
|
||||
local_storage.set_key(media, "https://example.com", Metadata())
|
||||
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
def test_upload_file_contents_and_metadata(local_storage, sample_media):
|
||||
local_storage.store(sample_media, "https://example.com", Metadata())
|
||||
dest = os.path.join(local_storage.save_to, sample_media.key)
|
||||
assert local_storage.upload(sample_media) is True
|
||||
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
|
||||
|
||||
|
||||
def test_upload_nonexistent_source(local_storage):
|
||||
media = Media(key="missing.txt", filename="nonexistent.txt")
|
||||
media = Media(_key="missing.txt", filename="nonexistent.txt")
|
||||
with pytest.raises(FileNotFoundError):
|
||||
local_storage.upload(media)
|
||||
|
||||
|
||||
|
||||
@@ -2,21 +2,109 @@ from typing import Type
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.storage import Storage
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
||||
|
||||
class TestStorageBase(object):
|
||||
|
||||
module_name: str = None
|
||||
config: dict = None
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_storage(self, setup_module):
|
||||
assert (
|
||||
self.module_name is not None
|
||||
), "self.module_name must be set on the subclass"
|
||||
assert self.module_name is not None, "self.module_name must be set on the subclass"
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
self.storage: Type[Storage] = setup_module(
|
||||
self.module_name, self.config
|
||||
)
|
||||
self.storage: Type[Storage] = setup_module(self.module_name, self.config)
|
||||
|
||||
|
||||
class TestBaseStorage(Storage):
|
||||
name = "test_storage"
|
||||
|
||||
def get_cdn_url(self, media):
|
||||
return "cdn_url"
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
return True
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_file(tmp_path):
|
||||
# create dummy.txt file
|
||||
dummy_file = tmp_path / "dummy.txt"
|
||||
dummy_file.write_text("test content")
|
||||
return str(dummy_file)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def storage_base():
|
||||
def _storage_base(config):
|
||||
storage_base = TestBaseStorage()
|
||||
storage_base.config_setup({TestBaseStorage.name: config})
|
||||
storage_base.module_factory = ModuleFactory()
|
||||
return storage_base
|
||||
|
||||
return _storage_base
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"path_generator, filename_generator, url, expected_key",
|
||||
[
|
||||
("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"),
|
||||
("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"),
|
||||
("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"),
|
||||
("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
|
||||
("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
|
||||
("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
|
||||
],
|
||||
)
|
||||
def test_storage_name_generation(
|
||||
storage_base, path_generator, filename_generator, url, expected_key, mocker, tmp_path, dummy_file
|
||||
):
|
||||
mock_random = mocker.patch("auto_archiver.core.storage.random_str")
|
||||
mock_random.return_value = "pretend-random"
|
||||
|
||||
config: dict = {
|
||||
"path_generator": path_generator,
|
||||
"filename_generator": filename_generator,
|
||||
}
|
||||
storage: Storage = storage_base(config)
|
||||
assert storage.path_generator == path_generator
|
||||
assert storage.filename_generator == filename_generator
|
||||
|
||||
metadata = Metadata()
|
||||
metadata.set_context("folder", "folder")
|
||||
media = Media(filename=dummy_file)
|
||||
storage.set_key(media, url, metadata)
|
||||
print(media.key)
|
||||
assert media.key == expected_key
|
||||
|
||||
|
||||
def test_really_long_name(storage_base, dummy_file):
|
||||
config: dict = {
|
||||
"path_generator": "url",
|
||||
"filename_generator": "static",
|
||||
}
|
||||
storage: Storage = storage_base(config)
|
||||
|
||||
url = f"https://example.com/{'file' * 100}"
|
||||
media = Media(filename=dummy_file)
|
||||
storage.set_key(media, url, Metadata())
|
||||
assert media.key == f"https-example-com-{'file' * 13}/6ae8a75555209fd6c44157c0.txt"
|
||||
|
||||
|
||||
def test_storage_loads_hash_enricher(storage_base, dummy_file):
|
||||
"""Ensure 'hash_enricher' is properly loaded without an explicit import."""
|
||||
config = {"path_generator": "url", "filename_generator": "static"}
|
||||
storage = storage_base(config)
|
||||
|
||||
url = "https://example.com/file/"
|
||||
media = Media(filename=dummy_file)
|
||||
metadata = Metadata()
|
||||
|
||||
try:
|
||||
storage.set_key(media, url, metadata)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Storage failed to dynamically load hash_enricher: {e}")
|
||||
|
||||
assert media.key is not None, "Expected media.key to be set, but it was None"
|
||||
|
||||
@@ -3,39 +3,46 @@ from auto_archiver.core import config
|
||||
from ruamel.yaml.scanner import ScannerError
|
||||
from ruamel.yaml.comments import CommentedMap
|
||||
|
||||
|
||||
def test_return_default_config_for_nonexistent_file():
|
||||
assert config.read_yaml("nonexistent_file.yaml") == config.EMPTY_CONFIG
|
||||
|
||||
|
||||
def test_return_default_config_for_empty_file(tmp_path):
|
||||
empty_file = tmp_path / "empty_file.yaml"
|
||||
empty_file.write_text("")
|
||||
assert config.read_yaml(empty_file) == config.EMPTY_CONFIG
|
||||
|
||||
|
||||
def test_raise_error_on_invalid_yaml(tmp_path):
|
||||
invalid_yaml = tmp_path / "invalid_yaml.yaml"
|
||||
invalid_yaml.write_text("key: \"value_without_end_quote")
|
||||
invalid_yaml.write_text('key: "value_without_end_quote')
|
||||
# make sure it raises ScannerError
|
||||
with pytest.raises(ScannerError):
|
||||
config.read_yaml(invalid_yaml)
|
||||
|
||||
|
||||
def test_write_yaml(tmp_path):
|
||||
yaml_file = tmp_path / "write_yaml.yaml"
|
||||
config.store_yaml(config.EMPTY_CONFIG, yaml_file.as_posix())
|
||||
assert "steps:\n" in yaml_file.read_text()
|
||||
|
||||
|
||||
def test_round_trip_comments(tmp_path):
|
||||
yaml_file = tmp_path / "round_trip_comments.yaml"
|
||||
|
||||
with open(yaml_file, "w") as f:
|
||||
f.write("generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2")
|
||||
f.write(
|
||||
"generic_extractor:\n facebook_cookie: abc # end of line comment\n subtitles: true\n # comments: false\n # livestreams: false\n list_type:\n - value1\n - value2"
|
||||
)
|
||||
|
||||
loaded = config.read_yaml(yaml_file)
|
||||
# check the comments are preserved
|
||||
assert loaded['generic_extractor']['facebook_cookie'] == "abc"
|
||||
assert loaded['generic_extractor'].ca.items['facebook_cookie'][2].value == "# end of line comment\n"
|
||||
assert loaded["generic_extractor"]["facebook_cookie"] == "abc"
|
||||
assert loaded["generic_extractor"].ca.items["facebook_cookie"][2].value == "# end of line comment\n"
|
||||
|
||||
# add some more items to my_settings
|
||||
loaded['generic_extractor']['list_type'].append("bellingcat")
|
||||
loaded["generic_extractor"]["list_type"].append("bellingcat")
|
||||
config.store_yaml(loaded, yaml_file.as_posix())
|
||||
|
||||
assert "# comments: false" in yaml_file.read_text()
|
||||
@@ -43,14 +50,17 @@ def test_round_trip_comments(tmp_path):
|
||||
assert "abc # end of line comment" in yaml_file.read_text()
|
||||
assert "- value2\n - bellingcat" in yaml_file.read_text()
|
||||
|
||||
|
||||
def test_merge_dicts():
|
||||
yaml_dict = config.EMPTY_CONFIG
|
||||
yaml_dict['settings'] = CommentedMap(**{
|
||||
yaml_dict["settings"] = CommentedMap(
|
||||
**{
|
||||
"key1": ["a"],
|
||||
"key2": "old_value",
|
||||
"key3": ["a", "b", "c"],
|
||||
"key5": "value5",
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
dotdict = {
|
||||
"settings.key1": ["b", "c"],
|
||||
@@ -67,15 +77,16 @@ def test_merge_dicts():
|
||||
|
||||
|
||||
def test_check_types():
|
||||
assert config.is_list_type([]) == True
|
||||
assert config.is_list_type(()) == True
|
||||
assert config.is_list_type(set()) == True
|
||||
assert config.is_list_type({}) == False
|
||||
assert config.is_list_type("") == False
|
||||
assert config.is_dict_type({}) == True
|
||||
assert config.is_dict_type(CommentedMap()) == True
|
||||
assert config.is_dict_type([]) == False
|
||||
assert config.is_dict_type("") == False
|
||||
assert config.is_list_type([]) is True
|
||||
assert config.is_list_type(()) is True
|
||||
assert config.is_list_type(set()) is True
|
||||
assert config.is_list_type({}) is False
|
||||
assert config.is_list_type("") is False
|
||||
assert config.is_dict_type({}) is True
|
||||
assert config.is_dict_type(CommentedMap()) is True
|
||||
assert config.is_dict_type([]) is False
|
||||
assert config.is_dict_type("") is False
|
||||
|
||||
|
||||
def test_from_dot_notation():
|
||||
dotdict = {
|
||||
@@ -88,16 +99,17 @@ def test_from_dot_notation():
|
||||
assert normal_dict["settings"]["key2"] == "new_value"
|
||||
assert normal_dict["settings"]["key3"]["key4"] == "value"
|
||||
|
||||
|
||||
def test_to_dot_notation():
|
||||
yaml_dict = config.EMPTY_CONFIG
|
||||
yaml_dict['settings'] = {
|
||||
yaml_dict["settings"] = {
|
||||
"key1": ["a", "b", "c"],
|
||||
"key2": "new_value",
|
||||
"key3": {
|
||||
"key4": "value",
|
||||
}
|
||||
},
|
||||
}
|
||||
dotdict = config.to_dot_notation(yaml_dict)
|
||||
assert dotdict["settings.key1"] == ["a", "b", "c"]
|
||||
assert dotdict["settings.key2"] == "new_value"
|
||||
assert dotdict["settings.key3.key4"] == "value"
|
||||
assert dotdict["settings.key3.key4"] == "value"
|
||||
|
||||
@@ -10,21 +10,23 @@ def orchestration_file_path(tmp_path):
|
||||
folder.mkdir(exist_ok=True)
|
||||
return (folder / "example_orch.yaml").as_posix()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def orchestration_file(orchestration_file_path):
|
||||
def _orchestration_file(content=''):
|
||||
def _orchestration_file(content=""):
|
||||
with open(orchestration_file_path, "w") as f:
|
||||
f.write(content)
|
||||
return orchestration_file_path
|
||||
|
||||
|
||||
return _orchestration_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def autoarchiver(tmp_path, monkeypatch, request):
|
||||
def _autoarchiver(args=[]):
|
||||
|
||||
def cleanup():
|
||||
from loguru import logger
|
||||
|
||||
if not logger._core.handlers.get(0):
|
||||
logger._core.handlers_count = 0
|
||||
logger.add(sys.stderr)
|
||||
@@ -44,9 +46,9 @@ def autoarchiver(tmp_path, monkeypatch, request):
|
||||
def test_run_auto_archiver_no_args(caplog, autoarchiver):
|
||||
with pytest.raises(SystemExit):
|
||||
autoarchiver()
|
||||
|
||||
assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text
|
||||
|
||||
|
||||
def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
|
||||
# exec 'auto-archiver' on the command lin
|
||||
with pytest.raises(SystemExit):
|
||||
@@ -54,6 +56,7 @@ def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
|
||||
|
||||
assert "Make sure the file exists and try again, or run without th" in caplog.text
|
||||
|
||||
|
||||
def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
|
||||
# create a valid (empty) orchestration file
|
||||
path = orchestration_file(content="")
|
||||
@@ -64,6 +67,7 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
|
||||
# should treat an empty file as if there is no file at all
|
||||
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
|
||||
|
||||
|
||||
def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
|
||||
from auto_archiver.__main__ import main
|
||||
|
||||
@@ -75,4 +79,4 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
|
||||
with pytest.raises(SystemExit):
|
||||
main()
|
||||
|
||||
assert "No URLs provided. Please provide at least one" in caplog.text
|
||||
assert "No URLs provided. Please provide at least one" in caplog.text
|
||||
|
||||
@@ -62,18 +62,8 @@ def test_simple_merge(basic_metadata):
|
||||
|
||||
|
||||
def test_left_merge():
|
||||
left = (
|
||||
Metadata()
|
||||
.set("tags", ["a"])
|
||||
.set("stats", {"views": 10})
|
||||
.set("status", "success")
|
||||
)
|
||||
right = (
|
||||
Metadata()
|
||||
.set("tags", ["b"])
|
||||
.set("stats", {"likes": 5})
|
||||
.set("status", "no archiver")
|
||||
)
|
||||
left = Metadata().set("tags", ["a"]).set("stats", {"views": 10}).set("status", "success")
|
||||
right = Metadata().set("tags", ["b"]).set("stats", {"likes": 5}).set("status", "no archiver")
|
||||
|
||||
left.merge(right, overwrite_left=True)
|
||||
assert left.get("status") == "no archiver"
|
||||
@@ -120,6 +110,7 @@ def test_is_empty():
|
||||
def test_store():
|
||||
pass
|
||||
|
||||
|
||||
# Test Media operations
|
||||
|
||||
|
||||
@@ -176,6 +167,7 @@ def test_choose_most_complete():
|
||||
res = Metadata.choose_most_complete([m_more, m_less])
|
||||
assert res.metadata.get("title") == "Title 1"
|
||||
|
||||
|
||||
def test_choose_most_complete_from_pickles(unpickle):
|
||||
# test most complete from pickles before and after an enricher has run
|
||||
# Only compares length of media, not the actual media
|
||||
|
||||
@@ -1,40 +1,41 @@
|
||||
import sys
|
||||
import pytest
|
||||
from auto_archiver.core.module import ModuleFactory, LazyBaseModule
|
||||
from auto_archiver.core.base_module import BaseModule
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_module():
|
||||
import auto_archiver
|
||||
|
||||
module_factory = ModuleFactory()
|
||||
|
||||
previous_path = auto_archiver.modules.__path__
|
||||
# previous_path = auto_archiver.modules.__path__
|
||||
auto_archiver.modules.__path__.append("tests/data/test_modules/")
|
||||
|
||||
return module_factory.get_module_lazy("example_module")
|
||||
|
||||
|
||||
def test_get_module_lazy(example_module):
|
||||
assert example_module.name == "example_module"
|
||||
assert example_module.display_name == "Example Module"
|
||||
|
||||
assert example_module.manifest is not None
|
||||
|
||||
|
||||
def test_python_dependency_check(example_module):
|
||||
# example_module requires loguru, which is not installed
|
||||
# monkey patch the manifest to include a nonexistnet dependency
|
||||
example_module.manifest["dependencies"]["python"] = ["does_not_exist"]
|
||||
|
||||
with pytest.raises(SystemExit) as load_error:
|
||||
with pytest.raises(SetupError):
|
||||
example_module.load({})
|
||||
|
||||
assert load_error.value.code == 1
|
||||
|
||||
def test_binary_dependency_check(example_module):
|
||||
# example_module requires ffmpeg, which is not installed
|
||||
# monkey patch the manifest to include a nonexistnet dependency
|
||||
example_module.manifest["dependencies"]["binary"] = ["does_not_exist"]
|
||||
|
||||
|
||||
def test_module_dependency_check_loads_module(example_module):
|
||||
# example_module requires cli_feeder, which is not installed
|
||||
# monkey patch the manifest to include a nonexistnet dependency
|
||||
@@ -49,19 +50,20 @@ def test_module_dependency_check_loads_module(example_module):
|
||||
assert module_factory._lazy_modules["hash_enricher"] is not None
|
||||
assert module_factory._lazy_modules["hash_enricher"]._instance is not None
|
||||
|
||||
def test_load_module(example_module):
|
||||
|
||||
def test_load_module(example_module):
|
||||
# setup the module, and check that config is set to the default values
|
||||
loaded_module = example_module.load({})
|
||||
assert loaded_module is not None
|
||||
assert isinstance(loaded_module, BaseModule)
|
||||
assert loaded_module.name == "example_module"
|
||||
assert loaded_module.display_name == "Example Module"
|
||||
assert loaded_module.config["example_module"] == {"csv_file" : "db.csv"}
|
||||
assert loaded_module.config["example_module"] == {"csv_file": "db.csv"}
|
||||
|
||||
# check that the vlaue is set on the module itself
|
||||
assert loaded_module.csv_file == "db.csv"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
|
||||
def test_load_modules(module_name):
|
||||
# test that specific modules can be loaded
|
||||
@@ -78,6 +80,20 @@ def test_load_modules(module_name):
|
||||
# check that default settings are applied
|
||||
default_config = module.configs
|
||||
assert loaded_module.name in loaded_module.config.keys()
|
||||
defaults = {k for k in default_config}
|
||||
assert defaults in [loaded_module.config[module_name].keys()]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
|
||||
def test_config_defaults(module_name):
|
||||
# test the values of the default config values are set
|
||||
# Note: some modules can alter values in the setup() method, this test checks cases that don't
|
||||
module = ModuleFactory().get_module_lazy(module_name)
|
||||
loaded_module = module.load({})
|
||||
# check that default config values are set
|
||||
default_config = module.configs
|
||||
defaults = {k: v.get("default") for k, v in default_config.items()}
|
||||
assert defaults == loaded_module.config[module_name]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
|
||||
@@ -96,5 +112,3 @@ def test_lazy_base_module(module_name):
|
||||
assert len(lazy_module.configs) > 0
|
||||
assert len(lazy_module.description) > 0
|
||||
assert len(lazy_module.version) > 0
|
||||
|
||||
|
||||
|
||||
@@ -1,59 +1,73 @@
|
||||
import pytest
|
||||
import sys
|
||||
from argparse import ArgumentParser, ArgumentTypeError
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core.config import read_yaml, store_yaml
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
|
||||
TEST_MODULES = "tests/data/test_modules/"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_args():
|
||||
return ["--config", TEST_ORCHESTRATION,
|
||||
"--module_paths", TEST_MODULES,
|
||||
"--example_module.required_field", "some_value"] # just set this for normal testing, we will remove it later
|
||||
return [
|
||||
"--config",
|
||||
TEST_ORCHESTRATION,
|
||||
"--module_paths",
|
||||
TEST_MODULES,
|
||||
"--example_module.required_field",
|
||||
"some_value",
|
||||
] # just set this for normal testing, we will remove it later
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def orchestrator():
|
||||
return ArchivingOrchestrator()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def basic_parser(orchestrator) -> ArgumentParser:
|
||||
return orchestrator.setup_basic_parser()
|
||||
|
||||
|
||||
def test_setup_orchestrator(orchestrator):
|
||||
assert orchestrator is not None
|
||||
|
||||
|
||||
def test_parse_config():
|
||||
pass
|
||||
|
||||
|
||||
def test_parse_basic(basic_parser):
|
||||
args = basic_parser.parse_args(["--config", TEST_ORCHESTRATION])
|
||||
assert args.config_file == TEST_ORCHESTRATION
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode", ["simple", "full"])
|
||||
def test_mode(basic_parser, mode):
|
||||
args = basic_parser.parse_args(["--mode", mode])
|
||||
assert args.mode == mode
|
||||
|
||||
|
||||
def test_mode_invalid(basic_parser, capsys):
|
||||
with pytest.raises(SystemExit) as exit_error:
|
||||
basic_parser.parse_args(["--mode", "invalid"])
|
||||
assert exit_error.value.code == 2
|
||||
assert "invalid choice" in capsys.readouterr().err
|
||||
|
||||
|
||||
def test_version(basic_parser, capsys):
|
||||
with pytest.raises(SystemExit) as exit_error:
|
||||
basic_parser.parse_args(["--version"])
|
||||
assert exit_error.value.code == 0
|
||||
assert capsys.readouterr().out == f"{__version__}\n"
|
||||
|
||||
def test_help(orchestrator, basic_parser, capsys):
|
||||
|
||||
def test_help(orchestrator, basic_parser, capsys):
|
||||
args = basic_parser.parse_args(["--help"])
|
||||
assert args.help == True
|
||||
assert args.help is True
|
||||
|
||||
# test the show_help() on orchestrator
|
||||
with pytest.raises(SystemExit) as exit_error:
|
||||
@@ -78,19 +92,22 @@ def test_help(orchestrator, basic_parser, capsys):
|
||||
assert "--logging.level" in logs
|
||||
|
||||
# individual module configs
|
||||
assert "--gsheet_feeder.sheet_id" in logs
|
||||
assert "--gsheet_feeder_db.sheet_id" in logs
|
||||
|
||||
|
||||
def test_add_custom_modules_path(orchestrator, test_args):
|
||||
orchestrator.setup_config(test_args)
|
||||
|
||||
|
||||
import auto_archiver
|
||||
|
||||
assert "tests/data/test_modules/" in auto_archiver.modules.__path__
|
||||
|
||||
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
|
||||
|
||||
orchestrator.setup_config(test_args + # we still need to load the real path to get the example_module
|
||||
["--module_paths", "tests/data/invalid_test_modules/"])
|
||||
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
|
||||
orchestrator.setup_config(
|
||||
test_args # we still need to load the real path to get the example_module
|
||||
+ ["--module_paths", "tests/data/invalid_test_modules/"]
|
||||
)
|
||||
|
||||
assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
|
||||
|
||||
@@ -99,16 +116,16 @@ def test_check_required_values(orchestrator, caplog, test_args):
|
||||
# drop the example_module.required_field from the test_args
|
||||
test_args = test_args[:-2]
|
||||
|
||||
with pytest.raises(SystemExit) as exit_error:
|
||||
config = orchestrator.setup_config(test_args)
|
||||
with pytest.raises(SystemExit):
|
||||
orchestrator.setup_config(test_args)
|
||||
|
||||
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
|
||||
|
||||
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
|
||||
|
||||
def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
|
||||
# load the default example yaml, add a required field, then run the orchestrator
|
||||
test_yaml = read_yaml(TEST_ORCHESTRATION)
|
||||
test_yaml['example_module'] = {'required_field': 'some_value'}
|
||||
test_yaml["example_module"] = {"required_field": "some_value"}
|
||||
# write it to a temp file
|
||||
tmp_file = (tmp_path / "temp_config.yaml").as_posix()
|
||||
store_yaml(test_yaml, tmp_file)
|
||||
@@ -117,27 +134,42 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
|
||||
config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
|
||||
assert config is not None
|
||||
|
||||
def test_load_authentication_string(orchestrator, test_args):
|
||||
|
||||
config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
|
||||
assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
|
||||
def test_load_authentication_string(orchestrator, test_args):
|
||||
config = orchestrator.setup_config(
|
||||
test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']
|
||||
)
|
||||
assert config["authentication"] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
|
||||
|
||||
|
||||
def test_load_authentication_string_concat_site(orchestrator, test_args):
|
||||
|
||||
config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
|
||||
assert config['authentication'] == {"x.com": {"api_key": "my_key"},
|
||||
"twitter.com": {"api_key": "my_key"}}
|
||||
assert config["authentication"] == {"x.com": {"api_key": "my_key"}, "twitter.com": {"api_key": "my_key"}}
|
||||
|
||||
|
||||
def test_load_invalid_authentication_string(orchestrator, test_args):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
|
||||
orchestrator.setup_config(test_args + ["--authentication", "{''invalid_json"])
|
||||
|
||||
|
||||
def test_load_authentication_invalid_dict(orchestrator, test_args):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
|
||||
|
||||
|
||||
def test_load_modules_from_commandline(orchestrator, test_args):
|
||||
args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
|
||||
args = test_args + [
|
||||
"--feeders",
|
||||
"example_module",
|
||||
"--extractors",
|
||||
"example_module",
|
||||
"--databases",
|
||||
"example_module",
|
||||
"--enrichers",
|
||||
"example_module",
|
||||
"--formatters",
|
||||
"example_module",
|
||||
]
|
||||
|
||||
orchestrator.setup(args)
|
||||
|
||||
@@ -153,27 +185,37 @@ def test_load_modules_from_commandline(orchestrator, test_args):
|
||||
assert orchestrator.enrichers[0].name == "example_module"
|
||||
assert orchestrator.formatters[0].name == "example_module"
|
||||
|
||||
|
||||
def test_load_settings_for_module_from_commandline(orchestrator, test_args):
|
||||
args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
|
||||
args = test_args + [
|
||||
"--feeders",
|
||||
"gsheet_feeder_db",
|
||||
"--gsheet_feeder_db.sheet_id",
|
||||
"123",
|
||||
"--gsheet_feeder_db.service_account",
|
||||
"tests/data/test_service_account.json",
|
||||
]
|
||||
|
||||
orchestrator.setup(args)
|
||||
|
||||
assert len(orchestrator.feeders) == 1
|
||||
assert orchestrator.feeders[0].name == "gsheet_feeder"
|
||||
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
|
||||
assert orchestrator.feeders[0].name == "gsheet_feeder_db"
|
||||
assert orchestrator.config["gsheet_feeder_db"]["sheet_id"] == "123"
|
||||
|
||||
|
||||
def test_multiple_orchestrator(test_args):
|
||||
|
||||
o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
|
||||
o1_args = test_args + [
|
||||
"--feeders",
|
||||
"gsheet_feeder_db",
|
||||
"--gsheet_feeder_db.service_account",
|
||||
"tests/data/test_service_account.json",
|
||||
]
|
||||
o1 = ArchivingOrchestrator()
|
||||
|
||||
with pytest.raises(ValueError) as exit_error:
|
||||
# this should fail because the gsheet_feeder requires a sheet_id / sheet
|
||||
with pytest.raises(ValueError):
|
||||
# this should fail because the gsheet_feeder_db requires a sheet_id / sheet
|
||||
o1.setup(o1_args)
|
||||
|
||||
|
||||
|
||||
o2_args = test_args + ["--feeders", "example_module"]
|
||||
o2 = ArchivingOrchestrator()
|
||||
o2.setup(o2_args)
|
||||
@@ -182,4 +224,16 @@ def test_multiple_orchestrator(test_args):
|
||||
|
||||
output: Metadata = list(o2.feed())
|
||||
assert len(output) == 1
|
||||
assert output[0].get_url() == "https://example.com"
|
||||
assert output[0].get_url() == "https://example.com"
|
||||
|
||||
|
||||
def test_wrong_step_type(test_args, caplog):
|
||||
args = test_args + [
|
||||
"--feeders",
|
||||
"example_extractor", # example_extractor is not a valid feeder!
|
||||
]
|
||||
|
||||
orchestrator = ArchivingOrchestrator()
|
||||
with pytest.raises(SetupError) as err:
|
||||
orchestrator.setup(args)
|
||||
assert "Module 'example_extractor' is not a feeder" in str(err.value)
|
||||
|
||||
@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import (
|
||||
update_nested_dict,
|
||||
calculate_file_hash,
|
||||
random_str,
|
||||
get_timestamp
|
||||
get_timestamp,
|
||||
)
|
||||
|
||||
|
||||
@@ -38,40 +38,46 @@ class TestDirectoryUtils:
|
||||
mkdir_if_not_exists(existing_dir)
|
||||
assert existing_dir.exists()
|
||||
|
||||
|
||||
class TestURLExpansion:
|
||||
@pytest.mark.parametrize("input_url,expected", [
|
||||
("https://example.com", "https://example.com"),
|
||||
("https://t.co/test", "https://expanded.url")
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"input_url,expected",
|
||||
[("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")],
|
||||
)
|
||||
def test_expand_url(self, input_url, expected, mocker):
|
||||
mock_response = mocker.Mock()
|
||||
mock_response.url = "https://expanded.url"
|
||||
mocker.patch('requests.get', return_value=mock_response)
|
||||
mocker.patch("requests.get", return_value=mock_response)
|
||||
result = expand_url(input_url)
|
||||
assert result == expected
|
||||
|
||||
def test_expand_url_handles_errors(self, caplog, mocker):
|
||||
mocker.patch('requests.get', side_effect=Exception("Connection error"))
|
||||
mocker.patch("requests.get", side_effect=Exception("Connection error"))
|
||||
url = "https://t.co/error"
|
||||
result = expand_url(url)
|
||||
assert result == url
|
||||
assert f"Failed to expand url {url}" in caplog.text
|
||||
|
||||
|
||||
class TestAttributeHandling:
|
||||
class Sample:
|
||||
exists = "value"
|
||||
none = None
|
||||
|
||||
@pytest.mark.parametrize("obj,attr,default,expected", [
|
||||
(Sample(), "exists", "default", "value"),
|
||||
(Sample(), "none", "default", "default"),
|
||||
(Sample(), "missing", "default", "default"),
|
||||
(None, "anything", "fallback", "fallback"),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"obj,attr,default,expected",
|
||||
[
|
||||
(Sample(), "exists", "default", "value"),
|
||||
(Sample(), "none", "default", "default"),
|
||||
(Sample(), "missing", "default", "default"),
|
||||
(None, "anything", "fallback", "fallback"),
|
||||
],
|
||||
)
|
||||
def test_getattr_or(self, obj, attr, default, expected):
|
||||
# Test gets attribute or returns a default value
|
||||
assert getattr_or(obj, attr, default) == expected
|
||||
|
||||
|
||||
class TestDateTimeHandling:
|
||||
def test_datetime_encoder(self, sample_datetime):
|
||||
result = json.dumps({"dt": sample_datetime}, cls=DateTimeEncoder)
|
||||
@@ -83,11 +89,14 @@ class TestDateTimeHandling:
|
||||
result = dump_payload(payload)
|
||||
assert str(sample_datetime) in result
|
||||
|
||||
@pytest.mark.parametrize("dt_str,fmt,expected", [
|
||||
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
|
||||
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
|
||||
("invalid", None, None),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"dt_str,fmt,expected",
|
||||
[
|
||||
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
|
||||
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
|
||||
("invalid", None, None),
|
||||
],
|
||||
)
|
||||
def test_datetime_from_string(self, dt_str, fmt, expected):
|
||||
result = get_datetime_from_str(dt_str, fmt)
|
||||
if expected is None:
|
||||
@@ -95,16 +104,21 @@ class TestDateTimeHandling:
|
||||
else:
|
||||
assert result == expected.replace(tzinfo=result.tzinfo)
|
||||
|
||||
|
||||
class TestDictUtils:
|
||||
@pytest.mark.parametrize("original,update,expected", [
|
||||
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
|
||||
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
|
||||
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"original,update,expected",
|
||||
[
|
||||
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
|
||||
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
|
||||
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
|
||||
],
|
||||
)
|
||||
def test_update_nested_dict(self, original, update, expected):
|
||||
update_nested_dict(original, update)
|
||||
assert original == expected
|
||||
|
||||
|
||||
class TestHashingUtils:
|
||||
def test_file_hashing(self, sample_file):
|
||||
expected = hashlib.sha256(b"test content").hexdigest()
|
||||
@@ -118,6 +132,7 @@ class TestHashingUtils:
|
||||
expected = hashlib.sha256(content).hexdigest()
|
||||
assert calculate_file_hash(str(file_path)) == expected
|
||||
|
||||
|
||||
class TestMiscUtils:
|
||||
def test_random_str_length(self):
|
||||
for length in [8, 16, 32]:
|
||||
@@ -131,14 +146,17 @@ class TestMiscUtils:
|
||||
def test_random_str_uniqueness(self):
|
||||
assert random_str() != random_str()
|
||||
|
||||
@pytest.mark.parametrize("ts_input,utc,iso,expected_type", [
|
||||
(datetime.now(), True, True, str),
|
||||
("2023-01-01T12:00:00+00:00", False, False, datetime),
|
||||
(1672574400, True, True, str),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"ts_input,utc,iso,expected_type",
|
||||
[
|
||||
(datetime.now(), True, True, str),
|
||||
("2023-01-01T12:00:00+00:00", False, False, datetime),
|
||||
(1672574400, True, True, str),
|
||||
],
|
||||
)
|
||||
def test_timestamp_parsing(self, ts_input, utc, iso, expected_type):
|
||||
result = get_timestamp(ts_input, utc=utc, iso=iso)
|
||||
assert isinstance(result, expected_type)
|
||||
|
||||
def test_invalid_timestamp_returns_none(self):
|
||||
assert get_timestamp("invalid-date") is None
|
||||
assert get_timestamp("invalid-date") is None
|
||||
|
||||
113
tests/utils/test_urls.py
Normal file
113
tests/utils/test_urls.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import pytest
|
||||
from auto_archiver.utils.url import (
|
||||
is_auth_wall,
|
||||
check_url_or_raise,
|
||||
domain_for_url,
|
||||
is_relevant_url,
|
||||
remove_get_parameters,
|
||||
twitter_best_quality_url,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, is_auth",
|
||||
[
|
||||
("https://example.com", False),
|
||||
("https://t.me/c/abc/123", True),
|
||||
("https://t.me/not-private/", False),
|
||||
("https://instagram.com", True),
|
||||
("https://www.instagram.com", True),
|
||||
("https://www.instagram.com/p/INVALID", True),
|
||||
("https://www.instagram.com/p/C4QgLbrIKXG/", True),
|
||||
],
|
||||
)
|
||||
def test_is_auth_wall(url, is_auth):
|
||||
assert is_auth_wall(url) == is_auth
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, raises",
|
||||
[
|
||||
("http://example.com", False),
|
||||
("https://example.com", False),
|
||||
("ftp://example.com", True),
|
||||
("http://localhost", True),
|
||||
("http://", True),
|
||||
],
|
||||
)
|
||||
def test_check_url_or_raise(url, raises):
|
||||
if raises:
|
||||
with pytest.raises(ValueError):
|
||||
check_url_or_raise(url)
|
||||
else:
|
||||
assert check_url_or_raise(url)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, domain",
|
||||
[
|
||||
("https://example.com", "example.com"),
|
||||
("https://www.example.com", "www.example.com"),
|
||||
("https://www.example.com/path", "www.example.com"),
|
||||
("https://", ""),
|
||||
("http://localhost", "localhost"),
|
||||
],
|
||||
)
|
||||
def test_domain_for_url(url, domain):
|
||||
assert domain_for_url(url) == domain
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, without_get",
|
||||
[
|
||||
("https://example.com", "https://example.com"),
|
||||
("https://example.com?utm_source=example", "https://example.com"),
|
||||
("https://example.com?utm_source=example&other=1", "https://example.com"),
|
||||
("https://example.com/something", "https://example.com/something"),
|
||||
("https://example.com/something?utm_source=example", "https://example.com/something"),
|
||||
],
|
||||
)
|
||||
def test_remove_get_parameters(url, without_get):
|
||||
assert remove_get_parameters(url) == without_get
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, relevant",
|
||||
[
|
||||
("https://example.com", True),
|
||||
("https://example.com/favicon.ico", False),
|
||||
("https://twimg.com/profile_images", False),
|
||||
("https://twimg.com/something/default_profile_images", False),
|
||||
("https://scontent.cdninstagram.com/username/150x150.jpg", False),
|
||||
("https://static.cdninstagram.com/rsrc.php/", False),
|
||||
("https://telegram.org/img/emoji/", False),
|
||||
("https://www.youtube.com/s/gaming/emoji/", False),
|
||||
("https://yt3.ggpht.com/default-user=", False),
|
||||
("https://www.youtube.com/s/search/audio/", False),
|
||||
("https://ok.ru/res/i/", False),
|
||||
("https://vk.com/emoji/", False),
|
||||
("https://vk.com/images/", False),
|
||||
("https://vk.com/images/reaction/", False),
|
||||
("https://wikipedia.org/static", False),
|
||||
("https://example.com/file.svg", False),
|
||||
("https://example.com/file.ico", False),
|
||||
("https://example.com/file.mp4", True),
|
||||
("https://example.com/150x150.jpg", True),
|
||||
("https://example.com/rsrc.php/", True),
|
||||
("https://example.com/img/emoji/", True),
|
||||
],
|
||||
)
|
||||
def test_is_relevant_url(url, relevant):
|
||||
assert is_relevant_url(url) == relevant
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, best_quality",
|
||||
[
|
||||
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
|
||||
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
|
||||
],
|
||||
)
|
||||
def test_twitter_best_quality_url(url, best_quality):
|
||||
assert twitter_best_quality_url(url) == best_quality
|
||||
Reference in New Issue
Block a user