Merge main into timestamping_enricher

2026-06-08 03:18:28 +03:00 · 2025-03-24 15:09:29 +04:00
parent 89ee6f19b6 7b454baa02
commit dfde6f1995
219 changed files with 11049 additions and 2933 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,7 @@
 """
 pytest conftest file, for shared fixtures and configuration
 """
+
 import os
 import pickle
 from datetime import datetime, timezone
@@ -16,32 +17,36 @@ from auto_archiver.core.module import ModuleFactory
 # that you only want to run if everything else succeeds (e.g. API calls). The order here is important
 # what comes first will be run first (at the end of all other tests not mentioned)
 # format is the name of the module (python file) without the .py extension
-TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
+TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
+

@pytest.fixture
 def setup_module(request):
-    def _setup_module(module_name, config={}):
-
+    def _setup_module(module_name, config=None):
+        if config is None:
+            config = {}
        module_factory = ModuleFactory()

        if isinstance(module_name, type):
            # get the module name:
            # if the class does not have a .name, use the name of the parent folder
-            module_name = module_name.__module__.rsplit(".",2)[-2]
+            module_name = module_name.__module__.rsplit(".", 2)[-2]

        m = module_factory.get_module(module_name, {module_name: config})
        # add the tmp_dir to the module
        tmp_dir = TemporaryDirectory()
        m.tmp_dir = tmp_dir.name
-        
+
        def cleanup():
            tmp_dir.cleanup()
+
        request.addfinalizer(cleanup)

        return m

    return _setup_module

+
@pytest.fixture
 def check_hash():
    def _check_hash(filename: str, hash: str):
@@ -51,6 +56,7 @@ def check_hash():

    return _check_hash

+
@pytest.fixture
 def make_item():
    def _make_item(url: str, **kwargs) -> Metadata:
@@ -62,7 +68,6 @@ def make_item():
    return _make_item


-
 def pytest_collection_modifyitems(items):
    module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}

@@ -78,13 +83,13 @@ def pytest_collection_modifyitems(items):
    items[:] = sorted_items


-
 # Incremental testing - fail tests in a class if any previous test fails
 # taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps

 # store history of failures per test class name and per index in parametrize (if parametrize used)
 _test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}

+
 def pytest_runtest_makereport(item, call):
    if "incremental" in item.keywords:
        # incremental marker is used
@@ -93,17 +98,11 @@ def pytest_runtest_makereport(item, call):
            # retrieve the class name of the test
            cls_name = str(item.cls)
            # retrieve the index of the test (if parametrize is used in combination with incremental)
-            parametrize_index = (
-                tuple(item.callspec.indices.values())
-                if hasattr(item, "callspec")
-                else ()
-            )
+            parametrize_index = tuple(item.callspec.indices.values()) if hasattr(item, "callspec") else ()
            # retrieve the name of the test function
            test_name = item.originalname or item.name
            # store in _test_failed_incremental the original name of the failed test
-            _test_failed_incremental.setdefault(cls_name, {}).setdefault(
-                parametrize_index, test_name
-            )
+            _test_failed_incremental.setdefault(cls_name, {}).setdefault(parametrize_index, test_name)


 def pytest_runtest_setup(item):
@@ -119,16 +118,17 @@ def pytest_runtest_setup(item):
                pytest.xfail(f"previous test failed ({test_name})")


-
-@pytest.fixture()
+@pytest.fixture
 def unpickle():
    """
    Returns a helper function that unpickles a file
    ** gets the file from the test_files directory: tests/data/ **
    """
+
    def _unpickle(path):
        with open(os.path.join("tests/data", path), "rb") as f:
            return pickle.load(f)
+
    return _unpickle


@@ -151,9 +151,9 @@ def sample_datetime():
    return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)


-@pytest.fixture(autouse=True)
+@pytest.fixture
 def mock_sleep(mocker):
-    """Globally mock time.sleep to avoid delays."""
+    """Mock time.sleep to avoid delays."""
    return mocker.patch("time.sleep")


@@ -162,4 +162,4 @@ def metadata():
    metadata = Metadata()
    metadata.set("_processed_at", "2021-01-01T00:00:00")
    metadata.set_url("https://example.com")
-    return metadata
+    return metadata
--- a/tests/data/dropin.py
+++ b/tests/data/dropin.py
@@ -1,5 +1,6 @@
 # this is a dummy class used to test importing a dropin in the
 #  generic extractor by filename/path

+
 class Dropin:
-    pass
+    pass
--- a/tests/data/test_modules/example_extractor/manifest.py
+++ b/tests/data/test_modules/example_extractor/manifest.py
@@ -0,0 +1,11 @@
+{
+    # Display Name of your module
+    "name": "Example Extractor",
+    # Optional version number, for your own versioning purposes
+    "version": 2.0,
+    # The type of the module, must be one (or more) of the built in module types
+    "type": ["extractor"],
+    # a boolean indicating whether or not a module requires additional user setup before it can be used
+    # for example: adding API keys, installing additional software etc.
+    "requires_setup": False,
+}
--- a/tests/data/test_modules/example_extractor/example_extractor.py
+++ b/tests/data/test_modules/example_extractor/example_extractor.py
@@ -0,0 +1,6 @@
+from auto_archiver.core import Extractor
+
+
+class ExampleExtractor(Extractor):
+    def download(self, item):
+        print("download")
--- a/tests/data/test_modules/example_module/init.py
+++ b/tests/data/test_modules/example_module/init.py
@@ -1 +1 @@
-from .example_module import ExampleModule
+from .example_module import ExampleModule
--- a/tests/data/test_modules/example_module/manifest.py
+++ b/tests/data/test_modules/example_module/manifest.py
@@ -16,14 +16,14 @@
    "dependencies": {
        "python": ["loguru"],
        "bin": ["bash"],
-        },
-    # configurations that this module takes. These are argparse-compliant dicationaries, that are 
+    },
+    # configurations that this module takes. These are argparse-compliant dicationaries, that are
    # used to create command line arguments when the programme is run.
    # The full name of the config option will become: `module_name.config_name`
    "configs": {
-            "csv_file": {"default": "db.csv", "help": "CSV file name"},
-            "required_field": {"required": True, "help": "required field in the CSV file"},
-        },
+        "csv_file": {"default": "db.csv", "help": "CSV file name"},
+        "required_field": {"required": True, "help": "required field in the CSV file"},
+    },
    # A description of the module, used for documentation
    "description": "This is an example module",
-}
+}
--- a/tests/data/test_modules/example_module/example_module.py
+++ b/tests/data/test_modules/example_module/example_module.py
@@ -1,5 +1,6 @@
 from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata

+
 class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
    def download(self, item):
        print("download")
@@ -7,7 +8,6 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
    def __iter__(self):
        yield Metadata().set_url("https://example.com")

-    
    def done(self, result):
        print("done")

@@ -16,13 +16,12 @@ class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):

    def get_cdn_url(self, media):
        return "nice_url"
-    
+
    def save(self, item):
        print("save")
-    
+
    def uploadf(self, file, key, **kwargs):
        print("uploadf")

-    
    def format(self, item):
        print("format")
--- a/tests/databases/test_api_db.py
+++ b/tests/databases/test_api_db.py
@@ -1,6 +1,5 @@
 import pytest

-from auto_archiver.core import Metadata
 from auto_archiver.modules.api_db import AAApiDb


@@ -41,9 +40,16 @@ def test_fetch(api_db, metadata, mocker):
    mock_datetime = mocker.patch("auto_archiver.core.metadata.datetime.datetime")
    mock_datetime.now.return_value = "2021-01-01T00:00:00"
    mock_get.return_value.status_code = 200
-    mock_get.return_value.json.return_value = [{"result": {}}, {"result":
-        {'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'},
-         'status': 'no archiver'}}]
+    mock_get.return_value.json.return_value = [
+        {"result": {}},
+        {
+            "result": {
+                "media": [],
+                "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"},
+                "status": "no archiver",
+            }
+        },
+    ]
    assert api_db.fetch(metadata) == metadata


@@ -52,8 +58,15 @@ def test_done_success(api_db, metadata, mocker):
    mock_post.return_value.status_code = 201
    api_db.done(metadata)
    mock_post.assert_called_once()
-    mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive",
-                                      json={'author_id': 'Someone', 'url': 'https://example.com',
-                                            'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'},
-                                      headers={'Authorization': 'Bearer test-token'})
-
+    mock_post.assert_called_once_with(
+        "https://api.example.com/interop/submit-archive",
+        json={
+            "author_id": "Someone",
+            "url": "https://example.com",
+            "public": False,
+            "group_id": "123",
+            "tags": ["[", "]"],
+            "result": '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}',
+        },
+        headers={"Authorization": "Bearer test-token"},
+    )
--- a/tests/databases/test_atlos_db.py
+++ b/tests/databases/test_atlos_db.py
@@ -2,7 +2,7 @@ import pytest
 from datetime import datetime

 from auto_archiver.core import Metadata
-from auto_archiver.modules.atlos_db import AtlosDb
+from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosDb


 class FakeAPIResponse:
@@ -12,19 +12,28 @@ class FakeAPIResponse:
        self._data = data
        self.raise_error = raise_error

+    def json(self) -> dict:
+        return self._data
+
    def raise_for_status(self) -> None:
        if self.raise_error:
            raise Exception("HTTP error")


@pytest.fixture
-def atlos_db(setup_module) -> AtlosDb:
+def atlos_db(setup_module, mocker) -> AtlosDb:
    """Fixture for AtlosDb."""
    configs: dict = {
        "api_token": "abc123",
        "atlos_url": "https://platform.atlos.org",
    }
-    return setup_module("atlos_db", configs)
+    mocker.patch("requests.Session")
+    atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
+    fake_session = mocker.MagicMock()
+    # Configure the default response to have no results so that __iter__ terminates
+    fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
+    atlos_feeder.session = fake_session
+    return atlos_feeder


 def test_failed_no_atlos_id(atlos_db, metadata, mocker):
@@ -38,25 +47,18 @@ def test_failed_with_atlos_id(atlos_db, metadata, mocker):
    """Test failed() posts failure when atlos_id is present."""
    metadata.set("atlos_id", 42)
    fake_resp = FakeAPIResponse({}, raise_error=False)
-    post_mock = mocker.patch("requests.post", return_value=fake_resp)
+    post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
    atlos_db.failed(metadata, "failure reason")
-    expected_url = (
-        f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver"
-    )
-    expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
-    expected_json = {
-        "metadata": {"processed": True, "status": "error", "error": "failure reason"}
-    }
-    post_mock.assert_called_once_with(
-        expected_url, headers=expected_headers, json=expected_json
-    )
+    expected_endpoint = "/api/v2/source_material/metadata/42/auto_archiver"
+    expected_json = {"metadata": {"processed": True, "status": "error", "error": "failure reason"}}
+    post_mock.assert_called_once_with(expected_endpoint, json=expected_json)


 def test_failed_http_error(atlos_db, metadata, mocker):
    """Test failed() raises exception on HTTP error."""
    metadata.set("atlos_id", 42)
-    fake_resp = FakeAPIResponse({}, raise_error=True)
-    mocker.patch("requests.post", return_value=fake_resp)
+    # Patch _post to raise an exception instead of returning a fake response.
+    mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
    with pytest.raises(Exception, match="HTTP error"):
        atlos_db.failed(metadata, "failure reason")

@@ -81,12 +83,9 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
    now = datetime.now()
    metadata.set("timestamp", now)
    fake_resp = FakeAPIResponse({}, raise_error=False)
-    post_mock = mocker.patch("requests.post", return_value=fake_resp)
+    post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
    atlos_db.done(metadata)
-    expected_url = (
-        f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver"
-    )
-    expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
+    expected_endpoint = "/api/v2/source_material/metadata/99/auto_archiver"
    expected_results = metadata.metadata.copy()
    expected_results["timestamp"] = now.isoformat()
    expected_json = {
@@ -96,15 +95,13 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
            "results": expected_results,
        }
    }
-    post_mock.assert_called_once_with(
-        expected_url, headers=expected_headers, json=expected_json
-    )
+    post_mock.assert_called_once_with(expected_endpoint, json=expected_json)


 def test_done_http_error(atlos_db, metadata, mocker):
-    """Test done() raises exception on HTTP error."""
+    """Test done() raises an exception on HTTP error."""
    metadata.set("atlos_id", 123)
-    fake_resp = FakeAPIResponse({}, raise_error=True)
-    mocker.patch("requests.post", return_value=fake_resp)
+    # Patch _post to raise an exception.
+    mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
    with pytest.raises(Exception, match="HTTP error"):
        atlos_db.done(metadata)
--- a/tests/databases/test_csv_db.py
+++ b/tests/databases/test_csv_db.py
@@ -1,4 +1,3 @@
-
 from auto_archiver.modules.csv_db import CSVDb
 from auto_archiver.core import Metadata

@@ -9,12 +8,21 @@ def test_store_item(tmp_path, setup_module):
    temp_db = tmp_path / "temp_db.csv"
    db = setup_module(CSVDb, {"csv_file": temp_db.as_posix()})

-    item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")
+    item = (
+        Metadata()
+        .set_url("http://example.com")
+        .set_title("Example")
+        .set_content("Example content")
+        .success("my-archiver")
+    )

    db.done(item)

    with open(temp_db, "r", encoding="utf-8") as f:
-        assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
+        assert (
+            f.read().strip()
+            == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
+        )

    # TODO: csv db doesn't have a fetch method - need to add it (?)
-    # assert db.fetch(item) == item
+    # assert db.fetch(item) == item
--- a/tests/databases/test_gsheet_db.py
+++ b/tests/databases/test_gsheet_db.py
@@ -2,8 +2,7 @@ from datetime import datetime, timezone
 import pytest

 from auto_archiver.core import Metadata, Media
-from auto_archiver.modules.gsheet_db import GsheetsDb
-from auto_archiver.modules.gsheet_feeder import GWorksheet
+from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet


@pytest.fixture
@@ -29,6 +28,7 @@ def mock_metadata(mocker):
    metadata.get_first_image.return_value = None
    return metadata

+
@pytest.fixture
 def metadata():
    metadata = Metadata()
@@ -52,13 +52,36 @@ def mock_media(mocker):
    mock_media.get.return_value = "not-calculated"
    return mock_media

+
@pytest.fixture
-def gsheets_db(mock_gworksheet, setup_module, mocker):
-    db = setup_module("gsheet_db", {
-        "allow_worksheets": "set()",
-        "block_worksheets": "set()",
-        "use_sheet_names_in_stored_paths": "True",
-    })
+def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsFeederDB:
+    mocker.patch("gspread.service_account")
+    config: dict = {
+        "sheet": "testsheet",
+        "sheet_id": None,
+        "header": 1,
+        "service_account": "test/service_account.json",
+        "columns": {
+            "url": "link",
+            "status": "archive status",
+            "folder": "destination folder",
+            "archive": "archive location",
+            "date": "archive date",
+            "thumbnail": "thumbnail",
+            "timestamp": "upload timestamp",
+            "title": "upload title",
+            "text": "text content",
+            "screenshot": "screenshot",
+            "hash": "hash",
+            "pdq_hash": "perceptual hashes",
+            "wacz": "wacz",
+            "replaywebpage": "replaywebpage",
+        },
+        "allow_worksheets": set(),
+        "block_worksheets": set(),
+        "use_sheet_names_in_stored_paths": True,
+    }
+    db = setup_module("gsheet_feeder_db", config)
    db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
    return db

@@ -72,20 +95,21 @@ def fixed_timestamp():
@pytest.fixture
 def expected_calls(mock_media, fixed_timestamp):
    """Fixture for the expected cell updates."""
-    return  [
-        (1, 'status', 'my-archiver: success'),
-        (1, 'archive', 'http://example.com/screenshot.png'),
-        (1, 'date', '2025-02-01T00:00:00+00:00'),
-        (1, 'title', 'Example Title'),
-        (1, 'text', 'Example Content'),
-        (1, 'timestamp', '2025-01-01T00:00:00+00:00'),
-        (1, 'hash', 'not-calculated'),
+    return [
+        (1, "status", "my-archiver: success"),
+        (1, "archive", "http://example.com/screenshot.png"),
+        (1, "date", "2025-02-01T00:00:00+00:00"),
+        (1, "title", "Example Title"),
+        (1, "text", "Example Content"),
+        (1, "timestamp", "2025-01-01T00:00:00+00:00"),
+        (1, "hash", "not-calculated"),
        # (1, 'screenshot', 'http://example.com/screenshot.png'),
        # (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
        # (1, 'wacz', 'http://example.com/browsertrix.wacz'),
        # (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
    ]

+
 def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
    gw, row = gsheets_db._retrieve_gsheet(metadata)
    assert gw == mock_gworksheet
@@ -94,27 +118,34 @@ def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):

 def test_started(gsheets_db, mock_metadata, mock_gworksheet):
    gsheets_db.started(mock_metadata)
-    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Archive in progress')
+    mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Archive in progress")
+

 def test_failed(gsheets_db, mock_metadata, mock_gworksheet):
    reason = "Test failure"
    gsheets_db.failed(mock_metadata, reason)
-    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', f'Archive failed {reason}')
+    mock_gworksheet.set_cell.assert_called_once_with(1, "status", f"Archive failed {reason}")


 def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
    gsheets_db.aborted(mock_metadata)
-    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
+    mock_gworksheet.set_cell.assert_called_once_with(1, "status", "")


 def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
-    mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
+    mocker.patch(
+        "auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
+        return_value="2025-02-01T00:00:00+00:00",
+    )
    gsheets_db.done(metadata)
    mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)


 def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
-    mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
+    mocker.patch(
+        "auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
+        return_value="2025-02-01T00:00:00+00:00",
+    )
    gsheets_db.done(metadata, cached=True)

    # Verify the status message includes "[cached]"
@@ -125,15 +156,17 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
 def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
    # clear media from metadata
    metadata.media = []
-    mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
+    mocker.patch(
+        "auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp",
+        return_value="2025-02-01T00:00:00+00:00",
+    )
    gsheets_db.done(metadata)
    # Verify nothing media-related gets updated
    call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
-    media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'}
+    media_fields = {"archive", "screenshot", "thumbnail", "wacz", "replaywebpage"}
    assert all(call[1] not in media_fields for call in call_args)

+
 def test_safe_status_update(gsheets_db, metadata, mock_gworksheet):
    gsheets_db._safe_status_update(metadata, "Test status")
-    mock_gworksheet.set_cell.assert_called_once_with(1, 'status', 'Test status')
-
-
+    mock_gworksheet.set_cell.assert_called_once_with(1, "status", "Test status")
--- a/tests/enrichers/test_hash_enricher.py
+++ b/tests/enrichers/test_hash_enricher.py
@@ -4,34 +4,50 @@ from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.core import Metadata, Media
 from auto_archiver.core.module import ModuleFactory

-@pytest.mark.parametrize("algorithm, filename, expected_hash", [
-    ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
-    ("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
-    ("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"),
-    ("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6")
-])
+
+@pytest.mark.parametrize(
+    "algorithm, filename, expected_hash",
+    [
+        ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
+        ("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
+        (
+            "SHA3-512",
+            "tests/data/testfile_1.txt",
+            "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e",
+        ),
+        (
+            "SHA3-512",
+            "tests/data/testfile_2.txt",
+            "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6",
+        ),
+    ],
+)
 def test_calculate_hash(algorithm, filename, expected_hash, setup_module):
    # test SHA-256
    he = setup_module(HashEnricher, {"algorithm": algorithm, "chunksize": 100})
    assert he.calculate_hash(filename) == expected_hash

+
 def test_default_config_values(setup_module):
    he = setup_module(HashEnricher)
    assert he.algorithm == "SHA-256"
    assert he.chunksize == 16000000

+
 def test_config():
    # test default config
-    c = ModuleFactory().get_module_lazy('hash_enricher').configs
+    c = ModuleFactory().get_module_lazy("hash_enricher").configs
    assert c["algorithm"]["default"] == "SHA-256"
    assert c["chunksize"]["default"] == 16000000
    assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
    assert c["algorithm"]["help"] == "hash algorithm to use"
-    assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
+    assert (
+        c["chunksize"]["help"]
+        == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
+    )


 def test_hash_media(setup_module):
-
    he = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 1})

    # generate metadata with two test files
@@ -46,4 +62,4 @@ def test_hash_media(setup_module):
    he.enrich(m)

    assert m.media[0].get("hash") == "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
-    assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
+    assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"
--- a/tests/enrichers/test_meta_enricher.py
+++ b/tests/enrichers/test_meta_enricher.py
@@ -1,4 +1,3 @@
-import datetime
 from datetime import datetime, timedelta, timezone

 import pytest
@@ -16,6 +15,7 @@ def mock_metadata(mocker):
    mock.get_all_media.return_value = []
    return mock

+
@pytest.fixture
 def mock_media(mocker):
    """Creates a mock Media object."""
@@ -59,6 +59,7 @@ def test_enrich_file_sizes(meta_enricher, metadata, tmp_path):
    assert metadata.get("total_bytes") == 3000
    assert metadata.get("total_size") == "2.9 KB"

+
@pytest.mark.parametrize(
    "size, expected",
    [
@@ -74,6 +75,7 @@ def test_human_readable_bytes(size, expected):
    enricher = MetaEnricher()
    assert enricher.human_readable_bytes(size) == expected

+
 def test_enrich_file_sizes_no_media(meta_enricher, metadata):
    """Test that enrich_file_sizes() handles empty media list gracefully."""
    meta_enricher.enrich_file_sizes(metadata)
@@ -91,4 +93,4 @@ def test_enrich_archive_duration(meta_enricher, metadata, mocker):
    mock_datetime.now.return_value = mock_now
    meta_enricher.enrich_archive_duration(metadata)

-    assert metadata.get("archive_duration_seconds") == 630
+    assert metadata.get("archive_duration_seconds") == 630
--- a/tests/enrichers/test_metadata_enricher.py
+++ b/tests/enrichers/test_metadata_enricher.py
@@ -1,4 +1,3 @@
-
 import pytest

 from auto_archiver.core import Media
@@ -33,9 +32,7 @@ def test_get_metadata(enricher, output, expected, mocker):

    result = enricher.get_metadata("test.jpg")
    assert result == expected
-    mock_run.assert_called_once_with(
-        ["exiftool", "test.jpg"], capture_output=True, text=True
-    )
+    mock_run.assert_called_once_with(["exiftool", "test.jpg"], capture_output=True, text=True)


 def test_get_metadata_exiftool_not_found(enricher, mocker):
@@ -85,4 +82,3 @@ def test_metadata_pickle(enricher, unpickle, mocker):
    actual_media = metadata.media
    assert len(expected_media) == len(actual_media)
    assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
-
--- a/tests/enrichers/test_opentimestamps_enricher.py
+++ b/tests/enrichers/test_opentimestamps_enricher.py
@@ -0,0 +1,276 @@
+import pytest
+import hashlib
+
+from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile
+from opentimestamps.calendar import RemoteCalendar
+from opentimestamps.core.notary import PendingAttestation, BitcoinBlockHeaderAttestation
+
+from auto_archiver.core import Metadata, Media
+
+
+# TODO: Remove once timestamping overhaul is merged
+@pytest.fixture
+def sample_media(tmp_path) -> Media:
+    """Fixture creating a Media object with temporary source file"""
+    src_file = tmp_path / "source.txt"
+    src_file.write_text("test content")
+    return Media(_key="subdir/test.txt", filename=str(src_file))
+
+
+@pytest.fixture
+def sample_file_path(tmp_path):
+    tmp_file = tmp_path / "test.txt"
+    tmp_file.write_text("This is a test file content for OpenTimestamps")
+    return str(tmp_file)
+
+
+@pytest.fixture
+def detached_timestamp_file():
+    """Create a simple detached timestamp file for testing"""
+    file_hash = hashlib.sha256(b"Test content").digest()
+    from opentimestamps.core.op import OpSHA256
+
+    file_hash_op = OpSHA256()
+    timestamp = Timestamp(file_hash)
+
+    # Add a pending attestation
+    pending = PendingAttestation("https://example.calendar.com")
+    timestamp.attestations.add(pending)
+
+    # Add a bitcoin attestation
+    bitcoin = BitcoinBlockHeaderAttestation(783000)  # Some block height
+    timestamp.attestations.add(bitcoin)
+
+    return DetachedTimestampFile(file_hash_op, timestamp)
+
+
+@pytest.fixture
+def verified_timestamp_file():
+    """Create a timestamp file with a Bitcoin attestation"""
+    file_hash = hashlib.sha256(b"Verified content").digest()
+    from opentimestamps.core.op import OpSHA256
+
+    file_hash_op = OpSHA256()
+    timestamp = Timestamp(file_hash)
+
+    # Add only a Bitcoin attestation
+    bitcoin = BitcoinBlockHeaderAttestation(783000)  # Some block height
+    timestamp.attestations.add(bitcoin)
+
+    return DetachedTimestampFile(file_hash_op, timestamp)
+
+
+@pytest.fixture
+def pending_timestamp_file():
+    """Create a timestamp file with only pending attestations"""
+    file_hash = hashlib.sha256(b"Pending content").digest()
+    from opentimestamps.core.op import OpSHA256
+
+    file_hash_op = OpSHA256()
+    timestamp = Timestamp(file_hash)
+
+    # Add only pending attestations
+    pending1 = PendingAttestation("https://example1.calendar.com")
+    pending2 = PendingAttestation("https://example2.calendar.com")
+    timestamp.attestations.add(pending1)
+    timestamp.attestations.add(pending2)
+
+    return DetachedTimestampFile(file_hash_op, timestamp)
+
+
+@pytest.mark.download
+def test_download_tsr(setup_module, mocker):
+    """Test submitting a hash to calendar servers"""
+    # Mock the RemoteCalendar submit method
+    mock_submit = mocker.patch.object(RemoteCalendar, "submit")
+    test_timestamp = Timestamp(hashlib.sha256(b"test").digest())
+    mock_submit.return_value = test_timestamp
+
+    # Create a calendar
+    calendar = RemoteCalendar("https://alice.btc.calendar.opentimestamps.org")
+
+    # Test submission
+    file_hash = hashlib.sha256(b"Test file content").digest()
+    result = calendar.submit(file_hash)
+
+    assert mock_submit.called
+    assert isinstance(result, Timestamp)
+    assert result == test_timestamp
+
+
+def test_verify_timestamp(setup_module, detached_timestamp_file):
+    """Test the verification of timestamp attestations"""
+    ots = setup_module("opentimestamps_enricher")
+
+    # Test verification
+    verification_info = ots.verify_timestamp(detached_timestamp_file)
+
+    # Check verification results
+    assert verification_info["attestation_count"] == 2
+    assert verification_info["verified"] is True
+    assert len(verification_info["attestations"]) == 2
+
+    # Check attestation types
+    assertion_types = [a["status"] for a in verification_info["attestations"]]
+    assert "pending" in assertion_types
+    assert "confirmed" in assertion_types
+
+    # Check Bitcoin attestation details
+    bitcoin_attestation = next(a for a in verification_info["attestations"] if a["status"] == "confirmed")
+    assert bitcoin_attestation["block_height"] == 783000
+
+
+def test_verify_pending_only(setup_module, pending_timestamp_file):
+    """Test verification of timestamps with only pending attestations"""
+    ots = setup_module("opentimestamps_enricher")
+
+    verification_info = ots.verify_timestamp(pending_timestamp_file)
+
+    assert verification_info["attestation_count"] == 2
+    assert verification_info["verified"] is False
+
+    # All attestations should be of type "pending"
+    assert all(a["status"] == "pending" for a in verification_info["attestations"])
+
+    # Check URIs of pending attestations
+    uris = [a["uri"] for a in verification_info["attestations"]]
+    assert "https://example1.calendar.com" in uris
+    assert "https://example2.calendar.com" in uris
+
+
+def test_verify_bitcoin_completed(setup_module, verified_timestamp_file):
+    """Test verification of timestamps with completed Bitcoin attestations"""
+
+    ots = setup_module("opentimestamps_enricher")
+
+    verification_info = ots.verify_timestamp(verified_timestamp_file)
+
+    assert verification_info["attestation_count"] == 1
+    assert verification_info["verified"] is True
+    assert "pending" not in verification_info
+
+    # Check that the attestation is a Bitcoin attestation
+    attestation = verification_info["attestations"][0]
+    assert attestation["status"] == "confirmed"
+    assert attestation["block_height"] == 783000
+
+
+def test_full_enriching(setup_module, sample_file_path, sample_media, mocker):
+    """Test the complete enrichment process"""
+
+    # Mock the calendar submission to avoid network requests
+    mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
+
+    # Create a function that returns a new timestamp for each call
+    def side_effect(digest):
+        test_timestamp = Timestamp(digest)
+        # Add a bitcoin attestation to the test timestamp
+        bitcoin = BitcoinBlockHeaderAttestation(783000)
+        test_timestamp.attestations.add(bitcoin)
+        return test_timestamp
+
+    mock_calendar.side_effect = side_effect
+
+    ots = setup_module("opentimestamps_enricher")
+
+    # Create test metadata with sample file
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.filename = sample_file_path
+    metadata.add_media(sample_media)
+
+    # Run enrichment
+    ots.enrich(metadata)
+
+    # Verify results
+    assert metadata.get("opentimestamped") is True
+    assert metadata.get("opentimestamps_count") == 1
+
+    # Check that we have one parent media item: the original
+    assert len(metadata.media) == 1
+
+    # Check that the original media was updated
+    assert metadata.media[0].get("opentimestamps") is True
+
+    # Check the timestamp file media is a child of the original
+    assert len(metadata.media[0].get("opentimestamp_files")) == 1
+
+    timestamp_media = metadata.media[0].get("opentimestamp_files")[0]
+
+    assert timestamp_media.get("opentimestamps_version") is not None
+
+    # Check verification results on the timestamp media
+    assert timestamp_media.get("verified") is True
+    assert timestamp_media.get("attestation_count") == 1
+
+
+def test_full_enriching_one_calendar_error(
+    setup_module, sample_file_path, sample_media, mocker, pending_timestamp_file
+):
+    """Test enrichment when one calendar server returns an error"""
+    # Mock the calendar submission to raise an exception
+    mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
+
+    test_timestamp = Timestamp(bytes.fromhex("583988e03646c26fa290c5c2408540a2f4e2aa9be087aa4546aefb531385b935"))
+    # Add a bitcoin attestation to the test timestamp
+    bitcoin = BitcoinBlockHeaderAttestation(783000)
+    test_timestamp.attestations.add(bitcoin)
+
+    mock_calendar.side_effect = [test_timestamp, Exception("Calendar server error")]
+
+    ots = setup_module(
+        "opentimestamps_enricher",
+        {
+            "calendar_urls": [
+                "https://alice.btc.calendar.opentimestamps.org",
+                "https://bob.btc.calendar.opentimestamps.org",
+            ]
+        },
+    )
+
+    # Create test metadata with sample file
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.filename = sample_file_path
+    metadata.add_media(sample_media)
+
+    # Run enrichment (should complete despite calendar errors)
+    ots.enrich(metadata)
+
+    # Verify results
+    assert metadata.get("opentimestamped") is True
+    assert metadata.get("opentimestamps_count") == 1  # only alice worked, not bob
+
+
+def test_full_enriching_calendar_error(setup_module, sample_file_path, sample_media, mocker):
+    """Test enrichment when calendar servers return errors"""
+    # Mock the calendar submission to raise an exception
+    mock_calendar = mocker.patch.object(RemoteCalendar, "submit")
+    mock_calendar.side_effect = Exception("Calendar server error")
+
+    ots = setup_module("opentimestamps_enricher")
+
+    # Create test metadata with sample file
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.filename = sample_file_path
+    metadata.add_media(sample_media)
+
+    # Run enrichment (should complete despite calendar errors)
+    ots.enrich(metadata)
+
+    # Verify results
+    assert metadata.get("opentimestamped") is False
+    assert metadata.get("opentimestamps_count") is None
+
+
+def test_no_files_to_stamp(setup_module):
+    """Test enrichment with no files to timestamp"""
+    ots = setup_module("opentimestamps_enricher")
+
+    # Create empty metadata
+    metadata = Metadata().set_url("https://example.com")
+
+    # Run enrichment
+    ots.enrich(metadata)
+
+    # Verify no timestamping occurred
+    assert metadata.get("opentimestamped") is None
+    assert len(metadata.media) == 0
--- a/tests/enrichers/test_pdq_hash_enricher.py
+++ b/tests/enrichers/test_pdq_hash_enricher.py
@@ -14,23 +14,21 @@ def enricher(setup_module):
 def metadata_with_images():
    m = Metadata()
    m.set_url("https://example.com")
-    m.add_media(Media(filename="image1.jpg", key="image1"))
-    m.add_media(Media(filename="image2.jpg", key="image2"))
+    m.add_media(Media(filename="image1.jpg", _key="image1"))
+    m.add_media(Media(filename="image2.jpg", _key="image2"))
    return m


 def test_successful_enrich(metadata_with_images, mocker):
-    with (
-        mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
-        mocker.patch("PIL.Image.open"),
-        mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image,
-    ):
-        enricher = PdqHashEnricher()
-        enricher.enrich(metadata_with_images)
+    mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100))
+    mocker.patch("PIL.Image.open")
+    mocker.patch.object(Media, "is_image", return_value=True)
+    enricher = PdqHashEnricher()
+    enricher.enrich(metadata_with_images)

-        # Ensure the hash is set for image media
-        for media in metadata_with_images.media:
-            assert media.get("pdq_hash") is not None
+    # Ensure the hash is set for image media
+    for media in metadata_with_images.media:
+        assert media.get("pdq_hash") is not None


 def test_enrich_skip_non_image(metadata_with_images, mocker):
@@ -59,7 +57,7 @@ def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
        ("screenshot", False),
        ("warc-file-123", False),
        ("regular-image", True),
-    ]
+    ],
 )
 def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
    metadata = Metadata()
@@ -75,4 +73,3 @@ def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):

    media_item = metadata.media[0]
    assert (media_item.get("pdq_hash") is not None) == should_have_hash
-
--- a/tests/enrichers/test_screenshot_enricher.py
+++ b/tests/enrichers/test_screenshot_enricher.py
@@ -15,13 +15,15 @@ def mock_selenium_env(mocker):
    mock_which = mocker.patch("shutil.which")
    mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
    mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
-    mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True)
+    mocker.patch("pathlib.Path.is_file", return_value=True)
    mock_popen = mocker.patch("subprocess.Popen")
-    mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
+    mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
    mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
+
    # Define side effect for `shutil.which`
    def mock_which_side_effect(dep):
        return "/mock/geckodriver" if dep == "geckodriver" else None
+
    mock_which.side_effect = mock_which_side_effect

    # Mock binary paths
@@ -83,8 +85,8 @@ def test_enrich_adds_screenshot(
    mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
    screenshot_enricher.enrich(metadata_with_video)
    mock_driver_class.assert_called_once_with(
-        cookies=None,
-        cookiejar=None,
+        cookie=None,
+        cookie_jar=None,
        facebook_accept_cookies=False,
        options=mock_options_instance,
    )
@@ -104,13 +106,7 @@ def test_enrich_adds_screenshot(
    ],
 )
 def test_enrich_auth_wall(
-    screenshot_enricher,
-    metadata_with_video,
-    mock_selenium_env,
-    common_patches,
-    url,
-    is_auth,
-    mocker
+    screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker
 ):
    # Testing with and without is_auth_wall
    mock_driver, mock_driver_class, _ = mock_selenium_env
@@ -128,9 +124,39 @@ def test_enrich_auth_wall(
        assert metadata_with_video.media[1].properties.get("id") == "screenshot"


-def test_handle_timeout_exception(
-    screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
-):
+def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
+    with caplog.at_level("WARNING"):
+        screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
+    assert "[SKIP] SCREENSHOT since url" in caplog.text
+
+
+@pytest.mark.parametrize(
+    "auth",
+    [
+        {"cookie": "cookie"},
+        {"cookies_jar": "cookie"},
+    ],
+)
+def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
+    mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
+
+    # patch the authentication dict:
+    screenshot_enricher.authentication = {"example.com": auth}
+    with caplog.at_level("WARNING"):
+        screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
+    assert "[SKIP] SCREENSHOT since url" not in caplog.text
+
+
+def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
+    mock_driver, mock_driver_class, _ = mock_selenium_env
+    mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
+    screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
+    with caplog.at_level("WARNING"):
+        screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
+    assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
+
+
+def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
    mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env

    mock_driver.get.side_effect = TimeoutException
@@ -140,9 +166,7 @@ def test_handle_timeout_exception(
    assert len(metadata_with_video.media) == 1


-def test_handle_general_exception(
-    screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
-):
+def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
    """Test proper handling of unexpected general exceptions"""
    mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
    # Simulate a generic exception when save_screenshot is called
@@ -152,9 +176,7 @@ def test_handle_general_exception(
    mock_log = mocker.patch("loguru.logger.error")
    screenshot_enricher.enrich(metadata_with_video)
    # Verify that the exception was logged with the log
-    mock_log.assert_called_once_with(
-        "Got error while loading webdriver for screenshot enricher: Unexpected Error"
-    )
+    mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error")
    # And no new media was added due to the error
    assert len(metadata_with_video.media) == 1

@@ -167,13 +189,12 @@ def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_sel
    # Mock the print_page method to return base64-encoded content
    mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
    # Patch functions with mocker
-    mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
-    mock_random_str = mocker.patch(
+    mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
+    mocker.patch(
        "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
        return_value="fixed123",
    )
    mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
-    mock_log_error = mocker.patch("loguru.logger.error")

    screenshot_enricher.enrich(metadata_with_video)
    # Verify screenshot and PDF creation
--- a/tests/enrichers/test_ssl_enricher.py
+++ b/tests/enrichers/test_ssl_enricher.py
@@ -51,4 +51,3 @@ def test_ssl_error_handling(enricher, metadata, mocker):
    mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error"))
    with pytest.raises(ssl.SSLError, match="SSL error"):
        enricher.enrich(metadata)
-
--- a/tests/enrichers/test_thumbnail_enricher.py
+++ b/tests/enrichers/test_thumbnail_enricher.py
@@ -25,7 +25,7 @@ def mock_ffmpeg_environment(mocker):
    # Mocking all the ffmpeg calls in one place
    mock_ffmpeg_input = mocker.patch("ffmpeg.input")
    mock_makedirs = mocker.patch("os.makedirs")
-    mocker.patch.object(Media, "is_video", return_value=True),
+    (mocker.patch.object(Media, "is_video", return_value=True),)
    mock_probe = mocker.patch(
        "ffmpeg.probe",
        return_value={
@@ -35,9 +35,7 @@ def mock_ffmpeg_environment(mocker):
        },
    )
    mock_output = mocker.MagicMock()
-    mock_ffmpeg_input.return_value.filter.return_value.output.return_value = (
-        mock_output
-    )
+    mock_ffmpeg_input.return_value.filter.return_value.output.return_value = mock_output

    return {
        "mock_ffmpeg_input": mock_ffmpeg_input,
@@ -47,14 +45,21 @@ def mock_ffmpeg_environment(mocker):
    }


-@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [
-    (10, 5, 5),  # Capped at max_thumbnails
-    (1, 10, 2),  # Less than max_thumbnails
-    (60, 7, 7),  # Matches exactly
-])
+@pytest.mark.parametrize(
+    "thumbnails_per_minute, max_thumbnails, expected_count",
+    [
+        (10, 5, 5),  # Capped at max_thumbnails
+        (1, 10, 2),  # Less than max_thumbnails
+        (60, 7, 7),  # Matches exactly
+    ],
+)
 def test_enrich_thumbnail_limits(
-    thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment,
-    thumbnails_per_minute, max_thumbnails, expected_count
+    thumbnail_enricher,
+    metadata_with_video,
+    mock_ffmpeg_environment,
+    thumbnails_per_minute,
+    max_thumbnails,
+    expected_count,
 ):
    thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
    thumbnail_enricher.max_thumbnails = max_thumbnails
@@ -65,8 +70,8 @@ def test_enrich_thumbnail_limits(
    thumbnails = metadata_with_video.media[0].get("thumbnails")
    assert len(thumbnails) == expected_count

-def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):

+def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
    mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
    mocker.patch("os.makedirs")
    mock_logger = mocker.patch("loguru.logger.error")
@@ -74,36 +79,43 @@ def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, m

    thumbnail_enricher.enrich(metadata_with_video)
    # Ensure error was logged
-    mock_logger.assert_called_with(
-        f"error getting duration of video video.mp4: Probe error"
-    )
+    mock_logger.assert_called_with("error getting duration of video video.mp4: Probe error")
    # Ensure no thumbnails were created
    thumbnails = metadata_with_video.media[0].get("thumbnails")
    assert thumbnails is None


 def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker):
-        mocker.patch.object(Media, "is_video", return_value=False)
-        mock_ffmpeg = mocker.patch("ffmpeg.input")
-        thumbnail_enricher.enrich(metadata_with_video)
-        mock_ffmpeg.assert_not_called()
+    mocker.patch.object(Media, "is_video", return_value=False)
+    mock_ffmpeg = mocker.patch("ffmpeg.input")
+    thumbnail_enricher.enrich(metadata_with_video)
+    mock_ffmpeg.assert_not_called()


-@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [
-    (60, 5, 5), # caught by max
-    (60, 20, 10), # caught by t/min
-    (0, 20, 1), # test min caught (1)
-    (11, 20, 1), # test min caught (1)
-    (12, 20, 2), # test caught by t/min
-])
+@pytest.mark.parametrize(
+    "thumbnails_per_minute,max_thumbnails,expected_count",
+    [
+        (60, 5, 5),  # caught by max
+        (60, 20, 10),  # caught by t/min
+        (0, 20, 1),  # test min caught (1)
+        (11, 20, 1),  # test min caught (1)
+        (12, 20, 2),  # test caught by t/min
+    ],
+)
 def test_enrich_handles_short_video(
-    thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker
+    thumbnail_enricher,
+    metadata_with_video,
+    mock_ffmpeg_environment,
+    thumbnails_per_minute,
+    max_thumbnails,
+    expected_count,
+    mocker,
 ):
    # override mock duration
    fake_duration = 10
    mocker.patch(
        "ffmpeg.probe",
-        return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
+        return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
    )
    thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
    thumbnail_enricher.max_thumbnails = max_thumbnails
@@ -114,9 +126,7 @@ def test_enrich_handles_short_video(
    assert len(thumbnails) == expected_count


-def test_uses_existing_duration(
-    thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment
-):
+def test_uses_existing_duration(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment):
    metadata_with_video.media[0].set("duration", 60)
    thumbnail_enricher.enrich(metadata_with_video)
    mock_ffmpeg_environment["mock_probe"].assert_not_called()
@@ -125,7 +135,7 @@ def test_uses_existing_duration(

 def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
    fake_duration = 120
-    mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]})
+    mocker.patch("ffmpeg.probe", return_value={"streams": [{"codec_type": "video", "duration": str(fake_duration)}]})
    thumbnail_enricher.thumbnails_per_minute = 2
    thumbnail_enricher.max_thumbnails = 4

--- a/tests/enrichers/test_wacz_enricher.py
+++ b/tests/enrichers/test_wacz_enricher.py
@@ -4,6 +4,7 @@ from zipfile import ZipFile
 import pytest

 from auto_archiver.core import Metadata, Media
+from auto_archiver.core.consts import SetupError


@pytest.fixture
@@ -22,6 +23,15 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
    return wacz


+def test_raises_error_without_docker_installed(setup_module, mocker, caplog):
+    # pretend that docker isn't installed
+    mocker.patch("shutil.which").return_value = None
+    with pytest.raises(SetupError):
+        setup_module("wacz_extractor_enricher", {})
+
+    assert "requires external dependency 'docker' which is not available/setup" in caplog.text
+
+
 def test_setup_without_docker(wacz_enricher, mocker):
    mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
    wacz_enricher.setup()
--- a/tests/enrichers/test_wayback_enricher.py
+++ b/tests/enrichers/test_wayback_enricher.py
@@ -5,37 +5,52 @@ from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnr
 from auto_archiver.core import Metadata


+@pytest.fixture(autouse=True)
+def mock_sleep(mocker):
+    """Mock time.sleep to avoid delays."""
+    return mocker.patch("time.sleep")
+
+
@pytest.fixture
 def mock_is_auth_wall(mocker):
    """Fixture to mock is_auth_wall behavior."""
+
    def _mock_is_auth_wall(return_value: bool):
        return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
+
    return _mock_is_auth_wall

+
@pytest.fixture
 def mock_post_success(mocker):
    """Fixture to mock POST requests with a successful response."""
+
    def _mock_post(json_data: dict = None, status_code: int = 200):
-        json_data = json_data or {"job_id": "job123"}
+        json_data = {"job_id": "job123"} if json_data is None else json_data
        resp = mocker.Mock(status_code=status_code)
        resp.json.return_value = json_data
        return mocker.patch("requests.post", return_value=resp)
+
    return _mock_post

+
@pytest.fixture
 def mock_get_success(mocker):
    """Fixture to mock GET requests returning a completed archive status."""
+
    def _mock_get(json_data: dict = None, status_code: int = 200):
        json_data = json_data or {
            "status": "success",
            "timestamp": "20250101010101",
-            "original_url": "https://example.com"
+            "original_url": "https://example.com",
        }
        resp = mocker.Mock(status_code=status_code)
        resp.json.return_value = json_data
        return mocker.patch("requests.get", return_value=resp)
+
    return _mock_get

+
@pytest.fixture
 def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
    configs: dict = {
@@ -49,12 +64,7 @@ def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
    return setup_module("wayback_extractor_enricher", configs)


-def test_download_success(
-    wayback_extractor_enricher,
-    mock_is_auth_wall,
-    mock_post_success,
-    mock_get_success
-):
+def test_download_success(wayback_extractor_enricher, mock_is_auth_wall, mock_post_success, mock_get_success):
    mock_is_auth_wall(False)
    mock_post_success()
    mock_get_success()
@@ -63,34 +73,28 @@ def test_download_success(
    result = wayback_extractor_enricher.download(metadata)
    assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"

+
 def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
    mock_is_auth_wall(True)
    result = wayback_extractor_enricher.enrich(metadata)
    assert result is None

+
 def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
    metadata.set("wayback", "existing")
    result = wayback_extractor_enricher.enrich(metadata)
    assert result is True

-def test_enrich_post_failure(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mock_post_success
-):
+
+def test_enrich_post_failure(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
    mock_is_auth_wall(False)
    mock_post_success(json_data={"error": "server error"}, status_code=500)
    result = wayback_extractor_enricher.enrich(metadata)
    assert result is False
    assert "Internet archive failed with status of 500" in metadata.get("wayback")

-def test_enrich_post_json_decode_error(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mocker
-):
+
+def test_enrich_post_json_decode_error(wayback_extractor_enricher, metadata, mock_is_auth_wall, mocker):
    mock_is_auth_wall(False)
    resp = mocker.Mock(status_code=200)
    resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
@@ -98,22 +102,15 @@ def test_enrich_post_json_decode_error(
    mocker.patch("requests.post", return_value=resp)
    assert wayback_extractor_enricher.enrich(metadata) is False

-def test_enrich_no_job_id(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mock_post_success
-):
+
+def test_enrich_no_job_id(wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success):
    mock_is_auth_wall(False)
    mock_post_success(json_data={})
    assert wayback_extractor_enricher.enrich(metadata) is False

+
 def test_enrich_get_success(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mock_post_success,
-    mock_get_success
+    wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
 ):
    mock_is_auth_wall(False)
    mock_post_success()
@@ -122,24 +119,18 @@ def test_enrich_get_success(
    assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
    assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"

+
 def test_enrich_get_failure(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mock_post_success,
-    mock_get_success
+    wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mock_get_success
 ):
    mock_is_auth_wall(False)
    mock_post_success()
    mock_get_success(json_data={"status": "failed"}, status_code=400)
    assert wayback_extractor_enricher.enrich(metadata) is False

+
 def test_enrich_get_request_exception(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mock_post_success,
-    mocker
+    wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
 ):
    mock_is_auth_wall(False)
    mock_post_success()
@@ -149,12 +140,9 @@ def test_enrich_get_request_exception(
    assert wayback_extractor_enricher.enrich(metadata) is True
    assert metadata.get("wayback").get("job_id") == "job123"

+
 def test_enrich_get_json_decode_error(
-    wayback_extractor_enricher,
-        metadata,
-    mock_is_auth_wall,
-    mock_post_success,
-    mocker
+    wayback_extractor_enricher, metadata, mock_is_auth_wall, mock_post_success, mocker
 ):
    mock_is_auth_wall(False)
    mock_post_success()
--- a/tests/enrichers/test_whisper_enricher.py
+++ b/tests/enrichers/test_whisper_enricher.py
@@ -7,6 +7,12 @@ from auto_archiver.modules.whisper_enricher import WhisperEnricher
 TEST_S3_URL = "http://cdn.example.com/test.mp4"


+@pytest.fixture(autouse=True)
+def mock_sleep(mocker):
+    """Mock time.sleep to avoid delays."""
+    return mocker.patch("time.sleep")
+
+
@pytest.fixture
 def enricher(mocker):
    """Fixture with mocked S3 and API dependencies"""
@@ -16,7 +22,7 @@ def enricher(mocker):
        "include_srt": False,
        "timeout": 5,
        "action": "translate",
-        "steps": {"storages": ["s3_storage"]}
+        "steps": {"storages": ["s3_storage"]},
    }
    mock_s3 = mocker.MagicMock(spec=S3Storage)
    mock_s3.get_cdn_url.return_value = TEST_S3_URL
@@ -25,7 +31,7 @@ def enricher(mocker):
    instance.display_name = "Whisper Enricher"
    instance.config_setup({instance.name: config})
    # bypassing the setup method and mocking S3 setup
-    instance.stores = config['steps']['storages']
+    instance.stores = config["steps"]["storages"]
    instance.s3 = mock_s3
    yield instance, mock_s3

@@ -63,19 +69,14 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
    # Mock the complete API interaction chain
    mock_status_response = mocker.MagicMock()
    mock_status_response.status_code = 200
-    mock_status_response.json.return_value = {
-        "status": "success",
-        "meta": {}
-    }
+    mock_status_response.json.return_value = {"status": "success", "meta": {}}
    mock_artifacts_response = mocker.MagicMock()
    mock_artifacts_response.status_code = 200
-    mock_artifacts_response.json.return_value = [{
-        "data": [{"start": 0, "end": 5, "text": "test transcript"}]
-    }]
+    mock_artifacts_response.json.return_value = [{"data": [{"start": 0, "end": 5, "text": "test transcript"}]}]
    # Set up mock response sequence
    mock_requests.get.side_effect = [
        mock_status_response,  # First call: status check
-        mock_artifacts_response  # Second call: artifacts check
+        mock_artifacts_response,  # Second call: artifacts check
    ]

    # Run enrichment (without opening file)
@@ -84,15 +85,17 @@ def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
    mock_requests.post.assert_called_once_with(
        "http://testapi/jobs",
        json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
-        headers={"Authorization": "Bearer whisper-key"}
+        headers={"Authorization": "Bearer whisper-key"},
    )
    # Verify job status checks
    assert mock_requests.get.call_count == 2
    assert "artifact_0_text" in metadata.media[0].get("whisper_model")
-    assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript',
-                                                      'job_artifacts_check': 'http://testapi/jobs/job123/artifacts',
-                                                      'job_id': 'job123',
-                                                      'job_status_check': 'http://testapi/jobs/job123'}
+    assert metadata.media[0].get("whisper_model") == {
+        "artifact_0_text": "test transcript",
+        "job_artifacts_check": "http://testapi/jobs/job123/artifacts",
+        "job_id": "job123",
+        "job_status_check": "http://testapi/jobs/job123",
+    }


 def test_submit_job(enricher, mocker):
--- a/tests/extractors/test_extractor_base.py
+++ b/tests/extractors/test_extractor_base.py
@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor


 class TestExtractorBase(object):
-
    extractor_module: str = None
    config: dict = None

@@ -17,7 +16,7 @@ class TestExtractorBase(object):
        assert self.config is not None, "self.config must be a dict set on the subclass"

        self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
-    
+
    def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
        assert test_response is not False

--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@@ -9,26 +9,28 @@ import pytest
 from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
 from .test_extractor_base import TestExtractorBase

-CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
+CI = os.getenv("GITHUB_ACTIONS", "") == "true"
+
+
 class TestGenericExtractor(TestExtractorBase):
-    """Tests Generic Extractor
-    """
-    extractor_module = 'generic_extractor'
+    """Tests Generic Extractor"""
+
+    extractor_module = "generic_extractor"
    extractor: GenericExtractor

    config = {
-        'subtitles': False,
-        'comments': False,
-        'livestreams': False,
-        'live_from_start': False,
-        'end_means_success': True,
-        'allow_playlist': False,
-        'max_downloads': "inf",
-        'proxy': None,
-        'cookies_from_browser': False,
-        'cookie_file': None,
-        }
-    
+        "subtitles": False,
+        "comments": False,
+        "livestreams": False,
+        "live_from_start": False,
+        "end_means_success": True,
+        "allow_playlist": False,
+        "max_downloads": "inf",
+        "proxy": None,
+        "cookies_from_browser": False,
+        "cookie_file": None,
+    }
+
    def test_load_dropin(self):
        # test loading dropins that are in the generic_archiver package
        package = "auto_archiver.modules.generic_extractor"
@@ -38,21 +40,42 @@ class TestGenericExtractor(TestExtractorBase):
        path = os.path.join(dirname(dirname(__file__)), "data/")
        assert self.extractor.dropin_for_name("dropin", additional_paths=[path])

+    @pytest.mark.parametrize(
+        "url, suitable_extractors",
+        [
+            ("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
+            ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
+            ("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
+            ("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
+            ("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
+        ],
+    )
+    def test_suitable_extractors(self, url, suitable_extractors):
+        suitable_extractors = suitable_extractors + ["generic"]  # the generic is valid for all
+        extractors = list(self.extractor.suitable_extractors(url))
+        assert len(extractors) == len(suitable_extractors)
+        assert [e.ie_key().lower() for e in extractors] == suitable_extractors

-
-    @pytest.mark.parametrize("url, is_suitable", [
-        ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
-        ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
-        ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
-        ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
-        ("https://www.twitch.tv/videos/1167226570", True),
-        ("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
-        ("https://google.com", True)])
-    def test_suitable_urls(self, make_item, url, is_suitable):
+    @pytest.mark.parametrize(
+        "url, is_suitable",
+        [
+            ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
+            ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
+            ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
+            ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
+            ("https://www.twitch.tv/videos/1167226570", True),
+            (
+                "https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
+                True,
+            ),
+            ("https://google.com", True),
+        ],
+    )
+    def test_suitable_urls(self, url, is_suitable):
        """
-            Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
-            This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
-            and then if and only if all archivers fails, does it fall back to the generic archiver)
+        Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
+        This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
+        and then if and only if all archivers fails, does it fall back to the generic archiver)
        """
        assert self.extractor.suitable(url) == is_suitable

@@ -63,11 +86,14 @@ class TestGenericExtractor(TestExtractorBase):
        assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"

    @pytest.mark.download
-    @pytest.mark.parametrize("url", [
-        "https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
-        "twitter.com/bellingcat/status/123",
-        "https://www.youtube.com/watch?v=1"
-    ])
+    @pytest.mark.parametrize(
+        "url",
+        [
+            "https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
+            "twitter.com/bellingcat/status/123",
+            "https://www.youtube.com/watch?v=1",
+        ],
+    )
    def test_download_nonexistent_media(self, make_item, url):
        """
        Test to make sure that the extractor doesn't break on non-existend posts/media
@@ -78,7 +104,10 @@ class TestGenericExtractor(TestExtractorBase):
        result = self.extractor.download(item)
        assert not result

-    @pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
+    @pytest.mark.skipif(
+        CI,
+        reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
+    )
    @pytest.mark.download
    def test_youtube_download(self, make_item):
        # url https://www.youtube.com/watch?v=5qap5aO4i9A
@@ -87,7 +116,10 @@ class TestGenericExtractor(TestExtractorBase):
        result = self.extractor.download(item)
        assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
        assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
-        assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
+        assert (
+            result.get("description")
+            == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
+        )
        assert len(result.media) == 2
        assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
        assert Path(result.media[1].filename).name == "hqdefault.jpg"
@@ -103,7 +135,7 @@ class TestGenericExtractor(TestExtractorBase):
        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.download
    def test_bluesky_download_no_media(self, make_item):
        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
@@ -115,7 +147,7 @@ class TestGenericExtractor(TestExtractorBase):
        item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_video(self, make_item):
@@ -130,14 +162,14 @@ class TestGenericExtractor(TestExtractorBase):
        item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_poll(self, make_item):
        item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_single_image(self, make_item):
@@ -159,7 +191,7 @@ class TestGenericExtractor(TestExtractorBase):
        url = "https://x.com/Bellingcat/status/17197025860711058"
        response = self.extractor.download(make_item(url))
        assert not response
-    
+
    @pytest.mark.download
    def test_twitter_download_malformed_tweetid(self, make_item):
        # this tweet does not exist
@@ -169,17 +201,17 @@ class TestGenericExtractor(TestExtractorBase):

    @pytest.mark.download
    def test_twitter_download_tweet_no_media(self, make_item):
-        
        item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
        post = self.extractor.download(item)

        self.assertValidResponseMetadata(
            post,
-            "Onion rings are just vegetable donuts.",
+            "Cookie Monster - Onion rings are just vegetable donuts.",
            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
-            "yt-dlp_Twitter: success"
+            "yt-dlp_Twitter: success",
        )
-    
+        assert post.get("content") == "Onion rings are just vegetable donuts."
+
    @pytest.mark.download
    def test_twitter_download_video(self, make_item):
        url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -187,26 +219,75 @@ class TestGenericExtractor(TestExtractorBase):
        self.assertValidResponseMetadata(
            post,
            "Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
-            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
+            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
        )

-    @pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
+    @pytest.mark.xfail(
+        reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
+    )
    @pytest.mark.download
-    @pytest.mark.parametrize("url, title, timestamp, image_hash", [
-            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-        ])
+    @pytest.mark.parametrize(
+        "url, title, timestamp, image_hash",
+        [
+            (
+                "https://x.com/SozinhoRamalho/status/1876710769913450647",
+                "ignore tweet, testing sensitivity warning nudity",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876710875475681357",
+                "ignore tweet, testing sensitivity warning violence",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711053813227618",
+                "ignore tweet, testing sensitivity warning sensitive",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711141314801937",
+                "ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+        ],
+    )
    def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
-
        """Download tweets with sensitive media"""

        post = self.extractor.download(make_item(url))
-        self.assertValidResponseMetadata(
-            post,
-            title,
-            timestamp
-        )
+        self.assertValidResponseMetadata(post, title, timestamp)
        assert len(post.media) == 1
-        assert post.media[0].hash == image_hash
+        assert post.media[0].hash == image_hash
+
+    @pytest.mark.download
+    def test_download_facebook_video(self, make_item):
+        post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
+        assert len(post.media) == 2
+        assert post.media[0].filename.endswith("588371253839133.mp4")
+        assert post.media[0].mimetype == "video/mp4"
+
+        assert post.media[1].filename.endswith(".jpg")
+        assert post.media[1].mimetype == "image/jpeg"
+
+        assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
+
+    @pytest.mark.download
+    def test_download_facebook_image(self, make_item):
+        post = self.extractor.download(
+            make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
+        )
+
+        assert len(post.media) == 1
+        assert post.media[0].filename.endswith(".png")
+        assert "Byline Festival - BylineFest Partner" == post.get_title()
+
+    @pytest.mark.download
+    def test_download_facebook_text_only(self, make_item):
+        url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
+        post = self.extractor.download(make_item(url))
+        assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
+        assert post.get_title() == "Bellingcat"
--- a/tests/extractors/test_instagram_api_extractor.py
+++ b/tests/extractors/test_instagram_api_extractor.py
@@ -15,10 +15,11 @@ def mock_user_response():
            "username": "test_user",
            "full_name": "Test User",
            "profile_pic_url_hd": "http://example.com/profile.jpg",
-            "profile_pic_url": "http://example.com/profile_lowres.jpg"
+            "profile_pic_url": "http://example.com/profile_lowres.jpg",
        }
    }

+
@pytest.fixture
 def mock_post_response():
    return {
@@ -27,16 +28,14 @@ def mock_post_response():
        "caption_text": "Test Caption",
        "taken_at": datetime.now().timestamp(),
        "video_url": "http://example.com/video.mp4",
-        "thumbnail_url": "http://example.com/thumbnail.jpg"
+        "thumbnail_url": "http://example.com/thumbnail.jpg",
    }

+
@pytest.fixture
 def mock_story_response():
-    return [{
-        "id": "story_123",
-        "taken_at": datetime.now().timestamp(),
-        "video_url": "http://example.com/story.mp4"
-    }]
+    return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
+

@pytest.fixture
 def mock_highlight_response():
@@ -46,11 +45,13 @@ def mock_highlight_response():
                "highlight:123": {
                    "id": "123",
                    "title": "Test Highlight",
-                    "items": [{
-                        "id": "item_123",
-                        "taken_at": datetime.now().timestamp(),
-                        "video_url": "http://example.com/highlight.mp4"
-                    }]
+                    "items": [
+                        {
+                            "id": "item_123",
+                            "taken_at": datetime.now().timestamp(),
+                            "video_url": "http://example.com/highlight.mp4",
+                        }
+                    ],
                }
            }
        }
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
        m.set("netloc", "instagram.com")
        return m

-    @pytest.mark.parametrize("url,expected", [
-        ("https://instagram.com/user", [("", "user", "")]),
-        ("https://instagr.am/p/post_id", []),
-        ("https://youtube.com", []),
-        ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
-        ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
-        ("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
-    ])
+    @pytest.mark.parametrize(
+        "url,expected",
+        [
+            ("https://instagram.com/user", [("", "user", "")]),
+            ("https://instagr.am/p/post_id", []),
+            ("https://youtube.com", []),
+            ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
+            ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
+            ("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
+        ],
+    )
    def test_url_parsing(self, url, expected):
        assert self.extractor.valid_url.findall(url) == expected

    def test_initialize(self):
        assert self.extractor.api_endpoint[-1] != "/"

-    @pytest.mark.parametrize("input_dict,expected", [
-        ({"x": 0, "valid": "data"}, {"valid": "data"}),
-        ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
-    ])
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            ({"x": 0, "valid": "data"}, {"valid": "data"}),
+            ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
+        ],
+    )
    def test_cleanup_dict(self, input_dict, expected):
        assert self.extractor.cleanup_dict(input_dict) == expected

@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_basic(self, metadata, mock_user_response, mocker):
        """Test basic profile download without full_profile"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
-        mock_download = mocker.patch.object(self.extractor, 'download_from_url')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
+        mock_download = mocker.patch.object(self.extractor, "download_from_url")
        # Mock API responses
        mock_call.return_value = mock_user_response
        mock_download.return_value = "profile.jpg"
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
        """Test full profile download with stories/posts"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
-        mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
-        mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
-        mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
-        mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
+        mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
+        mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
+        mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
+        mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")

        self.extractor.full_profile = True
-        mock_call.side_effect = [
-            mock_user_response,
-            mock_story_response
-        ]
+        mock_call.side_effect = [mock_user_response, mock_story_response]
        mock_highlights.return_value = None
        mock_stories.return_value = mock_story_response
        mock_posts.return_value = None
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_not_found(self, metadata, mocker):
        """Test profile not found error"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
        mock_call.return_value = {"user": None}
        with pytest.raises(AssertionError) as exc_info:
            self.extractor.download_profile(metadata, "invalid_user")
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
        """Test error handling in full profile mode"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
-        mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
-        mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
-        stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
-        mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
+        mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
+        mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
+        stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
+        mock_posts = mocker.patch.object(self.extractor, "download_all_posts")

        self.extractor.full_profile = True
-        mock_call.side_effect = [
-            mock_user_response,
-            Exception("Stories API failed"),
-            Exception("Posts API failed")
-        ]
+        mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
        mock_highlights.return_value = None
        mock_tagged.return_value = None
        stories_tagged.return_value = None
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
        result = self.extractor.download_profile(metadata, "test_user")

        assert result.is_success()
-        assert "Error downloading stories for test_user" in result.metadata["errors"]
+        assert "Error downloading stories for test_user" in result.metadata["errors"]
--- a/tests/extractors/test_instagram_extractor.py
+++ b/tests/extractors/test_instagram_extractor.py
@@ -1,21 +1,41 @@
 import pytest

 from auto_archiver.modules.instagram_extractor import InstagramExtractor
-from .test_extractor_base import TestExtractorBase

-class TestInstagramExtractor(TestExtractorBase):

-    extractor_module: str = 'instagram_extractor'
-    config: dict = {}
+@pytest.fixture
+def instagram_extractor(setup_module, mocker):
+    extractor_module: str = "instagram_extractor"
+    config: dict = {
+        "username": "user_name",
+        "password": "password123",
+        "download_folder": "instaloader",
+        "session_file": "secrets/instaloader.session",
+    }
+    fake_loader = mocker.MagicMock()
+    fake_loader.load_session_from_file.return_value = None
+    fake_loader.login.return_value = None
+    fake_loader.save_session_to_file.return_value = None
+    mocker.patch(
+        "instaloader.Instaloader",
+        return_value=fake_loader,
+    )
+    return setup_module(extractor_module, config)

-    @pytest.mark.parametrize("url", [
+
+@pytest.mark.parametrize(
+    "url",
+    [
        "https://www.instagram.com/p/",
        "https://www.instagram.com/p/1234567890/",
        "https://www.instagram.com/reel/1234567890/",
        "https://www.instagram.com/username/",
        "https://www.instagram.com/username/stories/",
        "https://www.instagram.com/username/highlights/",
-    ])
-    def test_regex_matches(self, url):
-        # post
-        assert InstagramExtractor.valid_url.match(url)
+    ],
+)
+def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
+    """
+    Ensure that the valid_url regex matches all provided Instagram URLs.
+    """
+    assert instagram_extractor.valid_url.match(url)
--- a/tests/extractors/test_instagram_tbot_extractor.py
+++ b/tests/extractors/test_instagram_tbot_extractor.py
@@ -7,10 +7,16 @@ from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtracto
 from tests.extractors.test_extractor_base import TestExtractorBase


+@pytest.fixture(autouse=True)
+def mock_sleep(mocker):
+    """Mock time.sleep to avoid delays."""
+    return mocker.patch("time.sleep")
+
+
@pytest.fixture
 def patch_extractor_methods(request, setup_module, mocker):
-    mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
-    mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
+    mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
+    mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
    yield


@@ -35,12 +41,7 @@ def mock_telegram_client(mocker):
@pytest.fixture
 def extractor(setup_module, patch_extractor_methods, mocker):
    extractor_module = "instagram_tbot_extractor"
-    config = {
-        "api_id": 12345,
-        "api_hash": "test_api_hash",
-        "session_file": "test_session",
-        "timeout": 4
-    }
+    config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
    extractor = setup_module(extractor_module, config)
    extractor.client = mocker.MagicMock()
    extractor.session_file = "test_session"
@@ -79,21 +80,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
        "session_file": "secrets/anon-insta",
    }

-    @pytest.mark.parametrize("url, expected_status, message, len_media", [
-        ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
-         "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
-         6),
-        ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
-         "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
-         3),
-        # instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
-        # ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
-        # Seems to be working intermittently for highlights
-        # ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
-        # Marking invalid url as success
-        ("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
-        ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
-    ])
+    @pytest.mark.parametrize(
+        "url, expected_status, message, len_media",
+        [
+            (
+                "https://www.instagram.com/p/C4QgLbrIKXG",
+                "insta-via-bot: success",
+                "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
+                6,
+            ),
+            (
+                "https://www.instagram.com/reel/DEVLK8qoIbg/",
+                "insta-via-bot: success",
+                "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
+                3,
+            ),
+            # instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
+            # ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
+            # Seems to be working intermittently for highlights
+            # ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
+            # Marking invalid url as success
+            ("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
+            ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
+        ],
+    )
    def test_download(self, url, expected_status, message, len_media, metadata_sample):
        """Test the `download()` method with various Instagram URLs."""
        metadata_sample.set_url(url)
--- a/tests/extractors/test_tiktok_tikwm_extractor.py
+++ b/tests/extractors/test_tiktok_tikwm_extractor.py
@@ -0,0 +1,177 @@
+from datetime import datetime, timezone
+import time
+import pytest
+import yt_dlp
+
+from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
+from auto_archiver.modules.generic_extractor.tiktok import Tiktok, TikTokIE
+
+from .test_extractor_base import TestExtractorBase
+
+
+@pytest.fixture(autouse=True)
+def skip_ytdlp_own_methods(mocker):
+    # mock this method, so that we skip the ytdlp download in these tests
+    mocker.patch("auto_archiver.modules.generic_extractor.tiktok.Tiktok.skip_ytdlp_download", return_value=True)
+    mocker.patch(
+        "auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.suitable_extractors",
+        return_value=[e for e in yt_dlp.YoutubeDL()._ies.values() if e.IE_NAME == "TikTok"],
+    )
+
+
+@pytest.fixture
+def mock_get(mocker):
+    return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
+
+
+@pytest.fixture
+def tiktok_dropin() -> Tiktok:
+    return Tiktok()
+
+
+class TestTiktokTikwmExtractor(TestExtractorBase):
+    """
+    Test suite for TestTiktokTikwmExtractor.
+    """
+
+    extractor_module = "generic_extractor"
+    extractor: GenericExtractor
+
+    config = {}
+
+    VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
+
+    @pytest.mark.parametrize(
+        "url, is_suitable",
+        [
+            ("https://bellingcat.com", False),
+            ("https://youtube.com", False),
+            ("https://tiktok.co/", False),
+            ("https://tiktok.com/", False),
+            ("https://www.tiktok.com/", False),
+            ("https://api.cool.tiktok.com/", False),
+            (VALID_EXAMPLE_URL, True),
+            ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
+            ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
+            ("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
+            ("https://vt.tiktok.com/ZSMTJeqRP/", True),
+        ],
+    )
+    def test_is_suitable(self, url, is_suitable, tiktok_dropin):
+        assert tiktok_dropin.suitable(url, TikTokIE()) == is_suitable
+
+    def test_invalid_json_responses(self, mock_get, make_item, caplog):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.side_effect = ValueError
+        with caplog.at_level("DEBUG"):
+            assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
+            mock_get.assert_called_once()
+            mock_get.return_value.json.assert_called_once()
+            # first message is just the 'Skipping using ytdlp to download files for TikTok' message
+            assert (
+                "failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
+                in caplog.text
+            )
+
+        mock_get.return_value.json.side_effect = Exception
+        with caplog.at_level("ERROR"):
+            assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
+            mock_get.assert_called()
+            assert mock_get.call_count == 2
+            assert mock_get.return_value.json.call_count == 2
+            assert (
+                "failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
+                in caplog.text
+            )
+
+    @pytest.mark.parametrize(
+        "response",
+        [
+            ({"msg": "failure"}),
+            ({"msg": "success"}),
+        ],
+    )
+    def test_unsuccessful_responses(self, mock_get, make_item, response, caplog):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = response
+        with caplog.at_level("DEBUG"):
+            assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
+            mock_get.assert_called_once()
+            mock_get.return_value.json.assert_called_once()
+            assert "failed to get a valid response from tikwm.com" in caplog.text
+
+    @pytest.mark.parametrize(
+        "response,has_vid",
+        [
+            ({"data": {"id": 123}}, False),
+            ({"data": {"wmplay": "url"}}, True),
+            ({"data": {"play": "url"}}, True),
+        ],
+    )
+    def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = {"msg": "success", **response}
+        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        if not has_vid:
+            assert result is False
+        else:
+            assert result.is_success()
+            assert len(result.media) == 1
+        mock_get.assert_called()
+        assert mock_get.call_count == 1 + int(has_vid)
+        mock_get.return_value.json.assert_called_once()
+
+    def test_correct_data_extracted(self, mock_get, make_item):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = {
+            "msg": "success",
+            "data": {
+                "wmplay": "url",
+                "origin_cover": "cover.jpg",
+                "title": "Title",
+                "id": 123,
+                "duration": 60,
+                "create_time": 1736301699,
+                "author": "Author",
+                "other": "data",
+            },
+        }
+
+        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "Title"
+        assert result.get("author") == "Author"
+        assert result.get("api_data") == {"other": "data", "id": 123}
+        assert result.media[1].get("duration") == 60
+        assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
+
+    @pytest.mark.download
+    def test_download_video(self, make_item):
+        url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
+
+        result = self.extractor.download(make_item(url))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert (
+            result.get_title()
+            == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg  #A23a  #Antarctica  #Ice  #ClimateChange  #DavidAttenborough  #Ocean  #Sea  #SouthGeorgia  #BBCNews "
+        )
+        assert result.get("author").get("unique_id") == "bbcnews"
+        assert result.get("api_data").get("id") == "7478038212070411542"
+        assert result.media[1].get("duration") == 59
+        assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
+
+    @pytest.mark.download
+    def test_download_sensitive_video(self, make_item):
+        url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
+        # Required for rate limiting
+        time.sleep(1.1)
+        result = self.extractor.download(make_item(url))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
+        assert result.get("author").get("id") == "7197400619475649562"
+        assert result.get("api_data").get("id") == "7441821351142362375"
+        assert result.media[1].get("duration") == 34
+        assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
--- a/tests/extractors/test_twitter_api_extractor.py
+++ b/tests/extractors/test_twitter_api_extractor.py
@@ -1,6 +1,5 @@
 import os
 import datetime
-import hashlib
 import pytest

 from pytwitter.models.media import MediaVariant
@@ -10,8 +9,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor

@pytest.mark.incremental
 class TestTwitterApiExtractor(TestExtractorBase):
-
-    extractor_module = 'twitter_api_extractor'
+    extractor_module: TwitterApiExtractor = "twitter_api_extractor"

    config = {
        "bearer_tokens": [],
@@ -22,41 +20,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
        "access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
    }

-    @pytest.mark.parametrize("url, expected", [
-        ("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
-        ("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
-        ("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
-        ("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
-        ("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
-    ])
+    @pytest.mark.parametrize(
+        "url, expected",
+        [
+            (
+                "https://x.com/bellingcat/status/1874097816571961839",
+                "https://x.com/bellingcat/status/1874097816571961839",
+            ),  # x.com urls unchanged
+            (
+                "https://twitter.com/bellingcat/status/1874097816571961839",
+                "https://twitter.com/bellingcat/status/1874097816571961839",
+            ),  # twitter urls unchanged
+            (
+                "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+                "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+            ),  # don't strip params from twitter urls (changed Jan 2025)
+            (
+                "https://www.bellingcat.com/category/resources/",
+                "https://www.bellingcat.com/category/resources/",
+            ),  # non-twitter/x urls unchanged
+            (
+                "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+                "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+            ),  # shouldn't strip params from non-twitter/x URLs
+        ],
+    )
    def test_sanitize_url(self, url, expected):
        assert expected == self.extractor.sanitize_url(url)

    @pytest.mark.download
    def test_sanitize_url_download(self):
-        assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
+        assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
+            "https://t.co/yl3oOJatFp"
+        )

-    @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
-        ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
-        ("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
-        ("https://www.bellingcat.com/category/resources/", False, False)
-        ])
+    @pytest.mark.parametrize(
+        "url, exptected_username, exptected_tweetid",
+        [
+            ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
+            ("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
+            ("https://www.bellingcat.com/category/resources/", False, False),
+        ],
+    )
    def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
-    
        username, tweet_id = self.extractor.get_username_tweet_id(url)
        assert exptected_username == username
        assert exptected_tweetid == tweet_id

    def test_choose_variants(self):
        # taken from the response for url https://x.com/bellingcat/status/1871552600346415571
-        variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
-                        MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
-                        MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
-                        MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
-                        ]
+        variant_list = [
+            MediaVariant(
+                content_type="application/x-mpegURL",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
+            ),
+            MediaVariant(
+                bit_rate=256000,
+                content_type="video/mp4",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
+            ),
+            MediaVariant(
+                bit_rate=832000,
+                content_type="video/mp4",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
+            ),
+            MediaVariant(
+                bit_rate=2176000,
+                content_type="video/mp4",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
+            ),
+        ]
        chosen_variant = self.extractor.choose_variant(variant_list)
        assert chosen_variant == variant_list[3]
-    
+
    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
    @pytest.mark.download
    def test_download_nonexistent_tweet(self, make_item):
@@ -76,7 +112,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
    @pytest.mark.download
    def test_download_tweet_no_media(self, make_item):
-        
        item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
        post = self.extractor.download(item)

@@ -84,7 +119,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
            post,
            "Onion rings are just vegetable donuts.",
            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
-            "twitter-api: success"
+            "twitter-api: success",
        )

    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@@ -95,27 +130,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
        self.assertValidResponseMetadata(
            post,
            "This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
-            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
+            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
        )

    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
-    @pytest.mark.parametrize("url, title, timestamp", [
-            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-        ])
+    @pytest.mark.parametrize(
+        "url, title, timestamp",
+        [
+            (
+                "https://x.com/SozinhoRamalho/status/1876710769913450647",
+                "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876710875475681357",
+                "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711053813227618",
+                "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711141314801937",
+                "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+        ],
+    )
    @pytest.mark.download
    def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
-
        """Download tweets with sensitive media"""

        post = self.extractor.download(make_item(url))
-        self.assertValidResponseMetadata(
-            post,
-            title,
-            timestamp
-        )
+        self.assertValidResponseMetadata(post, title, timestamp)
        assert len(post.media) == 1
        # check the SHA1 hash (quick) of the media, to make sure it's valid
-        check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
+        check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
--- a/tests/extractors/test_vk_extractor.py
+++ b/tests/extractors/test_vk_extractor.py
@@ -0,0 +1,77 @@
+import pytest
+
+from auto_archiver.core import Metadata
+from auto_archiver.modules.vk_extractor import VkExtractor
+
+
+@pytest.fixture
+def mock_vk_scraper(mocker):
+    """Fixture to mock VkScraper."""
+    return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
+
+
+@pytest.fixture
+def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
+    """Fixture to initialize VkExtractor with mocked VkScraper."""
+    extractor_module = "vk_extractor"
+    configs = {
+        "username": "name",
+        "password": "password123",
+        "session_file": "secrets/vk_config.v2.json",
+    }
+    vk = setup_module(extractor_module, configs)
+    vk.vks = mock_vk_scraper.return_value
+    return vk
+
+
+def test_netloc(vk_extractor, metadata):
+    # metadata url set as: "https://example.com/"
+    assert vk_extractor.download(metadata) is False
+
+
+def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
+    metadata.set_url("https://vk.com/valid-wall")
+    vk_extractor.vks.scrape.return_value = []
+    assert vk_extractor.download(metadata) is False
+    assert metadata.netloc == "vk.com"
+    vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
+
+
+def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
+    mock_scrapes = [
+        {"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
+        {"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
+    ]
+    mock_filenames = ["image1.jpg", "image2.png"]
+    vk_extractor.vks.scrape.return_value = mock_scrapes
+    vk_extractor.vks.download_media.return_value = mock_filenames
+    metadata.set_url("https://vk.com/valid-wall")
+    result = vk_extractor.download(metadata)
+    # Test metadata
+    assert result.is_success()
+    assert result.status == "vk: success"
+    assert result.get_title() == "Post Title"
+    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
+    assert "Another Post" in result.metadata["content"]
+    # Test Media objects
+    assert len(result.media) == 2
+    assert result.media[0].filename == "image1.jpg"
+    assert result.media[1].filename == "image2.png"
+    vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
+
+
+def test_adds_first_title_and_timestamp(vk_extractor):
+    metadata = Metadata().set_url("https://vk.com/no-metadata")
+    metadata.set_url("https://vk.com/no-metadata")
+    mock_scrapes = [
+        {"text": "value", "datetime": "2023-01-01T00:00:00"},
+        {"text": "value2", "datetime": "2023-01-02T00:00:00"},
+    ]
+    vk_extractor.vks.scrape.return_value = mock_scrapes
+    vk_extractor.vks.download_media.return_value = []
+    result = vk_extractor.download(metadata)
+
+    assert result.get_title() == "value"
+    # formatted timestamp
+    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
+    assert result.is_success()
--- a/tests/feeders/test_atlos_feeder.py
+++ b/tests/feeders/test_atlos_feeder.py
@@ -1,5 +1,5 @@
 import pytest
-from auto_archiver.modules.atlos_feeder import AtlosFeeder
+from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosFeeder


 class FakeAPIResponse:
@@ -18,44 +18,63 @@ class FakeAPIResponse:


@pytest.fixture
-def atlos_feeder(setup_module) -> AtlosFeeder:
+def atlos_feeder(setup_module, mocker) -> AtlosFeeder:
    """Fixture for AtlosFeeder."""
    configs: dict = {
        "api_token": "abc123",
        "atlos_url": "https://platform.atlos.org",
    }
-    return setup_module("atlos_feeder", configs)
+    mocker.patch("requests.Session")
+    atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
+    fake_session = mocker.MagicMock()
+    # Configure the default response to have no results so that __iter__ terminates
+    fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
+    atlos_feeder.session = fake_session
+    return atlos_feeder


@pytest.fixture
-def mock_atlos_api(mocker):
-    """Fixture to mock requests to Atlos API."""
+def mock_atlos_api(atlos_feeder):
+    """Fixture to update the atlos_feeder.session.get side_effect."""
+
    def _mock_responses(responses):
-        mocker.patch(
-            "requests.get",
-            side_effect=[FakeAPIResponse(data) for data in responses],
-        )
+        atlos_feeder.session.get.side_effect = [FakeAPIResponse(data) for data in responses]
+
    return _mock_responses


 def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
    """Test valid items are yielded and invalid ones ignored."""
-    mock_atlos_api([
-        {
-            "next": None,
-            "results": [
-                {"source_url": "http://example.com", "id": 1,
-                 "metadata": {"auto_archiver": {"processed": False}},
-                 "visibility": "visible", "status": "complete"},
-                {"source_url": "", "id": 2,
-                 "metadata": {"auto_archiver": {"processed": False}},
-                 "visibility": "visible", "status": "complete"},
-                {"source_url": "http://example.org", "id": 3,
-                 "metadata": {"auto_archiver": {"processed": True}},
-                 "visibility": "visible", "status": "complete"},
-            ],
-        }
-    ])
+    mock_atlos_api(
+        [
+            {
+                "next": None,
+                "results": [
+                    {
+                        "source_url": "http://example.com",
+                        "id": 1,
+                        "metadata": {"auto_archiver": {"processed": False}},
+                        "visibility": "visible",
+                        "status": "complete",
+                    },
+                    {
+                        "source_url": "",
+                        "id": 2,
+                        "metadata": {"auto_archiver": {"processed": False}},
+                        "visibility": "visible",
+                        "status": "complete",
+                    },
+                    {
+                        "source_url": "http://example.org",
+                        "id": 3,
+                        "metadata": {"auto_archiver": {"processed": True}},
+                        "visibility": "visible",
+                        "status": "complete",
+                    },
+                ],
+            }
+        ]
+    )

    items = list(atlos_feeder)
    assert len(items) == 1
@@ -65,24 +84,34 @@ def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):

 def test_atlos_feeder_multiple_pages(atlos_feeder, mock_atlos_api):
    """Test iteration over multiple pages with valid items."""
-    mock_atlos_api([
-        {
-            "next": "cursor2",
-            "results": [
-                {"source_url": "http://example1.com", "id": 10,
-                 "metadata": {"auto_archiver": {"processed": False}},
-                 "visibility": "visible", "status": "complete"},
-            ],
-        },
-        {
-            "next": None,
-            "results": [
-                {"source_url": "http://example2.com", "id": 20,
-                 "metadata": {"auto_archiver": {"processed": False}},
-                 "visibility": "visible", "status": "complete"},
-            ],
-        },
-    ])
+    mock_atlos_api(
+        [
+            {
+                "next": "cursor2",
+                "results": [
+                    {
+                        "source_url": "http://example1.com",
+                        "id": 10,
+                        "metadata": {"auto_archiver": {"processed": False}},
+                        "visibility": "visible",
+                        "status": "complete",
+                    },
+                ],
+            },
+            {
+                "next": None,
+                "results": [
+                    {
+                        "source_url": "http://example2.com",
+                        "id": 20,
+                        "metadata": {"auto_archiver": {"processed": False}},
+                        "visibility": "visible",
+                        "status": "complete",
+                    },
+                ],
+            },
+        ]
+    )

    items = list(atlos_feeder)
    assert len(items) == 2
@@ -100,9 +129,7 @@ def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api):

 def test_atlos_feeder_http_error(atlos_feeder, mocker):
    """Test raises an exception on HTTP error."""
-    mocker.patch(
-        "requests.get",
-        return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True),
-    )
+    fake_response = FakeAPIResponse({"next": None, "results": []}, raise_error=True)
+    atlos_feeder.session.get.side_effect = [fake_response]
    with pytest.raises(Exception, match="HTTP error"):
        list(atlos_feeder)
--- a/tests/feeders/test_csv_feeder.py
+++ b/tests/feeders/test_csv_feeder.py
@@ -1,13 +1,16 @@
 import pytest

+
@pytest.fixture
 def headerless_csv_file():
    return "tests/data/csv_no_headers.csv"

+
@pytest.fixture
 def header_csv_file():
    return "tests/data/csv_with_headers.csv"

+
@pytest.fixture
 def header_csv_file_non_default_column():
    return "tests/data/csv_with_headers_non_default_column.csv"
@@ -23,6 +26,7 @@ def test_csv_feeder_no_headers(headerless_csv_file, setup_module):
    assert urls[0].get_url() == "https://example.com/1/"
    assert urls[1].get_url() == "https://example.com/2/"

+
 def test_csv_feeder_with_headers(header_csv_file, setup_module):
    from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder

@@ -33,10 +37,10 @@ def test_csv_feeder_with_headers(header_csv_file, setup_module):
    assert urls[0].get_url() == "https://example.com/1/"
    assert urls[1].get_url() == "https://example.com/2/"

+
 def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog):
    from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder

-
    with caplog.at_level("WARNING"):
        feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1})
        urls = list(feeder)
@@ -54,4 +58,4 @@ def test_csv_feeder_column_by_name(header_csv_file, setup_module):
    urls = list(feeder)
    assert len(urls) == 2
    assert urls[0].get_url() == "https://example.com/1/"
-    assert urls[1].get_url() == "https://example.com/2/"
+    assert urls[1].get_url() == "https://example.com/2/"
--- a/tests/feeders/test_gsheet_feeder.py
+++ b/tests/feeders/test_gsheet_feeder.py
@@ -2,7 +2,7 @@ from typing import Type

 import gspread
 import pytest
-from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
+from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB
 from auto_archiver.core import Metadata, Feeder


@@ -11,43 +11,40 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
    mocker.patch("gspread.service_account")
    with pytest.raises(ValueError):
        setup_module(
-            "gsheet_feeder",
+            "gsheet_feeder_db",
            {"service_account": "dummy.json", "sheet": None, "sheet_id": None},
        )


@pytest.fixture
-def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
+def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
    config: dict = {
-                "service_account": "dummy.json",
-                "sheet": "test-auto-archiver",
-                "sheet_id": None,
-                "header": 1,
-                "columns": {
-                    "url": "link",
-                    "status": "archive status",
-                    "folder": "destination folder",
-                    "archive": "archive location",
-                    "date": "archive date",
-                    "thumbnail": "thumbnail",
-                    "timestamp": "upload timestamp",
-                    "title": "upload title",
-                    "text": "text content",
-                    "screenshot": "screenshot",
-                    "hash": "hash",
-                    "pdq_hash": "perceptual hashes",
-                    "wacz": "wacz",
-                    "replaywebpage": "replaywebpage",
-                },
-                "allow_worksheets": set(),
-                "block_worksheets": set(),
-                "use_sheet_names_in_stored_paths": True,
-            }
+        "service_account": "dummy.json",
+        "sheet": "test-auto-archiver",
+        "sheet_id": None,
+        "header": 1,
+        "columns": {
+            "url": "link",
+            "status": "archive status",
+            "folder": "destination folder",
+            "archive": "archive location",
+            "date": "archive date",
+            "thumbnail": "thumbnail",
+            "timestamp": "upload timestamp",
+            "title": "upload title",
+            "text": "text content",
+            "screenshot": "screenshot",
+            "hash": "hash",
+            "pdq_hash": "perceptual hashes",
+            "wacz": "wacz",
+            "replaywebpage": "replaywebpage",
+        },
+        "allow_worksheets": set(),
+        "block_worksheets": set(),
+        "use_sheet_names_in_stored_paths": True,
+    }
    mocker.patch("gspread.service_account")
-    feeder = setup_module(
-        "gsheet_feeder",
-        config
-    )
+    feeder = setup_module("gsheet_feeder_db", config)
    feeder.gsheets_client = mocker.MagicMock()
    return feeder

@@ -90,7 +87,7 @@ class MockWorksheet:
        return matching.get(col_name, default)


-def test__process_rows(gsheet_feeder: GsheetsFeeder):
+def test__process_rows(gsheet_feeder: GsheetsFeederDB):
    testworksheet = MockWorksheet()
    metadata_items = list(gsheet_feeder._process_rows(testworksheet))
    assert len(metadata_items) == 3
@@ -98,7 +95,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
    assert metadata_items[0].get("url") == "http://example.com"


-def test__set_metadata(gsheet_feeder: GsheetsFeeder):
+def test__set_metadata(gsheet_feeder: GsheetsFeederDB):
    worksheet = MockWorksheet()
    metadata = Metadata()
    gsheet_feeder._set_context(metadata, worksheet, 1)
@@ -106,12 +103,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder):


@pytest.mark.skip(reason="Not recognising folder column")
-def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
+def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet):
    gsheet_feeder._set_context(worksheet, 7)
    assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}


-def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
+def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
    testworksheet = MockWorksheet()
    metadata = Metadata()
    testworksheet.wks.title = "TestSheet"
@@ -128,9 +125,7 @@ def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
        (None, "ABC123", "open_by_key", "ABC123", "opening by sheet ID"),
    ],
 )
-def test_open_sheet_with_name_or_id(
-    setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker
-):
+def test_open_sheet_with_name_or_id(setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker):
    """Ensure open_sheet() correctly opens by name or ID based on configuration."""
    mock_service_account = mocker.patch("gspread.service_account")
    mock_client = mocker.MagicMock()
@@ -140,14 +135,12 @@ def test_open_sheet_with_name_or_id(

    # Setup module with parameterized values
    feeder = setup_module(
-        "gsheet_feeder",
+        "gsheet_feeder_db",
        {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
    )
    sheet_result = feeder.open_sheet()
    # Validate the correct method was called
-    getattr(mock_client, expected_method).assert_called_once_with(
-        expected_arg
-    ), f"Failed: {description}"
+    getattr(mock_client, expected_method).assert_called_once_with(expected_arg), f"Failed: {description}"
    assert sheet_result == "MockSheet", f"Failed: {description}"


@@ -159,7 +152,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
    mock_service_account.return_value = mock_client
    mock_client.open_by_key.return_value = "MockSheet"
    feeder = setup_module(
-        "gsheet_feeder",
+        "gsheet_feeder_db",
        {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
    )
    sheet = feeder.open_sheet()
@@ -170,7 +163,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
 def test_should_process_sheet(setup_module, mocker):
    mocker.patch("gspread.service_account")
    gdb = setup_module(
-        "gsheet_feeder",
+        "gsheet_feeder_db",
        {
            "service_account": "dummy.json",
            "sheet": "TestSheet",
@@ -179,18 +172,18 @@ def test_should_process_sheet(setup_module, mocker):
            "block_worksheets": {"Sheet3"},
        },
    )
-    assert gdb.should_process_sheet("TestSheet") == True
-    assert gdb.should_process_sheet("Sheet3") == False
+    assert gdb.should_process_sheet("TestSheet") is True
+    assert gdb.should_process_sheet("Sheet3") is False
    # False if allow_worksheets is set
-    assert gdb.should_process_sheet("AnotherSheet") == False
+    assert gdb.should_process_sheet("AnotherSheet") is False


@pytest.mark.skip(reason="Requires a real connection")
 class TestGSheetsFeederReal:
-    """Testing GSheetsFeeder class"""
+    """Testing GsheetsFeeder class"""

-    module_name: str = "gsheet_feeder"
-    feeder: GsheetsFeeder
+    module_name: str = "gsheet_feeder_db"
+    feeder: GsheetsFeederDB
    # You must follow the setup process explain in the docs for this to work
    config: dict = {
        "service_account": "secrets/service_account.json",
@@ -220,9 +213,7 @@ class TestGSheetsFeederReal:

    @pytest.fixture(autouse=True)
    def setup_feeder(self, setup_module):
-        assert (
-            self.module_name is not None
-        ), "self.module_name must be set on the subclass"
+        assert self.module_name is not None, "self.module_name must be set on the subclass"
        assert self.config is not None, "self.config must be a dict set on the subclass"
        self.feeder: Type[Feeder] = setup_module(self.module_name, self.config)

@@ -241,9 +232,7 @@ class TestGSheetsFeederReal:
        """Ensure open_sheet() connects to a real Google Sheets instance."""
        sheet = self.feeder.open_sheet()
        assert sheet is not None, "open_sheet() should return a valid sheet instance"
-        assert hasattr(
-            sheet, "worksheets"
-        ), "Returned object should have worksheets method"
+        assert hasattr(sheet, "worksheets"), "Returned object should have worksheets method"

    def test_iter_yields_metadata_real_data(self):
        """Ensure __iter__() yields Metadata objects for real test sheet data."""
--- a/tests/feeders/test_gworksheet.py
+++ b/tests/feeders/test_gworksheet.py
@@ -1,7 +1,7 @@
 # Note this isn't a feeder, but contained as utility of the gsheet feeder module
 import pytest

-from auto_archiver.modules.gsheet_feeder import GWorksheet
+from auto_archiver.modules.gsheet_feeder_db import GWorksheet


 class TestGWorksheet:
@@ -81,40 +81,27 @@ class TestGWorksheet:
            (False, ""),
        ],
    )
-    def test_get_cell_or_default_handles_empty_values(
-        self, mock_worksheet, when_empty, expected
-    ):
+    def test_get_cell_or_default_handles_empty_values(self, mock_worksheet, when_empty, expected):
        mock_worksheet.get_values.return_value[1][0] = ""  # Empty URL cell
        g = GWorksheet(mock_worksheet)
-        assert (
-            g.get_cell_or_default(
-                2, "url", default="default", when_empty_use_default=when_empty
-            )
-            == expected
-        )
+        assert g.get_cell_or_default(2, "url", default="default", when_empty_use_default=when_empty) == expected

    def test_get_cell_or_default_handles_missing_columns(self, gworksheet):
-        assert (
-            gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"
-        )
+        assert gworksheet.get_cell_or_default(1, "invalid_col", default="safe") == "safe"

    # Test write operations
    def test_set_cell_updates_correct_position(self, mock_worksheet, gworksheet):
        gworksheet.set_cell(2, "url", "new_url")
        mock_worksheet.update_cell.assert_called_once_with(2, 1, "new_url")

-    def test_batch_set_cell_formats_requests_correctly(
-        self, mock_worksheet, gworksheet
-    ):
+    def test_batch_set_cell_formats_requests_correctly(self, mock_worksheet, gworksheet):
        updates = [(2, "url", "new_url"), (3, "status", "processed")]
        gworksheet.batch_set_cell(updates)
        expected_batch = [
            {"range": "A2", "values": [["new_url"]]},
            {"range": "B3", "values": [["processed"]]},
        ]
-        mock_worksheet.batch_update.assert_called_once_with(
-            expected_batch, value_input_option="USER_ENTERED"
-        )
+        mock_worksheet.batch_update.assert_called_once_with(expected_batch, value_input_option="USER_ENTERED")

    def test_batch_set_cell_truncates_long_values(self, mock_worksheet, gworksheet):
        long_value = "x" * 50000
--- a/tests/formatters/test_html_formatter.py
+++ b/tests/formatters/test_html_formatter.py
@@ -5,13 +5,13 @@ from auto_archiver.core import Metadata, Media
 def test_format(setup_module):
    formatter = setup_module(HtmlFormatter)

-    metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com')
+    metadata = Metadata().set("content", "Hello, world!").set_url("https://example.com")

    final_media = formatter.format(metadata)
    assert isinstance(final_media, Media)
    assert ".html" in final_media.filename
-    with open (final_media.filename, "r", encoding="utf-8") as f:
+    with open(final_media.filename, "r", encoding="utf-8") as f:
        content = f.read()
        assert "Hello, world!" in content
    assert final_media.mimetype == "text/html"
-    assert "SHA-256:" in final_media.get('hash')
+    assert "SHA-256:" in final_media.get("hash")
--- a/tests/storages/test_S3_storage.py
+++ b/tests/storages/test_S3_storage.py
@@ -8,6 +8,7 @@ class TestS3Storage:
    """
    Test suite for S3Storage.
    """
+
    module_name: str = "s3_storage"
    storage: Type[S3Storage]
    config: dict = {
@@ -32,28 +33,28 @@ class TestS3Storage:
        """Test that S3 client is initialized with correct parameters"""

        assert self.storage.s3 is not None
-        assert self.storage.s3.meta.region_name == 'test-region'
+        assert self.storage.s3.meta.region_name == "test-region"

    def test_get_cdn_url_generation(self):
-        """Test CDN URL formatting """
+        """Test CDN URL formatting"""
        media = Media("test.txt")
-        media.key = "path/to/file.txt"
+        media._key = "path/to/file.txt"
        url = self.storage.get_cdn_url(media)
        assert url == "https://cdn.example.com/path/to/file.txt"
-        media.key = "another/path.jpg"
+        media._key = "another/path.jpg"
        assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"

    def test_uploadf_sets_acl_public(self, mocker):
        media = Media("test.txt")
        mock_file = mocker.MagicMock()
-        mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
-        mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
+        mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
+        mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
        self.storage.uploadf(mock_file, media)
        mock_s3_upload.assert_called_once_with(
            mock_file,
-            Bucket='test-bucket',
+            Bucket="test-bucket",
            Key=media.key,
-            ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
+            ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
        )

    def test_upload_decision_logic(self, mocker):
@@ -61,45 +62,48 @@ class TestS3Storage:
        media = Media("test.txt")
        assert self.storage.is_upload_needed(media) is True
        self.storage.random_no_duplicate = True
-        mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
-        mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
+        mocker.patch(
+            "auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
+            return_value="beepboop123beepboop123beepboop123",
+        )
+        mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
        assert self.storage.is_upload_needed(media) is False
-        assert media.key == 'existing_key.txt'
-        mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
+        assert media.key == "existing_key.txt"
+        mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")

    def test_skips_upload_when_duplicate_exists(self, mocker):
        """Test that upload skips when file_in_folder finds existing object"""
        self.storage.random_no_duplicate = True
-        mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
+        mocker.patch.object(S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt")
        media = Media("test.txt")
-        media.key = "original_path.txt"
-        mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
+        media._key = "original_path.txt"
+        mocker.patch(
+            "auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
+            return_value="beepboop123beepboop123beepboop123",
+        )
        assert self.storage.is_upload_needed(media) is False
        assert media.key == "existing_folder/existing_file.txt"
        assert media.get("previously archived") is True
-        mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
+        mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
        result = self.storage.uploadf(None, media)
        mock_upload.assert_not_called()
        assert result is True

    def test_uploads_with_correct_parameters(self, mocker):
        media = Media("test.txt")
-        media.key = "original_key.txt"
-        mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
-        media.mimetype = 'image/png'
+        media._key = "original_key.txt"
+        mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
+        media.mimetype = "image/png"
        mock_file = mocker.MagicMock()
-        mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
+        mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
        self.storage.uploadf(mock_file, media)
        mock_upload.assert_called_once_with(
            mock_file,
-            Bucket='test-bucket',
-            Key='original_key.txt',
-            ExtraArgs={
-                'ACL': 'public-read',
-                'ContentType': 'image/png'
-            }
+            Bucket="test-bucket",
+            Key="original_key.txt",
+            ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
        )

    def test_file_in_folder_exists(self, mocker):
-        mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
-        assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
+        mocker.patch.object(self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]})
+        assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"
--- a/tests/storages/test_atlos_storage.py
+++ b/tests/storages/test_atlos_storage.py
@@ -2,7 +2,7 @@ import os
 import hashlib
 import pytest
 from auto_archiver.core import Media, Metadata
-from auto_archiver.modules.atlos_storage import AtlosStorage
+from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosStorage


 class FakeAPIResponse:
@@ -21,13 +21,19 @@ class FakeAPIResponse:


@pytest.fixture
-def atlos_storage(setup_module) -> AtlosStorage:
+def atlos_storage(setup_module, mocker) -> AtlosStorage:
    """Fixture for AtlosStorage."""
    configs: dict = {
        "api_token": "abc123",
        "atlos_url": "https://platform.atlos.org",
    }
-    return setup_module("atlos_storage", configs)
+    mocker.patch("requests.Session")
+    atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
+    mock_session = mocker.MagicMock()
+    # Configure the default response to have no results so that __iter__ terminates
+    mock_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
+    atlos_feeder.session = mock_session
+    return atlos_feeder


@pytest.fixture
@@ -38,7 +44,7 @@ def media(tmp_path) -> Media:
    file_path.write_bytes(content)
    media = Media(filename=str(file_path))
    media.properties = {"something": "Title"}
-    media.key = "key"
+    media._key = "key"
    return media


@@ -49,17 +55,6 @@ def test_get_cdn_url(atlos_storage: AtlosStorage) -> None:
    assert url == atlos_storage.atlos_url


-def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None:
-    """Test _hash() computes the correct SHA-256 hash of a file."""
-    content = b"hello world"
-    file_path = tmp_path / "test.txt"
-    file_path.write_bytes(content)
-    media = Media(filename="dummy.mp4")
-    media.filename = str(file_path)
-    expected_hash = hashlib.sha256(content).hexdigest()
-    assert atlos_storage._hash(media) == expected_hash
-
-
 def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None:
    """Test upload() returns False when metadata lacks atlos_id."""
    metadata = Metadata()  # atlos_id not set
@@ -69,74 +64,49 @@ def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media,
    post_mock.assert_not_called()


-def test_upload_already_uploaded(atlos_storage: AtlosStorage,
-                                 metadata: Metadata,
-                                 media: Media,
-                                 tmp_path,
-                                 mocker) -> None:
+def test_upload_already_uploaded(atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
    """Test upload() returns True if media hash already exists."""
    content = b"media content"
    metadata.set("atlos_id", 101)
    media_hash = hashlib.sha256(content).hexdigest()
-    fake_get = FakeAPIResponse({
-        "result": {"artifacts": [{"file_hash_sha256": media_hash}]}
-    })
-    get_mock = mocker.patch("requests.get", return_value=fake_get)
-    post_mock = mocker.patch("requests.post")
+    fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": media_hash}]}}
+    get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
+    post_mock = mocker.patch.object(atlos_storage, "_post")
    result = atlos_storage.upload(media, metadata)
    assert result is True
    get_mock.assert_called_once()
    post_mock.assert_not_called()


-def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage,
-                             metadata: Metadata,
-                             media: Media,
-                             mocker) -> None:
+def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
    """Test upload() uploads media when not already present."""
    metadata.set("atlos_id", 202)
-    fake_get = FakeAPIResponse({
-        "result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}
-    })
-    get_mock = mocker.patch("requests.get", return_value=fake_get)
-    fake_post = FakeAPIResponse({}, raise_error=False)
-    post_mock = mocker.patch("requests.post", return_value=fake_post)
+    fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}}
+    get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
+    fake_post_response = {"result": "uploaded"}
+    post_mock = mocker.patch.object(atlos_storage, "_post", return_value=fake_post_response)
    result = atlos_storage.upload(media, metadata)
    assert result is True
+
    get_mock.assert_called_once()
    post_mock.assert_called_once()
-    expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202"
-    expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"}
+    expected_endpoint = "/api/v2/source_material/upload/202"
+    call_args = post_mock.call_args[0]
+    assert call_args[0] == expected_endpoint
+    call_kwargs = post_mock.call_args[1]
    expected_params = {"title": media.properties}
-    call_kwargs = post_mock.call_args.kwargs
-    assert call_kwargs["headers"] == expected_headers
    assert call_kwargs["params"] == expected_params
-    # Verify the URL passed to requests.post.
-    posted_url = call_kwargs.get("url") or post_mock.call_args.args[0]
-    assert posted_url == expected_url
-    # Verify files parameter contains the correct filename.
    file_tuple = call_kwargs["files"]["file"]
    assert file_tuple[0] == os.path.basename(media.filename)


-def test_upload_post_http_error(tmp_path,
-                                atlos_storage: AtlosStorage,
-                                metadata: Metadata,
-                                media: Media,
-                                mocker) -> None:
+def test_upload_post_http_error(
+    tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker
+) -> None:
    """Test upload() propagates HTTP error during POST."""
    metadata.set("atlos_id", 303)
-    fake_get = FakeAPIResponse({
-        "result": {"artifacts": []}
-    })
-    mocker.patch("requests.get", return_value=fake_get)
-    fake_post = FakeAPIResponse({}, raise_error=True)
-    mocker.patch("requests.post", return_value=fake_post)
+    fake_get_response = {"result": {"artifacts": []}}
+    mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
+    mocker.patch.object(atlos_storage, "_post", side_effect=Exception("HTTP error"))
    with pytest.raises(Exception, match="HTTP error"):
        atlos_storage.upload(media, metadata)
-
-
-def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None:
-    """Test uploadf() returns None (not implemented)."""
-    result = atlos_storage.uploadf(None, "dummy")
-    assert result is None
--- a/tests/storages/test_gdrive_storage.py
+++ b/tests/storages/test_gdrive_storage.py
@@ -1,37 +1,42 @@
 from typing import Type
 import pytest
-from oauth2client import service_account

 from auto_archiver.core import Media
 from auto_archiver.modules.gdrive_storage import GDriveStorage
-from auto_archiver.core.metadata import Metadata
 from tests.storages.test_storage_base import TestStorageBase


+@pytest.fixture(autouse=True)
+def mock_sleep(mocker):
+    """Mock time.sleep to avoid delays."""
+    return mocker.patch("time.sleep")
+
+
@pytest.fixture
-def gdrive_storage(setup_module, mocker):
+def gdrive_storage(setup_module, mocker) -> GDriveStorage:
    module_name: str = "gdrive_storage"
-    storage: GDriveStorage
-    config: dict = {'path_generator': 'url',
-            'filename_generator': 'static',
-            'root_folder_id': "fake_root_folder_id",
-            'oauth_token': None,
-            'service_account': 'fake_service_account.json'
-                    }
-    mocker.patch('google.oauth2.service_account.Credentials.from_service_account_file')
+    config: dict = {
+        "path_generator": "url",
+        "filename_generator": "static",
+        "root_folder_id": "fake_root_folder_id",
+        "oauth_token": None,
+        "service_account": "fake_service_account.json",
+    }
+    mocker.patch("google.oauth2.service_account.Credentials.from_service_account_file")
    return setup_module(module_name, config)


 def test_initialize_fails_with_non_existent_creds(setup_module):
    """Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
-        (and isn't mocked)
+    (and isn't mocked)
    """
-    config: dict = {'path_generator': 'url',
-                    'filename_generator': 'static',
-                    'root_folder_id': "fake_root_folder_id",
-                    'oauth_token': None,
-                    'service_account': 'fake_service_account.json'
-                    }
+    config: dict = {
+        "path_generator": "url",
+        "filename_generator": "static",
+        "root_folder_id": "fake_root_folder_id",
+        "oauth_token": None,
+        "service_account": "fake_service_account.json",
+    }
    with pytest.raises(FileNotFoundError) as exc_info:
        setup_module("gdrive_storage", config)
    assert "No such file or directory" in str(exc_info.value)
@@ -48,10 +53,10 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
    result = gdrive_storage._get_id_from_parent_and_name("parent", "mock", retries=1, use_mime_type=False)
    assert result == "123"

+
 def test_path_parts():
    media = Media(filename="test.jpg")
-    media.key = "folder1/folder2/test.jpg"
-
+    media._key = "folder1/folder2/test.jpg"


@pytest.mark.skip(reason="Requires real credentials")
@@ -63,19 +68,17 @@ class TestGDriveStorageConnected(TestStorageBase):

    module_name: str = "gdrive_storage"
    storage: Type[GDriveStorage]
-    config: dict = {'path_generator': 'url',
-            'filename_generator': 'static',
-            # TODO: replace with real root folder id
-            'root_folder_id': "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
-            'oauth_token': None,
-            'service_account': 'secrets/service_account.json'
-                    }
-
+    config: dict = {
+        "path_generator": "url",
+        "filename_generator": "static",
+        # TODO: replace with real root folder id
+        "root_folder_id": "1TVY_oJt95_dmRSEdP9m5zFy7l50TeCSk",
+        "oauth_token": None,
+        "service_account": "secrets/service_account.json",
+    }

    def test_initialize_with_real_credentials(self):
        """
        Test that the Google Drive service can be initialized with real credentials.
        """
        assert self.storage.service is not None
-
-
--- a/tests/storages/test_local_storage.py
+++ b/tests/storages/test_local_storage.py
@@ -1,43 +1,63 @@
-
 import os
 from pathlib import Path

 import pytest

-from auto_archiver.core import Media
+from auto_archiver.core import Media, Metadata
 from auto_archiver.modules.local_storage import LocalStorage
+from auto_archiver.core.consts import SetupError


@pytest.fixture
-def local_storage(setup_module) -> LocalStorage:
+def local_storage(setup_module, tmp_path) -> LocalStorage:
+    save_to = tmp_path / "local_archive"
+    save_to.mkdir()
    configs: dict = {
        "path_generator": "flat",
        "filename_generator": "static",
-        "save_to": "./local_archive",
+        "save_to": str(save_to),
        "save_absolute": False,
    }
    return setup_module("local_storage", configs)

+
+@pytest.fixture
+def sample_media(tmp_path) -> Media:
+    """Fixture creating a Media object with temporary source file"""
+    src_file = tmp_path / "source.txt"
+    src_file.write_text("test content")
+    return Media(filename=str(src_file))
+
+
+def test_too_long_save_path(setup_module):
+    with pytest.raises(SetupError):
+        setup_module("local_storage", {"save_to": "long" * 100})
+
 def test_get_cdn_url_relative(local_storage):
-    media = Media(key="test.txt", filename="dummy.txt")
+    local_storage.filename_generator = "random"
+    media = Media(filename="dummy.txt")
+    local_storage.set_key(media, "https://example.com", Metadata())
    expected = os.path.join(local_storage.save_to, media.key)
    assert local_storage.get_cdn_url(media) == expected

+
 def test_get_cdn_url_absolute(local_storage):
-    media = Media(key="test.txt", filename="dummy.txt")
+    local_storage.filename_generator = "random"
+
+    media = Media(filename="dummy.txt")
    local_storage.save_absolute = True
+    local_storage.set_key(media, "https://example.com", Metadata())
    expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
    assert local_storage.get_cdn_url(media) == expected

+
 def test_upload_file_contents_and_metadata(local_storage, sample_media):
+    local_storage.store(sample_media, "https://example.com", Metadata())
    dest = os.path.join(local_storage.save_to, sample_media.key)
-    assert local_storage.upload(sample_media) is True
    assert Path(sample_media.filename).read_text() == Path(dest).read_text()


 def test_upload_nonexistent_source(local_storage):
-    media = Media(key="missing.txt", filename="nonexistent.txt")
+    media = Media(_key="missing.txt", filename="nonexistent.txt")
    with pytest.raises(FileNotFoundError):
        local_storage.upload(media)
-
-
--- a/tests/storages/test_storage_base.py
+++ b/tests/storages/test_storage_base.py
@@ -2,21 +2,109 @@ from typing import Type

 import pytest

-from auto_archiver.core.metadata import Metadata
+from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.core.storage import Storage
+from auto_archiver.core.module import ModuleFactory


 class TestStorageBase(object):
-
    module_name: str = None
    config: dict = None

    @pytest.fixture(autouse=True)
    def setup_storage(self, setup_module):
-        assert (
-            self.module_name is not None
-        ), "self.module_name must be set on the subclass"
+        assert self.module_name is not None, "self.module_name must be set on the subclass"
        assert self.config is not None, "self.config must be a dict set on the subclass"
-        self.storage: Type[Storage] = setup_module(
-            self.module_name, self.config
-        )
+        self.storage: Type[Storage] = setup_module(self.module_name, self.config)
+
+
+class TestBaseStorage(Storage):
+    name = "test_storage"
+
+    def get_cdn_url(self, media):
+        return "cdn_url"
+
+    def uploadf(self, file, key, **kwargs):
+        return True
+
+
+@pytest.fixture
+def dummy_file(tmp_path):
+    # create dummy.txt file
+    dummy_file = tmp_path / "dummy.txt"
+    dummy_file.write_text("test content")
+    return str(dummy_file)
+
+
+@pytest.fixture
+def storage_base():
+    def _storage_base(config):
+        storage_base = TestBaseStorage()
+        storage_base.config_setup({TestBaseStorage.name: config})
+        storage_base.module_factory = ModuleFactory()
+        return storage_base
+
+    return _storage_base
+
+
+@pytest.mark.parametrize(
+    "path_generator, filename_generator, url, expected_key",
+    [
+        ("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"),
+        ("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"),
+        ("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"),
+        ("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
+        ("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
+        ("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
+    ],
+)
+def test_storage_name_generation(
+    storage_base, path_generator, filename_generator, url, expected_key, mocker, tmp_path, dummy_file
+):
+    mock_random = mocker.patch("auto_archiver.core.storage.random_str")
+    mock_random.return_value = "pretend-random"
+
+    config: dict = {
+        "path_generator": path_generator,
+        "filename_generator": filename_generator,
+    }
+    storage: Storage = storage_base(config)
+    assert storage.path_generator == path_generator
+    assert storage.filename_generator == filename_generator
+
+    metadata = Metadata()
+    metadata.set_context("folder", "folder")
+    media = Media(filename=dummy_file)
+    storage.set_key(media, url, metadata)
+    print(media.key)
+    assert media.key == expected_key
+
+
+def test_really_long_name(storage_base, dummy_file):
+    config: dict = {
+        "path_generator": "url",
+        "filename_generator": "static",
+    }
+    storage: Storage = storage_base(config)
+
+    url = f"https://example.com/{'file' * 100}"
+    media = Media(filename=dummy_file)
+    storage.set_key(media, url, Metadata())
+    assert media.key == f"https-example-com-{'file' * 13}/6ae8a75555209fd6c44157c0.txt"
+
+
+def test_storage_loads_hash_enricher(storage_base, dummy_file):
+    """Ensure 'hash_enricher' is properly loaded without an explicit import."""
+    config = {"path_generator": "url", "filename_generator": "static"}
+    storage = storage_base(config)
+
+    url = "https://example.com/file/"
+    media = Media(filename=dummy_file)
+    metadata = Metadata()
+
+    try:
+        storage.set_key(media, url, metadata)
+    except Exception as e:
+        pytest.fail(f"Storage failed to dynamically load hash_enricher: {e}")
+
+    assert media.key is not None, "Expected media.key to be set, but it was None"
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -3,39 +3,46 @@ from auto_archiver.core import config
 from ruamel.yaml.scanner import ScannerError
 from ruamel.yaml.comments import CommentedMap

+
 def test_return_default_config_for_nonexistent_file():
    assert config.read_yaml("nonexistent_file.yaml") == config.EMPTY_CONFIG

+
 def test_return_default_config_for_empty_file(tmp_path):
    empty_file = tmp_path / "empty_file.yaml"
    empty_file.write_text("")
    assert config.read_yaml(empty_file) == config.EMPTY_CONFIG

+
 def test_raise_error_on_invalid_yaml(tmp_path):
    invalid_yaml = tmp_path / "invalid_yaml.yaml"
-    invalid_yaml.write_text("key: \"value_without_end_quote")
+    invalid_yaml.write_text('key: "value_without_end_quote')
    # make sure it raises ScannerError
    with pytest.raises(ScannerError):
        config.read_yaml(invalid_yaml)

+
 def test_write_yaml(tmp_path):
    yaml_file = tmp_path / "write_yaml.yaml"
    config.store_yaml(config.EMPTY_CONFIG, yaml_file.as_posix())
    assert "steps:\n" in yaml_file.read_text()

+
 def test_round_trip_comments(tmp_path):
    yaml_file = tmp_path / "round_trip_comments.yaml"

    with open(yaml_file, "w") as f:
-        f.write("generic_extractor:\n  facebook_cookie: abc # end of line comment\n  subtitles: true\n  # comments: false\n  # livestreams: false\n  list_type:\n    - value1\n    - value2")
+        f.write(
+            "generic_extractor:\n  facebook_cookie: abc # end of line comment\n  subtitles: true\n  # comments: false\n  # livestreams: false\n  list_type:\n    - value1\n    - value2"
+        )

    loaded = config.read_yaml(yaml_file)
    # check the comments are preserved
-    assert loaded['generic_extractor']['facebook_cookie'] == "abc"
-    assert loaded['generic_extractor'].ca.items['facebook_cookie'][2].value == "# end of line comment\n"
+    assert loaded["generic_extractor"]["facebook_cookie"] == "abc"
+    assert loaded["generic_extractor"].ca.items["facebook_cookie"][2].value == "# end of line comment\n"

    # add some more items to my_settings
-    loaded['generic_extractor']['list_type'].append("bellingcat")
+    loaded["generic_extractor"]["list_type"].append("bellingcat")
    config.store_yaml(loaded, yaml_file.as_posix())

    assert "# comments: false" in yaml_file.read_text()
@@ -43,14 +50,17 @@ def test_round_trip_comments(tmp_path):
    assert "abc # end of line comment" in yaml_file.read_text()
    assert "- value2\n  - bellingcat" in yaml_file.read_text()

+
 def test_merge_dicts():
    yaml_dict = config.EMPTY_CONFIG
-    yaml_dict['settings'] = CommentedMap(**{
+    yaml_dict["settings"] = CommentedMap(
+        **{
            "key1": ["a"],
            "key2": "old_value",
            "key3": ["a", "b", "c"],
            "key5": "value5",
-        })
+        }
+    )

    dotdict = {
        "settings.key1": ["b", "c"],
@@ -67,15 +77,16 @@ def test_merge_dicts():


 def test_check_types():
-    assert config.is_list_type([]) == True
-    assert config.is_list_type(()) == True
-    assert config.is_list_type(set()) == True
-    assert config.is_list_type({}) == False
-    assert config.is_list_type("") == False
-    assert config.is_dict_type({}) == True
-    assert config.is_dict_type(CommentedMap()) == True
-    assert config.is_dict_type([]) == False
-    assert config.is_dict_type("") == False
+    assert config.is_list_type([]) is True
+    assert config.is_list_type(()) is True
+    assert config.is_list_type(set()) is True
+    assert config.is_list_type({}) is False
+    assert config.is_list_type("") is False
+    assert config.is_dict_type({}) is True
+    assert config.is_dict_type(CommentedMap()) is True
+    assert config.is_dict_type([]) is False
+    assert config.is_dict_type("") is False
+

 def test_from_dot_notation():
    dotdict = {
@@ -88,16 +99,17 @@ def test_from_dot_notation():
    assert normal_dict["settings"]["key2"] == "new_value"
    assert normal_dict["settings"]["key3"]["key4"] == "value"

+
 def test_to_dot_notation():
    yaml_dict = config.EMPTY_CONFIG
-    yaml_dict['settings'] = {
+    yaml_dict["settings"] = {
        "key1": ["a", "b", "c"],
        "key2": "new_value",
        "key3": {
            "key4": "value",
-        }
+        },
    }
    dotdict = config.to_dot_notation(yaml_dict)
    assert dotdict["settings.key1"] == ["a", "b", "c"]
    assert dotdict["settings.key2"] == "new_value"
-    assert dotdict["settings.key3.key4"] == "value"
+    assert dotdict["settings.key3.key4"] == "value"
--- a/tests/test_implementation.py
+++ b/tests/test_implementation.py
@@ -10,21 +10,23 @@ def orchestration_file_path(tmp_path):
    folder.mkdir(exist_ok=True)
    return (folder / "example_orch.yaml").as_posix()

+
@pytest.fixture
 def orchestration_file(orchestration_file_path):
-    def _orchestration_file(content=''):
+    def _orchestration_file(content=""):
        with open(orchestration_file_path, "w") as f:
            f.write(content)
        return orchestration_file_path
-    
+
    return _orchestration_file

+
@pytest.fixture
 def autoarchiver(tmp_path, monkeypatch, request):
    def _autoarchiver(args=[]):
-
        def cleanup():
            from loguru import logger
+
            if not logger._core.handlers.get(0):
                logger._core.handlers_count = 0
                logger.add(sys.stderr)
@@ -44,9 +46,9 @@ def autoarchiver(tmp_path, monkeypatch, request):
 def test_run_auto_archiver_no_args(caplog, autoarchiver):
    with pytest.raises(SystemExit):
        autoarchiver()
-
    assert "provide at least one URL via the command line, or set up an alternative feeder" in caplog.text

+
 def test_run_auto_archiver_invalid_file(caplog, autoarchiver):
    # exec 'auto-archiver' on the command lin
    with pytest.raises(SystemExit):
@@ -54,6 +56,7 @@ def test_run_auto_archiver_invalid_file(caplog, autoarchiver):

    assert "Make sure the file exists and try again, or run without th" in caplog.text

+
 def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
    # create a valid (empty) orchestration file
    path = orchestration_file(content="")
@@ -64,6 +67,7 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
    # should treat an empty file as if there is no file at all
    assert " No URLs provided. Please provide at least one URL via the com" in caplog.text

+
 def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
    from auto_archiver.__main__ import main

@@ -75,4 +79,4 @@ def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
        with pytest.raises(SystemExit):
            main()

-    assert "No URLs provided. Please provide at least one" in caplog.text
+    assert "No URLs provided. Please provide at least one" in caplog.text
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -62,18 +62,8 @@ def test_simple_merge(basic_metadata):


 def test_left_merge():
-    left = (
-        Metadata()
-        .set("tags", ["a"])
-        .set("stats", {"views": 10})
-        .set("status", "success")
-    )
-    right = (
-        Metadata()
-        .set("tags", ["b"])
-        .set("stats", {"likes": 5})
-        .set("status", "no archiver")
-    )
+    left = Metadata().set("tags", ["a"]).set("stats", {"views": 10}).set("status", "success")
+    right = Metadata().set("tags", ["b"]).set("stats", {"likes": 5}).set("status", "no archiver")

    left.merge(right, overwrite_left=True)
    assert left.get("status") == "no archiver"
@@ -120,6 +110,7 @@ def test_is_empty():
 def test_store():
    pass

+
 # Test Media operations


@@ -176,6 +167,7 @@ def test_choose_most_complete():
    res = Metadata.choose_most_complete([m_more, m_less])
    assert res.metadata.get("title") == "Title 1"

+
 def test_choose_most_complete_from_pickles(unpickle):
    # test most complete from pickles before and after an enricher has run
    # Only compares length of media, not the actual media
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -1,40 +1,41 @@
-import sys
 import pytest
 from auto_archiver.core.module import ModuleFactory, LazyBaseModule
 from auto_archiver.core.base_module import BaseModule
+from auto_archiver.core.consts import SetupError
+

@pytest.fixture
 def example_module():
    import auto_archiver

    module_factory = ModuleFactory()
-
-    previous_path = auto_archiver.modules.__path__
+    # previous_path = auto_archiver.modules.__path__
    auto_archiver.modules.__path__.append("tests/data/test_modules/")
-
    return module_factory.get_module_lazy("example_module")

+
 def test_get_module_lazy(example_module):
    assert example_module.name == "example_module"
    assert example_module.display_name == "Example Module"

    assert example_module.manifest is not None

+
 def test_python_dependency_check(example_module):
    # example_module requires loguru, which is not installed
    # monkey patch the manifest to include a nonexistnet dependency
    example_module.manifest["dependencies"]["python"] = ["does_not_exist"]

-    with pytest.raises(SystemExit) as load_error:
+    with pytest.raises(SetupError):
        example_module.load({})

-    assert load_error.value.code == 1

 def test_binary_dependency_check(example_module):
    # example_module requires ffmpeg, which is not installed
    # monkey patch the manifest to include a nonexistnet dependency
    example_module.manifest["dependencies"]["binary"] = ["does_not_exist"]

+
 def test_module_dependency_check_loads_module(example_module):
    # example_module requires cli_feeder, which is not installed
    # monkey patch the manifest to include a nonexistnet dependency
@@ -49,19 +50,20 @@ def test_module_dependency_check_loads_module(example_module):
    assert module_factory._lazy_modules["hash_enricher"] is not None
    assert module_factory._lazy_modules["hash_enricher"]._instance is not None

-def test_load_module(example_module):

+def test_load_module(example_module):
    # setup the module, and check that config is set to the default values
    loaded_module = example_module.load({})
    assert loaded_module is not None
    assert isinstance(loaded_module, BaseModule)
    assert loaded_module.name == "example_module"
    assert loaded_module.display_name == "Example Module"
-    assert loaded_module.config["example_module"] ==  {"csv_file" : "db.csv"}
+    assert loaded_module.config["example_module"] == {"csv_file": "db.csv"}

    # check that the vlaue is set on the module itself
    assert loaded_module.csv_file == "db.csv"

+
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
 def test_load_modules(module_name):
    # test that specific modules can be loaded
@@ -78,6 +80,20 @@ def test_load_modules(module_name):
    # check that default settings are applied
    default_config = module.configs
    assert loaded_module.name in loaded_module.config.keys()
+    defaults = {k for k in default_config}
+    assert defaults in [loaded_module.config[module_name].keys()]
+
+
+@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
+def test_config_defaults(module_name):
+    # test the values of the default config values are set
+    # Note: some modules can alter values in the setup() method, this test checks cases that don't
+    module = ModuleFactory().get_module_lazy(module_name)
+    loaded_module = module.load({})
+    # check that default config values are set
+    default_config = module.configs
+    defaults = {k: v.get("default") for k, v in default_config.items()}
+    assert defaults == loaded_module.config[module_name]


@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
@@ -96,5 +112,3 @@ def test_lazy_base_module(module_name):
    assert len(lazy_module.configs) > 0
    assert len(lazy_module.description) > 0
    assert len(lazy_module.version) > 0
-
-
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -1,59 +1,73 @@
 import pytest
-import sys
 from argparse import ArgumentParser, ArgumentTypeError
 from auto_archiver.core.orchestrator import ArchivingOrchestrator
 from auto_archiver.version import __version__
 from auto_archiver.core.config import read_yaml, store_yaml
 from auto_archiver.core import Metadata
+from auto_archiver.core.consts import SetupError

 TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
 TEST_MODULES = "tests/data/test_modules/"

+
@pytest.fixture
 def test_args():
-    return ["--config", TEST_ORCHESTRATION,
-            "--module_paths", TEST_MODULES,
-            "--example_module.required_field", "some_value"] # just set this for normal testing, we will remove it later
+    return [
+        "--config",
+        TEST_ORCHESTRATION,
+        "--module_paths",
+        TEST_MODULES,
+        "--example_module.required_field",
+        "some_value",
+    ]  # just set this for normal testing, we will remove it later
+

@pytest.fixture
 def orchestrator():
    return ArchivingOrchestrator()

+
@pytest.fixture
 def basic_parser(orchestrator) -> ArgumentParser:
    return orchestrator.setup_basic_parser()

+
 def test_setup_orchestrator(orchestrator):
    assert orchestrator is not None

+
 def test_parse_config():
    pass

+
 def test_parse_basic(basic_parser):
    args = basic_parser.parse_args(["--config", TEST_ORCHESTRATION])
    assert args.config_file == TEST_ORCHESTRATION

+
@pytest.mark.parametrize("mode", ["simple", "full"])
 def test_mode(basic_parser, mode):
    args = basic_parser.parse_args(["--mode", mode])
    assert args.mode == mode

+
 def test_mode_invalid(basic_parser, capsys):
    with pytest.raises(SystemExit) as exit_error:
        basic_parser.parse_args(["--mode", "invalid"])
    assert exit_error.value.code == 2
    assert "invalid choice" in capsys.readouterr().err

+
 def test_version(basic_parser, capsys):
    with pytest.raises(SystemExit) as exit_error:
        basic_parser.parse_args(["--version"])
    assert exit_error.value.code == 0
    assert capsys.readouterr().out == f"{__version__}\n"

-def test_help(orchestrator, basic_parser, capsys):

+def test_help(orchestrator, basic_parser, capsys):
    args = basic_parser.parse_args(["--help"])
-    assert args.help == True
+    assert args.help is True

    # test the show_help() on orchestrator
    with pytest.raises(SystemExit) as exit_error:
@@ -78,19 +92,22 @@ def test_help(orchestrator, basic_parser, capsys):
    assert "--logging.level" in logs

    # individual module configs
-    assert "--gsheet_feeder.sheet_id" in logs
+    assert "--gsheet_feeder_db.sheet_id" in logs


 def test_add_custom_modules_path(orchestrator, test_args):
    orchestrator.setup_config(test_args)
-    
+
    import auto_archiver
+
    assert "tests/data/test_modules/" in auto_archiver.modules.__path__

-def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):

-    orchestrator.setup_config(test_args +  # we still need to load the real path to get the example_module 
-                          ["--module_paths", "tests/data/invalid_test_modules/"])
+def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
+    orchestrator.setup_config(
+        test_args  # we still need to load the real path to get the example_module
+        + ["--module_paths", "tests/data/invalid_test_modules/"]
+    )

    assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."

@@ -99,16 +116,16 @@ def test_check_required_values(orchestrator, caplog, test_args):
    # drop the example_module.required_field from the test_args
    test_args = test_args[:-2]

-    with pytest.raises(SystemExit) as exit_error:
-        config = orchestrator.setup_config(test_args)
+    with pytest.raises(SystemExit):
+        orchestrator.setup_config(test_args)

    assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"

-def test_get_required_values_from_config(orchestrator, test_args, tmp_path):

+def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
    # load the default example yaml, add a required field, then run the orchestrator
    test_yaml = read_yaml(TEST_ORCHESTRATION)
-    test_yaml['example_module'] = {'required_field': 'some_value'}
+    test_yaml["example_module"] = {"required_field": "some_value"}
    # write it to a temp file
    tmp_file = (tmp_path / "temp_config.yaml").as_posix()
    store_yaml(test_yaml, tmp_file)
@@ -117,27 +134,42 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
    config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
    assert config is not None

-def test_load_authentication_string(orchestrator, test_args):

-    config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
-    assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
+def test_load_authentication_string(orchestrator, test_args):
+    config = orchestrator.setup_config(
+        test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}']
+    )
+    assert config["authentication"] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
+

 def test_load_authentication_string_concat_site(orchestrator, test_args):
-    
    config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
-    assert config['authentication'] == {"x.com": {"api_key": "my_key"},
-                                                     "twitter.com": {"api_key": "my_key"}}
+    assert config["authentication"] == {"x.com": {"api_key": "my_key"}, "twitter.com": {"api_key": "my_key"}}
+

 def test_load_invalid_authentication_string(orchestrator, test_args):
    with pytest.raises(ArgumentTypeError):
-        orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
+        orchestrator.setup_config(test_args + ["--authentication", "{''invalid_json"])
+

 def test_load_authentication_invalid_dict(orchestrator, test_args):
    with pytest.raises(ArgumentTypeError):
        orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])

+
 def test_load_modules_from_commandline(orchestrator, test_args):
-    args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
+    args = test_args + [
+        "--feeders",
+        "example_module",
+        "--extractors",
+        "example_module",
+        "--databases",
+        "example_module",
+        "--enrichers",
+        "example_module",
+        "--formatters",
+        "example_module",
+    ]

    orchestrator.setup(args)

@@ -153,27 +185,37 @@ def test_load_modules_from_commandline(orchestrator, test_args):
    assert orchestrator.enrichers[0].name == "example_module"
    assert orchestrator.formatters[0].name == "example_module"

+
 def test_load_settings_for_module_from_commandline(orchestrator, test_args):
-    args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
+    args = test_args + [
+        "--feeders",
+        "gsheet_feeder_db",
+        "--gsheet_feeder_db.sheet_id",
+        "123",
+        "--gsheet_feeder_db.service_account",
+        "tests/data/test_service_account.json",
+    ]

    orchestrator.setup(args)

    assert len(orchestrator.feeders) == 1
-    assert orchestrator.feeders[0].name == "gsheet_feeder"
-    assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
+    assert orchestrator.feeders[0].name == "gsheet_feeder_db"
+    assert orchestrator.config["gsheet_feeder_db"]["sheet_id"] == "123"


 def test_multiple_orchestrator(test_args):
-
-    o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
+    o1_args = test_args + [
+        "--feeders",
+        "gsheet_feeder_db",
+        "--gsheet_feeder_db.service_account",
+        "tests/data/test_service_account.json",
+    ]
    o1 = ArchivingOrchestrator()

-    with pytest.raises(ValueError) as exit_error:
-        # this should fail because the gsheet_feeder requires a sheet_id / sheet
+    with pytest.raises(ValueError):
+        # this should fail because the gsheet_feeder_db requires a sheet_id / sheet
        o1.setup(o1_args)

-
-
    o2_args = test_args + ["--feeders", "example_module"]
    o2 = ArchivingOrchestrator()
    o2.setup(o2_args)
@@ -182,4 +224,16 @@ def test_multiple_orchestrator(test_args):

    output: Metadata = list(o2.feed())
    assert len(output) == 1
-    assert output[0].get_url() == "https://example.com"
+    assert output[0].get_url() == "https://example.com"
+
+
+def test_wrong_step_type(test_args, caplog):
+    args = test_args + [
+        "--feeders",
+        "example_extractor",  # example_extractor is not a valid feeder!
+    ]
+
+    orchestrator = ArchivingOrchestrator()
+    with pytest.raises(SetupError) as err:
+        orchestrator.setup(args)
+        assert "Module 'example_extractor' is not a feeder" in str(err.value)
--- a/tests/utils/test_misc.py
+++ b/tests/utils/test_misc.py
@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import (
    update_nested_dict,
    calculate_file_hash,
    random_str,
-    get_timestamp
+    get_timestamp,
 )


@@ -38,40 +38,46 @@ class TestDirectoryUtils:
        mkdir_if_not_exists(existing_dir)
        assert existing_dir.exists()

+
 class TestURLExpansion:
-    @pytest.mark.parametrize("input_url,expected", [
-        ("https://example.com", "https://example.com"),
-        ("https://t.co/test", "https://expanded.url")
-    ])
+    @pytest.mark.parametrize(
+        "input_url,expected",
+        [("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url")],
+    )
    def test_expand_url(self, input_url, expected, mocker):
        mock_response = mocker.Mock()
        mock_response.url = "https://expanded.url"
-        mocker.patch('requests.get', return_value=mock_response)
+        mocker.patch("requests.get", return_value=mock_response)
        result = expand_url(input_url)
        assert result == expected

    def test_expand_url_handles_errors(self, caplog, mocker):
-        mocker.patch('requests.get', side_effect=Exception("Connection error"))
+        mocker.patch("requests.get", side_effect=Exception("Connection error"))
        url = "https://t.co/error"
        result = expand_url(url)
        assert result == url
        assert f"Failed to expand url {url}" in caplog.text

+
 class TestAttributeHandling:
    class Sample:
        exists = "value"
        none = None

-    @pytest.mark.parametrize("obj,attr,default,expected", [
-        (Sample(), "exists", "default", "value"),
-        (Sample(), "none", "default", "default"),
-        (Sample(), "missing", "default", "default"),
-        (None, "anything", "fallback", "fallback"),
-    ])
+    @pytest.mark.parametrize(
+        "obj,attr,default,expected",
+        [
+            (Sample(), "exists", "default", "value"),
+            (Sample(), "none", "default", "default"),
+            (Sample(), "missing", "default", "default"),
+            (None, "anything", "fallback", "fallback"),
+        ],
+    )
    def test_getattr_or(self, obj, attr, default, expected):
        # Test gets attribute or returns a default value
        assert getattr_or(obj, attr, default) == expected

+
 class TestDateTimeHandling:
    def test_datetime_encoder(self, sample_datetime):
        result = json.dumps({"dt": sample_datetime}, cls=DateTimeEncoder)
@@ -83,11 +89,14 @@ class TestDateTimeHandling:
        result = dump_payload(payload)
        assert str(sample_datetime) in result

-    @pytest.mark.parametrize("dt_str,fmt,expected", [
-        ("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
-        ("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
-        ("invalid", None, None),
-    ])
+    @pytest.mark.parametrize(
+        "dt_str,fmt,expected",
+        [
+            ("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
+            ("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
+            ("invalid", None, None),
+        ],
+    )
    def test_datetime_from_string(self, dt_str, fmt, expected):
        result = get_datetime_from_str(dt_str, fmt)
        if expected is None:
@@ -95,16 +104,21 @@ class TestDateTimeHandling:
        else:
            assert result == expected.replace(tzinfo=result.tzinfo)

+
 class TestDictUtils:
-    @pytest.mark.parametrize("original,update,expected", [
-        ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
-        ({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
-        ({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
-    ])
+    @pytest.mark.parametrize(
+        "original,update,expected",
+        [
+            ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
+            ({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
+            ({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
+        ],
+    )
    def test_update_nested_dict(self, original, update, expected):
        update_nested_dict(original, update)
        assert original == expected

+
 class TestHashingUtils:
    def test_file_hashing(self, sample_file):
        expected = hashlib.sha256(b"test content").hexdigest()
@@ -118,6 +132,7 @@ class TestHashingUtils:
        expected = hashlib.sha256(content).hexdigest()
        assert calculate_file_hash(str(file_path)) == expected

+
 class TestMiscUtils:
    def test_random_str_length(self):
        for length in [8, 16, 32]:
@@ -131,14 +146,17 @@ class TestMiscUtils:
    def test_random_str_uniqueness(self):
        assert random_str() != random_str()

-    @pytest.mark.parametrize("ts_input,utc,iso,expected_type", [
-        (datetime.now(), True, True, str),
-        ("2023-01-01T12:00:00+00:00", False, False, datetime),
-        (1672574400, True, True, str),
-    ])
+    @pytest.mark.parametrize(
+        "ts_input,utc,iso,expected_type",
+        [
+            (datetime.now(), True, True, str),
+            ("2023-01-01T12:00:00+00:00", False, False, datetime),
+            (1672574400, True, True, str),
+        ],
+    )
    def test_timestamp_parsing(self, ts_input, utc, iso, expected_type):
        result = get_timestamp(ts_input, utc=utc, iso=iso)
        assert isinstance(result, expected_type)

    def test_invalid_timestamp_returns_none(self):
-        assert get_timestamp("invalid-date") is None
+        assert get_timestamp("invalid-date") is None
--- a/tests/utils/test_urls.py
+++ b/tests/utils/test_urls.py
@@ -0,0 +1,113 @@
+import pytest
+from auto_archiver.utils.url import (
+    is_auth_wall,
+    check_url_or_raise,
+    domain_for_url,
+    is_relevant_url,
+    remove_get_parameters,
+    twitter_best_quality_url,
+)
+
+
+@pytest.mark.parametrize(
+    "url, is_auth",
+    [
+        ("https://example.com", False),
+        ("https://t.me/c/abc/123", True),
+        ("https://t.me/not-private/", False),
+        ("https://instagram.com", True),
+        ("https://www.instagram.com", True),
+        ("https://www.instagram.com/p/INVALID", True),
+        ("https://www.instagram.com/p/C4QgLbrIKXG/", True),
+    ],
+)
+def test_is_auth_wall(url, is_auth):
+    assert is_auth_wall(url) == is_auth
+
+
+@pytest.mark.parametrize(
+    "url, raises",
+    [
+        ("http://example.com", False),
+        ("https://example.com", False),
+        ("ftp://example.com", True),
+        ("http://localhost", True),
+        ("http://", True),
+    ],
+)
+def test_check_url_or_raise(url, raises):
+    if raises:
+        with pytest.raises(ValueError):
+            check_url_or_raise(url)
+    else:
+        assert check_url_or_raise(url)
+
+
+@pytest.mark.parametrize(
+    "url, domain",
+    [
+        ("https://example.com", "example.com"),
+        ("https://www.example.com", "www.example.com"),
+        ("https://www.example.com/path", "www.example.com"),
+        ("https://", ""),
+        ("http://localhost", "localhost"),
+    ],
+)
+def test_domain_for_url(url, domain):
+    assert domain_for_url(url) == domain
+
+
+@pytest.mark.parametrize(
+    "url, without_get",
+    [
+        ("https://example.com", "https://example.com"),
+        ("https://example.com?utm_source=example", "https://example.com"),
+        ("https://example.com?utm_source=example&other=1", "https://example.com"),
+        ("https://example.com/something", "https://example.com/something"),
+        ("https://example.com/something?utm_source=example", "https://example.com/something"),
+    ],
+)
+def test_remove_get_parameters(url, without_get):
+    assert remove_get_parameters(url) == without_get
+
+
+@pytest.mark.parametrize(
+    "url, relevant",
+    [
+        ("https://example.com", True),
+        ("https://example.com/favicon.ico", False),
+        ("https://twimg.com/profile_images", False),
+        ("https://twimg.com/something/default_profile_images", False),
+        ("https://scontent.cdninstagram.com/username/150x150.jpg", False),
+        ("https://static.cdninstagram.com/rsrc.php/", False),
+        ("https://telegram.org/img/emoji/", False),
+        ("https://www.youtube.com/s/gaming/emoji/", False),
+        ("https://yt3.ggpht.com/default-user=", False),
+        ("https://www.youtube.com/s/search/audio/", False),
+        ("https://ok.ru/res/i/", False),
+        ("https://vk.com/emoji/", False),
+        ("https://vk.com/images/", False),
+        ("https://vk.com/images/reaction/", False),
+        ("https://wikipedia.org/static", False),
+        ("https://example.com/file.svg", False),
+        ("https://example.com/file.ico", False),
+        ("https://example.com/file.mp4", True),
+        ("https://example.com/150x150.jpg", True),
+        ("https://example.com/rsrc.php/", True),
+        ("https://example.com/img/emoji/", True),
+    ],
+)
+def test_is_relevant_url(url, relevant):
+    assert is_relevant_url(url) == relevant
+
+
+@pytest.mark.parametrize(
+    "url, best_quality",
+    [
+        ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
+        ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
+        ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
+    ],
+)
+def test_twitter_best_quality_url(url, best_quality):
+    assert twitter_best_quality_url(url) == best_quality