diff --git a/docs/source/flow_overview.md b/docs/source/flow_overview.md index 5ffa3a8..ff1409a 100644 --- a/docs/source/flow_overview.md +++ b/docs/source/flow_overview.md @@ -8,7 +8,7 @@ The archiver archives web pages using the following workflow 4. **Formatter** creates a report from all the archived content (HTML, PDF, ...) 5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console) -Each step in the workflow is handled by 'modules' that interact with the data in different ways. For example, the Twitter Extractor Module would extract information from the Twitter website. The Screenshot Enricher Module will take screenshots of the given page. See the [core modules page](core_modules.md) for an overview of all the modules that are available. +Each step in the workflow is handled by 'modules' that interact with the data in different ways. For example, the Twitter Extractor Module would extract information from the Twitter website. The AntiBot Module will download HTML and take screenshots of the given page. See the [core modules page](core_modules.md) for an overview of all the modules that are available. Auto-archiver must have at least one module defined for each step of the workflow. This is done by setting the [configuration](installation/configurations.md) for your auto-archiver instance. diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py index 9c7a5c8..d02dfb7 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py @@ -15,7 +15,7 @@ - Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats). ### Notes - - Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available. + - Best used after enrichers like `thumbnail_enricher` or `antibot_extractor_enricher` (takes screenshots) to ensure images are available. - Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings. """, } diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index c7d4a47..19b9c59 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -6,7 +6,7 @@ objects and calculates perceptual hashes using the PDQ hashing algorithm. These hashes are designed specifically for images and can be used for detecting duplicate or near-duplicate visual content. -This enricher is typically used after thumbnail or screenshot enrichers +This enricher is typically used after thumbnail or screenshot (antibot) enrichers to ensure images are available for hashing. """ diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py deleted file mode 100644 index ec56345..0000000 --- a/tests/enrichers/test_screenshot_enricher.py +++ /dev/null @@ -1,216 +0,0 @@ -import base64 - -import pytest -from selenium.common.exceptions import TimeoutException - -from auto_archiver.core import Metadata, Media -from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher - - -@pytest.fixture -def mock_selenium_env(mocker): - """Patches Selenium calls and driver checks in one place.""" - - # Patch external dependencies - mock_which = mocker.patch("shutil.which") - mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver") - mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths") - mocker.patch("pathlib.Path.is_file", return_value=True) - mock_popen = mocker.patch("subprocess.Popen") - mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True) - mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions") - - # Define side effect for `shutil.which` - def mock_which_side_effect(dep): - return "/mock/geckodriver" if dep == "geckodriver" else None - - mock_which.side_effect = mock_which_side_effect - - # Mock binary paths - mock_binary_paths.return_value = { - "driver_path": "/mock/driver", - "browser_path": "/mock/browser", - } - # Mock `subprocess.Popen` - mock_proc = mocker.MagicMock() - mock_proc.poll.return_value = None - mock_popen.return_value = mock_proc - # Mock `CookieSettingDriver` - mock_driver = mocker.MagicMock() - mock_driver_class.return_value = mock_driver - # Mock `FirefoxOptions` - mock_options_instance = mocker.MagicMock() - mock_firefox_options.return_value = mock_options_instance - yield mock_driver, mock_driver_class, mock_options_instance - - -@pytest.fixture -def common_patches(tmp_path, mocker): - """Patches common utilities used across multiple tests.""" - mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=False) - mocker.patch("os.path.join", return_value=str(tmp_path / "test.png")) - mocker.patch("time.sleep") - yield - - -@pytest.fixture -def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher: - configs: dict = { - "width": 1280, - "height": 720, - "timeout": 60, - "sleep_before_screenshot": 4, - "http_proxy": "", - "save_to_pdf": "False", - "print_options": {}, - } - return setup_module("screenshot_enricher", configs) - - -@pytest.fixture -def metadata_with_video(): - m = Metadata() - m.set_url("https://example.com") - m.add_media(Media(filename="video.mp4").set("id", "video1")) - return m - - -def test_enrich_adds_screenshot( - screenshot_enricher, - metadata_with_video, - mock_selenium_env, - common_patches, - tmp_path, -): - mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env - screenshot_enricher.enrich(metadata_with_video) - mock_driver_class.assert_called_once_with( - cookie=None, - cookie_jar=None, - facebook_accept_cookies=False, - options=mock_options_instance, - ) - # Verify the actual calls on the returned mock_driver - mock_driver.get.assert_called_once_with("https://example.com") - mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png")) - # Check that the media was added (2 = original video + screenshot) - assert len(metadata_with_video.media) == 2 - assert metadata_with_video.media[1].properties.get("id") == "screenshot" - - -@pytest.mark.parametrize( - "url,is_auth", - [ - ("https://example.com", False), - ("https://private.com", True), - ], -) -def test_enrich_auth_wall( - screenshot_enricher, metadata_with_video, mock_selenium_env, common_patches, url, is_auth, mocker -): - # Testing with and without is_auth_wall - mock_driver, mock_driver_class, _ = mock_selenium_env - mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth) - metadata_with_video.set_url(url) - screenshot_enricher.enrich(metadata_with_video) - - if is_auth: - mock_driver.get.assert_not_called() - assert len(metadata_with_video.media) == 1 - assert metadata_with_video.media[0].properties.get("id") == "video1" - else: - mock_driver.get.assert_called_once_with(url) - assert len(metadata_with_video.media) == 2 - assert metadata_with_video.media[1].properties.get("id") == "screenshot" - - -def test_skip_authwall_no_cookies(screenshot_enricher, caplog): - with caplog.at_level("WARNING"): - screenshot_enricher.enrich(Metadata().set_url("https://instagram.com")) - assert "[SKIP] SCREENSHOT since url" in caplog.text - - -@pytest.mark.parametrize( - "auth", - [ - {"cookie": "cookie"}, - {"cookies_jar": "cookie"}, - ], -) -def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth): - mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True) - - # patch the authentication dict: - screenshot_enricher.authentication = {"example.com": auth} - with caplog.at_level("WARNING"): - screenshot_enricher.enrich(Metadata().set_url("https://example.com")) - assert "[SKIP] SCREENSHOT since url" not in caplog.text - - -def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env): - mock_driver, mock_driver_class, _ = mock_selenium_env - mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True) - screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}} - with caplog.at_level("WARNING"): - screenshot_enricher.enrich(Metadata().set_url("https://example.com")) - assert "Screenshot enricher only supports cookie-type authentication" in caplog.text - - -def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker): - mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env - - mock_driver.get.side_effect = TimeoutException - mock_log = mocker.patch("loguru.logger.info") - screenshot_enricher.enrich(metadata_with_video) - mock_log.assert_called_once_with("TimeoutException loading page for screenshot") - assert len(metadata_with_video.media) == 1 - - -def test_handle_general_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker): - """Test proper handling of unexpected general exceptions""" - mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env - # Simulate a generic exception when save_screenshot is called - mock_driver.get.return_value = None - mock_driver.save_screenshot.side_effect = Exception("Unexpected Error") - - mock_log = mocker.patch("loguru.logger.error") - screenshot_enricher.enrich(metadata_with_video) - # Verify that the exception was logged with the log - mock_log.assert_called_once_with("Got error while loading webdriver for screenshot enricher: Unexpected Error") - # And no new media was added due to the error - assert len(metadata_with_video.media) == 1 - - -def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_selenium_env): - """Test PDF creation when save_to_pdf is enabled""" - mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env - # Override the save_to_pdf option - screenshot_enricher.save_to_pdf = True - # Mock the print_page method to return base64-encoded content - mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8") - # Patch functions with mocker - mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}") - mocker.patch( - "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str", - return_value="fixed123", - ) - mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open) - - screenshot_enricher.enrich(metadata_with_video) - # Verify screenshot and PDF creation - mock_driver.save_screenshot.assert_called_once() - mock_driver.print_page.assert_called_once_with(mock_driver.print_options) - # Check that PDF file was opened and written - mock_open.assert_any_call("pdf_fixed123.pdf", "wb") - - # Ensure both screenshot and PDF were added as media - assert len(metadata_with_video.media) == 3 - assert metadata_with_video.media[1].properties.get("id") == "screenshot" - assert metadata_with_video.media[2].properties.get("id") == "pdf" - - -@pytest.fixture(autouse=True) -def cleanup_files(tmp_path): - yield - for file in tmp_path.iterdir(): - file.unlink()