diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 491bd51..4e01357 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() - if UrlUtil.is_auth_wall(url): - logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}") - return - logger.debug(f"Enriching screenshot for {url=}") auth = self.auth_for_site(url) + + # screenshot enricher only supports cookie-type auth (selenium) + has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie")) + + if UrlUtil.is_auth_wall(url) and not has_valid_auth: + logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}") + if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]): + logger.warning( + f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\ + Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site." + ) + return + with self.webdriver_factory( self.width, self.height, diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 43e7817..0b5fa57 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -22,35 +22,35 @@ from loguru import logger class CookieSettingDriver(webdriver.Firefox): facebook_accept_cookies: bool - cookies: str - cookiejar: MozillaCookieJar + cookie: str + cookie_jar: MozillaCookieJar - def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs): + def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs): if os.environ.get("RUNNING_IN_DOCKER"): # Selenium doesn't support linux-aarch64 driver, we need to set this manually kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver") super(CookieSettingDriver, self).__init__(*args, **kwargs) - self.cookies = cookies - self.cookiejar = cookiejar + self.cookie = cookie + self.cookie_jar = cookie_jar self.facebook_accept_cookies = facebook_accept_cookies def get(self, url: str): - if self.cookies or self.cookiejar: + if self.cookie_jar or self.cookie: # set up the driver to make it not 'cookie averse' (needs a context/URL) # get the 'robots.txt' file which should be quick and easy robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment="")) super(CookieSettingDriver, self).get(robots_url) - if self.cookies: + if self.cookie: # an explicit cookie is set for this site, use that first for cookie in self.cookies.split(";"): for name, value in cookie.split("="): self.driver.add_cookie({"name": name, "value": value}) - elif self.cookiejar: + elif self.cookie_jar: domain = urlparse(url).netloc regex = re.compile(f"(www)?.?{domain}$") - for cookie in self.cookiejar: + for cookie in self.cookie_jar: if regex.match(cookie.domain): try: self.add_cookie( @@ -145,8 +145,8 @@ class Webdriver: try: self.driver = CookieSettingDriver( - cookies=self.auth.get("cookies"), - cookiejar=self.auth.get("cookies_jar"), + cookie=self.auth.get("cookie"), + cookie_jar=self.auth.get("cookies_jar"), facebook_accept_cookies=self.facebook_accept_cookies, options=options, ) diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py index b86bb17..ec56345 100644 --- a/tests/enrichers/test_screenshot_enricher.py +++ b/tests/enrichers/test_screenshot_enricher.py @@ -85,8 +85,8 @@ def test_enrich_adds_screenshot( mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env screenshot_enricher.enrich(metadata_with_video) mock_driver_class.assert_called_once_with( - cookies=None, - cookiejar=None, + cookie=None, + cookie_jar=None, facebook_accept_cookies=False, options=mock_options_instance, ) @@ -124,6 +124,38 @@ def test_enrich_auth_wall( assert metadata_with_video.media[1].properties.get("id") == "screenshot" +def test_skip_authwall_no_cookies(screenshot_enricher, caplog): + with caplog.at_level("WARNING"): + screenshot_enricher.enrich(Metadata().set_url("https://instagram.com")) + assert "[SKIP] SCREENSHOT since url" in caplog.text + + +@pytest.mark.parametrize( + "auth", + [ + {"cookie": "cookie"}, + {"cookies_jar": "cookie"}, + ], +) +def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth): + mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True) + + # patch the authentication dict: + screenshot_enricher.authentication = {"example.com": auth} + with caplog.at_level("WARNING"): + screenshot_enricher.enrich(Metadata().set_url("https://example.com")) + assert "[SKIP] SCREENSHOT since url" not in caplog.text + + +def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env): + mock_driver, mock_driver_class, _ = mock_selenium_env + mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True) + screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}} + with caplog.at_level("WARNING"): + screenshot_enricher.enrich(Metadata().set_url("https://example.com")) + assert "Screenshot enricher only supports cookie-type authentication" in caplog.text + + def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker): mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env