Provide better logs for screenshot enricher when auth is/isn't supported (cookies only)

This commit is contained in:
Patrick Robertson
2025-03-21 12:05:47 +04:00
parent 5b131996c6
commit 14c56f4916
3 changed files with 58 additions and 17 deletions

View File

@@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
if UrlUtil.is_auth_wall(url):
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
return
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
# screenshot enricher only supports cookie-type auth (selenium)
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
logger.warning(
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
)
return
with self.webdriver_factory(
self.width,
self.height,

View File

@@ -22,35 +22,35 @@ from loguru import logger
class CookieSettingDriver(webdriver.Firefox):
facebook_accept_cookies: bool
cookies: str
cookiejar: MozillaCookieJar
cookie: str
cookie_jar: MozillaCookieJar
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
if os.environ.get("RUNNING_IN_DOCKER"):
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
super(CookieSettingDriver, self).__init__(*args, **kwargs)
self.cookies = cookies
self.cookiejar = cookiejar
self.cookie = cookie
self.cookie_jar = cookie_jar
self.facebook_accept_cookies = facebook_accept_cookies
def get(self, url: str):
if self.cookies or self.cookiejar:
if self.cookie_jar or self.cookie:
# set up the driver to make it not 'cookie averse' (needs a context/URL)
# get the 'robots.txt' file which should be quick and easy
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
super(CookieSettingDriver, self).get(robots_url)
if self.cookies:
if self.cookie:
# an explicit cookie is set for this site, use that first
for cookie in self.cookies.split(";"):
for name, value in cookie.split("="):
self.driver.add_cookie({"name": name, "value": value})
elif self.cookiejar:
elif self.cookie_jar:
domain = urlparse(url).netloc
regex = re.compile(f"(www)?.?{domain}$")
for cookie in self.cookiejar:
for cookie in self.cookie_jar:
if regex.match(cookie.domain):
try:
self.add_cookie(
@@ -145,8 +145,8 @@ class Webdriver:
try:
self.driver = CookieSettingDriver(
cookies=self.auth.get("cookies"),
cookiejar=self.auth.get("cookies_jar"),
cookie=self.auth.get("cookie"),
cookie_jar=self.auth.get("cookies_jar"),
facebook_accept_cookies=self.facebook_accept_cookies,
options=options,
)

View File

@@ -85,8 +85,8 @@ def test_enrich_adds_screenshot(
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookies=None,
cookiejar=None,
cookie=None,
cookie_jar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
@@ -124,6 +124,38 @@ def test_enrich_auth_wall(
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
assert "[SKIP] SCREENSHOT since url" in caplog.text
@pytest.mark.parametrize(
"auth",
[
{"cookie": "cookie"},
{"cookies_jar": "cookie"},
],
)
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
# patch the authentication dict:
screenshot_enricher.authentication = {"example.com": auth}
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
assert "[SKIP] SCREENSHOT since url" not in caplog.text
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
mock_driver, mock_driver_class, _ = mock_selenium_env
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
with caplog.at_level("WARNING"):
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env