improves antibot flow and makes auth_wall detection optional

This commit is contained in:
msramalho
2025-06-10 16:29:07 +01:00
parent 809b8c7749
commit 6bbc7fb47a
3 changed files with 64 additions and 31 deletions

View File

@@ -129,15 +129,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
),
(
"https://seleniumbase.io/apps/turnstile",
'id="captcha-success"',
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" style="" width="180">',
),
(
"https://seleniumbase.io/apps/form_turnstile",
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" width="120" style="">',
),
(
"https://gitlab.com/users/sign_in",
"Password",
),
],
)
def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
"""
Test downloading a page with Cloudflare Turnstile captcha.
"""
self.extractor = setup_module(
self.extractor_module,
{
"save_to_pdf": True,
"detect_auth_wall": False,
"max_download_images": 5,
"max_download_videos": "inf",
},
)
item = make_item(url)
self.extractor.enrich(item)