improves antibot flow and makes auth_wall detection optional

This commit is contained in:
msramalho
2025-06-10 16:29:07 +01:00
parent 809b8c7749
commit 6bbc7fb47a
3 changed files with 64 additions and 31 deletions

View File

@@ -25,6 +25,11 @@
"default": "secrets/antibot_user_data", "default": "secrets/antibot_user_data",
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.", "help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.",
}, },
"detect_auth_wall": {
"default": True,
"type": "bool",
"help": "detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.",
},
"proxy": { "proxy": {
"default": None, "default": None,
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'", "help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",

View File

@@ -15,7 +15,9 @@ from seleniumbase import SB
from auto_archiver.core import Extractor, Enricher, Metadata, Media from auto_archiver.core import Extractor, Enricher, Metadata, Media
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
from auto_archiver.utils.misc import random_str from auto_archiver.utils.misc import random_str
from auto_archiver.utils.url import is_relevant_url
class AntibotExtractorEnricher(Extractor, Enricher): class AntibotExtractorEnricher(Extractor, Enricher):
@@ -102,39 +104,41 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.uc_open_with_reconnect(url, 4) sb.uc_open_with_reconnect(url, 4)
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...") logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
sb.uc_gui_handle_cf()
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
# TODO: implement other Captcha handling dropin = self._get_suitable_dropin(url, sb)
sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected dropin.open_page(url)
suitable_dropin = self._get_suitable_dropin(url, sb) if self.detect_auth_wall and self._hit_auth_wall(sb):
if suitable_dropin:
suitable_dropin.open_page(url)
if self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}") logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
return False return False
logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
sb.wait_for_ready_state_complete() sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely sb.sleep(1) # margin for the page to load completely
to_enrich.set_title(sb.get_title()) to_enrich.set_title(sb.get_title())
self._enrich_html_source_code(sb, to_enrich) self._enrich_html_source_code(sb, to_enrich)
self._enrich_full_page_screenshot(sb, to_enrich) self._enrich_full_page_screenshot(sb, to_enrich)
if self.save_to_pdf: if self.save_to_pdf:
self._enrich_full_page_pdf(sb, to_enrich) self._enrich_full_page_pdf(sb, to_enrich)
downloaded_images, downloaded_videos = 0, 0 downloaded_images, downloaded_videos = dropin.add_extra_media(to_enrich)
if suitable_dropin:
downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich)
self._enrich_download_media( self._enrich_download_media(
sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images sb,
to_enrich,
css_selector=dropin.images_selectors(),
max_media=self.max_download_images - downloaded_images,
) )
self._enrich_download_media( self._enrich_download_media(
sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos sb,
to_enrich,
css_selector=dropin.video_selectors(),
max_media=self.max_download_videos - downloaded_videos,
) )
logger.success(f"ANTIBOT completed for {url_sample}") logger.info(f"ANTIBOT completed for {url_sample}")
return to_enrich return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e: except selenium.common.exceptions.SessionNotCreatedException as e:
@@ -155,10 +159,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
""" """
for dropin in self.dropins: for dropin in self.dropins:
if dropin.suitable(url): if dropin.suitable(url):
logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}") logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
return dropin(sb, self) return dropin(sb, self)
# logger.warning(f"ANTIBOT no suitable drop-in found for {url}")
return None return DefaultDropin(sb, self)
def _hit_auth_wall(self, sb: SB) -> bool: def _hit_auth_wall(self, sb: SB) -> bool:
""" """
@@ -168,8 +172,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# TODO: improve this detection logic, currently it is very basic and may not cover all cases # TODO: improve this detection logic, currently it is very basic and may not cover all cases
# Common URL patterns # Common URL patterns
url = sb.get_current_url().lower() current_url = sb.get_current_url().lower()
if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]): if any(kw in current_url for kw in ["login", "signin", "signup", "register", "captcha"]):
return True return True
# Common visible text markers # Common visible text markers
@@ -245,8 +249,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
Enriches the full page screenshot of the Metadata object. Enriches the full page screenshot of the Metadata object.
This method is called by the enrich method. This method is called by the enrich method.
""" """
x = sb.execute_script("return document.documentElement.scrollWidth") start_size = sb.get_window_size()
y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000) w, h = start_size["width"], start_size["height"]
x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y) sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png") screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -278,12 +286,9 @@ class AntibotExtractorEnricher(Extractor, Enricher):
""" """
if max_media == 0: if max_media == 0:
return return
logger.debug(
f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items."
)
url = to_enrich.get_url() url = to_enrich.get_url()
all_urls = set() all_urls = set()
# media_elements = sb.find_elements(css_selector)
sources = sb.execute_script(f""" sources = sb.execute_script(f"""
return Array.from(document.querySelectorAll("{css_selector}")) return Array.from(document.querySelectorAll("{css_selector}"))
.map(el => el.src || el.href) .map(el => el.src || el.href)
@@ -293,10 +298,15 @@ class AntibotExtractorEnricher(Extractor, Enricher):
if len(all_urls) >= max_media: if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.") logger.debug(f"Reached max download limit of {max_media} images/videos.")
break break
mimerype = mimetypes.guess_type(src)[0] if not is_relevant_url(src):
if mimerype in self.exclude_media_mimetypes: continue
mimetype = mimetypes.guess_type(src)[0]
if mimetype in self.exclude_media_mimetypes:
continue continue
full_src = urljoin(url, src) full_src = urljoin(url, src)
if full_src not in all_urls and (filename := self.download_from_url(full_src)): if full_src not in all_urls:
filename, full_src = self.download_from_url(full_src, try_best_quality=True)
if not filename:
continue
all_urls.add(full_src) all_urls.add(full_src)
to_enrich.add_media(Media(filename=filename, properties={"url": full_src})) to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))

View File

@@ -129,15 +129,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
), ),
( (
"https://seleniumbase.io/apps/turnstile", "https://seleniumbase.io/apps/turnstile",
'id="captcha-success"', '<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" style="" width="180">',
),
(
"https://seleniumbase.io/apps/form_turnstile",
'<img id="captcha-success" src="https://seleniumbase.io/cdn/img/green_check.png" width="120" style="">',
),
(
"https://gitlab.com/users/sign_in",
"Password",
), ),
], ],
) )
def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html): def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
""" """
Test downloading a page with Cloudflare Turnstile captcha. Test downloading a page with Cloudflare Turnstile captcha.
""" """
self.extractor = setup_module(
self.extractor_module,
{
"save_to_pdf": True,
"detect_auth_wall": False,
"max_download_images": 5,
"max_download_videos": "inf",
},
)
item = make_item(url) item = make_item(url)
self.extractor.enrich(item) self.extractor.enrich(item)