diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py
index d93c072..214653c 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/__manifest__.py
@@ -25,6 +25,11 @@
"default": "secrets/antibot_user_data",
"help": "Path to the user data directory for the webdriver. This is used to persist browser state, such as cookies and local storage. When using docker it's best to let docker create the folder otherwise there may be permission issues. The Extractor will try to work without it if that error occurs but login sessions will not be used or preserved on those runs.",
},
+ "detect_auth_wall": {
+ "default": True,
+ "type": "bool",
+ "help": "detect if the page is behind an authentication wall (e.g. login required) and skip it. disable if you want to archive pages where logins are required.",
+ },
"proxy": {
"default": None,
"help": "proxy to use for the webdriver, Format: 'SERVER:PORT' or 'USER:PASS@SERVER:PORT'",
diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
index e82a2f8..08d0c03 100644
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@@ -15,7 +15,9 @@ from seleniumbase import SB
from auto_archiver.core import Extractor, Enricher, Metadata, Media
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
+from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
from auto_archiver.utils.misc import random_str
+from auto_archiver.utils.url import is_relevant_url
class AntibotExtractorEnricher(Extractor, Enricher):
@@ -102,39 +104,41 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.uc_open_with_reconnect(url, 4)
logger.debug(f"ANTIBOT handling CAPTCHAs for {url_sample}...")
+ sb.uc_gui_handle_cf()
+ sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
- # TODO: implement other Captcha handling
- sb.uc_gui_handle_captcha() # handles Cloudflare Turnstile captcha if detected
+ dropin = self._get_suitable_dropin(url, sb)
+ dropin.open_page(url)
- suitable_dropin = self._get_suitable_dropin(url, sb)
-
- if suitable_dropin:
- suitable_dropin.open_page(url)
-
- if self._hit_auth_wall(sb):
+ if self.detect_auth_wall and self._hit_auth_wall(sb):
logger.warning(f"ANTIBOT SKIP since auth wall or CAPTCHA was detected for {url_sample}")
return False
- logger.debug(f"ANTIBOT no auth wall detected for {url_sample}...")
+
sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely
to_enrich.set_title(sb.get_title())
self._enrich_html_source_code(sb, to_enrich)
+
self._enrich_full_page_screenshot(sb, to_enrich)
if self.save_to_pdf:
self._enrich_full_page_pdf(sb, to_enrich)
- downloaded_images, downloaded_videos = 0, 0
- if suitable_dropin:
- downloaded_images, downloaded_videos = suitable_dropin.add_extra_media(to_enrich)
+ downloaded_images, downloaded_videos = dropin.add_extra_media(to_enrich)
self._enrich_download_media(
- sb, to_enrich, css_selector="img", max_media=self.max_download_images - downloaded_images
+ sb,
+ to_enrich,
+ css_selector=dropin.images_selectors(),
+ max_media=self.max_download_images - downloaded_images,
)
self._enrich_download_media(
- sb, to_enrich, css_selector="video, source", max_media=self.max_download_videos - downloaded_videos
+ sb,
+ to_enrich,
+ css_selector=dropin.video_selectors(),
+ max_media=self.max_download_videos - downloaded_videos,
)
- logger.success(f"ANTIBOT completed for {url_sample}")
+ logger.info(f"ANTIBOT completed for {url_sample}")
return to_enrich
except selenium.common.exceptions.SessionNotCreatedException as e:
@@ -155,10 +159,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
"""
for dropin in self.dropins:
if dropin.suitable(url):
- logger.debug(f"ANTIBOT using drop-in {dropin.__class__.__name__} for {url}")
+ logger.debug(f"ANTIBOT using drop-in {dropin.__name__} for {url}")
return dropin(sb, self)
- # logger.warning(f"ANTIBOT no suitable drop-in found for {url}")
- return None
+
+ return DefaultDropin(sb, self)
def _hit_auth_wall(self, sb: SB) -> bool:
"""
@@ -168,8 +172,8 @@ class AntibotExtractorEnricher(Extractor, Enricher):
# TODO: improve this detection logic, currently it is very basic and may not cover all cases
# Common URL patterns
- url = sb.get_current_url().lower()
- if any(kw in url for kw in ["login", "signin", "signup", "register", "captcha"]):
+ current_url = sb.get_current_url().lower()
+ if any(kw in current_url for kw in ["login", "signin", "signup", "register", "captcha"]):
return True
# Common visible text markers
@@ -245,8 +249,12 @@ class AntibotExtractorEnricher(Extractor, Enricher):
Enriches the full page screenshot of the Metadata object.
This method is called by the enrich method.
"""
- x = sb.execute_script("return document.documentElement.scrollWidth")
- y = min(sb.execute_script("return document.documentElement.scrollHeight"), 25_000)
+ start_size = sb.get_window_size()
+ w, h = start_size["width"], start_size["height"]
+
+ x = max(sb.execute_script("return document.documentElement.scrollWidth"), w)
+ y = min(max(sb.execute_script("return document.documentElement.scrollHeight"), h), 25_000)
+ logger.debug(f"Setting window size to {x}x{y} for full page screenshot.")
sb.set_window_size(x, y)
screen_filename = os.path.join(self.tmp_dir, f"screenshot{random_str(6)}.png")
@@ -278,12 +286,9 @@ class AntibotExtractorEnricher(Extractor, Enricher):
"""
if max_media == 0:
return
- logger.debug(
- f"Downloading media from {to_enrich.get_url()} with selector '{css_selector}' up to {max_media} items."
- )
url = to_enrich.get_url()
all_urls = set()
- # media_elements = sb.find_elements(css_selector)
+
sources = sb.execute_script(f"""
return Array.from(document.querySelectorAll("{css_selector}"))
.map(el => el.src || el.href)
@@ -293,10 +298,15 @@ class AntibotExtractorEnricher(Extractor, Enricher):
if len(all_urls) >= max_media:
logger.debug(f"Reached max download limit of {max_media} images/videos.")
break
- mimerype = mimetypes.guess_type(src)[0]
- if mimerype in self.exclude_media_mimetypes:
+ if not is_relevant_url(src):
+ continue
+ mimetype = mimetypes.guess_type(src)[0]
+ if mimetype in self.exclude_media_mimetypes:
continue
full_src = urljoin(url, src)
- if full_src not in all_urls and (filename := self.download_from_url(full_src)):
+ if full_src not in all_urls:
+ filename, full_src = self.download_from_url(full_src, try_best_quality=True)
+ if not filename:
+ continue
all_urls.add(full_src)
to_enrich.add_media(Media(filename=filename, properties={"url": full_src}))
diff --git a/tests/extractors/test_antibot_extractor_enricher.py b/tests/extractors/test_antibot_extractor_enricher.py
index 1da025d..600baf2 100644
--- a/tests/extractors/test_antibot_extractor_enricher.py
+++ b/tests/extractors/test_antibot_extractor_enricher.py
@@ -129,15 +129,33 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
),
(
"https://seleniumbase.io/apps/turnstile",
- 'id="captcha-success"',
+ '
',
+ ),
+ (
+ "https://seleniumbase.io/apps/form_turnstile",
+ '
',
+ ),
+ (
+ "https://gitlab.com/users/sign_in",
+ "Password",
),
],
)
- def test_download_with_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
+ def test_overcome_cloudflare_turnstile(self, setup_module, make_item, url, in_html):
"""
Test downloading a page with Cloudflare Turnstile captcha.
"""
+ self.extractor = setup_module(
+ self.extractor_module,
+ {
+ "save_to_pdf": True,
+ "detect_auth_wall": False,
+ "max_download_images": 5,
+ "max_download_videos": "inf",
+ },
+ )
+
item = make_item(url)
self.extractor.enrich(item)