mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
new auth wall check logic and escaped CSS selector in selenium
This commit is contained in:
@@ -99,7 +99,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
dropin = self._get_suitable_dropin(url, sb)
|
dropin = self._get_suitable_dropin(url, sb)
|
||||||
dropin.open_page(url)
|
dropin.open_page(url)
|
||||||
|
|
||||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||||
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -277,8 +277,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
return
|
return
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
all_urls = set()
|
all_urls = set()
|
||||||
|
logger.debug(f"Extracting media for {js_css_selector=}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
sources = sb.execute_script(js_css_selector)
|
||||||
|
except selenium.common.exceptions.JavascriptException as e:
|
||||||
|
logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
sources = sb.execute_script(js_css_selector)
|
|
||||||
# js_for_css_selectors
|
# js_for_css_selectors
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if len(all_urls) >= max_media:
|
if len(all_urls) >= max_media:
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Mapping
|
from typing import Mapping
|
||||||
@@ -74,8 +75,11 @@ class Dropin:
|
|||||||
|
|
||||||
You can overwrite this instead of `images_selector` for more control over scraped images.
|
You can overwrite this instead of `images_selector` for more control over scraped images.
|
||||||
"""
|
"""
|
||||||
|
if not self.images_selectors():
|
||||||
|
return "return [];"
|
||||||
|
safe_selector = json.dumps(self.images_selectors())
|
||||||
return f"""
|
return f"""
|
||||||
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def js_for_video_css_selectors(self) -> str:
|
def js_for_video_css_selectors(self) -> str:
|
||||||
@@ -84,8 +88,11 @@ class Dropin:
|
|||||||
|
|
||||||
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
||||||
"""
|
"""
|
||||||
|
if not self.video_selectors():
|
||||||
|
return "return [];"
|
||||||
|
safe_selector = json.dumps(self.video_selectors())
|
||||||
return f"""
|
return f"""
|
||||||
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def open_page(self, url) -> bool:
|
def open_page(self, url) -> bool:
|
||||||
@@ -103,6 +110,12 @@ class Dropin:
|
|||||||
"""
|
"""
|
||||||
return 0, 0
|
return 0, 0
|
||||||
|
|
||||||
|
def hit_auth_wall(self) -> bool:
|
||||||
|
"""
|
||||||
|
Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
def _get_username_password(self, site) -> tuple[str, str]:
|
def _get_username_password(self, site) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Get the username and password for the site from the extractor's auth data.
|
Get the username and password for the site from the extractor's auth data.
|
||||||
|
|||||||
Reference in New Issue
Block a user