diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index 1982389..04e4702 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -116,13 +116,13 @@ class AntibotExtractorEnricher(Extractor, Enricher): self._enrich_download_media( sb, to_enrich, - css_selector=dropin.images_selectors(), + js_css_selector=dropin.js_for_image_css_selectors(), max_media=self.max_download_images - downloaded_images, ) self._enrich_download_media( sb, to_enrich, - css_selector=dropin.video_selectors(), + js_css_selector=dropin.js_for_video_css_selectors(), max_media=self.max_download_videos - downloaded_videos, ) logger.info(f"ANTIBOT completed for {url_sample}") @@ -266,7 +266,7 @@ class AntibotExtractorEnricher(Extractor, Enricher): to_enrich.add_media(Media(filename=pdf_filename), id="pdf") @logger.catch - def _enrich_download_media(self, sb: SB, to_enrich: Metadata, css_selector: str, max_media: int): + def _enrich_download_media(self, sb: SB, to_enrich: Metadata, js_css_selector: str, max_media: int): """ Downloads media from the page and adds them to the Metadata object. This method is called by the enrich method. @@ -276,11 +276,8 @@ class AntibotExtractorEnricher(Extractor, Enricher): url = to_enrich.get_url() all_urls = set() - sources = sb.execute_script(f""" - return Array.from(document.querySelectorAll("{css_selector}")) - .map(el => el.src || el.href) - .filter(Boolean); - """) + sources = sb.execute_script(js_css_selector) + # js_for_css_selectors for src in sources: if len(all_urls) >= max_media: logger.debug(f"Reached max download limit of {max_media} images/videos.") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 15c2e28..2e8c4f6 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -53,6 +53,26 @@ class Dropin: """ return "video, source" + def js_for_image_css_selectors(self) -> str: + """ + A configurable JS script that receives a css selector from the dropin itself and returns an array of Image elements according to the selection. + + You can overwrite this instead of `images_selector` for more control over scraped images. + """ + return f""" + return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean); + """ + + def js_for_video_css_selectors(self) -> str: + """ + A configurable JS script that receives a css selector from the dropin itself and returns an array of Video elements according to the selection. + + You can overwrite this instead of `video_selector` for more control over scraped videos. + """ + return f""" + return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean); + """ + def open_page(self, url) -> bool: """ Make sure the page is opened, even if it requires authentication, captcha solving, etc. @@ -66,7 +86,7 @@ class Dropin: Extract image and/or video data from the currently open post with SeleniumBase. Media is added to the `to_enrich` Metadata object. :return: A tuple (number of Images added, number of Videos added). """ - raise NotImplementedError("This method should be implemented in the subclass") + return 0, 0 def _get_username_password(self, site) -> tuple[str, str]: """ diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py index c5c865a..72ec3f0 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/default.py @@ -1,4 +1,3 @@ -from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin @@ -13,6 +12,3 @@ class DefaultDropin(Dropin): def open_page(self, url) -> bool: return True - - def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - return 0, 0