diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py index 805edfd..15c2e28 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py @@ -1,7 +1,10 @@ +import os +from loguru import logger from seleniumbase import SB +import yt_dlp -from auto_archiver.core.extractor import Extractor -from auto_archiver.core.metadata import Metadata +from auto_archiver.core import Extractor, Media, Metadata +from auto_archiver.utils.misc import ydl_entry_to_filename class Dropin: @@ -36,6 +39,20 @@ class Dropin: """ return url + @staticmethod + def images_selectors() -> str: + """ + CSS selector to find images in the HTML page + """ + return "img" + + @staticmethod + def video_selectors() -> str: + """ + CSS selector to find videos in the HTML page. + """ + return "video, source" + def open_page(self, url) -> bool: """ Make sure the page is opened, even if it requires authentication, captcha solving, etc. @@ -50,3 +67,59 @@ class Dropin: :return: A tuple (number of Images added, number of Videos added). """ raise NotImplementedError("This method should be implemented in the subclass") + + def _get_username_password(self, site) -> tuple[str, str]: + """ + Get the username and password for the site from the extractor's auth data. + :return: A tuple (username, password). + """ + auth = self.extractor.auth_for_site(site) + username = auth.get("username", "") + password = auth.get("password", "") + if not username or not password: + raise ValueError(f"{site} authentication requires a username and password.") + return username, password + + def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int: + """ + Download videos using yt-dlp. + :param video_urls: List of video URLs to download. + :return: The number of videos downloaded. + """ + if type(self.extractor.max_download_videos) is int: + video_urls = video_urls[: self.extractor.max_download_videos] + + if not video_urls: + return 0 + + ydl_options = [ + "-o", + os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), + "--quiet", + "--no-playlist", + "--no-write-subs", + "--no-write-auto-subs", + "--postprocessor-args", + "ffmpeg:-bitexact", + "--max-filesize", + "1000M", # Limit to 1GB per video + ] + *_, validated_options = yt_dlp.parse_options(ydl_options) + downloaded = 0 + with yt_dlp.YoutubeDL(validated_options) as ydl: + for url in video_urls: + try: + logger.debug(f"Downloading video from URL: {url}") + info = ydl.extract_info(url, download=True) + filename = ydl_entry_to_filename(ydl, info) + if not filename: # Failed to download video. + continue + media = Media(filename) + for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: + if x in info: + media.set(x, info[x]) + to_enrich.add_media(media) + downloaded += 1 + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + return downloaded diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py index 6f54187..76e176e 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py @@ -1,12 +1,8 @@ -import os import re -from auto_archiver.core.media import Media from auto_archiver.core.metadata import Metadata from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin -from auto_archiver.utils.misc import ydl_entry_to_filename -import yt_dlp from loguru import logger @@ -37,8 +33,8 @@ class VkDropin(Dropin): def open_page(self, url) -> bool: if self.sb.is_text_visible("Sign in to VK"): - self._login() - self.sb.open(url) + if self._login(): + self.sb.open(url) return True def _login(self) -> bool: @@ -50,13 +46,9 @@ class VkDropin(Dropin): return True # need to login - logger.debug("Logging in to VK...") - auth = self.extractor.auth_for_site("vk.com") - username = auth.get("username", "") - password = auth.get("password", "") - if not username or not password: - raise ValueError("VK authentication requires a username and password.") - logger.debug("Using username: {}", username) + username, password = self._get_username_password("vk.com") + logger.debug("Logging in to VK with username: {}", username) + self.sb.click('[data-testid="enter-another-way"]', timeout=10) self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10) self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10) @@ -80,47 +72,6 @@ class VkDropin(Dropin): @logger.catch def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]: - """ - Extract video data from the currently open post with SeleniumBase. - - :return: A tuple (number of Images added, number of Videos added). - """ video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')] - if type(self.extractor.max_download_videos) is int: - video_urls = video_urls[: self.extractor.max_download_videos] - if not video_urls: - return 0, 0 - - logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.") - ydl_options = [ - "-o", - os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"), - "--quiet", - "--no-playlist", - "--no-write-subs", - "--no-write-auto-subs", - "--postprocessor-args", - "ffmpeg:-bitexact", - "--max-filesize", - "1000M", # Limit to 1GB per video - ] - *_, validated_options = yt_dlp.parse_options(ydl_options) - downloaded = 0 - with yt_dlp.YoutubeDL(validated_options) as ydl: - for url in video_urls: - try: - logger.debug(f"Downloading video from URL: {url}") - info = ydl.extract_info(url, download=True) - filename = ydl_entry_to_filename(ydl, info) - if not filename: # Failed to download video. - continue - media = Media(filename) - for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]: - if x in info: - media.set(x, info[x]) - to_enrich.add_media(media) - downloaded += 1 - except Exception as e: - logger.error(f"Error downloading {url}: {e}") - return 0, downloaded + return 0, self._download_videos_with_ytdlp(video_urls, to_enrich)