dropin with new ytdlp feature and helper method

2026-06-13 05:38:29 +03:00 · 2025-06-10 16:11:55 +01:00
parent 287e823f43
commit 6bd493a791
2 changed files with 81 additions and 57 deletions
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropin.py
@@ -1,7 +1,10 @@
+import os
+from loguru import logger
 from seleniumbase import SB
+import yt_dlp

-from auto_archiver.core.extractor import Extractor
-from auto_archiver.core.metadata import Metadata
+from auto_archiver.core import Extractor, Media, Metadata
+from auto_archiver.utils.misc import ydl_entry_to_filename


 class Dropin:
@@ -36,6 +39,20 @@ class Dropin:
        """
        return url

+    @staticmethod
+    def images_selectors() -> str:
+        """
+        CSS selector to find images in the HTML page
+        """
+        return "img"
+
+    @staticmethod
+    def video_selectors() -> str:
+        """
+        CSS selector to find videos in the HTML page.
+        """
+        return "video, source"
+
    def open_page(self, url) -> bool:
        """
        Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@@ -50,3 +67,59 @@ class Dropin:
        :return: A tuple (number of Images added, number of Videos added).
        """
        raise NotImplementedError("This method should be implemented in the subclass")
+
+    def _get_username_password(self, site) -> tuple[str, str]:
+        """
+        Get the username and password for the site from the extractor's auth data.
+        :return: A tuple (username, password).
+        """
+        auth = self.extractor.auth_for_site(site)
+        username = auth.get("username", "")
+        password = auth.get("password", "")
+        if not username or not password:
+            raise ValueError(f"{site} authentication requires a username and password.")
+        return username, password
+
+    def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int:
+        """
+        Download videos using yt-dlp.
+        :param video_urls: List of video URLs to download.
+        :return: The number of videos downloaded.
+        """
+        if type(self.extractor.max_download_videos) is int:
+            video_urls = video_urls[: self.extractor.max_download_videos]
+
+        if not video_urls:
+            return 0
+
+        ydl_options = [
+            "-o",
+            os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
+            "--quiet",
+            "--no-playlist",
+            "--no-write-subs",
+            "--no-write-auto-subs",
+            "--postprocessor-args",
+            "ffmpeg:-bitexact",
+            "--max-filesize",
+            "1000M",  # Limit to 1GB per video
+        ]
+        *_, validated_options = yt_dlp.parse_options(ydl_options)
+        downloaded = 0
+        with yt_dlp.YoutubeDL(validated_options) as ydl:
+            for url in video_urls:
+                try:
+                    logger.debug(f"Downloading video from URL: {url}")
+                    info = ydl.extract_info(url, download=True)
+                    filename = ydl_entry_to_filename(ydl, info)
+                    if not filename:  # Failed to download video.
+                        continue
+                    media = Media(filename)
+                    for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
+                        if x in info:
+                            media.set(x, info[x])
+                    to_enrich.add_media(media)
+                    downloaded += 1
+                except Exception as e:
+                    logger.error(f"Error downloading {url}: {e}")
+        return downloaded
--- a/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/dropins/vk.py
@@ -1,12 +1,8 @@
-import os
 import re

-from auto_archiver.core.media import Media
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
-from auto_archiver.utils.misc import ydl_entry_to_filename

-import yt_dlp
 from loguru import logger


@@ -37,8 +33,8 @@ class VkDropin(Dropin):

    def open_page(self, url) -> bool:
        if self.sb.is_text_visible("Sign in to VK"):
-            self._login()
-            self.sb.open(url)
+            if self._login():
+                self.sb.open(url)
        return True

    def _login(self) -> bool:
@@ -50,13 +46,9 @@ class VkDropin(Dropin):
            return True

        # need to login
-        logger.debug("Logging in to VK...")
-        auth = self.extractor.auth_for_site("vk.com")
-        username = auth.get("username", "")
-        password = auth.get("password", "")
-        if not username or not password:
-            raise ValueError("VK authentication requires a username and password.")
-        logger.debug("Using username: {}", username)
+        username, password = self._get_username_password("vk.com")
+        logger.debug("Logging in to VK with username: {}", username)
+
        self.sb.click('[data-testid="enter-another-way"]', timeout=10)
        self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
        self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10)
@@ -80,47 +72,6 @@ class VkDropin(Dropin):

    @logger.catch
    def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
-        """
-        Extract video data from the currently open post with SeleniumBase.
-
-        :return: A tuple (number of Images added, number of Videos added).
-        """
        video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
-        if type(self.extractor.max_download_videos) is int:
-            video_urls = video_urls[: self.extractor.max_download_videos]

-        if not video_urls:
-            return 0, 0
-
-        logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.")
-        ydl_options = [
-            "-o",
-            os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
-            "--quiet",
-            "--no-playlist",
-            "--no-write-subs",
-            "--no-write-auto-subs",
-            "--postprocessor-args",
-            "ffmpeg:-bitexact",
-            "--max-filesize",
-            "1000M",  # Limit to 1GB per video
-        ]
-        *_, validated_options = yt_dlp.parse_options(ydl_options)
-        downloaded = 0
-        with yt_dlp.YoutubeDL(validated_options) as ydl:
-            for url in video_urls:
-                try:
-                    logger.debug(f"Downloading video from URL: {url}")
-                    info = ydl.extract_info(url, download=True)
-                    filename = ydl_entry_to_filename(ydl, info)
-                    if not filename:  # Failed to download video.
-                        continue
-                    media = Media(filename)
-                    for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
-                        if x in info:
-                            media.set(x, info[x])
-                    to_enrich.add_media(media)
-                    downloaded += 1
-                except Exception as e:
-                    logger.error(f"Error downloading {url}: {e}")
-        return 0, downloaded
+        return 0, self._download_videos_with_ytdlp(video_urls, to_enrich)