dropin with new ytdlp feature and helper method

This commit is contained in:
msramalho
2025-06-10 16:11:55 +01:00
parent 287e823f43
commit 6bd493a791
2 changed files with 81 additions and 57 deletions

View File

@@ -1,7 +1,10 @@
import os
from loguru import logger
from seleniumbase import SB
import yt_dlp
from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata
from auto_archiver.core import Extractor, Media, Metadata
from auto_archiver.utils.misc import ydl_entry_to_filename
class Dropin:
@@ -36,6 +39,20 @@ class Dropin:
"""
return url
@staticmethod
def images_selectors() -> str:
"""
CSS selector to find images in the HTML page
"""
return "img"
@staticmethod
def video_selectors() -> str:
"""
CSS selector to find videos in the HTML page.
"""
return "video, source"
def open_page(self, url) -> bool:
"""
Make sure the page is opened, even if it requires authentication, captcha solving, etc.
@@ -50,3 +67,59 @@ class Dropin:
:return: A tuple (number of Images added, number of Videos added).
"""
raise NotImplementedError("This method should be implemented in the subclass")
def _get_username_password(self, site) -> tuple[str, str]:
"""
Get the username and password for the site from the extractor's auth data.
:return: A tuple (username, password).
"""
auth = self.extractor.auth_for_site(site)
username = auth.get("username", "")
password = auth.get("password", "")
if not username or not password:
raise ValueError(f"{site} authentication requires a username and password.")
return username, password
def _download_videos_with_ytdlp(self, video_urls: list[str], to_enrich: Metadata) -> int:
"""
Download videos using yt-dlp.
:param video_urls: List of video URLs to download.
:return: The number of videos downloaded.
"""
if type(self.extractor.max_download_videos) is int:
video_urls = video_urls[: self.extractor.max_download_videos]
if not video_urls:
return 0
ydl_options = [
"-o",
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
"--quiet",
"--no-playlist",
"--no-write-subs",
"--no-write-auto-subs",
"--postprocessor-args",
"ffmpeg:-bitexact",
"--max-filesize",
"1000M", # Limit to 1GB per video
]
*_, validated_options = yt_dlp.parse_options(ydl_options)
downloaded = 0
with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls:
try:
logger.debug(f"Downloading video from URL: {url}")
info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video.
continue
media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
if x in info:
media.set(x, info[x])
to_enrich.add_media(media)
downloaded += 1
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return downloaded

View File

@@ -1,12 +1,8 @@
import os
import re
from auto_archiver.core.media import Media
from auto_archiver.core.metadata import Metadata
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.utils.misc import ydl_entry_to_filename
import yt_dlp
from loguru import logger
@@ -37,8 +33,8 @@ class VkDropin(Dropin):
def open_page(self, url) -> bool:
if self.sb.is_text_visible("Sign in to VK"):
self._login()
self.sb.open(url)
if self._login():
self.sb.open(url)
return True
def _login(self) -> bool:
@@ -50,13 +46,9 @@ class VkDropin(Dropin):
return True
# need to login
logger.debug("Logging in to VK...")
auth = self.extractor.auth_for_site("vk.com")
username = auth.get("username", "")
password = auth.get("password", "")
if not username or not password:
raise ValueError("VK authentication requires a username and password.")
logger.debug("Using username: {}", username)
username, password = self._get_username_password("vk.com")
logger.debug("Logging in to VK with username: {}", username)
self.sb.click('[data-testid="enter-another-way"]', timeout=10)
self.sb.clear('input[name="login"][type="tel"]', by="css selector", timeout=10)
self.sb.type('input[name="login"][type="tel"]', username, by="css selector", timeout=10)
@@ -80,47 +72,6 @@ class VkDropin(Dropin):
@logger.catch
def add_extra_media(self, to_enrich: Metadata) -> tuple[int, int]:
"""
Extract video data from the currently open post with SeleniumBase.
:return: A tuple (number of Images added, number of Videos added).
"""
video_urls = [v.get_attribute("href") for v in self.sb.find_elements('a[href*="/video-"]')]
if type(self.extractor.max_download_videos) is int:
video_urls = video_urls[: self.extractor.max_download_videos]
if not video_urls:
return 0, 0
logger.debug(f"Found {len(video_urls)} video URLs in the post, using ytdlp for download.")
ydl_options = [
"-o",
os.path.join(self.extractor.tmp_dir, "%(id)s.%(ext)s"),
"--quiet",
"--no-playlist",
"--no-write-subs",
"--no-write-auto-subs",
"--postprocessor-args",
"ffmpeg:-bitexact",
"--max-filesize",
"1000M", # Limit to 1GB per video
]
*_, validated_options = yt_dlp.parse_options(ydl_options)
downloaded = 0
with yt_dlp.YoutubeDL(validated_options) as ydl:
for url in video_urls:
try:
logger.debug(f"Downloading video from URL: {url}")
info = ydl.extract_info(url, download=True)
filename = ydl_entry_to_filename(ydl, info)
if not filename: # Failed to download video.
continue
media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
if x in info:
media.set(x, info[x])
to_enrich.add_media(media)
downloaded += 1
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return 0, downloaded
return 0, self._download_videos_with_ytdlp(video_urls, to_enrich)