Merge remote-tracking branch 'origin/main' into feat/seleniumbase

This commit is contained in:
msramalho
2025-06-03 11:05:16 +01:00
15 changed files with 1188 additions and 1333 deletions

View File

@@ -1,3 +1,4 @@
import mimetypes
import shutil
import sys
import datetime
@@ -11,6 +12,7 @@ from urllib.request import urlretrieve
import yt_dlp
from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.utils import MaxDownloadsReached
import pysubs2
from loguru import logger
@@ -156,7 +158,7 @@ class GenericExtractor(Extractor):
logger.error("generate_once.js not found after transpilation.")
return
self.extractor_args.setdefault("youtube", {})["getpot_bgutil_script"] = script_path
self.extractor_args.setdefault("youtubepot-bgutilscript", {})["script_path"] = script_path
logger.info(f"PO Token script configured at: {script_path}")
except Exception as e:
@@ -301,9 +303,9 @@ class GenericExtractor(Extractor):
result.set_url(url)
if "description" in video_data and not result.get("content"):
result.set_content(video_data["description"])
result.set_content(video_data.pop("description"))
# extract comments if enabled
if self.comments:
if self.comments and video_data.get("comments", []) is not None:
result.set(
"comments",
[
@@ -362,7 +364,12 @@ class GenericExtractor(Extractor):
# this time download
ydl.params["getcomments"] = self.comments
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
try:
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
except MaxDownloadsReached: # proceed as normal once MaxDownloadsReached is raised
pass
logger.success(data)
if "entries" in data:
entries = data.get("entries", [])
if not len(entries):
@@ -370,14 +377,33 @@ class GenericExtractor(Extractor):
return False
else:
entries = [data]
result = Metadata()
def _helper_get_filename(entry: dict) -> str:
entry_url = entry.get("url")
filename = ydl.prepare_filename(entry)
base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext'
directory = os.path.dirname(base_filename) # '/get/path/to'
basename = os.path.basename(base_filename) # 'file'
for f in os.listdir(directory):
if (
f.startswith(basename)
or (entry_url and os.path.splitext(f)[0] in entry_url)
and "video/" in (mimetypes.guess_type(f)[0] or "")
):
return os.path.join(directory, f)
return False
for entry in entries:
try:
filename = ydl.prepare_filename(entry)
if not os.path.exists(filename):
filename = filename.split(".")[0] + ".mkv"
filename = _helper_get_filename(entry)
if not filename or not os.path.exists(filename):
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
continue
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -396,6 +422,9 @@ class GenericExtractor(Extractor):
result.add_media(new_media)
except Exception as e:
logger.error(f"Error processing entry {entry}: {e}")
if not len(result.media):
logger.warning(f"No media found for entry {entry}, skipping.")
return False
return self.add_metadata(data, info_extractor, url, result)
@@ -454,6 +483,13 @@ class GenericExtractor(Extractor):
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
if data.get("is_live", False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
return self.get_metadata_for_video(data, info_extractor, url, ydl)
try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
@@ -461,11 +497,12 @@ class GenericExtractor(Extractor):
# don't download since it can be a live stream
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
if data.get("is_live", False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
return False
# it's a valid video, that the youtubdedl can download out of the box
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
except MaxDownloadsReached:
# yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
except Exception as e:
if info_extractor.IE_NAME == "generic":
@@ -519,6 +556,8 @@ class GenericExtractor(Extractor):
"--write-subs" if self.subtitles else "--no-write-subs",
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
"--live-from-start" if self.live_from_start else "--no-live-from-start",
"--postprocessor-args",
"ffmpeg:-bitexact", # ensure bitexact output to avoid mismatching hashes for same video
]
# proxy handling

View File

@@ -98,7 +98,7 @@ class GsheetsFeederDB(Feeder, Database):
return missing
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
logger.info(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, "status", "Archive in progress")

View File

@@ -1 +0,0 @@
from .vk_extractor import VkExtractor

View File

@@ -1,37 +0,0 @@
{
"name": "VKontakte Extractor",
"type": ["extractor"],
"requires_setup": True,
"depends": ["core", "utils"],
"dependencies": {
"python": ["loguru", "vk_url_scraper"],
},
"configs": {
"username": {"required": True, "help": "valid VKontakte username"},
"password": {"required": True, "help": "valid VKontakte password"},
"session_file": {
"default": "secrets/vk_config.v2.json",
"help": "valid VKontakte password",
},
},
"description": """
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
and download content. Note that VK videos are handled separately by the `YTDownloader`.
### Features
- Extracts text, timestamps, and metadata from VK `/wall` posts.
- Downloads associated images and attaches them to the resulting `Metadata` object.
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
- **Username**: A valid VKontakte account username.
- **Password**: The corresponding password for the VKontakte account.
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
Credentials can be set in the configuration file or directly via environment variables. Ensure you
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
""",
}

View File

@@ -1,43 +0,0 @@
from loguru import logger
from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
class VkExtractor(Extractor):
""" "
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
def setup(self) -> None:
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if "vk.com" not in item.netloc:
return False
# some urls can contain multiple wall/photo/... parts and all will be fetched
vk_scrapes = self.vks.scrape(url)
if not len(vk_scrapes):
return False
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
result = Metadata()
for scrape in vk_scrapes:
if not result.get_title():
result.set_title(scrape["text"])
if not result.get_timestamp():
result.set_timestamp(scrape["datetime"])
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
for filename in filenames:
result.add_media(Media(filename))
return result.success("vk")

View File

@@ -194,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
shutil.copyfileobj(infile, outfile)
# get media out of .warc
counter = 0
counter_warc_files = 0
counter_screenshots = 0
seen_urls = set()
with open(warc_filename, "rb") as warc_stream:
@@ -203,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
if (
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
): # screenshots
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
with open(fn, "wb") as outf:
outf.write(record.raw_stream.read())
m = Media(filename=fn)
to_enrich.add_media(m, "browsertrix-screenshot")
counter += 1
to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
counter_screenshots += 1
if not self.extract_media:
continue
@@ -231,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
# create local file and add media
ext = mimetypes.guess_extension(content_type)
warc_fn = f"warc-file-{counter}{ext}"
warc_fn = f"warc-file-{counter_screenshots}{ext}"
fn = os.path.join(tmp_dir, warc_fn)
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
@@ -256,6 +257,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
continue
to_enrich.add_media(m, warc_fn)
counter += 1
counter_warc_files += 1
seen_urls.add(record_url)
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
logger.info(
f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)"
)