From 9bbc13e9be7c0a40d886e87443416b532cee616d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 23:15:25 +0000 Subject: [PATCH] vk and yt-dlp --- src/archivers/__init__.py | 5 +- src/archivers/vk_archiverv2.py | 2 +- src/archivers/youtubedl_archiverv2.py | 70 +++++++++++++++++++++++++++ src/enrichers/thumbnail_enricher.py | 1 - 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 src/archivers/youtubedl_archiverv2.py diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index d2a2c49..51d3546 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -5,7 +5,7 @@ from .archiver import Archiverv2 # from .telethon_archiver import TelethonArchiver # from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver -from .youtubedl_archiver import YoutubeDLArchiver +# from .youtubedl_archiver import YoutubeDLArchiver # from .twitter_archiver import TwitterArchiver # from .vk_archiver import VkArchiver # from .twitter_api_archiver import TwitterApiArchiver @@ -17,4 +17,5 @@ from .twitter_api_archiverv2 import TwitterApiArchiver from .instagram_archiverv2 import InstagramArchiver from .tiktok_archiverv2 import TiktokArchiver from .telegram_archiverv2 import TelegramArchiver -from .vk_archiverv2 import VkArchiver \ No newline at end of file +from .vk_archiverv2 import VkArchiver +from .youtubedl_archiverv2 import YoutubeDLArchiver \ No newline at end of file diff --git a/src/archivers/vk_archiverv2.py b/src/archivers/vk_archiverv2.py index ec06808..32b6cec 100644 --- a/src/archivers/vk_archiverv2.py +++ b/src/archivers/vk_archiverv2.py @@ -1,9 +1,9 @@ from loguru import logger from vk_url_scraper import VkScraper +from utils.misc import dump_payload from metadata import Metadata from media import Media -from utils.misc import dump_payload from .archiver import Archiverv2 diff --git a/src/archivers/youtubedl_archiverv2.py b/src/archivers/youtubedl_archiverv2.py new file mode 100644 index 0000000..6d26de6 --- /dev/null +++ b/src/archivers/youtubedl_archiverv2.py @@ -0,0 +1,70 @@ +import datetime +import os + +import yt_dlp +from loguru import logger + +from metadata import Metadata +from media import Media +from .archiver import Archiverv2 + + +class YoutubeDLArchiver(Archiverv2): + name = "youtubedl_enricher" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return { + "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + + if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: + logger.debug('Using Facebook cookie') + yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie + + ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False}) + + try: + # don'd download since it can be a live stream + info = ydl.extract_info(url, download=False) + if info.get('is_live', False): + logger.warning("Live streaming media, not archiving now") + return False + except yt_dlp.utils.DownloadError as e: + logger.debug(f'No video - Youtube normal control flow: {e}') + return False + except Exception as e: + logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}') + return False + + # this time download + info = ydl.extract_info(url, download=True) + if "entries" in info: + entries = info.get("entries", []) + if not len(entries): + logger.warning('YoutubeDLArchiver could not find any video') + return False + else: entries = [info] + + result = Metadata() + result.set_title(info.get("title")) + for entry in entries: + filename = ydl.prepare_filename(entry) + if not os.path.exists(filename): + filename = filename.split('.')[0] + '.mkv' + result.add_media(Media(filename).set("duration", info.get("duration"))) + + if (timestamp := info.get("timestamp")): + timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat() + result.set_timestamp(timestamp) + if (upload_date := info.get("upload_date")): + upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc) + result.set("upload_date", upload_date) + + return result.success("yt-dlp") diff --git a/src/enrichers/thumbnail_enricher.py b/src/enrichers/thumbnail_enricher.py index 94c5ee7..32e09be 100644 --- a/src/enrichers/thumbnail_enricher.py +++ b/src/enrichers/thumbnail_enricher.py @@ -25,7 +25,6 @@ class ThumbnailEnricher(Enricher): folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) os.makedirs(folder, exist_ok=True) for i, m in enumerate(to_enrich.media[::]): - logger.info(m) if m.is_video(): logger.debug(f"generating thumbnails for {m.filename}") fps, duration = 0.5, m.get("duration")