mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Ruff format with defaults.
This commit is contained in:
@@ -1 +1 @@
|
||||
from .generic_extractor import GenericExtractor
|
||||
from .generic_extractor import GenericExtractor
|
||||
|
||||
@@ -4,15 +4,16 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
class Bluesky(GenericDropin):
|
||||
|
||||
class Bluesky(GenericDropin):
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
result = Metadata()
|
||||
result.set_url(url)
|
||||
result.set_title(post["record"]["text"])
|
||||
result.set_timestamp(post["record"]["createdAt"])
|
||||
for k, v in self._get_post_data(post).items():
|
||||
if v: result.set(k, v)
|
||||
if v:
|
||||
result.set(k, v)
|
||||
|
||||
# download if embeds present (1 video XOR >=1 images)
|
||||
for media in self._download_bsky_embeds(post, archiver):
|
||||
@@ -23,12 +24,12 @@ class Bluesky(GenericDropin):
|
||||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
|
||||
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
|
||||
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
|
||||
handle, video_id = ie_instance._match_valid_url(url).group("handle", "id")
|
||||
return ie_instance._extract_post(handle=handle, post_id=video_id)
|
||||
|
||||
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
|
||||
"""
|
||||
Iterates over image(s) or video in a Bluesky post and downloads them
|
||||
Iterates over image(s) or video in a Bluesky post and downloads them
|
||||
"""
|
||||
media = []
|
||||
embed = post.get("record", {}).get("embed", {})
|
||||
@@ -37,16 +38,15 @@ class Bluesky(GenericDropin):
|
||||
|
||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||
for image_media in image_medias:
|
||||
url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
|
||||
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
||||
image_media = archiver.download_from_url(url)
|
||||
media.append(Media(image_media))
|
||||
for video_media in video_medias:
|
||||
url = media_url.format(video_media['ref']['$link'], post['author']['did'])
|
||||
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
||||
video_media = archiver.download_from_url(url)
|
||||
media.append(Media(video_media))
|
||||
return media
|
||||
|
||||
|
||||
def _get_post_data(self, post: dict) -> dict:
|
||||
"""
|
||||
Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
|
||||
@@ -74,4 +74,4 @@ class Bluesky(GenericDropin):
|
||||
res["tags"] = tags
|
||||
if links:
|
||||
res["links"] = links
|
||||
return res
|
||||
return res
|
||||
|
||||
@@ -2,11 +2,12 @@ from yt_dlp.extractor.common import InfoExtractor
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
|
||||
|
||||
class GenericDropin:
|
||||
"""Base class for dropins for the generic extractor.
|
||||
|
||||
|
||||
In many instances, an extractor will exist in ytdlp, but it will only process videos.
|
||||
Dropins can be created and used to make use of the already-written private code of a
|
||||
Dropins can be created and used to make use of the already-written private code of a
|
||||
specific extractor from ytdlp.
|
||||
|
||||
The dropin should be able to handle the following methods:
|
||||
@@ -28,21 +29,19 @@ class GenericDropin:
|
||||
This method should return the post data from the url.
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
"""
|
||||
This method should create a Metadata object from the post data.
|
||||
"""
|
||||
raise NotImplementedError("This method should be implemented in the subclass")
|
||||
|
||||
|
||||
def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
|
||||
"""
|
||||
This method should return True if you want to skip the ytdlp download method.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
def keys_to_clean(self, video_data: dict, info_extractor: InfoExtractor):
|
||||
"""
|
||||
This method should return a list of strings (keys) to clean from the video_data dict.
|
||||
@@ -50,9 +49,9 @@ class GenericDropin:
|
||||
E.g. ["uploader", "uploader_id", "tiktok_specific_field"]
|
||||
"""
|
||||
return []
|
||||
|
||||
|
||||
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata):
|
||||
"""
|
||||
This method should download any additional media from the post.
|
||||
"""
|
||||
return metadata
|
||||
return metadata
|
||||
|
||||
@@ -3,16 +3,15 @@ from .dropin import GenericDropin
|
||||
|
||||
class Facebook(GenericDropin):
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
video_id = ie_instance._match_valid_url(url).group('id')
|
||||
ie_instance._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
|
||||
video_id = ie_instance._match_valid_url(url).group("id")
|
||||
ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), video_id)
|
||||
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group("id"))
|
||||
|
||||
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
|
||||
post_data = ie_instance._extract_metadata(webpage)
|
||||
return post_data
|
||||
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
metadata = archiver.create_metadata(url)
|
||||
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
|
||||
return metadata
|
||||
metadata.set_title(post.get("title")).set_content(post.get("description")).set_post_data(post)
|
||||
return metadata
|
||||
|
||||
@@ -12,6 +12,7 @@ from loguru import logger
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
@@ -19,14 +20,14 @@ class GenericExtractor(Extractor):
|
||||
# check for file .ytdlp-update in the secrets folder
|
||||
if self.ytdlp_update_interval < 0:
|
||||
return
|
||||
|
||||
use_secrets = os.path.exists('secrets')
|
||||
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
|
||||
|
||||
use_secrets = os.path.exists("secrets")
|
||||
path = os.path.join("secrets" if use_secrets else "", ".ytdlp-update")
|
||||
next_update_check = None
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as f:
|
||||
next_update_check = datetime.datetime.fromisoformat(f.read())
|
||||
|
||||
|
||||
if not next_update_check or next_update_check < datetime.datetime.now():
|
||||
self.update_ytdlp()
|
||||
|
||||
@@ -36,8 +37,11 @@ class GenericExtractor(Extractor):
|
||||
|
||||
def update_ytdlp(self):
|
||||
logger.info("Checking and updating yt-dlp...")
|
||||
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
|
||||
logger.info(
|
||||
f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}"
|
||||
)
|
||||
from importlib.metadata import version as get_version
|
||||
|
||||
old_version = get_version("yt-dlp")
|
||||
try:
|
||||
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
||||
@@ -59,15 +63,17 @@ class GenericExtractor(Extractor):
|
||||
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
|
||||
if info_extractor.suitable(url) and info_extractor.working():
|
||||
yield info_extractor
|
||||
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
"""
|
||||
Checks for valid URLs out of all ytdlp extractors.
|
||||
Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
|
||||
"""
|
||||
return any(self.suitable_extractors(url))
|
||||
|
||||
def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
|
||||
|
||||
def download_additional_media(
|
||||
self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata
|
||||
) -> Metadata:
|
||||
"""
|
||||
Downloads additional media like images, comments, subtitles, etc.
|
||||
|
||||
@@ -76,7 +82,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# Just get the main thumbnail. More thumbnails are available in
|
||||
# video_data['thumbnails'] should they be required
|
||||
thumbnail_url = video_data.get('thumbnail')
|
||||
thumbnail_url = video_data.get("thumbnail")
|
||||
if thumbnail_url:
|
||||
try:
|
||||
cover_image_path = self.download_from_url(thumbnail_url)
|
||||
@@ -99,15 +105,65 @@ class GenericExtractor(Extractor):
|
||||
Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
|
||||
"""
|
||||
|
||||
base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
|
||||
'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
|
||||
'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
|
||||
'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
|
||||
'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
|
||||
'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
|
||||
'_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
|
||||
'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
|
||||
|
||||
base_keys = [
|
||||
"formats",
|
||||
"thumbnail",
|
||||
"display_id",
|
||||
"epoch",
|
||||
"requested_downloads",
|
||||
"duration_string",
|
||||
"thumbnails",
|
||||
"http_headers",
|
||||
"webpage_url_basename",
|
||||
"webpage_url_domain",
|
||||
"extractor",
|
||||
"extractor_key",
|
||||
"playlist",
|
||||
"playlist_index",
|
||||
"duration_string",
|
||||
"protocol",
|
||||
"requested_subtitles",
|
||||
"format_id",
|
||||
"acodec",
|
||||
"vcodec",
|
||||
"ext",
|
||||
"epoch",
|
||||
"_has_drm",
|
||||
"filesize",
|
||||
"audio_ext",
|
||||
"video_ext",
|
||||
"vbr",
|
||||
"abr",
|
||||
"resolution",
|
||||
"dynamic_range",
|
||||
"aspect_ratio",
|
||||
"cookies",
|
||||
"format",
|
||||
"quality",
|
||||
"preference",
|
||||
"artists",
|
||||
"channel_id",
|
||||
"subtitles",
|
||||
"tbr",
|
||||
"url",
|
||||
"original_url",
|
||||
"automatic_captions",
|
||||
"playable_in_embed",
|
||||
"live_status",
|
||||
"_format_sort_fields",
|
||||
"chapters",
|
||||
"requested_formats",
|
||||
"format_note",
|
||||
"audio_channels",
|
||||
"asr",
|
||||
"fps",
|
||||
"was_live",
|
||||
"is_live",
|
||||
"heatmap",
|
||||
"age_limit",
|
||||
"stretched_ratio",
|
||||
]
|
||||
|
||||
dropin = self.dropin_for_name(info_extractor.ie_key())
|
||||
if dropin:
|
||||
try:
|
||||
@@ -116,8 +172,8 @@ class GenericExtractor(Extractor):
|
||||
pass
|
||||
|
||||
return base_keys
|
||||
|
||||
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
|
||||
|
||||
def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url: str, result: Metadata) -> Metadata:
|
||||
"""
|
||||
Creates a Metadata object from the given video_data
|
||||
"""
|
||||
@@ -126,29 +182,36 @@ class GenericExtractor(Extractor):
|
||||
result = self.download_additional_media(video_data, info_extractor, result)
|
||||
|
||||
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
|
||||
result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
|
||||
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
|
||||
result.set_url(url)
|
||||
if "description" in video_data: result.set_content(video_data["description"])
|
||||
if "description" in video_data:
|
||||
result.set_content(video_data["description"])
|
||||
# extract comments if enabled
|
||||
if self.comments:
|
||||
result.set("comments", [{
|
||||
"text": c["text"],
|
||||
"author": c["author"],
|
||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
|
||||
} for c in video_data.get("comments", [])])
|
||||
result.set(
|
||||
"comments",
|
||||
[
|
||||
{
|
||||
"text": c["text"],
|
||||
"author": c["author"],
|
||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz=datetime.timezone.utc),
|
||||
}
|
||||
for c in video_data.get("comments", [])
|
||||
],
|
||||
)
|
||||
|
||||
# then add the common metadata
|
||||
if timestamp := video_data.pop("timestamp", None):
|
||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
|
||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
|
||||
result.set_timestamp(timestamp)
|
||||
if upload_date := video_data.pop("upload_date", None):
|
||||
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
|
||||
result.set("upload_date", upload_date)
|
||||
|
||||
|
||||
# then clean away any keys we don't want
|
||||
for clean_key in self.keys_to_clean(info_extractor, video_data):
|
||||
video_data.pop(clean_key, None)
|
||||
|
||||
|
||||
# then add the rest of the video data
|
||||
for k, v in video_data.items():
|
||||
if v:
|
||||
@@ -169,22 +232,24 @@ class GenericExtractor(Extractor):
|
||||
logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
|
||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||
return False
|
||||
|
||||
|
||||
post_data = dropin.extract_post(url, ie_instance)
|
||||
return dropin.create_metadata(post_data, ie_instance, self, url)
|
||||
|
||||
def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
|
||||
def get_metadata_for_video(
|
||||
self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL
|
||||
) -> Metadata:
|
||||
# this time download
|
||||
ydl.params['getcomments'] = self.comments
|
||||
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
ydl.params["getcomments"] = self.comments
|
||||
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||
if "entries" in data:
|
||||
entries = data.get("entries", [])
|
||||
if not len(entries):
|
||||
logger.warning('YoutubeDLArchiver could not find any video')
|
||||
logger.warning("YoutubeDLArchiver could not find any video")
|
||||
return False
|
||||
else: entries = [data]
|
||||
else:
|
||||
entries = [data]
|
||||
|
||||
result = Metadata()
|
||||
|
||||
@@ -192,17 +257,18 @@ class GenericExtractor(Extractor):
|
||||
try:
|
||||
filename = ydl.prepare_filename(entry)
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split('.')[0] + '.mkv'
|
||||
filename = filename.split(".")[0] + ".mkv"
|
||||
|
||||
new_media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
if x in entry: new_media.set(x, entry[x])
|
||||
if x in entry:
|
||||
new_media.set(x, entry[x])
|
||||
|
||||
# read text from subtitles if enabled
|
||||
if self.subtitles:
|
||||
for lang, val in (data.get('requested_subtitles') or {}).items():
|
||||
try:
|
||||
subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
|
||||
for lang, val in (data.get("requested_subtitles") or {}).items():
|
||||
try:
|
||||
subs = pysubs2.load(val.get("filepath"), encoding="utf-8")
|
||||
text = " ".join([line.text for line in subs])
|
||||
new_media.set(f"subtitles_{lang}", text)
|
||||
except Exception as e:
|
||||
@@ -212,8 +278,8 @@ class GenericExtractor(Extractor):
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
|
||||
def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
|
||||
|
||||
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
|
||||
dropin_name = dropin_name.lower()
|
||||
|
||||
if dropin_name == "generic":
|
||||
@@ -221,6 +287,7 @@ class GenericExtractor(Extractor):
|
||||
return None
|
||||
|
||||
dropin_class_name = dropin_name.title()
|
||||
|
||||
def _load_dropin(dropin):
|
||||
dropin_class = getattr(dropin, dropin_class_name)()
|
||||
return self._dropins.setdefault(dropin_name, dropin_class)
|
||||
@@ -244,7 +311,7 @@ class GenericExtractor(Extractor):
|
||||
return _load_dropin(dropin)
|
||||
except (FileNotFoundError, ModuleNotFoundError):
|
||||
pass
|
||||
|
||||
|
||||
# fallback to loading the dropins within auto-archiver
|
||||
try:
|
||||
return _load_dropin(importlib.import_module(f".{dropin_name}", package=package))
|
||||
@@ -256,12 +323,12 @@ class GenericExtractor(Extractor):
|
||||
def download_for_extractor(self, info_extractor: InfoExtractor, url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
|
||||
"""
|
||||
Tries to download the given url using the specified extractor
|
||||
|
||||
|
||||
It first tries to use ytdlp directly to download the video. If the post is not a video, it will then try to
|
||||
use the extractor's _extract_post method to get the post metadata if possible.
|
||||
"""
|
||||
# when getting info without download, we also don't need the comments
|
||||
ydl.params['getcomments'] = False
|
||||
ydl.params["getcomments"] = False
|
||||
result = False
|
||||
|
||||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||
@@ -272,7 +339,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
if data.get('is_live', False) and not self.livestreams:
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
@@ -283,16 +350,21 @@ class GenericExtractor(Extractor):
|
||||
# don't clutter the logs with issues about the 'generic' extractor not having a dropin
|
||||
return False
|
||||
|
||||
logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
|
||||
logger.debug(
|
||||
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
|
||||
)
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
logger.error(f'Error downloading metadata for post: {post_e}')
|
||||
logger.error(f"Error downloading metadata for post: {post_e}")
|
||||
return False
|
||||
except Exception as generic_e:
|
||||
logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}', exc_info=True)
|
||||
logger.debug(
|
||||
f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}',
|
||||
exc_info=True,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
if result:
|
||||
extractor_name = "yt-dlp"
|
||||
if info_extractor:
|
||||
@@ -308,43 +380,49 @@ class GenericExtractor(Extractor):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
#TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
# TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
if url.startswith("https://ya.ru"):
|
||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||
item.set("replaced_url", url)
|
||||
|
||||
ydl_options = {
|
||||
"outtmpl": os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"),
|
||||
"quiet": False,
|
||||
"noplaylist": not self.allow_playlist,
|
||||
"writesubtitles": self.subtitles,
|
||||
"writeautomaticsub": self.subtitles,
|
||||
"live_from_start": self.live_from_start,
|
||||
"proxy": self.proxy,
|
||||
"max_downloads": self.max_downloads,
|
||||
"playlistend": self.max_downloads,
|
||||
}
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
|
||||
'quiet': False, 'noplaylist': not self.allow_playlist ,
|
||||
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
|
||||
"live_from_start": self.live_from_start, "proxy": self.proxy,
|
||||
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
logger.debug(f'Using provided auth username and password for {url}')
|
||||
ydl_options['username'] = auth['username']
|
||||
ydl_options['password'] = auth['password']
|
||||
elif 'cookie' in auth:
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
|
||||
ydl_options['cookiefile'] = auth['cookies_file']
|
||||
if "username" in auth and "password" in auth:
|
||||
logger.debug(f"Using provided auth username and password for {url}")
|
||||
ydl_options["username"] = auth["username"]
|
||||
ydl_options["password"] = auth["password"]
|
||||
elif "cookie" in auth:
|
||||
logger.debug(f"Using provided auth cookie for {url}")
|
||||
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
|
||||
elif "cookies_from_browser" in auth:
|
||||
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
|
||||
ydl_options["cookiesfrombrowser"] = auth["cookies_from_browser"]
|
||||
elif "cookies_file" in auth:
|
||||
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
|
||||
ydl_options["cookiefile"] = auth["cookies_file"]
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
ydl = yt_dlp.YoutubeDL(
|
||||
ydl_options
|
||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
for info_extractor in self.suitable_extractors(url):
|
||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if result:
|
||||
return result
|
||||
|
||||
|
||||
return False
|
||||
|
||||
@@ -9,11 +9,11 @@ from dateutil.parser import parse as parse_dt
|
||||
|
||||
from .dropin import GenericDropin
|
||||
|
||||
class Truth(GenericDropin):
|
||||
|
||||
class Truth(GenericDropin):
|
||||
def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
|
||||
video_id = ie_instance._match_id(url)
|
||||
truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
|
||||
truthsocial_url = f"https://truthsocial.com/api/v1/statuses/{video_id}"
|
||||
return ie_instance._download_json(truthsocial_url, video_id)
|
||||
|
||||
def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
|
||||
@@ -22,31 +22,42 @@ class Truth(GenericDropin):
|
||||
def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
"""
|
||||
Creates metadata from a truth social post
|
||||
|
||||
|
||||
Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
|
||||
|
||||
|
||||
Format is:
|
||||
|
||||
|
||||
{'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
|
||||
"""
|
||||
|
||||
result = Metadata()
|
||||
result.set_url(url)
|
||||
timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
|
||||
timestamp = post["created_at"] # format is 2022-12-29T19:51:18.161Z
|
||||
result.set_timestamp(parse_dt(timestamp))
|
||||
result.set('description', post['content'])
|
||||
result.set('author', post['account']['username'])
|
||||
result.set("description", post["content"])
|
||||
result.set("author", post["account"]["username"])
|
||||
|
||||
for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
|
||||
for key in [
|
||||
"replies_count",
|
||||
"reblogs_count",
|
||||
"favourites_count",
|
||||
("account", "followers_count"),
|
||||
("account", "following_count"),
|
||||
("account", "statuses_count"),
|
||||
("account", "display_name"),
|
||||
"language",
|
||||
"in_reply_to_account",
|
||||
"replies_count",
|
||||
]:
|
||||
if isinstance(key, tuple):
|
||||
store_key = " ".join(key)
|
||||
else:
|
||||
store_key = key
|
||||
result.set(store_key, traverse_obj(post, key))
|
||||
|
||||
# add the media
|
||||
for media in post.get('media_attachments', []):
|
||||
filename = archiver.download_from_url(media['url'])
|
||||
result.add_media(Media(filename), id=media.get('id'))
|
||||
|
||||
return result
|
||||
# add the media
|
||||
for media in post.get("media_attachments", []):
|
||||
filename = archiver.download_from_url(media["url"])
|
||||
result.add_media(Media(filename), id=media.get("id"))
|
||||
|
||||
return result
|
||||
|
||||
@@ -10,9 +10,8 @@ from auto_archiver.core.extractor import Extractor
|
||||
|
||||
from .dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
class Twitter(GenericDropin):
|
||||
|
||||
|
||||
def choose_variant(self, variants):
|
||||
# choosing the highest quality possible
|
||||
variant, width, height = None, 0, 0
|
||||
@@ -27,9 +26,9 @@ class Twitter(GenericDropin):
|
||||
else:
|
||||
variant = var if not variant else variant
|
||||
return variant
|
||||
|
||||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||
twid = ie_instance._match_valid_url(url).group('id')
|
||||
twid = ie_instance._match_valid_url(url).group("id")
|
||||
return ie_instance._extract_status(twid=twid)
|
||||
|
||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
@@ -41,30 +40,29 @@ class Twitter(GenericDropin):
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
return False
|
||||
|
||||
result\
|
||||
.set_title(tweet.get('full_text', ''))\
|
||||
.set_content(json.dumps(tweet, ensure_ascii=False))\
|
||||
.set_timestamp(timestamp)
|
||||
|
||||
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
|
||||
timestamp
|
||||
)
|
||||
if not tweet.get("entities", {}).get("media"):
|
||||
logger.debug('No media found, archiving tweet text only')
|
||||
logger.debug("No media found, archiving tweet text only")
|
||||
result.status = "twitter-ytdl"
|
||||
return result
|
||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||
media = Media(filename="")
|
||||
mimetype = ""
|
||||
if tw_media["type"] == "photo":
|
||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
|
||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media["media_url_https"]))
|
||||
mimetype = "image/jpeg"
|
||||
elif tw_media["type"] == "video":
|
||||
variant = self.choose_variant(tw_media['video_info']['variants'])
|
||||
media.set("src", variant['url'])
|
||||
mimetype = variant['content_type']
|
||||
variant = self.choose_variant(tw_media["video_info"]["variants"])
|
||||
media.set("src", variant["url"])
|
||||
mimetype = variant["content_type"]
|
||||
elif tw_media["type"] == "animated_gif":
|
||||
variant = tw_media['video_info']['variants'][0]
|
||||
media.set("src", variant['url'])
|
||||
mimetype = variant['content_type']
|
||||
variant = tw_media["video_info"]["variants"][0]
|
||||
media.set("src", variant["url"])
|
||||
mimetype = variant["content_type"]
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
result.add_media(media)
|
||||
return result
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user