Merge branch 'main' into feat/yt-dlp-pots

# Conflicts:
#	src/auto_archiver/modules/generic_extractor/__manifest__.py
#	tests/test_modules.py
This commit is contained in:
erinhmclark
2025-03-25 15:16:31 +00:00
28 changed files with 586 additions and 121 deletions

View File

@@ -74,10 +74,6 @@ If you are having issues with the extractor, you can review the version of `yt-d
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"pot_provider": {
"default": "bgutils",
"help": "The Proof of origin provider method.",
},
"extractor_args": {
"default": {},
"help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",

View File

@@ -1,6 +1,5 @@
import re
import mimetypes
import json
from loguru import logger
from slugify import slugify
@@ -32,6 +31,9 @@ class Twitter(GenericDropin):
twid = ie_instance._match_valid_url(url).group("id")
return ie_instance._extract_status(twid=twid)
def keys_to_clean(self, video_data, info_extractor):
return ["user", "created_at", "entities", "favorited", "translator_type"]
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
result = Metadata()
try:
@@ -42,9 +44,11 @@ class Twitter(GenericDropin):
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
timestamp
)
full_text = tweet.pop("full_text", "")
author = tweet["user"].get("name", "")
result.set("author", author).set_url(url)
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
logger.debug("No media found, archiving tweet text only")
result.status = "twitter-ytdl"