Merge branch 'main' into feat/yt-dlp-pots

# Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py # tests/test_modules.py
2026-06-12 13:18:28 +03:00 · 2025-03-25 15:16:31 +00:00
parent 93921e71d4 5c6005d843
commit b4c33318c4
28 changed files with 586 additions and 121 deletions
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -74,10 +74,6 @@ If you are having issues with the extractor, you can review the version of `yt-d
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
-        "pot_provider": {
-            "default": "bgutils",
-            "help": "The Proof of origin provider method.",
-        },
        "extractor_args": {
            "default": {},
            "help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,6 +1,5 @@
 import re
 import mimetypes
-import json

 from loguru import logger
 from slugify import slugify
@@ -32,6 +31,9 @@ class Twitter(GenericDropin):
        twid = ie_instance._match_valid_url(url).group("id")
        return ie_instance._extract_status(twid=twid)

+    def keys_to_clean(self, video_data, info_extractor):
+        return ["user", "created_at", "entities", "favorited", "translator_type"]
+
    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
@@ -42,9 +44,11 @@ class Twitter(GenericDropin):
            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
            return False

-        result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
-            timestamp
-        )
+        full_text = tweet.pop("full_text", "")
+        author = tweet["user"].get("name", "")
+        result.set("author", author).set_url(url)
+
+        result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
        if not tweet.get("entities", {}).get("media"):
            logger.debug("No media found, archiving tweet text only")
            result.status = "twitter-ytdl"