More manifests, base modules and rename from archiver to extractor.

2026-06-12 05:08:28 +03:00 · 2025-01-23 16:40:48 +00:00
parent 9db26cdfc2
commit 1274a1b231
93 changed files with 378 additions and 238 deletions
--- a/src/auto_archiver/modules/twitter_api_extractor/init.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/init.py
--- a/src/auto_archiver/modules/twitter_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/manifest.py
@@ -0,0 +1,44 @@
+{
+    "name": "Twitter API Extractor",
+    "type": ["extractor"],
+    "requires_setup": True,
+    "external_dependencies": {
+        "python": ["requests",
+                   "loguru",
+                   "pytwitter",
+                   "slugify",],
+        "bin": [""]
+    },
+    "configs": {
+            "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
+            "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
+                              "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
+            "consumer_key": {"default": None, "help": "twitter API consumer_key"},
+            "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
+            "access_token": {"default": None, "help": "twitter API access_token"},
+            "access_secret": {"default": None, "help": "twitter API access_secret"},
+        },
+    "description": """
+        The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API. 
+        It supports multiple API configurations for extended rate limits and reliable access. 
+        Features include URL expansion, media downloads (e.g., images, videos), and structured output 
+        via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens 
+        or consumer key/secret and access token/secret.
+        
+        ### Features
+        - Fetches tweets and their metadata, including text, creation timestamp, and author information.
+        - Downloads media attachments (e.g., images, videos) in high quality.
+        - Supports multiple API configurations for improved rate limiting.
+        - Expands shortened URLs (e.g., `t.co` links).
+        - Outputs structured metadata and media using `Metadata` and `Media` objects.
+        
+        ### Setup
+        To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration:
+        - **Bearer Token(s)**: A single token or a list for rate-limited API access.
+        - **Consumer Key and Secret**: Required for user-authenticated API access.
+        - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
+        
+        Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
+        """
+,
+}
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py
@@ -0,0 +1,132 @@
+import json
+import re
+import mimetypes
+import requests
+from datetime import datetime
+
+from loguru import logger
+from pytwitter import Api
+from slugify import slugify
+
+from auto_archiver.base_modules import Extractor
+from auto_archiver.core import Metadata,Media
+
+class TwitterApiExtractor(Extractor):
+    name = "twitter_api_extractor"
+    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+
+        self.api_index = 0
+        self.apis = []
+        if len(self.bearer_tokens):
+            self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens])
+        if self.bearer_token:
+            self.assert_valid_string("bearer_token")
+            self.apis.append(Api(bearer_token=self.bearer_token))
+        if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
+            self.assert_valid_string("consumer_key")
+            self.assert_valid_string("consumer_secret")
+            self.assert_valid_string("access_token")
+            self.assert_valid_string("access_secret")
+            self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
+                             access_token=self.access_token, access_secret=self.access_secret))
+        assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
+
+    @property  # getter .mimetype
+    def api_client(self) -> str:
+        return self.apis[self.api_index]
+    
+    def sanitize_url(self, url: str) -> str:
+        # expand URL if t.co and clean tracker GET params
+        if 'https://t.co/' in url:
+            try:
+                r = requests.get(url, timeout=30)
+                logger.debug(f'Expanded url {url} to {r.url}')
+                url = r.url
+            except:
+                logger.error(f'Failed to expand url {url}')
+        return url
+
+
+    def download(self, item: Metadata) -> Metadata:
+        # call download retry until success or no more apis
+        while self.api_index < len(self.apis):
+            if res := self.download_retry(item): return res
+            self.api_index += 1
+        self.api_index = 0
+        return False
+
+    def get_username_tweet_id(self, url):
+        # detect URLs that we definitely cannot handle
+        matches = self.link_pattern.findall(url)
+        if not len(matches): return False, False
+
+        username, tweet_id = matches[0]  # only one URL supported
+        logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
+
+        return username, tweet_id
+
+    def download_retry(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+        # detect URLs that we definitely cannot handle
+        username, tweet_id = self.get_username_tweet_id(url)
+        if not username: return False
+
+        try:
+            tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
+            logger.debug(tweet)
+        except Exception as e:
+            logger.error(f"Could not get tweet: {e}")
+            return False
+
+        result = Metadata()
+        result.set_title(tweet.data.text)
+        result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
+
+        urls = []
+        if tweet.includes:
+            for i, m in enumerate(tweet.includes.media):
+                media = Media(filename="")
+                if m.url and len(m.url):
+                    media.set("src", m.url)
+                    media.set("duration", (m.duration_ms or 1) // 1000)
+                    mimetype = "image/jpeg"
+                elif hasattr(m, "variants"):
+                    variant = self.choose_variant(m.variants)
+                    if not variant: continue
+                    media.set("src", variant.url)
+                    mimetype = variant.content_type
+                else:
+                    continue
+                logger.info(f"Found media {media}")
+                ext = mimetypes.guess_extension(mimetype)
+                media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
+                result.add_media(media)
+
+        result.set_content(json.dumps({
+            "id": tweet.data.id,
+            "text": tweet.data.text,
+            "created_at": tweet.data.created_at,
+            "author_id": tweet.data.author_id,
+            "geo": tweet.data.geo,
+            "lang": tweet.data.lang,
+            "media": urls
+        }, ensure_ascii=False, indent=4))
+        return result.success("twitter-api")
+
+    def choose_variant(self, variants):
+
+        """
+        Chooses the highest quality variable possible out of a list of variants
+        """
+        variant, bit_rate = None, -1
+        for var in variants:
+            if var.content_type == "video/mp4":
+                if var.bit_rate > bit_rate:
+                    bit_rate = var.bit_rate
+                    variant = var
+            else:
+                variant = var if not variant else variant
+        return variant