More manifests, base modules and rename from archiver to extractor.

This commit is contained in:
erinhmclark
2025-01-23 16:40:48 +00:00
parent 9db26cdfc2
commit 1274a1b231
93 changed files with 378 additions and 238 deletions

View File

@@ -0,0 +1,44 @@
{
"name": "Twitter API Extractor",
"type": ["extractor"],
"requires_setup": True,
"external_dependencies": {
"python": ["requests",
"loguru",
"pytwitter",
"slugify",],
"bin": [""]
},
"configs": {
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"},
"access_secret": {"default": None, "help": "twitter API access_secret"},
},
"description": """
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
It supports multiple API configurations for extended rate limits and reliable access.
Features include URL expansion, media downloads (e.g., images, videos), and structured output
via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens
or consumer key/secret and access token/secret.
### Features
- Fetches tweets and their metadata, including text, creation timestamp, and author information.
- Downloads media attachments (e.g., images, videos) in high quality.
- Supports multiple API configurations for improved rate limiting.
- Expands shortened URLs (e.g., `t.co` links).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
### Setup
To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration:
- **Bearer Token(s)**: A single token or a list for rate-limited API access.
- **Consumer Key and Secret**: Required for user-authenticated API access.
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
"""
,
}

View File

@@ -0,0 +1,132 @@
import json
import re
import mimetypes
import requests
from datetime import datetime
from loguru import logger
from pytwitter import Api
from slugify import slugify
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata,Media
class TwitterApiExtractor(Extractor):
name = "twitter_api_extractor"
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def __init__(self, config: dict) -> None:
super().__init__(config)
self.api_index = 0
self.apis = []
if len(self.bearer_tokens):
self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens])
if self.bearer_token:
self.assert_valid_string("bearer_token")
self.apis.append(Api(bearer_token=self.bearer_token))
if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
self.assert_valid_string("consumer_key")
self.assert_valid_string("consumer_secret")
self.assert_valid_string("access_token")
self.assert_valid_string("access_secret")
self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
access_token=self.access_token, access_secret=self.access_secret))
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
@property # getter .mimetype
def api_client(self) -> str:
return self.apis[self.api_index]
def sanitize_url(self, url: str) -> str:
# expand URL if t.co and clean tracker GET params
if 'https://t.co/' in url:
try:
r = requests.get(url, timeout=30)
logger.debug(f'Expanded url {url} to {r.url}')
url = r.url
except:
logger.error(f'Failed to expand url {url}')
return url
def download(self, item: Metadata) -> Metadata:
# call download retry until success or no more apis
while self.api_index < len(self.apis):
if res := self.download_retry(item): return res
self.api_index += 1
self.api_index = 0
return False
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
if not len(matches): return False, False
username, tweet_id = matches[0] # only one URL supported
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
return username, tweet_id
def download_retry(self, item: Metadata) -> Metadata:
url = item.get_url()
# detect URLs that we definitely cannot handle
username, tweet_id = self.get_username_tweet_id(url)
if not username: return False
try:
tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
logger.debug(tweet)
except Exception as e:
logger.error(f"Could not get tweet: {e}")
return False
result = Metadata()
result.set_title(tweet.data.text)
result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
urls = []
if tweet.includes:
for i, m in enumerate(tweet.includes.media):
media = Media(filename="")
if m.url and len(m.url):
media.set("src", m.url)
media.set("duration", (m.duration_ms or 1) // 1000)
mimetype = "image/jpeg"
elif hasattr(m, "variants"):
variant = self.choose_variant(m.variants)
if not variant: continue
media.set("src", variant.url)
mimetype = variant.content_type
else:
continue
logger.info(f"Found media {media}")
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
result.add_media(media)
result.set_content(json.dumps({
"id": tweet.data.id,
"text": tweet.data.text,
"created_at": tweet.data.created_at,
"author_id": tweet.data.author_id,
"geo": tweet.data.geo,
"lang": tweet.data.lang,
"media": urls
}, ensure_ascii=False, indent=4))
return result.success("twitter-api")
def choose_variant(self, variants):
"""
Chooses the highest quality variable possible out of a list of variants
"""
variant, bit_rate = None, -1
for var in variants:
if var.content_type == "video/mp4":
if var.bit_rate > bit_rate:
bit_rate = var.bit_rate
variant = var
else:
variant = var if not variant else variant
return variant