mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
More manifests, base modules and rename from archiver to extractor.
This commit is contained in:
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"name": "Twitter API Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["requests",
|
||||
"loguru",
|
||||
"pytwitter",
|
||||
"slugify",],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||
},
|
||||
"description": """
|
||||
The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API.
|
||||
It supports multiple API configurations for extended rate limits and reliable access.
|
||||
Features include URL expansion, media downloads (e.g., images, videos), and structured output
|
||||
via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens
|
||||
or consumer key/secret and access token/secret.
|
||||
|
||||
### Features
|
||||
- Fetches tweets and their metadata, including text, creation timestamp, and author information.
|
||||
- Downloads media attachments (e.g., images, videos) in high quality.
|
||||
- Supports multiple API configurations for improved rate limiting.
|
||||
- Expands shortened URLs (e.g., `t.co` links).
|
||||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration:
|
||||
- **Bearer Token(s)**: A single token or a list for rate-limited API access.
|
||||
- **Consumer Key and Secret**: Required for user-authenticated API access.
|
||||
- **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
|
||||
|
||||
Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
|
||||
"""
|
||||
,
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
import json
|
||||
import re
|
||||
import mimetypes
|
||||
import requests
|
||||
from datetime import datetime
|
||||
|
||||
from loguru import logger
|
||||
from pytwitter import Api
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata,Media
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
name = "twitter_api_extractor"
|
||||
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
self.api_index = 0
|
||||
self.apis = []
|
||||
if len(self.bearer_tokens):
|
||||
self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens])
|
||||
if self.bearer_token:
|
||||
self.assert_valid_string("bearer_token")
|
||||
self.apis.append(Api(bearer_token=self.bearer_token))
|
||||
if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
|
||||
self.assert_valid_string("consumer_key")
|
||||
self.assert_valid_string("consumer_secret")
|
||||
self.assert_valid_string("access_token")
|
||||
self.assert_valid_string("access_secret")
|
||||
self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
|
||||
access_token=self.access_token, access_secret=self.access_secret))
|
||||
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
||||
|
||||
@property # getter .mimetype
|
||||
def api_client(self) -> str:
|
||||
return self.apis[self.api_index]
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# expand URL if t.co and clean tracker GET params
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url, timeout=30)
|
||||
logger.debug(f'Expanded url {url} to {r.url}')
|
||||
url = r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
# call download retry until success or no more apis
|
||||
while self.api_index < len(self.apis):
|
||||
if res := self.download_retry(item): return res
|
||||
self.api_index += 1
|
||||
self.api_index = 0
|
||||
return False
|
||||
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
if not len(matches): return False, False
|
||||
|
||||
username, tweet_id = matches[0] # only one URL supported
|
||||
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
|
||||
|
||||
return username, tweet_id
|
||||
|
||||
def download_retry(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
username, tweet_id = self.get_username_tweet_id(url)
|
||||
if not username: return False
|
||||
|
||||
try:
|
||||
tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
|
||||
logger.debug(tweet)
|
||||
except Exception as e:
|
||||
logger.error(f"Could not get tweet: {e}")
|
||||
return False
|
||||
|
||||
result = Metadata()
|
||||
result.set_title(tweet.data.text)
|
||||
result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
|
||||
|
||||
urls = []
|
||||
if tweet.includes:
|
||||
for i, m in enumerate(tweet.includes.media):
|
||||
media = Media(filename="")
|
||||
if m.url and len(m.url):
|
||||
media.set("src", m.url)
|
||||
media.set("duration", (m.duration_ms or 1) // 1000)
|
||||
mimetype = "image/jpeg"
|
||||
elif hasattr(m, "variants"):
|
||||
variant = self.choose_variant(m.variants)
|
||||
if not variant: continue
|
||||
media.set("src", variant.url)
|
||||
mimetype = variant.content_type
|
||||
else:
|
||||
continue
|
||||
logger.info(f"Found media {media}")
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
|
||||
result.add_media(media)
|
||||
|
||||
result.set_content(json.dumps({
|
||||
"id": tweet.data.id,
|
||||
"text": tweet.data.text,
|
||||
"created_at": tweet.data.created_at,
|
||||
"author_id": tweet.data.author_id,
|
||||
"geo": tweet.data.geo,
|
||||
"lang": tweet.data.lang,
|
||||
"media": urls
|
||||
}, ensure_ascii=False, indent=4))
|
||||
return result.success("twitter-api")
|
||||
|
||||
def choose_variant(self, variants):
|
||||
|
||||
"""
|
||||
Chooses the highest quality variable possible out of a list of variants
|
||||
"""
|
||||
variant, bit_rate = None, -1
|
||||
for var in variants:
|
||||
if var.content_type == "video/mp4":
|
||||
if var.bit_rate > bit_rate:
|
||||
bit_rate = var.bit_rate
|
||||
variant = var
|
||||
else:
|
||||
variant = var if not variant else variant
|
||||
return variant
|
||||
Reference in New Issue
Block a user