Files
auto-archiver/archivers/twitter_api_archiver.py
Ed Summers c34fb9cf10 Add browsertrix profile config option
This commit adds a browsertrix profile option to the configuration. In
order to not require the passing of the browsertrix config to every
Archiver, the Archiver constructors (include the base) were modified to
accept a Storage and Config instance. Some of the constructors them pick
out the pieces they need from the Config, in addition to calling the
parent constructor. In order to avoid a circular import that this
created the Config object now defines the default hash function to use,
rather than having it be a static property of the Archiver class.
2022-10-11 16:21:42 -04:00

75 lines
3.1 KiB
Python

import json
from datetime import datetime
from loguru import logger
from pytwitter import Api
from storages.base_storage import Storage
from configs import Config
from .base_archiver import ArchiveResult
from .twitter_archiver import TwitterArchiver
class TwitterApiArchiver(TwitterArchiver):
name = "twitter_api"
def __init__(self, storage: Storage, config: Config):
super().__init__(storage, config)
c = config.twitter_config
if c.bearer_token:
self.api = Api(bearer_token=c.bearer_token)
elif c.consumer_key and c.consumer_secret and c.access_token and c.access_secret:
self.api = Api(
consumer_key=c.consumer_key, consumer_secret=c.consumer_secret, access_token=c.access_token, access_secret=c.access_secret)
def download(self, url, check_if_exists=False):
if not hasattr(self, "api"):
logger.warning('Missing Twitter API config')
return False
username, tweet_id = self.get_username_tweet_id(url)
if not username: return False
tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
timestamp = datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")
# check if exists
key = self.get_html_key(url)
if check_if_exists and self.storage.exists(key):
# only s3 storage supports storage.exists as not implemented on gd
cdn_url = self.storage.get_cdn_url(key)
screenshot = self.get_screenshot(url)
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
urls = []
if tweet.includes:
for m in tweet.includes.media:
if m.url:
urls.append(m.url)
elif hasattr(m, "variants"):
var_url = self.choose_variant(m.variants)
urls.append(var_url)
else:
urls.append(None) # will trigger error
for u in urls:
if u is None:
logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver")
return self.download_alternative(url, tweet_id)
logger.debug(f"found {urls=}")
output = json.dumps({
"id": tweet.data.id,
"text": tweet.data.text,
"created_at": tweet.data.created_at,
"author_id": tweet.data.author_id,
"geo": tweet.data.geo,
"lang": tweet.data.lang,
"media": urls
}, ensure_ascii=False, indent=4)
screenshot = self.get_screenshot(url)
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text)