From 6a62c5798cd3bf350048144c2e9d183f8bf1f20d Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 25 Feb 2022 13:55:43 +0100 Subject: [PATCH] Add Twitter non-video archiver --- .gitignore | 2 + archivers/__init__.py | 3 +- archivers/twitter_archiver.py | 101 ++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 archivers/twitter_archiver.py diff --git a/.gitignore b/.gitignore index 141d9f5..5db8d9d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ service_account.json __pycache__/ ._* anu.html +geckodriver.log + diff --git a/archivers/__init__.py b/archivers/__init__.py index e6c4ba6..f4e439f 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -3,4 +3,5 @@ from .base_archiver import * from .telegram_archiver import * from .tiktok_archiver import * from .wayback_archiver import * -from .youtubedl_archiver import * \ No newline at end of file +from .youtubedl_archiver import * +from .twitter_archiver import * \ No newline at end of file diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py new file mode 100644 index 0000000..cfe96fb --- /dev/null +++ b/archivers/twitter_archiver.py @@ -0,0 +1,101 @@ +from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo +from loguru import logger +import requests +from urllib.parse import urlparse + +from .base_archiver import Archiver, ArchiveResult + + +class TwitterArchiver(Archiver): + name = "twitter" + + def download(self, url, check_if_exists=False): + if 'twitter.com' != self.get_netloc(url): + return False + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + + tweet_id = url.split('/') + if 'status' in tweet_id: + i = tweet_id.index('status') + tweet_id = tweet_id[i+1] + else: + return False + + scr = TwitterTweetScraper(tweet_id) + + try: + tweet = next(scr.get_items()) + except: + logger.warning('wah wah') + return False + + if tweet.media is None: + return False + + archived_media = [] + + for media in tweet.media: + if type(media) == Video: + variant = max( + [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) + media_url = variant.url + elif type(media) == Gif: + media_url = media.variants[0].url + elif type(media) == Photo: + media_url = media.fullUrl + else: + logger.warning(f"Could not get media URL of {media}") + media_url = None + + if media_url is not None: + path = urlparse(media_url).path + key = self.get_key(path.replace("/", "_")) + if '.' not in path: + key += '.jpg' + + filename = 'tmp/' + key + + d = requests.get(media_url, headers=headers) + with open(filename, 'wb') as f: + f.write(d.content) + + self.storage.upload(filename, key) + hash = self.get_hash(filename) + + archived_media.append((self.storage.get_cdn_url(key), hash)) + + page = f'''{url} + +

Archived media from tweet

+

{url}