twitter archiver improvements

This commit is contained in:
msramalho
2022-06-14 20:55:43 +02:00
parent bd753b27ed
commit 2be539d39e

View File

@@ -1,6 +1,8 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger import html
from urllib.parse import urlparse from urllib.parse import urlparse
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
@@ -11,6 +13,7 @@ class TwitterArchiver(Archiver):
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
if 'twitter.com' != self.get_netloc(url): if 'twitter.com' != self.get_netloc(url):
logger.debug(f'{url=} is not from twitter')
return False return False
tweet_id = urlparse(url).path.split('/') tweet_id = urlparse(url).path.split('/')
@@ -18,6 +21,7 @@ class TwitterArchiver(Archiver):
i = tweet_id.index('status') i = tweet_id.index('status')
tweet_id = tweet_id[i + 1] tweet_id = tweet_id[i + 1]
else: else:
logger.debug(f'{url=} does not contain "status"')
return False return False
scr = TwitterTweetScraper(tweet_id) scr = TwitterTweetScraper(tweet_id)
@@ -29,8 +33,10 @@ class TwitterArchiver(Archiver):
return False return False
if tweet.media is None: if tweet.media is None:
logger.trace(f'No media found') logger.debug(f'No media found, archiving tweet text only')
return False screenshot = self.get_screenshot(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
urls = [] urls = []
@@ -50,4 +56,4 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)