diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 1e2c20b..7a61ea7 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -9,6 +9,7 @@ import hashlib from selenium.common.exceptions import TimeoutException from loguru import logger import time +import requests from storages import Storage from utils import mkdir_if_not_exists @@ -43,6 +44,55 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc + def generate_media_page(self, urls, url, object): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + + page = f'''{url} + +

Archived media from {self.name}

+

{url}

{self.name} object data:

{object}" + page += f"" + + page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") + page_filename = 'tmp/' + page_key + page_cdn = self.storage.get_cdn_url(page_key) + + with open(page_filename, "w") as f: + f.write(page) + + page_hash = self.get_hash(page_filename) + + self.storage.upload(page_filename, page_key, extra_args={ + 'ACL': 'public-read', 'ContentType': 'text/html'}) + + return (page_cdn, page_hash, thumbnail) def get_key(self, filename): """ @@ -52,6 +102,11 @@ class Archiver(ABC): _id, extension = os.path.splitext(tail) # returns [filename, .ext] if 'unknown_video' in _id: _id = _id.replace('unknown_video', 'jpg') + + # long filenames can cause problems, so trim them if necessary + if len(_id) > 128: + _id = _id[-128:] + return f'{self.name}_{_id}{extension}' def get_hash(self, filename): @@ -127,7 +182,8 @@ class Archiver(ABC): thumb_index = key_folder + 'index.html' - self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) + self.storage.upload(index_fname, thumb_index, extra_args={ + 'ACL': 'public-read', 'ContentType': 'text/html'}) shutil.rmtree(thumbnails_folder) thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index d6207df..8ca0dac 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -1,6 +1,9 @@ import os import requests from bs4 import BeautifulSoup +from loguru import logger +import re +import html from .base_archiver import Archiver, ArchiveResult @@ -24,12 +27,24 @@ class TelegramArchiver(Archiver): if url[-8:] != "?embed=1": url += "?embed=1" + screenshot = self.get_screenshot(url) + t = requests.get(url, headers=headers) s = BeautifulSoup(t.content, 'html.parser') video = s.find("video") if video is None: - return False # could not find video + logger.warning("could not find video") + image_tags = s.find_all(class_="js-message_photo") + + images = [] + for im in image_tags: + urls = [u.replace("'", "") for u in re.findall('url\((.*?)\)', im['style'])] + images += urls + + page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content))) + + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=s.find_all('time')[1].get('datetime')) video_url = video.get('src') video_id = video_url.split('/')[-1].split('?')[0] @@ -50,7 +65,6 @@ class TelegramArchiver(Archiver): self.storage.upload(filename, key) hash = self.get_hash(filename) - screenshot = self.get_screenshot(url) # extract duration from HTML duration = s.find_all('time')[0].contents[0] diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 629c901..49ab5ae 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -13,10 +13,6 @@ class TwitterArchiver(Archiver): if 'twitter.com' != self.get_netloc(url): return False - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } - tweet_id = urlparse(url).path.split('/') if 'status' in tweet_id: i = tweet_id.index('status') @@ -35,67 +31,22 @@ class TwitterArchiver(Archiver): if tweet.media is None: return False - archived_media = [] + urls = [] for media in tweet.media: if type(media) == Video: variant = max( [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) - media_url = variant.url + urls.append(variant.url) elif type(media) == Gif: - media_url = media.variants[0].url + urls.append(media.variants[0].url) elif type(media) == Photo: - media_url = media.fullUrl + urls.append(media.fullUrl) else: logger.warning(f"Could not get media URL of {media}") - media_url = None - if media_url is not None: - path = urlparse(media_url).path - key = self.get_key(path.replace("/", "_")) - if '.' not in path: - key += '.jpg' - - filename = 'tmp/' + key - - d = requests.get(media_url, headers=headers) - with open(filename, 'wb') as f: - f.write(d.content) - - self.storage.upload(filename, key) - hash = self.get_hash(filename) - - archived_media.append((self.storage.get_cdn_url(key), hash)) - - page = f'''{url} - -

Archived media from tweet

-

{url}