diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 1e2c20b..7a61ea7 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -9,6 +9,7 @@ import hashlib from selenium.common.exceptions import TimeoutException from loguru import logger import time +import requests from storages import Storage from utils import mkdir_if_not_exists @@ -43,6 +44,55 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc + def generate_media_page(self, urls, url, object): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + + page = f'''
{object}"
+ page += f""
+
+ page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
+ page_filename = 'tmp/' + page_key
+ page_cdn = self.storage.get_cdn_url(page_key)
+
+ with open(page_filename, "w") as f:
+ f.write(page)
+
+ page_hash = self.get_hash(page_filename)
+
+ self.storage.upload(page_filename, page_key, extra_args={
+ 'ACL': 'public-read', 'ContentType': 'text/html'})
+
+ return (page_cdn, page_hash, thumbnail)
def get_key(self, filename):
"""
@@ -52,6 +102,11 @@ class Archiver(ABC):
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
if 'unknown_video' in _id:
_id = _id.replace('unknown_video', 'jpg')
+
+ # long filenames can cause problems, so trim them if necessary
+ if len(_id) > 128:
+ _id = _id[-128:]
+
return f'{self.name}_{_id}{extension}'
def get_hash(self, filename):
@@ -127,7 +182,8 @@ class Archiver(ABC):
thumb_index = key_folder + 'index.html'
- self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
+ self.storage.upload(index_fname, thumb_index, extra_args={
+ 'ACL': 'public-read', 'ContentType': 'text/html'})
shutil.rmtree(thumbnails_folder)
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index d6207df..8ca0dac 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -1,6 +1,9 @@
import os
import requests
from bs4 import BeautifulSoup
+from loguru import logger
+import re
+import html
from .base_archiver import Archiver, ArchiveResult
@@ -24,12 +27,24 @@ class TelegramArchiver(Archiver):
if url[-8:] != "?embed=1":
url += "?embed=1"
+ screenshot = self.get_screenshot(url)
+
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
video = s.find("video")
if video is None:
- return False # could not find video
+ logger.warning("could not find video")
+ image_tags = s.find_all(class_="js-message_photo")
+
+ images = []
+ for im in image_tags:
+ urls = [u.replace("'", "") for u in re.findall('url\((.*?)\)', im['style'])]
+ images += urls
+
+ page_cdn, page_hash, thumbnail = self.generate_media_page(images, url, html.escape(str(t.content)))
+
+ return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=s.find_all('time')[1].get('datetime'))
video_url = video.get('src')
video_id = video_url.split('/')[-1].split('?')[0]
@@ -50,7 +65,6 @@ class TelegramArchiver(Archiver):
self.storage.upload(filename, key)
hash = self.get_hash(filename)
- screenshot = self.get_screenshot(url)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py
index 629c901..49ab5ae 100644
--- a/archivers/twitter_archiver.py
+++ b/archivers/twitter_archiver.py
@@ -13,10 +13,6 @@ class TwitterArchiver(Archiver):
if 'twitter.com' != self.get_netloc(url):
return False
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
- }
-
tweet_id = urlparse(url).path.split('/')
if 'status' in tweet_id:
i = tweet_id.index('status')
@@ -35,67 +31,22 @@ class TwitterArchiver(Archiver):
if tweet.media is None:
return False
- archived_media = []
+ urls = []
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
- media_url = variant.url
+ urls.append(variant.url)
elif type(media) == Gif:
- media_url = media.variants[0].url
+ urls.append(media.variants[0].url)
elif type(media) == Photo:
- media_url = media.fullUrl
+ urls.append(media.fullUrl)
else:
logger.warning(f"Could not get media URL of {media}")
- media_url = None
- if media_url is not None:
- path = urlparse(media_url).path
- key = self.get_key(path.replace("/", "_"))
- if '.' not in path:
- key += '.jpg'
-
- filename = 'tmp/' + key
-
- d = requests.get(media_url, headers=headers)
- with open(filename, 'wb') as f:
- f.write(d.content)
-
- self.storage.upload(filename, key)
- hash = self.get_hash(filename)
-
- archived_media.append((self.storage.get_cdn_url(key), hash))
-
- page = f'''{tweet.json()}"
- page += f""
-
- page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
- page_filename = 'tmp/' + key
- page_cdn = self.storage.get_cdn_url(page_key)
-
- with open(page_filename, "w") as f:
- f.write(page)
-
- page_hash = self.get_hash(page_filename)
-
- self.storage.upload(page_filename, page_key, extra_args={
- 'ACL': 'public-read', 'ContentType': 'text/html'})
+ page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
screenshot = self.get_screenshot(url)
- if (len(archived_media) > 0):
- thumbnail = archived_media[0][0]
- else:
- thumbnail = None
-
- return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail)
-
+ return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)