From 4b423dfc34ec82c6f7678c59fe5f193f1bb7f028 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Jun 2022 14:36:58 +0200 Subject: [PATCH] fix telethon exception --- archivers/base_archiver.py | 17 ++++++++--------- archivers/telethon_archiver.py | 23 +++++++++++++++++++++-- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 70be680..815d31e 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -93,16 +93,19 @@ class Archiver(ABC): return mime.split("/")[0] return "" - # eg images in a tweet save to cloud storage + def download_from_url(self, url, to_filename): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + d = requests.get(url, headers=headers) + with open(to_filename, 'wb') as f: + f.write(d.content) def generate_media_page(self, urls, url, object): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them """ - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } thumbnail = None uploaded_media = [] @@ -110,11 +113,7 @@ class Archiver(ABC): key = self._get_key_from_url(media_url, ".jpg") filename = os.path.join(Storage.TMP_FOLDER, key) - - d = requests.get(media_url, headers=headers) - with open(filename, 'wb') as f: - f.write(d.content) - + self.download_from_url(media_url, filename) self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index c360634..f35e323 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -82,19 +82,38 @@ class TelethonArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) + key_thumb, thumb_index = None, None group_id = post.grouped_id if post.grouped_id is not None else post.id uploaded_media = [] message = post.message - for i, mp in enumerate(media_posts): + for mp in media_posts: if len(mp.message) > len(message): message = mp.message + + # media can also be in entities + if mp.entities: + other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] + logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + for om_url in other_media_urls: + filename = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') + self.download_from_url(om_url, filename) + key = filename.split(Storage.TMP_FOLDER)[1] + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) + if not filename: + logger.debug(f"Empty media found, skipping {str(mp)=}") + continue + key = filename.split(Storage.TMP_FOLDER)[1] self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - if i == 0: + if key_thumb is None: key_thumb, thumb_index = self.get_thumbnails(filename, key) os.remove(filename)