From 516db483d6d1c93d89e21f2f0e70f5bf65690629 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 16 Mar 2022 19:51:02 +0100 Subject: [PATCH] telethon archiver working for 0,1,1+ media objects --- archivers/telethon_archiver.py | 83 ++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 00f5999..f02276c 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -1,10 +1,9 @@ import os -import requests -from bs4 import BeautifulSoup -from loguru import logger import re import html from dataclasses import dataclass +from urllib.parse import urlparse +from loguru import logger from storages import Storage from .base_archiver import Archiver, ArchiveResult @@ -25,6 +24,24 @@ class TelethonArchiver(Archiver): super().__init__(storage, driver) self.client = TelegramClient("./anon", config.api_id, config.api_hash) + def _get_media_posts_in_group(self, chat, original_post, max_amp=10): + """ + Searches for Telegram posts that are part of the same group of uploads + The search is conducted around the id of the original post with an amplitude + of `max_amp` both ways + Returns a list of [post] where each post has media and is in the same grouped_id + """ + if original_post.grouped_id is None: + return [original_post] if original_post.media is not None else [] + + search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)] + posts = self.client.get_messages(chat, ids=search_ids) + media = [] + for post in posts: + if post.grouped_id == original_post.grouped_id and post.media is not None: + media.append(post) + return media + def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle matches = self.link_pattern.findall(url) @@ -36,34 +53,54 @@ class TelethonArchiver(Archiver): with self.client.start(): matches = list(matches[0]) - chat, post_id = matches[-2], matches[-1] - + chat, post_id = matches[1], matches[2] + post_id = int(post_id) - post = self.client.get_messages(chat, ids=post_id) + try: + post = self.client.get_messages(chat, ids=post_id) + except ValueError as e: + logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}') + return False - if post.media is not None: - key = f'{chat}_{post_id}' - filename = 'tmp/' + key + media_posts = self._get_media_posts_in_group(chat, post) - filename = self.client.download_media(post.media, filename) - key += os.path.splitext(filename)[1] # add the extension to the key + print(len(media_posts)) + if len(media_posts) > 1: + key = self.get_html_key(url) cdn_url = self.storage.get_cdn_url(key) - hash = self.get_hash(filename) + if check_if_exists and self.storage.exists(key): status = 'already archived' - else: - self.storage.upload(filename, key) + return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) + group_id = post.grouped_id if post.grouped_id is not None else post.id + uploaded_media = [] + message = post.message + for mp in media_posts: + if len(mp.message) > message: message = mp.message + filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') + key = filename.split('tmp/')[1] + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + os.remove(filename) + + page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post))) + + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot) + elif len(media_posts) == 1: + key = self.get_key(f'{chat}_{post_id}') + filename = self.client.download_media(post.media, f'tmp/{key}') + key = filename.split('tmp/')[1].replace(" ", "") + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) key_thumb, thumb_index = self.get_thumbnails(filename, key) os.remove(filename) - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, title=post.message, timestamp=post.date, hash=hash, screenshot=screenshot) - else: - return ArchiveResult(status="success", cdn_url=cdn_url, title=post.message, timestamp=post.date, hash=hash, screenshot=screenshot) - def get_post_channel_and_id_from_url(self, url): - parts = url.split('t.me/')[1] - if parts.startswith('s/'): - parts = parts.split('s/')[1] - channel_info = parts.split('/') - return channel_info[0], channel_info[1] + return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot) + + page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post))) + return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)