telethon archiver working for 0,1,1+ media objects

This commit is contained in:
msramalho
2022-03-16 19:51:02 +01:00
parent c2ae382a4e
commit 516db483d6

View File

@@ -1,10 +1,9 @@
import os
import requests
from bs4 import BeautifulSoup
from loguru import logger
import re
import html
from dataclasses import dataclass
from urllib.parse import urlparse
from loguru import logger
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
@@ -25,6 +24,24 @@ class TelethonArchiver(Archiver):
super().__init__(storage, driver)
self.client = TelegramClient("./anon", config.api_id, config.api_hash)
def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
"""
Searches for Telegram posts that are part of the same group of uploads
The search is conducted around the id of the original post with an amplitude
of `max_amp` both ways
Returns a list of [post] where each post has media and is in the same grouped_id
"""
if original_post.grouped_id is None:
return [original_post] if original_post.media is not None else []
search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
posts = self.client.get_messages(chat, ids=search_ids)
media = []
for post in posts:
if post.grouped_id == original_post.grouped_id and post.media is not None:
media.append(post)
return media
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
@@ -36,34 +53,54 @@ class TelethonArchiver(Archiver):
with self.client.start():
matches = list(matches[0])
chat, post_id = matches[-2], matches[-1]
chat, post_id = matches[1], matches[2]
post_id = int(post_id)
post = self.client.get_messages(chat, ids=post_id)
try:
post = self.client.get_messages(chat, ids=post_id)
except ValueError as e:
logger.warning(f'Could not fetch telegram {url} possibly it\'s private: {e}')
return False
if post.media is not None:
key = f'{chat}_{post_id}'
filename = 'tmp/' + key
media_posts = self._get_media_posts_in_group(chat, post)
filename = self.client.download_media(post.media, filename)
key += os.path.splitext(filename)[1] # add the extension to the key
print(len(media_posts))
if len(media_posts) > 1:
key = self.get_html_key(url)
cdn_url = self.storage.get_cdn_url(key)
hash = self.get_hash(filename)
if check_if_exists and self.storage.exists(key):
status = 'already archived'
else:
self.storage.upload(filename, key)
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
group_id = post.grouped_id if post.grouped_id is not None else post.id
uploaded_media = []
message = post.message
for mp in media_posts:
if len(mp.message) > message: message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1]
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
os.remove(filename)
page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)
elif len(media_posts) == 1:
key = self.get_key(f'{chat}_{post_id}')
filename = self.client.download_media(post.media, f'tmp/{key}')
key = filename.split('tmp/')[1].replace(" ", "")
self.storage.upload(filename, key)
hash = self.get_hash(filename)
cdn_url = self.storage.get_cdn_url(key)
key_thumb, thumb_index = self.get_thumbnails(filename, key)
os.remove(filename)
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, title=post.message, timestamp=post.date, hash=hash, screenshot=screenshot)
else:
return ArchiveResult(status="success", cdn_url=cdn_url, title=post.message, timestamp=post.date, hash=hash, screenshot=screenshot)
def get_post_channel_and_id_from_url(self, url):
parts = url.split('t.me/')[1]
if parts.startswith('s/'):
parts = parts.split('s/')[1]
channel_info = parts.split('/')
return channel_info[0], channel_info[1]
return ArchiveResult(status=status, cdn_url=cdn_url, title=post.message, thumbnail=key_thumb, thumbnail_index=thumb_index, timestamp=post.date, hash=hash, screenshot=screenshot)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=post.date, hash=page_hash, screenshot=screenshot)