More manifests, base modules and rename from archiver to extractor.

This commit is contained in:
erinhmclark
2025-01-23 16:40:48 +00:00
parent 9db26cdfc2
commit 1274a1b231
93 changed files with 378 additions and 238 deletions

View File

@@ -0,0 +1,24 @@
{
"name": "Telegram Extractor",
"type": ["extractor"],
"requires_setup": False,
"external_dependencies": {
"python": [
"requests",
"bs4",
"loguru",
],
},
"description": """
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
is advised for more comprehensive functionality.
### Features
- Extracts images and videos from public Telegram message links (`t.me`).
- Processes HTML content of messages to retrieve embedded media.
- Sets structured metadata, including timestamps, content, and media details.
- Does not require user authentication for Telegram.
""",
}

View File

@@ -0,0 +1,70 @@
import requests, re, html
from bs4 import BeautifulSoup
from loguru import logger
from auto_archiver.base_modules import Extractor
from auto_archiver.core import Metadata, Media
class TelegramExtractor(Extractor):
"""
Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
will only return if at least one image or one video is found
"""
name = "telegram_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
# detect URLs that we definitely cannot handle
if 't.me' != item.netloc:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
# TODO: check if we can do this more resilient to variable URLs
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
result = Metadata()
result.set_content(html.escape(str(t.content)))
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
result.set_timestamp(timestamp)
video = s.find("video")
if video is None:
logger.warning("could not find video")
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
image_urls = []
for im in image_tags:
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
image_urls += urls
if not len(image_urls): return False
for img_url in image_urls:
result.add_media(Media(self.download_from_url(img_url)))
else:
video_url = video.get('src')
m_video = Media(self.download_from_url(video_url))
# extract duration from HTML
try:
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(
':')[0]) * 60 + float(duration.split(':')[1])
else:
duration = float(duration)
m_video.set("duration", duration)
except: pass
result.add_media(m_video)
return result.success("telegram")