mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
More manifests, base modules and rename from archiver to extractor.
This commit is contained in:
24
src/auto_archiver/modules/telegram_extractor/__manifest__.py
Normal file
24
src/auto_archiver/modules/telegram_extractor/__manifest__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "Telegram Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"requests",
|
||||
"bs4",
|
||||
"loguru",
|
||||
],
|
||||
},
|
||||
"description": """
|
||||
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
|
||||
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
|
||||
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
|
||||
is advised for more comprehensive functionality.
|
||||
|
||||
### Features
|
||||
- Extracts images and videos from public Telegram message links (`t.me`).
|
||||
- Processes HTML content of messages to retrieve embedded media.
|
||||
- Sets structured metadata, including timestamps, content, and media details.
|
||||
- Does not require user authentication for Telegram.
|
||||
""",
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
import requests, re, html
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class TelegramExtractor(Extractor):
|
||||
"""
|
||||
Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
|
||||
will only return if at least one image or one video is found
|
||||
"""
|
||||
name = "telegram_extractor"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 't.me' != item.netloc:
|
||||
return False
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
|
||||
# TODO: check if we can do this more resilient to variable URLs
|
||||
if url[-8:] != "?embed=1":
|
||||
url += "?embed=1"
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
|
||||
result = Metadata()
|
||||
result.set_content(html.escape(str(t.content)))
|
||||
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
|
||||
result.set_timestamp(timestamp)
|
||||
|
||||
video = s.find("video")
|
||||
if video is None:
|
||||
logger.warning("could not find video")
|
||||
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
||||
|
||||
image_urls = []
|
||||
for im in image_tags:
|
||||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
||||
image_urls += urls
|
||||
|
||||
if not len(image_urls): return False
|
||||
for img_url in image_urls:
|
||||
result.add_media(Media(self.download_from_url(img_url)))
|
||||
else:
|
||||
video_url = video.get('src')
|
||||
m_video = Media(self.download_from_url(video_url))
|
||||
# extract duration from HTML
|
||||
try:
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(
|
||||
':')[0]) * 60 + float(duration.split(':')[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
m_video.set("duration", duration)
|
||||
except: pass
|
||||
result.add_media(m_video)
|
||||
|
||||
return result.success("telegram")
|
||||
Reference in New Issue
Block a user