More manifests, base modules and rename from archiver to extractor.

2026-06-12 21:28:29 +03:00 · 2025-01-23 16:40:48 +00:00
parent 9db26cdfc2
commit 1274a1b231
93 changed files with 378 additions and 238 deletions
--- a/src/auto_archiver/modules/telegram_extractor/init.py
+++ b/src/auto_archiver/modules/telegram_extractor/init.py
--- a/src/auto_archiver/modules/telegram_extractor/manifest.py
+++ b/src/auto_archiver/modules/telegram_extractor/manifest.py
@@ -0,0 +1,24 @@
+{
+    "name": "Telegram Extractor",
+    "type": ["extractor"],
+    "requires_setup": False,
+    "external_dependencies": {
+        "python": [
+            "requests",
+            "bs4",
+            "loguru",
+        ],
+    },
+    "description": """ 
+        The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. 
+        It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` 
+        and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` 
+        is advised for more comprehensive functionality.
+        
+        ### Features
+- Extracts images and videos from public Telegram message links (`t.me`).
+- Processes HTML content of messages to retrieve embedded media.
+- Sets structured metadata, including timestamps, content, and media details.
+- Does not require user authentication for Telegram.
+    """,
+}
--- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
+++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
@@ -0,0 +1,70 @@
+import requests, re, html
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from auto_archiver.base_modules import Extractor
+from auto_archiver.core import Metadata, Media
+
+
+class TelegramExtractor(Extractor):
+    """
+    Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
+    will only return if at least one image or one video is found
+    """
+    name = "telegram_extractor"
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+
+
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+        # detect URLs that we definitely cannot handle
+        if 't.me' != item.netloc:
+            return False
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+        }
+
+        # TODO: check if we can do this more resilient to variable URLs
+        if url[-8:] != "?embed=1":
+            url += "?embed=1"
+
+        t = requests.get(url, headers=headers)
+        s = BeautifulSoup(t.content, 'html.parser')
+
+        result = Metadata()
+        result.set_content(html.escape(str(t.content)))
+        if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
+            result.set_timestamp(timestamp)
+
+        video = s.find("video")
+        if video is None:
+            logger.warning("could not find video")
+            image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
+
+            image_urls = []
+            for im in image_tags:
+                urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
+                image_urls += urls
+
+            if not len(image_urls): return False
+            for img_url in image_urls:
+                result.add_media(Media(self.download_from_url(img_url)))
+        else:
+            video_url = video.get('src')
+            m_video = Media(self.download_from_url(video_url))
+            # extract duration from HTML
+            try:
+                duration = s.find_all('time')[0].contents[0]
+                if ':' in duration:
+                    duration = float(duration.split(
+                        ':')[0]) * 60 + float(duration.split(':')[1])
+                else:
+                    duration = float(duration)
+                m_video.set("duration", duration)
+            except: pass
+            result.add_media(m_video)
+
+        return result.success("telegram")