From b1d8475a0359299eccac2fe24cd5b2c1ba21de21 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 1 Oct 2020 18:29:08 +0000
Subject: [PATCH] Fix link extraction on Telegram

---
 snscrape/modules/telegram.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 94546d4..9524cc3 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -68,7 +68,17 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 			message = post.find('div', class_ = 'tgme_widget_message_text')
 			if message:
 				content = message.text
-				outlinks = [urllib.parse.urljoin(pageUrl, link['href']) for link in post.find_all('a') if not link.text.startswith('@') and link['href'].startswith('https://t.me/')]
+				outlinks = []
+				for link in post.find_all('a'):
+					if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
+						# Author links at the top (avatar and name)
+						continue
+					if link['href'] == f'https://t.me/{post["data-post"]}':
+						# Generic filter of links to the post itself, catches videos, photos, and the date link
+						continue
+					href = urllib.parse.urljoin(pageUrl, link['href'])
+					if href not in outlinks:
+						outlinks.append(href)
 				outlinksss = ' '.join(outlinks)
 			else:
 				content = None