Fix link extraction on Telegram

2026-06-08 02:28:29 +03:00 · 2020-10-01 18:29:08 +00:00
parent 3d3faf80bf
commit b1d8475a03
1 changed files with 11 additions and 1 deletions
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -68,7 +68,17 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 			message = post.find('div', class_ = 'tgme_widget_message_text')
 			if message:
 				content = message.text
-				outlinks = [urllib.parse.urljoin(pageUrl, link['href']) for link in post.find_all('a') if not link.text.startswith('@') and link['href'].startswith('https://t.me/')]
+				outlinks = []
+				for link in post.find_all('a'):
+					if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
+						# Author links at the top (avatar and name)
+						continue
+					if link['href'] == f'https://t.me/{post["data-post"]}':
+						# Generic filter of links to the post itself, catches videos, photos, and the date link
+						continue
+					href = urllib.parse.urljoin(pageUrl, link['href'])
+					if href not in outlinks:
+						outlinks.append(href)
 				outlinksss = ' '.join(outlinks)
 			else:
 				content = None