From b1d8475a0359299eccac2fe24cd5b2c1ba21de21 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 1 Oct 2020 18:29:08 +0000 Subject: [PATCH] Fix link extraction on Telegram --- snscrape/modules/telegram.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 94546d4..9524cc3 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -68,7 +68,17 @@ class TelegramChannelScraper(snscrape.base.Scraper): message = post.find('div', class_ = 'tgme_widget_message_text') if message: content = message.text - outlinks = [urllib.parse.urljoin(pageUrl, link['href']) for link in post.find_all('a') if not link.text.startswith('@') and link['href'].startswith('https://t.me/')] + outlinks = [] + for link in post.find_all('a'): + if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')): + # Author links at the top (avatar and name) + continue + if link['href'] == f'https://t.me/{post["data-post"]}': + # Generic filter of links to the post itself, catches videos, photos, and the date link + continue + href = urllib.parse.urljoin(pageUrl, link['href']) + if href not in outlinks: + outlinks.append(href) outlinksss = ' '.join(outlinks) else: content = None