mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-13 04:48:28 +03:00
Fix link extraction on Telegram
This commit is contained in:
@@ -68,7 +68,17 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
message = post.find('div', class_ = 'tgme_widget_message_text')
|
message = post.find('div', class_ = 'tgme_widget_message_text')
|
||||||
if message:
|
if message:
|
||||||
content = message.text
|
content = message.text
|
||||||
outlinks = [urllib.parse.urljoin(pageUrl, link['href']) for link in post.find_all('a') if not link.text.startswith('@') and link['href'].startswith('https://t.me/')]
|
outlinks = []
|
||||||
|
for link in post.find_all('a'):
|
||||||
|
if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
|
||||||
|
# Author links at the top (avatar and name)
|
||||||
|
continue
|
||||||
|
if link['href'] == f'https://t.me/{post["data-post"]}':
|
||||||
|
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
||||||
|
continue
|
||||||
|
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||||
|
if href not in outlinks:
|
||||||
|
outlinks.append(href)
|
||||||
outlinksss = ' '.join(outlinks)
|
outlinksss = ' '.join(outlinks)
|
||||||
else:
|
else:
|
||||||
content = None
|
content = None
|
||||||
|
|||||||
Reference in New Issue
Block a user