From b1a7b9607f5294ec3f2a51fbfa3468e8f21df50b Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 7 Oct 2020 01:27:26 +0000 Subject: [PATCH] Skip individual Telegram photo/video links --- snscrape/modules/telegram.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 8ecbb32..5e83917 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -1,12 +1,14 @@ import bs4 import datetime import logging +import re import snscrape.base import typing import urllib.parse logger = logging.getLogger(__name__) +_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') class LinkPreview(typing.NamedTuple): @@ -89,6 +91,9 @@ class TelegramChannelScraper(snscrape.base.Scraper): if link['href'] == rawUrl or link['href'] == url: # Generic filter of links to the post itself, catches videos, photos, and the date link continue + if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']): + # Individual photo or video link + continue href = urllib.parse.urljoin(pageUrl, link['href']) if href not in outlinks: outlinks.append(href)