mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 02:28:29 +03:00
Skip individual Telegram photo/video links
This commit is contained in:
@@ -1,12 +1,14 @@
|
||||
import bs4
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||
|
||||
|
||||
class LinkPreview(typing.NamedTuple):
|
||||
@@ -89,6 +91,9 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
if link['href'] == rawUrl or link['href'] == url:
|
||||
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
||||
continue
|
||||
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
|
||||
# Individual photo or video link
|
||||
continue
|
||||
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||
if href not in outlinks:
|
||||
outlinks.append(href)
|
||||
|
||||
Reference in New Issue
Block a user