mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 20:38:29 +03:00
Skip individual Telegram photo/video links
This commit is contained in:
@@ -1,12 +1,14 @@
|
|||||||
import bs4
|
import bs4
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import snscrape.base
|
import snscrape.base
|
||||||
import typing
|
import typing
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||||
|
|
||||||
|
|
||||||
class LinkPreview(typing.NamedTuple):
|
class LinkPreview(typing.NamedTuple):
|
||||||
@@ -89,6 +91,9 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
if link['href'] == rawUrl or link['href'] == url:
|
if link['href'] == rawUrl or link['href'] == url:
|
||||||
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
# Generic filter of links to the post itself, catches videos, photos, and the date link
|
||||||
continue
|
continue
|
||||||
|
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
|
||||||
|
# Individual photo or video link
|
||||||
|
continue
|
||||||
href = urllib.parse.urljoin(pageUrl, link['href'])
|
href = urllib.parse.urljoin(pageUrl, link['href'])
|
||||||
if href not in outlinks:
|
if href not in outlinks:
|
||||||
outlinks.append(href)
|
outlinks.append(href)
|
||||||
|
|||||||
Reference in New Issue
Block a user