Skip individual Telegram photo/video links

This commit is contained in:
JustAnotherArchivist
2020-10-07 01:27:26 +00:00
parent 119e53d07c
commit b1a7b9607f

View File

@@ -1,12 +1,14 @@
import bs4 import bs4
import datetime import datetime
import logging import logging
import re
import snscrape.base import snscrape.base
import typing import typing
import urllib.parse import urllib.parse
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
class LinkPreview(typing.NamedTuple): class LinkPreview(typing.NamedTuple):
@@ -89,6 +91,9 @@ class TelegramChannelScraper(snscrape.base.Scraper):
if link['href'] == rawUrl or link['href'] == url: if link['href'] == rawUrl or link['href'] == url:
# Generic filter of links to the post itself, catches videos, photos, and the date link # Generic filter of links to the post itself, catches videos, photos, and the date link
continue continue
if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
# Individual photo or video link
continue
href = urllib.parse.urljoin(pageUrl, link['href']) href = urllib.parse.urljoin(pageUrl, link['href'])
if href not in outlinks: if href not in outlinks:
outlinks.append(href) outlinks.append(href)