mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 20:38:29 +03:00
made Telegram scraper not return full channel info for forwarded_from attribute; fixed video edge cases.
This commit is contained in:
@@ -27,9 +27,9 @@ class LinkPreview:
|
|||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class Channel(snscrape.base.Entity):
|
class Channel(snscrape.base.Entity):
|
||||||
username: str
|
username: str
|
||||||
title: str
|
title: typing.Optional[str] = None
|
||||||
verified: bool
|
verified: typing.Optional[bool] = None
|
||||||
photo: str
|
photo: typing.Optional[str] = None
|
||||||
description: typing.Optional[str] = None
|
description: typing.Optional[str] = None
|
||||||
members: typing.Optional[int] = None
|
members: typing.Optional[int] = None
|
||||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
@@ -123,14 +123,18 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
content = message.get_text(separator="\n")
|
content = message.get_text(separator="\n")
|
||||||
|
|
||||||
for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
|
for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
|
||||||
|
iTag = video_player.find('i')
|
||||||
style = video_player.find('i')['style']
|
if iTag is None:
|
||||||
videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)
|
videoUrl = None
|
||||||
videoTag = video_player.find('video')
|
videoThumbnailUrl = None
|
||||||
if videoTag is None:
|
|
||||||
videoUrl = None
|
|
||||||
else:
|
else:
|
||||||
videoUrl = videoTag['src']
|
style = iTag['style']
|
||||||
|
videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)[0]
|
||||||
|
videoTag = video_player.find('video')
|
||||||
|
if videoTag is None:
|
||||||
|
videoUrl = None
|
||||||
|
else:
|
||||||
|
videoUrl = videoTag['src']
|
||||||
mKwargs = {
|
mKwargs = {
|
||||||
'thumbnailUrl': videoThumbnailUrl,
|
'thumbnailUrl': videoThumbnailUrl,
|
||||||
'url': videoUrl,
|
'url': videoUrl,
|
||||||
@@ -146,8 +150,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
|
if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
|
||||||
forwardedUrl = forward_tag['href']
|
forwardedUrl = forward_tag['href']
|
||||||
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
|
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
|
||||||
forwardedChannelScraper = TelegramChannelScraper(name = forwardedName)
|
forwarded = Channel(username = forwardedName)
|
||||||
forwarded = forwardedChannelScraper._get_entity()
|
|
||||||
|
|
||||||
outlinks = []
|
outlinks = []
|
||||||
for link in post.find_all('a'):
|
for link in post.find_all('a'):
|
||||||
@@ -213,7 +216,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
if not pageLink:
|
if not pageLink:
|
||||||
break
|
break
|
||||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||||
r = self._get(nextPageUrl, headers = self._headers)
|
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
@@ -279,4 +282,12 @@ def parse_num(s):
|
|||||||
elif s.endswith('K'):
|
elif s.endswith('K'):
|
||||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||||
else:
|
else:
|
||||||
return int(s), 1
|
return int(s), 1
|
||||||
|
|
||||||
|
def telegramResponseOkCallback(r):
|
||||||
|
if r.status_code == 200:
|
||||||
|
return (True, None)
|
||||||
|
elif r.status_code // 100 == 5:
|
||||||
|
return (False, f'status code: {r.status_code}')
|
||||||
|
else:
|
||||||
|
return (False, None)
|
||||||
Reference in New Issue
Block a user