made Telegram scraper not return full channel info for forwarded_from attribute; fixed video edge cases.

This commit is contained in:
Tristan Lee
2022-04-17 03:55:37 -05:00
parent 4e59638e7c
commit babcddda19

View File

@@ -27,9 +27,9 @@ class LinkPreview:
@dataclasses.dataclass @dataclasses.dataclass
class Channel(snscrape.base.Entity): class Channel(snscrape.base.Entity):
username: str username: str
title: str title: typing.Optional[str] = None
verified: bool verified: typing.Optional[bool] = None
photo: str photo: typing.Optional[str] = None
description: typing.Optional[str] = None description: typing.Optional[str] = None
members: typing.Optional[int] = None members: typing.Optional[int] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None photos: typing.Optional[snscrape.base.IntWithGranularity] = None
@@ -123,14 +123,18 @@ class TelegramChannelScraper(snscrape.base.Scraper):
content = message.get_text(separator="\n") content = message.get_text(separator="\n")
for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}): for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
iTag = video_player.find('i')
style = video_player.find('i')['style'] if iTag is None:
videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style) videoUrl = None
videoTag = video_player.find('video') videoThumbnailUrl = None
if videoTag is None:
videoUrl = None
else: else:
videoUrl = videoTag['src'] style = iTag['style']
videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)[0]
videoTag = video_player.find('video')
if videoTag is None:
videoUrl = None
else:
videoUrl = videoTag['src']
mKwargs = { mKwargs = {
'thumbnailUrl': videoThumbnailUrl, 'thumbnailUrl': videoThumbnailUrl,
'url': videoUrl, 'url': videoUrl,
@@ -146,8 +150,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')): if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
forwardedUrl = forward_tag['href'] forwardedUrl = forward_tag['href']
forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0] forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
forwardedChannelScraper = TelegramChannelScraper(name = forwardedName) forwarded = Channel(username = forwardedName)
forwarded = forwardedChannelScraper._get_entity()
outlinks = [] outlinks = []
for link in post.find_all('a'): for link in post.find_all('a'):
@@ -213,7 +216,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
if not pageLink: if not pageLink:
break break
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
r = self._get(nextPageUrl, headers = self._headers) r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
if r.status_code != 200: if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}') raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml') soup = bs4.BeautifulSoup(r.text, 'lxml')
@@ -279,4 +282,12 @@ def parse_num(s):
elif s.endswith('K'): elif s.endswith('K'):
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1])) return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
else: else:
return int(s), 1 return int(s), 1
def telegramResponseOkCallback(r):
if r.status_code == 200:
return (True, None)
elif r.status_code // 100 == 5:
return (False, f'status code: {r.status_code}')
else:
return (False, None)