From babcddda19a0eca7d7413464666cbeb145d76f2d Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Sun, 17 Apr 2022 03:55:37 -0500
Subject: [PATCH] made Telegram scraper not return full channel info for
 forwarded_from attribute; fixed video edge cases.

---
 snscrape/modules/telegram.py | 39 +++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py
index 19aa22e..565322c 100644
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -27,9 +27,9 @@ class LinkPreview:
 @dataclasses.dataclass
 class Channel(snscrape.base.Entity):
 	username: str
-	title: str
-	verified: bool
-	photo: str
+	title: typing.Optional[str] = None
+	verified: typing.Optional[bool] = None
+	photo: typing.Optional[str] = None
 	description: typing.Optional[str] = None
 	members: typing.Optional[int] = None
 	photos: typing.Optional[snscrape.base.IntWithGranularity] = None
@@ -123,14 +123,18 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 				content = message.get_text(separator="\n")
 
 				for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
-
-					style = video_player.find('i')['style']
-					videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)
-					videoTag = video_player.find('video')
-					if videoTag is None:
-						videoUrl = None
+					iTag = video_player.find('i')
+					if iTag is None:
+						videoUrl = None 
+						videoThumbnailUrl = None
 					else:
-						videoUrl = videoTag['src']
+						style = iTag['style']
+						videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)[0]
+						videoTag = video_player.find('video')
+						if videoTag is None:
+							videoUrl = None
+						else:
+							videoUrl = videoTag['src']
 					mKwargs = {
 						'thumbnailUrl': videoThumbnailUrl,
 						'url': videoUrl,
@@ -146,8 +150,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 				if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
 					forwardedUrl = forward_tag['href']
 					forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
-					forwardedChannelScraper = TelegramChannelScraper(name = forwardedName)
-					forwarded = forwardedChannelScraper._get_entity()
+					forwarded = Channel(username = forwardedName)
 
 				outlinks = []
 				for link in post.find_all('a'):
@@ -213,7 +216,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 			if not pageLink:
 				break
 			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
-			r = self._get(nextPageUrl, headers = self._headers)
+			r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
 			if r.status_code != 200:
 				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
 			soup = bs4.BeautifulSoup(r.text, 'lxml')
@@ -279,4 +282,12 @@ def parse_num(s):
 	elif s.endswith('K'):
 		return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
 	else:
-		return int(s), 1
\ No newline at end of file
+		return int(s), 1
+
+def telegramResponseOkCallback(r):
+	if r.status_code == 200:
+		return (True, None)
+	elif r.status_code // 100 == 5:
+		return (False, f'status code: {r.status_code}')
+	else:
+		return (False, None)
\ No newline at end of file