Merge pull request #7 from bellingcat/more-tg-info

More tg info
Merge pull request #6 from bellingcat/add-vk-user
2026-06-11 03:48:29 +03:00 · 2022-07-05 08:29:09 -07:00 · 2022-07-05 08:28:01 -07:00 · 2022-07-05 08:25:20 -07:00 · 2022-07-05 10:23:26 -05:00 · 2022-07-05 10:21:59 -05:00
6 changed files with 845 additions and 155 deletions
--- a/snscrape/_cli.py
+++ b/snscrape/_cli.py
@@ -133,12 +133,22 @@ def _dump_stack_and_locals(trace, exc = None):
 		fp.write('Stack:\n')
 		for frameRecord in trace:
 			fp.write(f'  File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
-			for line in frameRecord.code_context:
+			if frameRecord.code_context is not None:
-				fp.write(f'    {line.strip()}\n')
+				for line in frameRecord.code_context:
 					fp.write(f'    {line.strip()}\n')
 		fp.write('\n')
-		for frameRecord in trace:
+		modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
-			module = inspect.getmodule(frameRecord[0])
+		for i, (module, frameRecord) in enumerate(zip(modules, trace)):
 			if module is None:
 				# Module-less frame, e.g. dataclass.__init__
 				for j in reversed(range(i)):
 					if modules[j] is not None:
 						break
 				else:
 					# No previous module scope
 					continue
 				module = modules[j]
 			if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
 				continue
 			locals_ = frameRecord[0].f_locals
--- a/snscrape/base.py
+++ b/snscrape/base.py
@@ -163,16 +163,19 @@ class Scraper:
 		return self._get_entity()
 	def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
-		proxies = proxies or self._proxies
+		proxies = proxies or self._proxies or {}
 		for attempt in range(self._retries + 1):
 			# The request is newly prepared on each retry because of potential cookie updates.
 			req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
 			environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
 			logger.info(f'Retrieving {req.url}')
 			logger.debug(f'... with headers: {headers!r}')
 			if data:
 				logger.debug(f'... with data: {data!r}')
 			if environmentSettings:
 				logger.debug(f'... with environmentSettings: {environmentSettings!r}')
 			try:
-				r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, proxies = proxies)
+				r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
 			except requests.exceptions.RequestException as exc:
 				if attempt < self._retries:
 					retrying = ', retrying'
@@ -226,7 +229,7 @@ class Scraper:
 	@classmethod
 	def _cli_from_args(cls, args):
-		return cls._construct(args)
+		return cls._cli_construct(args)
 	@classmethod
 	def _cli_construct(cls, argparseArgs, *args, **kwargs):
--- a/snscrape/modules/reddit.py
+++ b/snscrape/modules/reddit.py
@@ -20,7 +20,7 @@ _logger = logging.getLogger(__name__)
@dataclasses.dataclass
 class Submission(snscrape.base.Item):
 	author: typing.Optional[str] # E.g. submission hf7k6
-	created: datetime.datetime
+	date: datetime.datetime
 	id: str
 	link: typing.Optional[str]
 	selftext: typing.Optional[str]
@@ -28,6 +28,8 @@ class Submission(snscrape.base.Item):
 	title: str
 	url: str
 	created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
 	def __str__(self):
 		return self.url
@@ -36,12 +38,14 @@ class Submission(snscrape.base.Item):
 class Comment(snscrape.base.Item):
 	author: typing.Optional[str]
 	body: str
-	created: datetime.datetime
+	date: datetime.datetime
 	id: str
 	parentId: typing.Optional[str]
 	subreddit: typing.Optional[str]
 	url: str
 	created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
 	def __str__(self):
 		return self.url
@@ -111,7 +115,7 @@ class _RedditPushshiftScraper(snscrape.base.Scraper):
 		kwargs = {
 			'author': d.get('author'),
-			'created': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
+			'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
 			'url': f'https://old.reddit.com{permalink}',
 			'subreddit': d.get('subreddit'),
 		}
@@ -192,7 +196,7 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
 		while True:
 			# Return newer first; if both have the same creation datetime, return the comment first
-			if tipSubmission.created > tipComment.created:
+			if tipSubmission.date > tipComment.date:
 				yield tipSubmission
 				try:
 					tipSubmission = next(submissionsIter)
--- a/snscrape/modules/telegram.py
+++ b/snscrape/modules/telegram.py
@@ -9,7 +9,6 @@ import re
 import snscrape.base
 import typing
 import urllib.parse
 import base64
 _logger = logging.getLogger(__name__)
 _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
@@ -57,7 +56,7 @@ class TelegramPost(snscrape.base.Item):
 	forwarded: typing.Optional['Channel'] = None
 	forwardedUrl: typing.Optional[str] = None
 	media: typing.Optional[typing.List['Medium']] = None
-	views: typing.Optional[int] = None
+	views: typing.Optional[snscrape.base.IntWithGranularity] = None
 	linkPreview: typing.Optional[LinkPreview] = None
 	outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
@@ -176,7 +175,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 			for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
 				audioUrl = voicePlayer.find('audio')['src']
 				durationStr = voicePlayer.find('time').text
-				duration = durationStrToSeconds(durationStr)
+				duration = _durationStrToSeconds(durationStr)
 				barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
 				media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
@@ -201,7 +200,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 				else:
 					cls = Video
 					durationStr = videoPlayer.find('time').text
-					mKwargs['duration'] = durationStrToSeconds(durationStr)
+					mKwargs['duration'] = _durationStrToSeconds(durationStr)
 				media.append(cls(**mKwargs))
 			linkPreview = None
@@ -224,7 +223,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 					outlinks.remove(kwargs['href'])
 			viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
-			views = None if viewsSpan is None else parse_num(viewsSpan.text)
+			views = None if viewsSpan is None else _parse_num(viewsSpan.text)
 			outlinks = outlinks if outlinks else None
 			media = media if media else None
 			mentions = mentions if mentions else None
 			hashtags = hashtags if hashtags else None
 			yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
@@ -253,7 +257,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 				else:
 					break
 			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
-			r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
+			r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
 			if r.status_code != 200:
 				raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
 			soup = bs4.BeautifulSoup(r.text, 'lxml')
@@ -266,8 +270,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 			raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
 		soup = bs4.BeautifulSoup(r.text, 'lxml')
 		membersDiv = soup.find('div', class_ = 'tgme_page_extra')
-		if membersDiv.text.endswith((' members', ' subscribers')):
+		if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
-			kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
+			membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
 			if membersStr == 'no':
 				kwargs['members'] = 0
 			else:
 				kwargs['members'] = int(membersStr)
 		photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
 		if photoImg is not None:
 			kwargs['photo'] = photoImg.attrs['src']
@@ -294,7 +302,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 			kwargs['description'] = descriptionDiv.text
 		for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
-			value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
+			value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
 			type_ = div.find('span', class_ = 'counter_type').text
 			if type_ == 'members':
 				# Already extracted more accurately from /channel, skip
@@ -312,7 +320,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
 	def _cli_from_args(cls, args):
 		return cls._cli_construct(args, args.channel)
-def parse_num(s):
+def _parse_num(s):
 	s = s.replace(' ', '')
 	if s.endswith('M'):
 		return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
@@ -320,11 +328,11 @@ def parse_num(s):
 		return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
 	return int(s), 1
-def durationStrToSeconds(durationStr):
+def _durationStrToSeconds(durationStr):
 	durationList = durationStr.split(':')
-	return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
+	return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
-def telegramResponseOkCallback(r):
+def _telegramResponseOkCallback(r):
 	if r.status_code == 200:
 		return (True, None)
 	return (False, f'{r.status_code=}')
--- a/snscrape/modules/twitter.py
+++ b/snscrape/modules/twitter.py
--- a/snscrape/modules/vkontakte.py
+++ b/snscrape/modules/vkontakte.py
@@ -32,17 +32,41 @@ _logger = logging.getLogger(__name__)
 _months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
 _datePattern = re.compile(r'^(?P<date>today'
                                  r'|yesterday'
-                                  r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
+                                  r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
                                  r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
                           ')'
                          r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
@dataclasses.dataclass
 class User(snscrape.base.Entity):
 	username: str
 	name: str
 	verified: bool
 	description: typing.Optional[str] = None
 	websites: typing.Optional[typing.List[str]] = None
 	followers: typing.Optional[snscrape.base.IntWithGranularity] = None
 	posts: typing.Optional[snscrape.base.IntWithGranularity] = None
 	photos: typing.Optional[snscrape.base.IntWithGranularity] = None
 	tags: typing.Optional[snscrape.base.IntWithGranularity] = None
 	following: typing.Optional[snscrape.base.IntWithGranularity] = None
 	followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
 	postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
 	photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
 	tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
 	followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
 	def __str__(self):
 		return f'https://vk.com/{self.username}'
@dataclasses.dataclass
 class VKontaktePost(snscrape.base.Item):
 	url: str
 	date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
 	content: str
 	user: User
 	outlinks: typing.Optional[typing.List[str]] = None
 	photos: typing.Optional[typing.List['Photo']] = None
 	video: typing.Optional['Video'] = None
@@ -74,29 +98,6 @@ class Video:
 	thumbUrl: str
@dataclasses.dataclass
 class User(snscrape.base.Entity):
 	username: str
 	name: str
 	verified: bool
 	description: typing.Optional[str] = None
 	websites: typing.Optional[typing.List[str]] = None
 	followers: typing.Optional[snscrape.base.IntWithGranularity] = None
 	posts: typing.Optional[snscrape.base.IntWithGranularity] = None
 	photos: typing.Optional[snscrape.base.IntWithGranularity] = None
 	tags: typing.Optional[snscrape.base.IntWithGranularity] = None
 	following: typing.Optional[snscrape.base.IntWithGranularity] = None
 	followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
 	postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
 	photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
 	tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
 	followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
 	def __str__(self):
 		return f'https://vk.com/{self.username}'
 class VKontakteUserScraper(snscrape.base.Scraper):
 	name = 'vkontakte-user'
@@ -177,15 +178,11 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 					continue
 				if 'data-video' in a.attrs:
 					# Video
 					if 'data-link-attr' in a.attrs:
 						hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
 					else:
 						hrefUrl = f'https://vk.com{a["href"]}'
 					video = Video(
 						id = a['data-video'],
 						list = a['data-list'],
 						duration = int(a['data-duration']),
-						url = hrefUrl,
+						url = f'https://vk.com{a["href"]}',
 						thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
 					)
 					continue
@@ -216,14 +213,24 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 				photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
 				photos.append(Photo(variants = photoVariants, url = photoUrl))
 		quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
 		authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
 		authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
 		username = authorLink['href'].split('/')[-1]
 		name = authorLink.text
 		if authorHeading.find('div', class_ = 'page_verified') is not None:
 			verified = True
 		else:
 			verified = False
 		user = User(username = username, name = name, verified = verified)
 		return VKontaktePost(
-		  url = url,
+			url = url,
-		  date = self._date_span_to_date(dateSpan),
+			date = self._date_span_to_date(dateSpan),
-		  content = textDiv.text if textDiv else None,
+			content = textDiv.text if textDiv else None,
-		  outlinks = outlinks or None,
+			user = user,
-		  photos = photos or None,
+			outlinks = outlinks or None,
-		  video = video or None,
+			photos = photos or None,
-		  quotedPost = quotedPost,
+			video = video or None,
 			quotedPost = quotedPost,
 		 )
 	def _soup_to_items(self, soup):
@@ -380,6 +387,13 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 		if (followersDiv := soup.find('div', id = 'public_followers')):
 			if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
 				kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
 		# On community groups, this is where followers are listed
 		elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
 			kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
 		# On public groups, this is where followers are listed
 		elif (followersDiv := soup.find('div', id = 'group_followers')):
 			if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
 				kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
 		return User(**kwargs)
@@ -389,4 +403,4 @@ class VKontakteUserScraper(snscrape.base.Scraper):
 	@classmethod
 	def _cli_from_args(cls, args):
-		return cls._cli_construct(args, args.username)
+		return cls._cli_construct(args, args.username)
Author	SHA1	Message	Date
Tristan Lee	40b8d9f267	Merge pull request #7 from bellingcat/more-tg-info More tg info	2022-07-05 08:29:09 -07:00
Tristan Lee	fdc40f7411	Merge pull request #6 from bellingcat/add-vk-user added User dataclass as argument to VKontaktePost dataclass	2022-07-05 08:28:01 -07:00
Tristan Lee	82351800d6	Merge pull request #5 from JustAnotherArchivist/master merge upstream	2022-07-05 08:25:20 -07:00
Tristan Lee	73f10a4f24	fixed edge case where channel with no members fails _get_entity	2022-07-05 10:23:26 -05:00
Tristan Lee	cb429909d0	added User dataclass as argument to VKontaktePost dataclass	2022-07-05 10:21:59 -05:00
JustAnotherArchivist	d72b51953f	Fix missing r prefix on string with regex backslashes	2022-06-24 23:12:50 +00:00
Tristan Lee	056cd6215c	incorporated requested changes from maintainer, removed modifications to VK module	2022-06-23 15:47:18 -05:00
JustAnotherArchivist	d5b406bc1b	Update API parameters to what Twitter currently uses The `count` reduction does not affect anything as Twitter ignores that parameter now. Cf. #481	2022-06-23 19:50:17 +00:00
Tristan Lee	56e4232083	fixed typo	2022-06-23 11:51:13 -05:00
JustAnotherArchivist	50899c01f3	Fix crash on malformed guest token cache file Fixes #494	2022-06-16 17:12:04 +00:00
JustAnotherArchivist	bcad6923c2	Rename Tweet.content to rawContent and User.description to renderedDescription for consistency Closes #479	2022-06-14 00:35:02 +00:00
JustAnotherArchivist	0d361685ff	Fix AttributeError crash on scrapers using the default CLI constructor Introduced by `267b7d0e` Fixes #483	2022-06-01 17:35:38 +00:00
JustAnotherArchivist	530f4fa122	Fix KeyErrors on display_url and expanded_url for certain users with broken profile links Fixes #480	2022-05-29 17:23:43 +00:00
JustAnotherArchivist	dc6bc9bf9d	Refactor how links on Twitter are handled All links in text (tweets, profile descriptions, and profile links) are now represented by TextLink objects, which contain all relevant information: the displayed text (if available), the URL, the short t.co URL, and the indices in the text at which it appears. Closes #478	2022-05-29 07:16:04 +00:00
JustAnotherArchivist	01cf6a09b3	Fix type of description URL objects	2022-05-29 05:08:23 +00:00
JustAnotherArchivist	ef7c4fad3e	Fix AttributeError for DescriptionURL on from-import	2022-05-29 05:08:23 +00:00
Tristan Lee	65723f10ff	fixed merge	2022-05-25 06:47:47 -05:00
Tristan Lee	07a5f6fd7d	merged master into more-tg-info to update upstream PR	2022-05-25 01:18:48 -05:00
Tristan Lee	0822a9c354	Merge pull request #4 from JustAnotherArchivist/master upstream merge	2022-05-24 23:10:38 -07:00
JustAnotherArchivist	faeffe2603	Merge pull request #474 from GeraniumKF/GeraniumKF-reddit-since-crash Fix crash using --since with Reddit	2022-05-23 23:06:16 +00:00
Geranium	e3bdc02a7c	Reddit: deprecate 'created' property for 'date' This fixes a crash when using --since with the Reddit scraper, as the CLI code expects items to have a date property.	2022-05-23 23:31:44 +01:00
Tristan Lee	e2d922301e	forgot to save modified twitter.py module	2022-05-09 09:37:36 -05:00
Tristan Lee	b13e62eb5d	Merge branch 'JustAnotherArchivist-master'	2022-05-09 09:35:35 -05:00
Tristan Lee	f38513503d	fixed merge conflicts	2022-05-09 09:35:19 -05:00
Tristan Lee	0a4bd39ca6	Merge pull request #2 from bellingcat/telegram-media Implemented JustAnotherArchivist's requested changes to Telegram scraper from PR	2022-05-09 07:23:39 -07:00
JustAnotherArchivist	ed3ea944d1	Fix newsletter issue cards without an issue description Fixes #456	2022-04-16 19:44:36 +00:00
JustAnotherArchivist	e7a6d38a5f	Add support for community_details cards	2022-04-15 20:07:01 +00:00
JustAnotherArchivist	6c50eee31b	Fix proxies not being applied correctly due to missing merge with environment settings Fixes #447	2022-04-15 19:23:54 +00:00
JustAnotherArchivist	5103a33afa	Fix t.co card URL replacement on retweets Fixes #411	2022-04-15 03:18:45 +00:00
JustAnotherArchivist	247bd82d79	Refactor to tweetId variable	2022-04-15 03:14:29 +00:00
JustAnotherArchivist	5fc67f2bcf	Add support for 'message me' cards	2022-04-15 02:52:37 +00:00
JustAnotherArchivist	65e7d8bd24	Fix warning on card URL translation to include the tweet ID	2022-04-15 02:52:03 +00:00
JustAnotherArchivist	3870282a42	Fix broadcast and event card crashes	2022-04-12 20:53:38 +00:00
JustAnotherArchivist	7c0fcdec43	Fix Periscope card crashes	2022-04-12 18:29:51 +00:00
JustAnotherArchivist	9af1f19034	Properly support all card types Fixes #407	2022-04-12 18:11:26 +00:00
JustAnotherArchivist	5fc3c0e290	Fix crash in locals dumping on module-less frames	2022-04-12 18:03:36 +00:00
Logan Williams	b8efce2a12	Clean up unnecessary imports	2022-03-08 15:10:15 +01:00