mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-09 10:58:28 +03:00
Compare commits
37 Commits
telegram-m
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40b8d9f267 | ||
|
|
fdc40f7411 | ||
|
|
82351800d6 | ||
|
|
73f10a4f24 | ||
|
|
cb429909d0 | ||
|
|
d72b51953f | ||
|
|
056cd6215c | ||
|
|
d5b406bc1b | ||
|
|
56e4232083 | ||
|
|
50899c01f3 | ||
|
|
bcad6923c2 | ||
|
|
0d361685ff | ||
|
|
530f4fa122 | ||
|
|
dc6bc9bf9d | ||
|
|
01cf6a09b3 | ||
|
|
ef7c4fad3e | ||
|
|
65723f10ff | ||
|
|
07a5f6fd7d | ||
|
|
0822a9c354 | ||
|
|
faeffe2603 | ||
|
|
e3bdc02a7c | ||
|
|
e2d922301e | ||
|
|
b13e62eb5d | ||
|
|
f38513503d | ||
|
|
0a4bd39ca6 | ||
|
|
ed3ea944d1 | ||
|
|
e7a6d38a5f | ||
|
|
6c50eee31b | ||
|
|
5103a33afa | ||
|
|
247bd82d79 | ||
|
|
5fc67f2bcf | ||
|
|
65e7d8bd24 | ||
|
|
3870282a42 | ||
|
|
7c0fcdec43 | ||
|
|
9af1f19034 | ||
|
|
5fc3c0e290 | ||
|
|
b8efce2a12 |
@@ -133,12 +133,22 @@ def _dump_stack_and_locals(trace, exc = None):
|
||||
fp.write('Stack:\n')
|
||||
for frameRecord in trace:
|
||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
if frameRecord.code_context is not None:
|
||||
for line in frameRecord.code_context:
|
||||
fp.write(f' {line.strip()}\n')
|
||||
fp.write('\n')
|
||||
|
||||
for frameRecord in trace:
|
||||
module = inspect.getmodule(frameRecord[0])
|
||||
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
|
||||
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
|
||||
if module is None:
|
||||
# Module-less frame, e.g. dataclass.__init__
|
||||
for j in reversed(range(i)):
|
||||
if modules[j] is not None:
|
||||
break
|
||||
else:
|
||||
# No previous module scope
|
||||
continue
|
||||
module = modules[j]
|
||||
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
||||
continue
|
||||
locals_ = frameRecord[0].f_locals
|
||||
|
||||
@@ -163,16 +163,19 @@ class Scraper:
|
||||
return self._get_entity()
|
||||
|
||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
|
||||
proxies = proxies or self._proxies
|
||||
proxies = proxies or self._proxies or {}
|
||||
for attempt in range(self._retries + 1):
|
||||
# The request is newly prepared on each retry because of potential cookie updates.
|
||||
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
|
||||
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
|
||||
logger.info(f'Retrieving {req.url}')
|
||||
logger.debug(f'... with headers: {headers!r}')
|
||||
if data:
|
||||
logger.debug(f'... with data: {data!r}')
|
||||
if environmentSettings:
|
||||
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
|
||||
try:
|
||||
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, proxies = proxies)
|
||||
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
|
||||
except requests.exceptions.RequestException as exc:
|
||||
if attempt < self._retries:
|
||||
retrying = ', retrying'
|
||||
@@ -226,7 +229,7 @@ class Scraper:
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._construct(args)
|
||||
return cls._cli_construct(args)
|
||||
|
||||
@classmethod
|
||||
def _cli_construct(cls, argparseArgs, *args, **kwargs):
|
||||
|
||||
@@ -20,7 +20,7 @@ _logger = logging.getLogger(__name__)
|
||||
@dataclasses.dataclass
|
||||
class Submission(snscrape.base.Item):
|
||||
author: typing.Optional[str] # E.g. submission hf7k6
|
||||
created: datetime.datetime
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
link: typing.Optional[str]
|
||||
selftext: typing.Optional[str]
|
||||
@@ -28,6 +28,8 @@ class Submission(snscrape.base.Item):
|
||||
title: str
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
@@ -36,12 +38,14 @@ class Submission(snscrape.base.Item):
|
||||
class Comment(snscrape.base.Item):
|
||||
author: typing.Optional[str]
|
||||
body: str
|
||||
created: datetime.datetime
|
||||
date: datetime.datetime
|
||||
id: str
|
||||
parentId: typing.Optional[str]
|
||||
subreddit: typing.Optional[str]
|
||||
url: str
|
||||
|
||||
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||
|
||||
def __str__(self):
|
||||
return self.url
|
||||
|
||||
@@ -111,7 +115,7 @@ class _RedditPushshiftScraper(snscrape.base.Scraper):
|
||||
|
||||
kwargs = {
|
||||
'author': d.get('author'),
|
||||
'created': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
||||
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
||||
'url': f'https://old.reddit.com{permalink}',
|
||||
'subreddit': d.get('subreddit'),
|
||||
}
|
||||
@@ -192,7 +196,7 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
|
||||
|
||||
while True:
|
||||
# Return newer first; if both have the same creation datetime, return the comment first
|
||||
if tipSubmission.created > tipComment.created:
|
||||
if tipSubmission.date > tipComment.date:
|
||||
yield tipSubmission
|
||||
try:
|
||||
tipSubmission = next(submissionsIter)
|
||||
|
||||
@@ -9,7 +9,6 @@ import re
|
||||
import snscrape.base
|
||||
import typing
|
||||
import urllib.parse
|
||||
import base64
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||
@@ -57,7 +56,7 @@ class TelegramPost(snscrape.base.Item):
|
||||
forwarded: typing.Optional['Channel'] = None
|
||||
forwardedUrl: typing.Optional[str] = None
|
||||
media: typing.Optional[typing.List['Medium']] = None
|
||||
views: typing.Optional[int] = None
|
||||
views: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
linkPreview: typing.Optional[LinkPreview] = None
|
||||
|
||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||
@@ -176,7 +175,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
||||
audioUrl = voicePlayer.find('audio')['src']
|
||||
durationStr = voicePlayer.find('time').text
|
||||
duration = durationStrToSeconds(durationStr)
|
||||
duration = _durationStrToSeconds(durationStr)
|
||||
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
|
||||
|
||||
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
|
||||
@@ -201,7 +200,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
else:
|
||||
cls = Video
|
||||
durationStr = videoPlayer.find('time').text
|
||||
mKwargs['duration'] = durationStrToSeconds(durationStr)
|
||||
mKwargs['duration'] = _durationStrToSeconds(durationStr)
|
||||
media.append(cls(**mKwargs))
|
||||
|
||||
linkPreview = None
|
||||
@@ -224,7 +223,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
outlinks.remove(kwargs['href'])
|
||||
|
||||
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
|
||||
views = None if viewsSpan is None else parse_num(viewsSpan.text)
|
||||
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
|
||||
|
||||
outlinks = outlinks if outlinks else None
|
||||
media = media if media else None
|
||||
mentions = mentions if mentions else None
|
||||
hashtags = hashtags if hashtags else None
|
||||
|
||||
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
||||
|
||||
@@ -253,7 +257,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
else:
|
||||
break
|
||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
|
||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
|
||||
if r.status_code != 200:
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
@@ -266,8 +270,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
||||
if membersDiv.text.endswith((' members', ' subscribers')):
|
||||
kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
|
||||
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
|
||||
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
|
||||
if membersStr == 'no':
|
||||
kwargs['members'] = 0
|
||||
else:
|
||||
kwargs['members'] = int(membersStr)
|
||||
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
|
||||
if photoImg is not None:
|
||||
kwargs['photo'] = photoImg.attrs['src']
|
||||
@@ -294,7 +302,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
||||
value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
|
||||
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
|
||||
type_ = div.find('span', class_ = 'counter_type').text
|
||||
if type_ == 'members':
|
||||
# Already extracted more accurately from /channel, skip
|
||||
@@ -312,7 +320,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.channel)
|
||||
|
||||
def parse_num(s):
|
||||
def _parse_num(s):
|
||||
s = s.replace(' ', '')
|
||||
if s.endswith('M'):
|
||||
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
||||
@@ -320,11 +328,11 @@ def parse_num(s):
|
||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||
return int(s), 1
|
||||
|
||||
def durationStrToSeconds(durationStr):
|
||||
def _durationStrToSeconds(durationStr):
|
||||
durationList = durationStr.split(':')
|
||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
|
||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
|
||||
|
||||
def telegramResponseOkCallback(r):
|
||||
def _telegramResponseOkCallback(r):
|
||||
if r.status_code == 200:
|
||||
return (True, None)
|
||||
return (False, f'{r.status_code=}')
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -32,17 +32,41 @@ _logger = logging.getLogger(__name__)
|
||||
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
_datePattern = re.compile(r'^(?P<date>today'
|
||||
r'|yesterday'
|
||||
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
|
||||
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
|
||||
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
|
||||
')'
|
||||
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
description: typing.Optional[str] = None
|
||||
websites: typing.Optional[typing.List[str]] = None
|
||||
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://vk.com/{self.username}'
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class VKontaktePost(snscrape.base.Item):
|
||||
url: str
|
||||
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
|
||||
content: str
|
||||
user: User
|
||||
outlinks: typing.Optional[typing.List[str]] = None
|
||||
photos: typing.Optional[typing.List['Photo']] = None
|
||||
video: typing.Optional['Video'] = None
|
||||
@@ -74,29 +98,6 @@ class Video:
|
||||
thumbUrl: str
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class User(snscrape.base.Entity):
|
||||
username: str
|
||||
name: str
|
||||
verified: bool
|
||||
description: typing.Optional[str] = None
|
||||
websites: typing.Optional[typing.List[str]] = None
|
||||
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||
|
||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||
|
||||
def __str__(self):
|
||||
return f'https://vk.com/{self.username}'
|
||||
|
||||
|
||||
class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
name = 'vkontakte-user'
|
||||
|
||||
@@ -177,15 +178,11 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
continue
|
||||
if 'data-video' in a.attrs:
|
||||
# Video
|
||||
if 'data-link-attr' in a.attrs:
|
||||
hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
|
||||
else:
|
||||
hrefUrl = f'https://vk.com{a["href"]}'
|
||||
video = Video(
|
||||
id = a['data-video'],
|
||||
list = a['data-list'],
|
||||
duration = int(a['data-duration']),
|
||||
url = hrefUrl,
|
||||
url = f'https://vk.com{a["href"]}',
|
||||
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
|
||||
)
|
||||
continue
|
||||
@@ -216,14 +213,24 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
|
||||
photos.append(Photo(variants = photoVariants, url = photoUrl))
|
||||
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
|
||||
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
|
||||
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
|
||||
username = authorLink['href'].split('/')[-1]
|
||||
name = authorLink.text
|
||||
if authorHeading.find('div', class_ = 'page_verified') is not None:
|
||||
verified = True
|
||||
else:
|
||||
verified = False
|
||||
user = User(username = username, name = name, verified = verified)
|
||||
return VKontaktePost(
|
||||
url = url,
|
||||
date = self._date_span_to_date(dateSpan),
|
||||
content = textDiv.text if textDiv else None,
|
||||
outlinks = outlinks or None,
|
||||
photos = photos or None,
|
||||
video = video or None,
|
||||
quotedPost = quotedPost,
|
||||
url = url,
|
||||
date = self._date_span_to_date(dateSpan),
|
||||
content = textDiv.text if textDiv else None,
|
||||
user = user,
|
||||
outlinks = outlinks or None,
|
||||
photos = photos or None,
|
||||
video = video or None,
|
||||
quotedPost = quotedPost,
|
||||
)
|
||||
|
||||
def _soup_to_items(self, soup):
|
||||
@@ -380,6 +387,13 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
if (followersDiv := soup.find('div', id = 'public_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
# On community groups, this is where followers are listed
|
||||
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
|
||||
# On public groups, this is where followers are listed
|
||||
elif (followersDiv := soup.find('div', id = 'group_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
|
||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
@@ -389,4 +403,4 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
|
||||
@classmethod
|
||||
def _cli_from_args(cls, args):
|
||||
return cls._cli_construct(args, args.username)
|
||||
return cls._cli_construct(args, args.username)
|
||||
Reference in New Issue
Block a user