mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-11 03:48:29 +03:00
Compare commits
37 Commits
telegram-m
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40b8d9f267 | ||
|
|
fdc40f7411 | ||
|
|
82351800d6 | ||
|
|
73f10a4f24 | ||
|
|
cb429909d0 | ||
|
|
d72b51953f | ||
|
|
056cd6215c | ||
|
|
d5b406bc1b | ||
|
|
56e4232083 | ||
|
|
50899c01f3 | ||
|
|
bcad6923c2 | ||
|
|
0d361685ff | ||
|
|
530f4fa122 | ||
|
|
dc6bc9bf9d | ||
|
|
01cf6a09b3 | ||
|
|
ef7c4fad3e | ||
|
|
65723f10ff | ||
|
|
07a5f6fd7d | ||
|
|
0822a9c354 | ||
|
|
faeffe2603 | ||
|
|
e3bdc02a7c | ||
|
|
e2d922301e | ||
|
|
b13e62eb5d | ||
|
|
f38513503d | ||
|
|
0a4bd39ca6 | ||
|
|
ed3ea944d1 | ||
|
|
e7a6d38a5f | ||
|
|
6c50eee31b | ||
|
|
5103a33afa | ||
|
|
247bd82d79 | ||
|
|
5fc67f2bcf | ||
|
|
65e7d8bd24 | ||
|
|
3870282a42 | ||
|
|
7c0fcdec43 | ||
|
|
9af1f19034 | ||
|
|
5fc3c0e290 | ||
|
|
b8efce2a12 |
@@ -133,12 +133,22 @@ def _dump_stack_and_locals(trace, exc = None):
|
|||||||
fp.write('Stack:\n')
|
fp.write('Stack:\n')
|
||||||
for frameRecord in trace:
|
for frameRecord in trace:
|
||||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||||
for line in frameRecord.code_context:
|
if frameRecord.code_context is not None:
|
||||||
fp.write(f' {line.strip()}\n')
|
for line in frameRecord.code_context:
|
||||||
|
fp.write(f' {line.strip()}\n')
|
||||||
fp.write('\n')
|
fp.write('\n')
|
||||||
|
|
||||||
for frameRecord in trace:
|
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
|
||||||
module = inspect.getmodule(frameRecord[0])
|
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
|
||||||
|
if module is None:
|
||||||
|
# Module-less frame, e.g. dataclass.__init__
|
||||||
|
for j in reversed(range(i)):
|
||||||
|
if modules[j] is not None:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# No previous module scope
|
||||||
|
continue
|
||||||
|
module = modules[j]
|
||||||
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
|
||||||
continue
|
continue
|
||||||
locals_ = frameRecord[0].f_locals
|
locals_ = frameRecord[0].f_locals
|
||||||
|
|||||||
@@ -163,16 +163,19 @@ class Scraper:
|
|||||||
return self._get_entity()
|
return self._get_entity()
|
||||||
|
|
||||||
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
|
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
|
||||||
proxies = proxies or self._proxies
|
proxies = proxies or self._proxies or {}
|
||||||
for attempt in range(self._retries + 1):
|
for attempt in range(self._retries + 1):
|
||||||
# The request is newly prepared on each retry because of potential cookie updates.
|
# The request is newly prepared on each retry because of potential cookie updates.
|
||||||
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
|
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
|
||||||
|
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
|
||||||
logger.info(f'Retrieving {req.url}')
|
logger.info(f'Retrieving {req.url}')
|
||||||
logger.debug(f'... with headers: {headers!r}')
|
logger.debug(f'... with headers: {headers!r}')
|
||||||
if data:
|
if data:
|
||||||
logger.debug(f'... with data: {data!r}')
|
logger.debug(f'... with data: {data!r}')
|
||||||
|
if environmentSettings:
|
||||||
|
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
|
||||||
try:
|
try:
|
||||||
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, proxies = proxies)
|
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
|
||||||
except requests.exceptions.RequestException as exc:
|
except requests.exceptions.RequestException as exc:
|
||||||
if attempt < self._retries:
|
if attempt < self._retries:
|
||||||
retrying = ', retrying'
|
retrying = ', retrying'
|
||||||
@@ -226,7 +229,7 @@ class Scraper:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _cli_from_args(cls, args):
|
def _cli_from_args(cls, args):
|
||||||
return cls._construct(args)
|
return cls._cli_construct(args)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _cli_construct(cls, argparseArgs, *args, **kwargs):
|
def _cli_construct(cls, argparseArgs, *args, **kwargs):
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ _logger = logging.getLogger(__name__)
|
|||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class Submission(snscrape.base.Item):
|
class Submission(snscrape.base.Item):
|
||||||
author: typing.Optional[str] # E.g. submission hf7k6
|
author: typing.Optional[str] # E.g. submission hf7k6
|
||||||
created: datetime.datetime
|
date: datetime.datetime
|
||||||
id: str
|
id: str
|
||||||
link: typing.Optional[str]
|
link: typing.Optional[str]
|
||||||
selftext: typing.Optional[str]
|
selftext: typing.Optional[str]
|
||||||
@@ -28,6 +28,8 @@ class Submission(snscrape.base.Item):
|
|||||||
title: str
|
title: str
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
@@ -36,12 +38,14 @@ class Submission(snscrape.base.Item):
|
|||||||
class Comment(snscrape.base.Item):
|
class Comment(snscrape.base.Item):
|
||||||
author: typing.Optional[str]
|
author: typing.Optional[str]
|
||||||
body: str
|
body: str
|
||||||
created: datetime.datetime
|
date: datetime.datetime
|
||||||
id: str
|
id: str
|
||||||
parentId: typing.Optional[str]
|
parentId: typing.Optional[str]
|
||||||
subreddit: typing.Optional[str]
|
subreddit: typing.Optional[str]
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
@@ -111,7 +115,7 @@ class _RedditPushshiftScraper(snscrape.base.Scraper):
|
|||||||
|
|
||||||
kwargs = {
|
kwargs = {
|
||||||
'author': d.get('author'),
|
'author': d.get('author'),
|
||||||
'created': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
|
||||||
'url': f'https://old.reddit.com{permalink}',
|
'url': f'https://old.reddit.com{permalink}',
|
||||||
'subreddit': d.get('subreddit'),
|
'subreddit': d.get('subreddit'),
|
||||||
}
|
}
|
||||||
@@ -192,7 +196,7 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
# Return newer first; if both have the same creation datetime, return the comment first
|
# Return newer first; if both have the same creation datetime, return the comment first
|
||||||
if tipSubmission.created > tipComment.created:
|
if tipSubmission.date > tipComment.date:
|
||||||
yield tipSubmission
|
yield tipSubmission
|
||||||
try:
|
try:
|
||||||
tipSubmission = next(submissionsIter)
|
tipSubmission = next(submissionsIter)
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ import re
|
|||||||
import snscrape.base
|
import snscrape.base
|
||||||
import typing
|
import typing
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import base64
|
|
||||||
|
|
||||||
_logger = logging.getLogger(__name__)
|
_logger = logging.getLogger(__name__)
|
||||||
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
|
||||||
@@ -57,7 +56,7 @@ class TelegramPost(snscrape.base.Item):
|
|||||||
forwarded: typing.Optional['Channel'] = None
|
forwarded: typing.Optional['Channel'] = None
|
||||||
forwardedUrl: typing.Optional[str] = None
|
forwardedUrl: typing.Optional[str] = None
|
||||||
media: typing.Optional[typing.List['Medium']] = None
|
media: typing.Optional[typing.List['Medium']] = None
|
||||||
views: typing.Optional[int] = None
|
views: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
linkPreview: typing.Optional[LinkPreview] = None
|
linkPreview: typing.Optional[LinkPreview] = None
|
||||||
|
|
||||||
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
|
||||||
@@ -176,7 +175,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
|
||||||
audioUrl = voicePlayer.find('audio')['src']
|
audioUrl = voicePlayer.find('audio')['src']
|
||||||
durationStr = voicePlayer.find('time').text
|
durationStr = voicePlayer.find('time').text
|
||||||
duration = durationStrToSeconds(durationStr)
|
duration = _durationStrToSeconds(durationStr)
|
||||||
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
|
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
|
||||||
|
|
||||||
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
|
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
|
||||||
@@ -201,7 +200,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
else:
|
else:
|
||||||
cls = Video
|
cls = Video
|
||||||
durationStr = videoPlayer.find('time').text
|
durationStr = videoPlayer.find('time').text
|
||||||
mKwargs['duration'] = durationStrToSeconds(durationStr)
|
mKwargs['duration'] = _durationStrToSeconds(durationStr)
|
||||||
media.append(cls(**mKwargs))
|
media.append(cls(**mKwargs))
|
||||||
|
|
||||||
linkPreview = None
|
linkPreview = None
|
||||||
@@ -224,7 +223,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
outlinks.remove(kwargs['href'])
|
outlinks.remove(kwargs['href'])
|
||||||
|
|
||||||
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
|
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
|
||||||
views = None if viewsSpan is None else parse_num(viewsSpan.text)
|
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
|
||||||
|
|
||||||
|
outlinks = outlinks if outlinks else None
|
||||||
|
media = media if media else None
|
||||||
|
mentions = mentions if mentions else None
|
||||||
|
hashtags = hashtags if hashtags else None
|
||||||
|
|
||||||
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
|
||||||
|
|
||||||
@@ -253,7 +257,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
|
||||||
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
|
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
@@ -266,8 +270,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
|
||||||
if membersDiv.text.endswith((' members', ' subscribers')):
|
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
|
||||||
kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
|
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
|
||||||
|
if membersStr == 'no':
|
||||||
|
kwargs['members'] = 0
|
||||||
|
else:
|
||||||
|
kwargs['members'] = int(membersStr)
|
||||||
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
|
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
|
||||||
if photoImg is not None:
|
if photoImg is not None:
|
||||||
kwargs['photo'] = photoImg.attrs['src']
|
kwargs['photo'] = photoImg.attrs['src']
|
||||||
@@ -294,7 +302,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
kwargs['description'] = descriptionDiv.text
|
kwargs['description'] = descriptionDiv.text
|
||||||
|
|
||||||
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
|
||||||
value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
|
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
|
||||||
type_ = div.find('span', class_ = 'counter_type').text
|
type_ = div.find('span', class_ = 'counter_type').text
|
||||||
if type_ == 'members':
|
if type_ == 'members':
|
||||||
# Already extracted more accurately from /channel, skip
|
# Already extracted more accurately from /channel, skip
|
||||||
@@ -312,7 +320,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
|||||||
def _cli_from_args(cls, args):
|
def _cli_from_args(cls, args):
|
||||||
return cls._cli_construct(args, args.channel)
|
return cls._cli_construct(args, args.channel)
|
||||||
|
|
||||||
def parse_num(s):
|
def _parse_num(s):
|
||||||
s = s.replace(' ', '')
|
s = s.replace(' ', '')
|
||||||
if s.endswith('M'):
|
if s.endswith('M'):
|
||||||
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
|
||||||
@@ -320,11 +328,11 @@ def parse_num(s):
|
|||||||
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
|
||||||
return int(s), 1
|
return int(s), 1
|
||||||
|
|
||||||
def durationStrToSeconds(durationStr):
|
def _durationStrToSeconds(durationStr):
|
||||||
durationList = durationStr.split(':')
|
durationList = durationStr.split(':')
|
||||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
|
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
|
||||||
|
|
||||||
def telegramResponseOkCallback(r):
|
def _telegramResponseOkCallback(r):
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
return (True, None)
|
return (True, None)
|
||||||
return (False, f'{r.status_code=}')
|
return (False, f'{r.status_code=}')
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -32,17 +32,41 @@ _logger = logging.getLogger(__name__)
|
|||||||
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||||
_datePattern = re.compile(r'^(?P<date>today'
|
_datePattern = re.compile(r'^(?P<date>today'
|
||||||
r'|yesterday'
|
r'|yesterday'
|
||||||
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
|
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
|
||||||
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
|
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
|
||||||
')'
|
')'
|
||||||
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
|
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class User(snscrape.base.Entity):
|
||||||
|
username: str
|
||||||
|
name: str
|
||||||
|
verified: bool
|
||||||
|
description: typing.Optional[str] = None
|
||||||
|
websites: typing.Optional[typing.List[str]] = None
|
||||||
|
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
|
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
|
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
|
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
|
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
||||||
|
|
||||||
|
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
||||||
|
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
||||||
|
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
||||||
|
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
||||||
|
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f'https://vk.com/{self.username}'
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class VKontaktePost(snscrape.base.Item):
|
class VKontaktePost(snscrape.base.Item):
|
||||||
url: str
|
url: str
|
||||||
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
|
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
|
||||||
content: str
|
content: str
|
||||||
|
user: User
|
||||||
outlinks: typing.Optional[typing.List[str]] = None
|
outlinks: typing.Optional[typing.List[str]] = None
|
||||||
photos: typing.Optional[typing.List['Photo']] = None
|
photos: typing.Optional[typing.List['Photo']] = None
|
||||||
video: typing.Optional['Video'] = None
|
video: typing.Optional['Video'] = None
|
||||||
@@ -74,29 +98,6 @@ class Video:
|
|||||||
thumbUrl: str
|
thumbUrl: str
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
|
||||||
class User(snscrape.base.Entity):
|
|
||||||
username: str
|
|
||||||
name: str
|
|
||||||
verified: bool
|
|
||||||
description: typing.Optional[str] = None
|
|
||||||
websites: typing.Optional[typing.List[str]] = None
|
|
||||||
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
|
|
||||||
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
|
|
||||||
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
|
|
||||||
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
|
|
||||||
following: typing.Optional[snscrape.base.IntWithGranularity] = None
|
|
||||||
|
|
||||||
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
|
|
||||||
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
|
|
||||||
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
|
|
||||||
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
|
|
||||||
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return f'https://vk.com/{self.username}'
|
|
||||||
|
|
||||||
|
|
||||||
class VKontakteUserScraper(snscrape.base.Scraper):
|
class VKontakteUserScraper(snscrape.base.Scraper):
|
||||||
name = 'vkontakte-user'
|
name = 'vkontakte-user'
|
||||||
|
|
||||||
@@ -177,15 +178,11 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
continue
|
continue
|
||||||
if 'data-video' in a.attrs:
|
if 'data-video' in a.attrs:
|
||||||
# Video
|
# Video
|
||||||
if 'data-link-attr' in a.attrs:
|
|
||||||
hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
|
|
||||||
else:
|
|
||||||
hrefUrl = f'https://vk.com{a["href"]}'
|
|
||||||
video = Video(
|
video = Video(
|
||||||
id = a['data-video'],
|
id = a['data-video'],
|
||||||
list = a['data-list'],
|
list = a['data-list'],
|
||||||
duration = int(a['data-duration']),
|
duration = int(a['data-duration']),
|
||||||
url = hrefUrl,
|
url = f'https://vk.com{a["href"]}',
|
||||||
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
|
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
@@ -216,14 +213,24 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
|
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
|
||||||
photos.append(Photo(variants = photoVariants, url = photoUrl))
|
photos.append(Photo(variants = photoVariants, url = photoUrl))
|
||||||
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
|
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
|
||||||
|
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
|
||||||
|
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
|
||||||
|
username = authorLink['href'].split('/')[-1]
|
||||||
|
name = authorLink.text
|
||||||
|
if authorHeading.find('div', class_ = 'page_verified') is not None:
|
||||||
|
verified = True
|
||||||
|
else:
|
||||||
|
verified = False
|
||||||
|
user = User(username = username, name = name, verified = verified)
|
||||||
return VKontaktePost(
|
return VKontaktePost(
|
||||||
url = url,
|
url = url,
|
||||||
date = self._date_span_to_date(dateSpan),
|
date = self._date_span_to_date(dateSpan),
|
||||||
content = textDiv.text if textDiv else None,
|
content = textDiv.text if textDiv else None,
|
||||||
outlinks = outlinks or None,
|
user = user,
|
||||||
photos = photos or None,
|
outlinks = outlinks or None,
|
||||||
video = video or None,
|
photos = photos or None,
|
||||||
quotedPost = quotedPost,
|
video = video or None,
|
||||||
|
quotedPost = quotedPost,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _soup_to_items(self, soup):
|
def _soup_to_items(self, soup):
|
||||||
@@ -380,6 +387,13 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
if (followersDiv := soup.find('div', id = 'public_followers')):
|
if (followersDiv := soup.find('div', id = 'public_followers')):
|
||||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||||
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||||
|
# On community groups, this is where followers are listed
|
||||||
|
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
|
||||||
|
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
|
||||||
|
# On public groups, this is where followers are listed
|
||||||
|
elif (followersDiv := soup.find('div', id = 'group_followers')):
|
||||||
|
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
|
||||||
|
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
|
||||||
|
|
||||||
return User(**kwargs)
|
return User(**kwargs)
|
||||||
|
|
||||||
@@ -389,4 +403,4 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _cli_from_args(cls, args):
|
def _cli_from_args(cls, args):
|
||||||
return cls._cli_construct(args, args.username)
|
return cls._cli_construct(args, args.username)
|
||||||
Reference in New Issue
Block a user