37 Commits

Author SHA1 Message Date
Tristan Lee
40b8d9f267 Merge pull request #7 from bellingcat/more-tg-info
More tg info
2022-07-05 08:29:09 -07:00
Tristan Lee
fdc40f7411 Merge pull request #6 from bellingcat/add-vk-user
added User dataclass as argument to VKontaktePost dataclass
2022-07-05 08:28:01 -07:00
Tristan Lee
82351800d6 Merge pull request #5 from JustAnotherArchivist/master
merge upstream
2022-07-05 08:25:20 -07:00
Tristan Lee
73f10a4f24 fixed edge case where channel with no members fails _get_entity 2022-07-05 10:23:26 -05:00
Tristan Lee
cb429909d0 added User dataclass as argument to VKontaktePost dataclass 2022-07-05 10:21:59 -05:00
JustAnotherArchivist
d72b51953f Fix missing r prefix on string with regex backslashes 2022-06-24 23:12:50 +00:00
Tristan Lee
056cd6215c incorporated requested changes from maintainer, removed modifications to VK module 2022-06-23 15:47:18 -05:00
JustAnotherArchivist
d5b406bc1b Update API parameters to what Twitter currently uses
The `count` reduction does not affect anything as Twitter ignores that parameter now. Cf. #481
2022-06-23 19:50:17 +00:00
Tristan Lee
56e4232083 fixed typo 2022-06-23 11:51:13 -05:00
JustAnotherArchivist
50899c01f3 Fix crash on malformed guest token cache file
Fixes #494
2022-06-16 17:12:04 +00:00
JustAnotherArchivist
bcad6923c2 Rename Tweet.content to rawContent and User.description to renderedDescription for consistency
Closes #479
2022-06-14 00:35:02 +00:00
JustAnotherArchivist
0d361685ff Fix AttributeError crash on scrapers using the default CLI constructor
Introduced by 267b7d0e

Fixes #483
2022-06-01 17:35:38 +00:00
JustAnotherArchivist
530f4fa122 Fix KeyErrors on display_url and expanded_url for certain users with broken profile links
Fixes #480
2022-05-29 17:23:43 +00:00
JustAnotherArchivist
dc6bc9bf9d Refactor how links on Twitter are handled
All links in text (tweets, profile descriptions, and profile links) are now represented by TextLink objects, which contain all relevant information: the displayed text (if available), the URL, the short t.co URL, and the indices in the text at which it appears.

Closes #478
2022-05-29 07:16:04 +00:00
JustAnotherArchivist
01cf6a09b3 Fix type of description URL objects 2022-05-29 05:08:23 +00:00
JustAnotherArchivist
ef7c4fad3e Fix AttributeError for DescriptionURL on from-import 2022-05-29 05:08:23 +00:00
Tristan Lee
65723f10ff fixed merge 2022-05-25 06:47:47 -05:00
Tristan Lee
07a5f6fd7d merged master into more-tg-info to update upstream PR 2022-05-25 01:18:48 -05:00
Tristan Lee
0822a9c354 Merge pull request #4 from JustAnotherArchivist/master
upstream merge
2022-05-24 23:10:38 -07:00
JustAnotherArchivist
faeffe2603 Merge pull request #474 from GeraniumKF/GeraniumKF-reddit-since-crash
Fix crash using --since with Reddit
2022-05-23 23:06:16 +00:00
Geranium
e3bdc02a7c Reddit: deprecate 'created' property for 'date'
This fixes a crash when using --since with the Reddit scraper,
as the CLI code expects items to have a date property.
2022-05-23 23:31:44 +01:00
Tristan Lee
e2d922301e forgot to save modified twitter.py module 2022-05-09 09:37:36 -05:00
Tristan Lee
b13e62eb5d Merge branch 'JustAnotherArchivist-master' 2022-05-09 09:35:35 -05:00
Tristan Lee
f38513503d fixed merge conflicts 2022-05-09 09:35:19 -05:00
Tristan Lee
0a4bd39ca6 Merge pull request #2 from bellingcat/telegram-media
Implemented JustAnotherArchivist's requested changes to Telegram scraper from PR
2022-05-09 07:23:39 -07:00
JustAnotherArchivist
ed3ea944d1 Fix newsletter issue cards without an issue description
Fixes #456
2022-04-16 19:44:36 +00:00
JustAnotherArchivist
e7a6d38a5f Add support for community_details cards 2022-04-15 20:07:01 +00:00
JustAnotherArchivist
6c50eee31b Fix proxies not being applied correctly due to missing merge with environment settings
Fixes #447
2022-04-15 19:23:54 +00:00
JustAnotherArchivist
5103a33afa Fix t.co card URL replacement on retweets
Fixes #411
2022-04-15 03:18:45 +00:00
JustAnotherArchivist
247bd82d79 Refactor to tweetId variable 2022-04-15 03:14:29 +00:00
JustAnotherArchivist
5fc67f2bcf Add support for 'message me' cards 2022-04-15 02:52:37 +00:00
JustAnotherArchivist
65e7d8bd24 Fix warning on card URL translation to include the tweet ID 2022-04-15 02:52:03 +00:00
JustAnotherArchivist
3870282a42 Fix broadcast and event card crashes 2022-04-12 20:53:38 +00:00
JustAnotherArchivist
7c0fcdec43 Fix Periscope card crashes 2022-04-12 18:29:51 +00:00
JustAnotherArchivist
9af1f19034 Properly support all card types
Fixes #407
2022-04-12 18:11:26 +00:00
JustAnotherArchivist
5fc3c0e290 Fix crash in locals dumping on module-less frames 2022-04-12 18:03:36 +00:00
Logan Williams
b8efce2a12 Clean up unnecessary imports 2022-03-08 15:10:15 +01:00
6 changed files with 845 additions and 155 deletions

View File

@@ -133,12 +133,22 @@ def _dump_stack_and_locals(trace, exc = None):
fp.write('Stack:\n')
for frameRecord in trace:
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
for line in frameRecord.code_context:
fp.write(f' {line.strip()}\n')
if frameRecord.code_context is not None:
for line in frameRecord.code_context:
fp.write(f' {line.strip()}\n')
fp.write('\n')
for frameRecord in trace:
module = inspect.getmodule(frameRecord[0])
modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
for i, (module, frameRecord) in enumerate(zip(modules, trace)):
if module is None:
# Module-less frame, e.g. dataclass.__init__
for j in reversed(range(i)):
if modules[j] is not None:
break
else:
# No previous module scope
continue
module = modules[j]
if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
continue
locals_ = frameRecord[0].f_locals

View File

@@ -163,16 +163,19 @@ class Scraper:
return self._get_entity()
def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
proxies = proxies or self._proxies
proxies = proxies or self._proxies or {}
for attempt in range(self._retries + 1):
# The request is newly prepared on each retry because of potential cookie updates.
req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
logger.info(f'Retrieving {req.url}')
logger.debug(f'... with headers: {headers!r}')
if data:
logger.debug(f'... with data: {data!r}')
if environmentSettings:
logger.debug(f'... with environmentSettings: {environmentSettings!r}')
try:
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, proxies = proxies)
r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
except requests.exceptions.RequestException as exc:
if attempt < self._retries:
retrying = ', retrying'
@@ -226,7 +229,7 @@ class Scraper:
@classmethod
def _cli_from_args(cls, args):
return cls._construct(args)
return cls._cli_construct(args)
@classmethod
def _cli_construct(cls, argparseArgs, *args, **kwargs):

View File

@@ -20,7 +20,7 @@ _logger = logging.getLogger(__name__)
@dataclasses.dataclass
class Submission(snscrape.base.Item):
author: typing.Optional[str] # E.g. submission hf7k6
created: datetime.datetime
date: datetime.datetime
id: str
link: typing.Optional[str]
selftext: typing.Optional[str]
@@ -28,6 +28,8 @@ class Submission(snscrape.base.Item):
title: str
url: str
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
def __str__(self):
return self.url
@@ -36,12 +38,14 @@ class Submission(snscrape.base.Item):
class Comment(snscrape.base.Item):
author: typing.Optional[str]
body: str
created: datetime.datetime
date: datetime.datetime
id: str
parentId: typing.Optional[str]
subreddit: typing.Optional[str]
url: str
created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
def __str__(self):
return self.url
@@ -111,7 +115,7 @@ class _RedditPushshiftScraper(snscrape.base.Scraper):
kwargs = {
'author': d.get('author'),
'created': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
'url': f'https://old.reddit.com{permalink}',
'subreddit': d.get('subreddit'),
}
@@ -192,7 +196,7 @@ class _RedditPushshiftSearchScraper(_RedditPushshiftScraper):
while True:
# Return newer first; if both have the same creation datetime, return the comment first
if tipSubmission.created > tipComment.created:
if tipSubmission.date > tipComment.date:
yield tipSubmission
try:
tipSubmission = next(submissionsIter)

View File

@@ -9,7 +9,6 @@ import re
import snscrape.base
import typing
import urllib.parse
import base64
_logger = logging.getLogger(__name__)
_SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
@@ -57,7 +56,7 @@ class TelegramPost(snscrape.base.Item):
forwarded: typing.Optional['Channel'] = None
forwardedUrl: typing.Optional[str] = None
media: typing.Optional[typing.List['Medium']] = None
views: typing.Optional[int] = None
views: typing.Optional[snscrape.base.IntWithGranularity] = None
linkPreview: typing.Optional[LinkPreview] = None
outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
@@ -176,7 +175,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
audioUrl = voicePlayer.find('audio')['src']
durationStr = voicePlayer.find('time').text
duration = durationStrToSeconds(durationStr)
duration = _durationStrToSeconds(durationStr)
barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
@@ -201,7 +200,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
else:
cls = Video
durationStr = videoPlayer.find('time').text
mKwargs['duration'] = durationStrToSeconds(durationStr)
mKwargs['duration'] = _durationStrToSeconds(durationStr)
media.append(cls(**mKwargs))
linkPreview = None
@@ -224,7 +223,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
outlinks.remove(kwargs['href'])
viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
views = None if viewsSpan is None else parse_num(viewsSpan.text)
views = None if viewsSpan is None else _parse_num(viewsSpan.text)
outlinks = outlinks if outlinks else None
media = media if media else None
mentions = mentions if mentions else None
hashtags = hashtags if hashtags else None
yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
@@ -253,7 +257,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
else:
break
nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback)
if r.status_code != 200:
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
@@ -266,8 +270,12 @@ class TelegramChannelScraper(snscrape.base.Scraper):
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
membersDiv = soup.find('div', class_ = 'tgme_page_extra')
if membersDiv.text.endswith((' members', ' subscribers')):
kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
if membersDiv.text.split(',')[0].endswith((' members', ' subscribers')):
membersStr = ''.join(membersDiv.text.split(',')[0].split(' ')[:-1])
if membersStr == 'no':
kwargs['members'] = 0
else:
kwargs['members'] = int(membersStr)
photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
if photoImg is not None:
kwargs['photo'] = photoImg.attrs['src']
@@ -294,7 +302,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
kwargs['description'] = descriptionDiv.text
for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
value, granularity = _parse_num(div.find('span', class_ = 'counter_value').text)
type_ = div.find('span', class_ = 'counter_type').text
if type_ == 'members':
# Already extracted more accurately from /channel, skip
@@ -312,7 +320,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.channel)
def parse_num(s):
def _parse_num(s):
s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
@@ -320,11 +328,11 @@ def parse_num(s):
return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
return int(s), 1
def durationStrToSeconds(durationStr):
def _durationStrToSeconds(durationStr):
durationList = durationStr.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(durationList))])
def telegramResponseOkCallback(r):
def _telegramResponseOkCallback(r):
if r.status_code == 200:
return (True, None)
return (False, f'{r.status_code=}')

File diff suppressed because it is too large Load Diff

View File

@@ -32,17 +32,41 @@ _logger = logging.getLogger(__name__)
_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
_datePattern = re.compile(r'^(?P<date>today'
r'|yesterday'
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
')'
r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: str
verified: bool
description: typing.Optional[str] = None
websites: typing.Optional[typing.List[str]] = None
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
following: typing.Optional[snscrape.base.IntWithGranularity] = None
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
def __str__(self):
return f'https://vk.com/{self.username}'
@dataclasses.dataclass
class VKontaktePost(snscrape.base.Item):
url: str
date: typing.Optional[typing.Union[datetime.datetime, datetime.date]]
content: str
user: User
outlinks: typing.Optional[typing.List[str]] = None
photos: typing.Optional[typing.List['Photo']] = None
video: typing.Optional['Video'] = None
@@ -74,29 +98,6 @@ class Video:
thumbUrl: str
@dataclasses.dataclass
class User(snscrape.base.Entity):
username: str
name: str
verified: bool
description: typing.Optional[str] = None
websites: typing.Optional[typing.List[str]] = None
followers: typing.Optional[snscrape.base.IntWithGranularity] = None
posts: typing.Optional[snscrape.base.IntWithGranularity] = None
photos: typing.Optional[snscrape.base.IntWithGranularity] = None
tags: typing.Optional[snscrape.base.IntWithGranularity] = None
following: typing.Optional[snscrape.base.IntWithGranularity] = None
followersGranularity = snscrape.base._DeprecatedProperty('followersGranularity', lambda self: self.followers.granularity, 'followers.granularity')
postsGranularity = snscrape.base._DeprecatedProperty('postsGranularity', lambda self: self.posts.granularity, 'posts.granularity')
photosGranularity = snscrape.base._DeprecatedProperty('photosGranularity', lambda self: self.photos.granularity, 'photos.granularity')
tagsGranularity = snscrape.base._DeprecatedProperty('tagsGranularity', lambda self: self.tags.granularity, 'tags.granularity')
followingGranularity = snscrape.base._DeprecatedProperty('followingGranularity', lambda self: self.following.granularity, 'following.granularity')
def __str__(self):
return f'https://vk.com/{self.username}'
class VKontakteUserScraper(snscrape.base.Scraper):
name = 'vkontakte-user'
@@ -177,15 +178,11 @@ class VKontakteUserScraper(snscrape.base.Scraper):
continue
if 'data-video' in a.attrs:
# Video
if 'data-link-attr' in a.attrs:
hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
else:
hrefUrl = f'https://vk.com{a["href"]}'
video = Video(
id = a['data-video'],
list = a['data-list'],
duration = int(a['data-duration']),
url = hrefUrl,
url = f'https://vk.com{a["href"]}',
thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
)
continue
@@ -216,14 +213,24 @@ class VKontakteUserScraper(snscrape.base.Scraper):
photoUrl = f'https://vk.com{a["href"]}' if 'href' in a.attrs and a['href'].startswith('/photo') and a['href'][6:].strip('0123456789-_') == '' else None
photos.append(Photo(variants = photoVariants, url = photoUrl))
quotedPost = self._post_div_to_item(quoteDiv, isCopy = True) if (quoteDiv := post.find('div', class_ = 'copy_quote')) else None
authorHeading = post.find('h5', class_ = ['post_author', 'copy_post_author'])
authorLink = authorHeading.find('a', class_ = ['author', 'copy_author'])
username = authorLink['href'].split('/')[-1]
name = authorLink.text
if authorHeading.find('div', class_ = 'page_verified') is not None:
verified = True
else:
verified = False
user = User(username = username, name = name, verified = verified)
return VKontaktePost(
url = url,
date = self._date_span_to_date(dateSpan),
content = textDiv.text if textDiv else None,
outlinks = outlinks or None,
photos = photos or None,
video = video or None,
quotedPost = quotedPost,
url = url,
date = self._date_span_to_date(dateSpan),
content = textDiv.text if textDiv else None,
user = user,
outlinks = outlinks or None,
photos = photos or None,
video = video or None,
quotedPost = quotedPost,
)
def _soup_to_items(self, soup):
@@ -380,6 +387,13 @@ class VKontakteUserScraper(snscrape.base.Scraper):
if (followersDiv := soup.find('div', id = 'public_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
# On community groups, this is where followers are listed
elif (followersDiv := soup.find('div', class_ = 'group_friends_text')):
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(followersDiv.find('span', class_ = 'group_friends_count').text))
# On public groups, this is where followers are listed
elif (followersDiv := soup.find('div', id = 'group_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Members':
kwargs['followers'] = snscrape.base.IntWithGranularity(*parse_num(topDiv.find('span', class_ = 'header_count').text))
return User(**kwargs)
@@ -389,4 +403,4 @@ class VKontakteUserScraper(snscrape.base.Scraper):
@classmethod
def _cli_from_args(cls, args):
return cls._cli_construct(args, args.username)
return cls._cli_construct(args, args.username)