mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-08 02:28:29 +03:00
Use more assignment expressions where appropriate
This commit is contained in:
@@ -254,13 +254,11 @@ def main():
|
||||
|
||||
i = 0
|
||||
with _dump_locals_on_exception():
|
||||
if args.withEntity:
|
||||
entity = scraper.entity
|
||||
if entity:
|
||||
if args.jsonl:
|
||||
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
|
||||
else:
|
||||
print(entity)
|
||||
if args.withEntity and (entity := scraper.entity):
|
||||
if args.jsonl:
|
||||
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
|
||||
else:
|
||||
print(entity)
|
||||
if args.maxResults == 0:
|
||||
logger.info('Exiting after 0 results')
|
||||
return
|
||||
|
||||
@@ -129,8 +129,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
|
||||
dirtyUrl = urllib.parse.urljoin(baseUrl, href)
|
||||
cleanUrl = self._clean_url(dirtyUrl)
|
||||
date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
|
||||
contentDiv = entry.find('div', class_ = '_5pbx')
|
||||
if contentDiv:
|
||||
if (contentDiv := entry.find('div', class_ = '_5pbx')):
|
||||
content = contentDiv.text
|
||||
else:
|
||||
content = None
|
||||
@@ -180,9 +179,8 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper):
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
|
||||
while nextPageLink:
|
||||
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
|
||||
logger.info('Retrieving next page')
|
||||
|
||||
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
|
||||
@@ -200,7 +198,6 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper):
|
||||
assert '__html' in response['domops'][0][3]
|
||||
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
|
||||
yield from self._soup_to_items(soup, self._baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
@@ -337,8 +334,7 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
||||
|
||||
# Pagination
|
||||
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
|
||||
while True:
|
||||
while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
|
||||
# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
|
||||
r = self._get(
|
||||
f'https://www.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
|
||||
@@ -353,7 +349,6 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
break
|
||||
soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'group')
|
||||
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
|
||||
@@ -154,8 +154,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
|
||||
logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
|
||||
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
|
||||
descriptionDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')
|
||||
if descriptionDiv:
|
||||
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
def parse_num(s):
|
||||
|
||||
@@ -121,19 +121,17 @@ class TwitterOldDesignScraper(snscrape.base.Scraper):
|
||||
url = f'https://twitter.com/{username}/status/{tweetID}'
|
||||
|
||||
date = None
|
||||
timestampA = tweet.find('a', 'tweet-timestamp')
|
||||
if timestampA:
|
||||
if (timestampA := tweet.find('a', 'tweet-timestamp')):
|
||||
timestampSpan = timestampA.find('span', '_timestamp')
|
||||
if timestampSpan and timestampSpan.has_attr('data-time'):
|
||||
date = datetime.datetime.fromtimestamp(int(timestampSpan['data-time']), datetime.timezone.utc)
|
||||
if not date:
|
||||
logger.warning(f'Failed to extract date for {url}')
|
||||
|
||||
contentP = tweet.find('p', 'tweet-text')
|
||||
content = None
|
||||
outlinks = []
|
||||
tcooutlinks = []
|
||||
if contentP:
|
||||
if (contentP := tweet.find('p', 'tweet-text')):
|
||||
content = contentP.text
|
||||
for a in contentP.find_all('a'):
|
||||
if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']):
|
||||
@@ -144,8 +142,7 @@ class TwitterOldDesignScraper(snscrape.base.Scraper):
|
||||
tcooutlinks.append(a['href'])
|
||||
else:
|
||||
logger.warning(f'Failed to extract content for {url}')
|
||||
card = tweet.find('div', 'card2')
|
||||
if card and 'has-autoplayable-media' not in card['class']:
|
||||
if (card := tweet.find('div', 'card2')) and 'has-autoplayable-media' not in card['class']:
|
||||
for div in card.find_all('div'):
|
||||
if div.has_attr('data-card-url'):
|
||||
outlinks.append(div['data-card-url'])
|
||||
@@ -177,8 +174,7 @@ class TwitterAPIScraper(snscrape.base.Scraper):
|
||||
return
|
||||
logger.info('Retrieving guest token')
|
||||
r = self._get(self._baseUrl if url is None else url, headers = {'User-Agent': self._userAgent})
|
||||
match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text)
|
||||
if match:
|
||||
if (match := re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text)):
|
||||
logger.debug('Found guest token in HTML')
|
||||
self._guestToken = match.group(1)
|
||||
if 'gt' in r.cookies:
|
||||
|
||||
@@ -87,8 +87,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
logger.warning('Private profile')
|
||||
return
|
||||
|
||||
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
|
||||
if profileDeleted:
|
||||
if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
|
||||
# Unclear what this state represents, so just log website text.
|
||||
logger.warning(profileDeleted.text)
|
||||
return
|
||||
@@ -166,12 +165,10 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
kwargs['name'] = nameH1.text
|
||||
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
|
||||
|
||||
descriptionDiv = soup.find('div', id = 'page_current_info')
|
||||
if descriptionDiv:
|
||||
if (descriptionDiv := soup.find('div', id = 'page_current_info')):
|
||||
kwargs['description'] = descriptionDiv.text
|
||||
|
||||
infoDiv = soup.find('div', id = 'page_info_wrap')
|
||||
if infoDiv:
|
||||
if (infoDiv := soup.find('div', id = 'page_info_wrap')):
|
||||
websites = []
|
||||
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
|
||||
if 'profile_info_row' in rowDiv['class']:
|
||||
@@ -197,8 +194,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
else:
|
||||
return int(s.replace(',', '')), 1
|
||||
|
||||
countsDiv = soup.find('div', class_ = 'counts_module')
|
||||
if countsDiv:
|
||||
if (countsDiv := soup.find('div', class_ = 'counts_module')):
|
||||
for a in countsDiv.find_all('a', class_ = 'page_counter'):
|
||||
count, granularity = parse_num(a.find('div', class_ = 'count').text)
|
||||
label = a.find('div', class_ = 'label').text
|
||||
@@ -207,17 +203,13 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
if label in ('followers', 'posts', 'photos', 'tags'):
|
||||
kwargs[label], kwargs[f'{label}Granularity'] = count, granularity
|
||||
|
||||
idolsDiv = soup.find('div', id = 'profile_idols')
|
||||
if idolsDiv:
|
||||
topDiv = idolsDiv.find('div', class_ = 'header_top')
|
||||
if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Following':
|
||||
if (idolsDiv := soup.find('div', id = 'profile_idols')):
|
||||
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
|
||||
kwargs['following'], kwargs['followingGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
|
||||
|
||||
# On public pages, this is where followers are listed
|
||||
followersDiv = soup.find('div', id = 'public_followers')
|
||||
if followersDiv:
|
||||
topDiv = followersDiv.find('div', class_ = 'header_top')
|
||||
if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
if (followersDiv := soup.find('div', id = 'public_followers')):
|
||||
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
|
||||
kwargs['followers'], kwargs['followersGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
|
||||
|
||||
return User(**kwargs)
|
||||
|
||||
Reference in New Issue
Block a user