Use more assignment expressions where appropriate

This commit is contained in:
JustAnotherArchivist
2020-10-01 21:41:44 +00:00
parent 8b68f1a8af
commit a70b361176
5 changed files with 21 additions and 41 deletions

View File

@@ -254,13 +254,11 @@ def main():
i = 0
with _dump_locals_on_exception():
if args.withEntity:
entity = scraper.entity
if entity:
if args.jsonl:
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
else:
print(entity)
if args.withEntity and (entity := scraper.entity):
if args.jsonl:
print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime))
else:
print(entity)
if args.maxResults == 0:
logger.info('Exiting after 0 results')
return

View File

@@ -129,8 +129,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
dirtyUrl = urllib.parse.urljoin(baseUrl, href)
cleanUrl = self._clean_url(dirtyUrl)
date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc)
contentDiv = entry.find('div', class_ = '_5pbx')
if contentDiv:
if (contentDiv := entry.find('div', class_ = '_5pbx')):
content = contentDiv.text
else:
content = None
@@ -180,9 +179,8 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper):
logger.warning('User does not exist')
return
yield from self._soup_to_items(soup, self._baseUrl, 'user')
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
while nextPageLink:
while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)):
logger.info('Retrieving next page')
# The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser.
@@ -200,7 +198,6 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper):
assert '__html' in response['domops'][0][3]
soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml')
yield from self._soup_to_items(soup, self._baseUrl, 'user')
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
@classmethod
def setup_parser(cls, subparser):
@@ -337,8 +334,7 @@ class FacebookGroupScraper(FacebookCommonScraper):
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
# Pagination
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
while True:
while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]):
# As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML)
r = self._get(
f'https://www.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet',
@@ -353,7 +349,6 @@ class FacebookGroupScraper(FacebookCommonScraper):
break
soup = bs4.BeautifulSoup(obj['payload'], 'lxml')
yield from self._soup_to_items(soup, baseUrl, 'group')
data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]
@classmethod
def setup_parser(cls, subparser):

View File

@@ -154,8 +154,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
# If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry.
logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly')
kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @
descriptionDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')
if descriptionDiv:
if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
kwargs['description'] = descriptionDiv.text
def parse_num(s):

View File

@@ -121,19 +121,17 @@ class TwitterOldDesignScraper(snscrape.base.Scraper):
url = f'https://twitter.com/{username}/status/{tweetID}'
date = None
timestampA = tweet.find('a', 'tweet-timestamp')
if timestampA:
if (timestampA := tweet.find('a', 'tweet-timestamp')):
timestampSpan = timestampA.find('span', '_timestamp')
if timestampSpan and timestampSpan.has_attr('data-time'):
date = datetime.datetime.fromtimestamp(int(timestampSpan['data-time']), datetime.timezone.utc)
if not date:
logger.warning(f'Failed to extract date for {url}')
contentP = tweet.find('p', 'tweet-text')
content = None
outlinks = []
tcooutlinks = []
if contentP:
if (contentP := tweet.find('p', 'tweet-text')):
content = contentP.text
for a in contentP.find_all('a'):
if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']):
@@ -144,8 +142,7 @@ class TwitterOldDesignScraper(snscrape.base.Scraper):
tcooutlinks.append(a['href'])
else:
logger.warning(f'Failed to extract content for {url}')
card = tweet.find('div', 'card2')
if card and 'has-autoplayable-media' not in card['class']:
if (card := tweet.find('div', 'card2')) and 'has-autoplayable-media' not in card['class']:
for div in card.find_all('div'):
if div.has_attr('data-card-url'):
outlinks.append(div['data-card-url'])
@@ -177,8 +174,7 @@ class TwitterAPIScraper(snscrape.base.Scraper):
return
logger.info('Retrieving guest token')
r = self._get(self._baseUrl if url is None else url, headers = {'User-Agent': self._userAgent})
match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text)
if match:
if (match := re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text)):
logger.debug('Found guest token in HTML')
self._guestToken = match.group(1)
if 'gt' in r.cookies:

View File

@@ -87,8 +87,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
logger.warning('Private profile')
return
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
if profileDeleted:
if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')):
# Unclear what this state represents, so just log website text.
logger.warning(profileDeleted.text)
return
@@ -166,12 +165,10 @@ class VKontakteUserScraper(snscrape.base.Scraper):
kwargs['name'] = nameH1.text
kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified'))
descriptionDiv = soup.find('div', id = 'page_current_info')
if descriptionDiv:
if (descriptionDiv := soup.find('div', id = 'page_current_info')):
kwargs['description'] = descriptionDiv.text
infoDiv = soup.find('div', id = 'page_info_wrap')
if infoDiv:
if (infoDiv := soup.find('div', id = 'page_info_wrap')):
websites = []
for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']):
if 'profile_info_row' in rowDiv['class']:
@@ -197,8 +194,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
else:
return int(s.replace(',', '')), 1
countsDiv = soup.find('div', class_ = 'counts_module')
if countsDiv:
if (countsDiv := soup.find('div', class_ = 'counts_module')):
for a in countsDiv.find_all('a', class_ = 'page_counter'):
count, granularity = parse_num(a.find('div', class_ = 'count').text)
label = a.find('div', class_ = 'label').text
@@ -207,17 +203,13 @@ class VKontakteUserScraper(snscrape.base.Scraper):
if label in ('followers', 'posts', 'photos', 'tags'):
kwargs[label], kwargs[f'{label}Granularity'] = count, granularity
idolsDiv = soup.find('div', id = 'profile_idols')
if idolsDiv:
topDiv = idolsDiv.find('div', class_ = 'header_top')
if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Following':
if (idolsDiv := soup.find('div', id = 'profile_idols')):
if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following':
kwargs['following'], kwargs['followingGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
# On public pages, this is where followers are listed
followersDiv = soup.find('div', id = 'public_followers')
if followersDiv:
topDiv = followersDiv.find('div', class_ = 'header_top')
if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Followers':
if (followersDiv := soup.find('div', id = 'public_followers')):
if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers':
kwargs['followers'], kwargs['followersGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text)
return User(**kwargs)