diff --git a/snscrape/cli.py b/snscrape/cli.py index 350e7da..c279aa7 100644 --- a/snscrape/cli.py +++ b/snscrape/cli.py @@ -254,13 +254,11 @@ def main(): i = 0 with _dump_locals_on_exception(): - if args.withEntity: - entity = scraper.entity - if entity: - if args.jsonl: - print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime)) - else: - print(entity) + if args.withEntity and (entity := scraper.entity): + if args.jsonl: + print(json.dumps(namedtuple_to_dict_recursive(entity), default = json_serialise_datetime)) + else: + print(entity) if args.maxResults == 0: logger.info('Exiting after 0 results') return diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 00f8297..d821a80 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -129,8 +129,7 @@ class FacebookCommonScraper(snscrape.base.Scraper): dirtyUrl = urllib.parse.urljoin(baseUrl, href) cleanUrl = self._clean_url(dirtyUrl) date = datetime.datetime.fromtimestamp(int(entry.find('abbr', class_ = '_5ptz')['data-utime']), datetime.timezone.utc) - contentDiv = entry.find('div', class_ = '_5pbx') - if contentDiv: + if (contentDiv := entry.find('div', class_ = '_5pbx')): content = contentDiv.text else: content = None @@ -180,9 +179,8 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper): logger.warning('User does not exist') return yield from self._soup_to_items(soup, self._baseUrl, 'user') - nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern) - while nextPageLink: + while (nextPageLink := soup.find('a', ajaxify = nextPageLinkPattern)): logger.info('Retrieving next page') # The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser. @@ -200,7 +198,6 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper): assert '__html' in response['domops'][0][3] soup = bs4.BeautifulSoup(response['domops'][0][3]['__html'], 'lxml') yield from self._soup_to_items(soup, self._baseUrl, 'user') - nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern) @classmethod def setup_parser(cls, subparser): @@ -337,8 +334,7 @@ class FacebookGroupScraper(FacebookCommonScraper): yield from self._soup_to_items(codeSoup, baseUrl, 'group') # Pagination - data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:] - while True: + while (data := pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:]): # As on the user profile pages, the web app sends a lot of additional parameters, but those all seem to be unnecessary (although some change the response format, e.g. from JSON to HTML) r = self._get( f'https://www.facebook.com/ajax/pagelet/generic.php/GroupEntstreamPagelet', @@ -353,7 +349,6 @@ class FacebookGroupScraper(FacebookCommonScraper): break soup = bs4.BeautifulSoup(obj['payload'], 'lxml') yield from self._soup_to_items(soup, baseUrl, 'group') - data = pageletDataPattern.search(r.text).group(0)[pageletDataPrefixLength:] @classmethod def setup_parser(cls, subparser): diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index d272795..fc84e95 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -154,8 +154,7 @@ class TelegramChannelScraper(snscrape.base.Scraper): # If there are no posts, fall back to the channel info div, although that should never happen due to the 'Channel created' entry. logger.warning('Could not find a post; extracting username from channel info div, which may not be capitalised correctly') kwargs['username'] = channelInfoDiv.find('div', class_ = 'tgme_channel_info_header_username').text[1:] # Remove @ - descriptionDiv = channelInfoDiv.find('div', class_ = 'tgme_channel_info_description') - if descriptionDiv: + if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')): kwargs['description'] = descriptionDiv.text def parse_num(s): diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8f536cf..f5f7bae 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -121,19 +121,17 @@ class TwitterOldDesignScraper(snscrape.base.Scraper): url = f'https://twitter.com/{username}/status/{tweetID}' date = None - timestampA = tweet.find('a', 'tweet-timestamp') - if timestampA: + if (timestampA := tweet.find('a', 'tweet-timestamp')): timestampSpan = timestampA.find('span', '_timestamp') if timestampSpan and timestampSpan.has_attr('data-time'): date = datetime.datetime.fromtimestamp(int(timestampSpan['data-time']), datetime.timezone.utc) if not date: logger.warning(f'Failed to extract date for {url}') - contentP = tweet.find('p', 'tweet-text') content = None outlinks = [] tcooutlinks = [] - if contentP: + if (contentP := tweet.find('p', 'tweet-text')): content = contentP.text for a in contentP.find_all('a'): if a.has_attr('href') and not a['href'].startswith('/') and (not a.has_attr('class') or 'u-hidden' not in a['class']): @@ -144,8 +142,7 @@ class TwitterOldDesignScraper(snscrape.base.Scraper): tcooutlinks.append(a['href']) else: logger.warning(f'Failed to extract content for {url}') - card = tweet.find('div', 'card2') - if card and 'has-autoplayable-media' not in card['class']: + if (card := tweet.find('div', 'card2')) and 'has-autoplayable-media' not in card['class']: for div in card.find_all('div'): if div.has_attr('data-card-url'): outlinks.append(div['data-card-url']) @@ -177,8 +174,7 @@ class TwitterAPIScraper(snscrape.base.Scraper): return logger.info('Retrieving guest token') r = self._get(self._baseUrl if url is None else url, headers = {'User-Agent': self._userAgent}) - match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text) - if match: + if (match := re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text)): logger.debug('Found guest token in HTML') self._guestToken = match.group(1) if 'gt' in r.cookies: diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index b07051d..2684be1 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -87,8 +87,7 @@ class VKontakteUserScraper(snscrape.base.Scraper): logger.warning('Private profile') return - profileDeleted = soup.find('h5', class_ = 'profile_deleted_text') - if profileDeleted: + if (profileDeleted := soup.find('h5', class_ = 'profile_deleted_text')): # Unclear what this state represents, so just log website text. logger.warning(profileDeleted.text) return @@ -166,12 +165,10 @@ class VKontakteUserScraper(snscrape.base.Scraper): kwargs['name'] = nameH1.text kwargs['verified'] = bool(nameH1.find('div', class_ = 'page_verified')) - descriptionDiv = soup.find('div', id = 'page_current_info') - if descriptionDiv: + if (descriptionDiv := soup.find('div', id = 'page_current_info')): kwargs['description'] = descriptionDiv.text - infoDiv = soup.find('div', id = 'page_info_wrap') - if infoDiv: + if (infoDiv := soup.find('div', id = 'page_info_wrap')): websites = [] for rowDiv in infoDiv.find_all('div', class_ = ['profile_info_row', 'group_info_row']): if 'profile_info_row' in rowDiv['class']: @@ -197,8 +194,7 @@ class VKontakteUserScraper(snscrape.base.Scraper): else: return int(s.replace(',', '')), 1 - countsDiv = soup.find('div', class_ = 'counts_module') - if countsDiv: + if (countsDiv := soup.find('div', class_ = 'counts_module')): for a in countsDiv.find_all('a', class_ = 'page_counter'): count, granularity = parse_num(a.find('div', class_ = 'count').text) label = a.find('div', class_ = 'label').text @@ -207,17 +203,13 @@ class VKontakteUserScraper(snscrape.base.Scraper): if label in ('followers', 'posts', 'photos', 'tags'): kwargs[label], kwargs[f'{label}Granularity'] = count, granularity - idolsDiv = soup.find('div', id = 'profile_idols') - if idolsDiv: - topDiv = idolsDiv.find('div', class_ = 'header_top') - if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Following': + if (idolsDiv := soup.find('div', id = 'profile_idols')): + if (topDiv := idolsDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Following': kwargs['following'], kwargs['followingGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text) # On public pages, this is where followers are listed - followersDiv = soup.find('div', id = 'public_followers') - if followersDiv: - topDiv = followersDiv.find('div', class_ = 'header_top') - if topDiv and topDiv.find('span', class_ = 'header_label').text == 'Followers': + if (followersDiv := soup.find('div', id = 'public_followers')): + if (topDiv := followersDiv.find('div', class_ = 'header_top')) and topDiv.find('span', class_ = 'header_label').text == 'Followers': kwargs['followers'], kwargs['followersGranularity'] = parse_num(topDiv.find('span', class_ = 'header_count').text) return User(**kwargs)