diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 0725767..ebefce0 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -141,8 +141,7 @@ class FacebookUserScraper(FacebookCommonScraper): logger.warning('User does not exist') return elif r.status_code != 200: - logger.error('Got status code {r.status_code}') - return + raise snscrape.base.ScraperException('Got status code {r.status_code}') soup = bs4.BeautifulSoup(r.text, 'lxml') yield from self._soup_to_items(soup, baseUrl, 'user') nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern) @@ -154,8 +153,7 @@ class FacebookUserScraper(FacebookCommonScraper): # Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here. r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers) if r.status_code != 200: - logger.error(f'Got status code {r.status_code}') - return + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') response = json.loads(spuriousForLoopPattern.sub('', r.text)) assert 'domops' in response assert len(response['domops']) == 1 @@ -197,12 +195,10 @@ class FacebookGroupScraper(FacebookCommonScraper): logger.warning('Group does not exist') return elif r.status_code != 200: - logger.error('Got status code {r.status_code}') - return + raise snscrape.base.ScraperException('Got status code {r.status_code}') if 'content:{pagelet_group_mall:{container_id:"' not in r.text: - logger.error('Code container ID marker not found (does the group exist?)') - return + raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)') soup = bs4.BeautifulSoup(r.text, 'lxml') @@ -212,9 +208,9 @@ class FacebookGroupScraper(FacebookCommonScraper): codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)] codeContainer = soup.find('code', id = codeContainerId) if not codeContainer: - raise RuntimeError('Code container not found') + raise snscrape.base.ScraperException('Code container not found') if type(codeContainer.string) is not bs4.element.Comment: - raise RuntimeError('Code container does not contain a comment') + raise snscrape.base.ScraperException('Code container does not contain a comment') codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml') yield from self._soup_to_items(codeSoup, baseUrl, 'group') @@ -228,7 +224,7 @@ class FacebookGroupScraper(FacebookCommonScraper): headers = headers, ) if r.status_code != 200: - raise RuntimeError(f'Got status code {r.status_code}') + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') obj = json.loads(spuriousForLoopPattern.sub('', r.text)) if obj['payload'] == '': # End of pagination diff --git a/snscrape/modules/instagram.py b/snscrape/modules/instagram.py index 31642e7..912f76e 100644 --- a/snscrape/modules/instagram.py +++ b/snscrape/modules/instagram.py @@ -109,8 +109,7 @@ class InstagramCommonScraper(snscrape.base.Scraper): logger.warning(f'{self._mode} does not exist') return elif r.status_code != 200: - logger.error(f'Got status code {r.status_code}') - return + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') response = r._snscrape_json_obj rhxGis = response['rhx_gis'] if 'rhx_gis' in response else '' if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0: @@ -133,8 +132,7 @@ class InstagramCommonScraper(snscrape.base.Scraper): r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback) if r.status_code != 200: - logger.error(f'Got status code {r.status_code}') - return + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') response = r._snscrape_json_obj if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']: diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 4d739be..426e61b 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -49,8 +49,7 @@ class TelegramChannelScraper(snscrape.base.Scraper): while True: r = self._get(nextPageUrl, headers = headers) if r.status_code != 200: - logger.error(f'Got status code {r.status_code}') - return + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') soup = bs4.BeautifulSoup(r.text, 'lxml') yield from self._soup_to_items(soup, nextPageUrl) pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 4c2601a..50da535 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -100,7 +100,7 @@ class TwitterSearchScraper(TwitterCommonScraper): r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent}) match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+);', r.text) if not match: - raise RuntimeError('Unable to find guest token') + raise snscrape.base.ScraperException('Unable to find guest token') return match.group(1) def _check_scroll_response(self, r): @@ -166,8 +166,7 @@ class TwitterSearchScraper(TwitterCommonScraper): try: obj = r.json() except json.JSONDecodeError as e: - logger.error(f'Received invalid JSON from Twitter: {e!s}') - raise RuntimeError('Received invalid JSON from Twitter') from e + raise snscrape.base.ScraperException('Received invalid JSON from Twitter') from e # No data format test, just a hard and loud crash if anything's wrong :-) newCursor = None @@ -185,7 +184,7 @@ class TwitterSearchScraper(TwitterCommonScraper): elif 'tombstone' in entry['content']['item']['content'] and 'tweet' in entry['content']['item']['content']['tombstone']: tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tombstone']['tweet']['id']] else: - raise RuntimeError(f'Unable to handle entry {entry["entryId"]!r}') + raise snscrape.base.ScraperException(f'Unable to handle entry {entry["entryId"]!r}') tweetID = tweet['id'] content = tweet['full_text'] username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name'] @@ -340,7 +339,7 @@ class TwitterListMembersScraper(TwitterCommonScraper): soup = bs4.BeautifulSoup(r.text, 'lxml') container = soup.find('div', 'stream-container') if not container: - raise RuntimeError('Unable to find container') + raise snscrape.base.ScraperException('Unable to find container') items = container.find_all('li', 'js-stream-item') if not items: logger.warning('Empty list') diff --git a/snscrape/modules/vkontakte.py b/snscrape/modules/vkontakte.py index bb34278..d89f7af 100644 --- a/snscrape/modules/vkontakte.py +++ b/snscrape/modules/vkontakte.py @@ -43,23 +43,22 @@ class VKontakteUserScraper(snscrape.base.Scraper): logger.info('Retrieving initial data') r = self._get(baseUrl, headers = headers) if r.status_code == 404: - logger.error('Wall does not exist') + logger.warning('Wall does not exist') return elif r.status_code != 200: - logger.error(f'Got status code {r.status_code}') - return + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') # VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly. soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding) if soup.find('div', class_ = 'profile_closed_wall_dummy'): - logger.error('Private profile') + logger.warning('Private profile') return profileDeleted = soup.find('h5', class_ = 'profile_deleted_text') if profileDeleted: # Unclear what this state represents, so just log website text. - logger.error(profileDeleted.text) + logger.warning(profileDeleted.text) return newestPost = soup.find('div', class_ = 'post') @@ -84,16 +83,14 @@ class VKontakteUserScraper(snscrape.base.Scraper): headers = headers ) if r.status_code != 200: - logger.error(f'Got status code {r.status_code}') - return + raise snscrape.base.ScraperException(f'Got status code {r.status_code}') # Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes. posts = r.json()['payload'][1][0] if posts.startswith('
'): # Reached the end break if not posts.startswith('