mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 04:18:29 +03:00
Consistently raise ScraperException on fatal errors
This commit is contained in:
@@ -141,8 +141,7 @@ class FacebookUserScraper(FacebookCommonScraper):
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException('Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl, 'user')
|
||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||
@@ -154,8 +153,7 @@ class FacebookUserScraper(FacebookCommonScraper):
|
||||
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
|
||||
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
response = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
assert 'domops' in response
|
||||
assert len(response['domops']) == 1
|
||||
@@ -197,12 +195,10 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
logger.warning('Group does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error('Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException('Got status code {r.status_code}')
|
||||
|
||||
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
|
||||
logger.error('Code container ID marker not found (does the group exist?)')
|
||||
return
|
||||
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
|
||||
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
@@ -212,9 +208,9 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
|
||||
codeContainer = soup.find('code', id = codeContainerId)
|
||||
if not codeContainer:
|
||||
raise RuntimeError('Code container not found')
|
||||
raise snscrape.base.ScraperException('Code container not found')
|
||||
if type(codeContainer.string) is not bs4.element.Comment:
|
||||
raise RuntimeError('Code container does not contain a comment')
|
||||
raise snscrape.base.ScraperException('Code container does not contain a comment')
|
||||
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
|
||||
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
||||
|
||||
@@ -228,7 +224,7 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
||||
headers = headers,
|
||||
)
|
||||
if r.status_code != 200:
|
||||
raise RuntimeError(f'Got status code {r.status_code}')
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||
if obj['payload'] == '':
|
||||
# End of pagination
|
||||
|
||||
@@ -109,8 +109,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
logger.warning(f'{self._mode} does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
response = r._snscrape_json_obj
|
||||
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
||||
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
||||
@@ -133,8 +132,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
response = r._snscrape_json_obj
|
||||
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||
|
||||
@@ -49,8 +49,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
|
||||
while True:
|
||||
r = self._get(nextPageUrl, headers = headers)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
yield from self._soup_to_items(soup, nextPageUrl)
|
||||
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||
|
||||
@@ -100,7 +100,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
||||
r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent})
|
||||
match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+);', r.text)
|
||||
if not match:
|
||||
raise RuntimeError('Unable to find guest token')
|
||||
raise snscrape.base.ScraperException('Unable to find guest token')
|
||||
return match.group(1)
|
||||
|
||||
def _check_scroll_response(self, r):
|
||||
@@ -166,8 +166,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
||||
try:
|
||||
obj = r.json()
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f'Received invalid JSON from Twitter: {e!s}')
|
||||
raise RuntimeError('Received invalid JSON from Twitter') from e
|
||||
raise snscrape.base.ScraperException('Received invalid JSON from Twitter') from e
|
||||
|
||||
# No data format test, just a hard and loud crash if anything's wrong :-)
|
||||
newCursor = None
|
||||
@@ -185,7 +184,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
||||
elif 'tombstone' in entry['content']['item']['content'] and 'tweet' in entry['content']['item']['content']['tombstone']:
|
||||
tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tombstone']['tweet']['id']]
|
||||
else:
|
||||
raise RuntimeError(f'Unable to handle entry {entry["entryId"]!r}')
|
||||
raise snscrape.base.ScraperException(f'Unable to handle entry {entry["entryId"]!r}')
|
||||
tweetID = tweet['id']
|
||||
content = tweet['full_text']
|
||||
username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
|
||||
@@ -340,7 +339,7 @@ class TwitterListMembersScraper(TwitterCommonScraper):
|
||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||
container = soup.find('div', 'stream-container')
|
||||
if not container:
|
||||
raise RuntimeError('Unable to find container')
|
||||
raise snscrape.base.ScraperException('Unable to find container')
|
||||
items = container.find_all('li', 'js-stream-item')
|
||||
if not items:
|
||||
logger.warning('Empty list')
|
||||
|
||||
@@ -43,23 +43,22 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(baseUrl, headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.error('Wall does not exist')
|
||||
logger.warning('Wall does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
|
||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||
|
||||
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
||||
logger.error('Private profile')
|
||||
logger.warning('Private profile')
|
||||
return
|
||||
|
||||
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
|
||||
if profileDeleted:
|
||||
# Unclear what this state represents, so just log website text.
|
||||
logger.error(profileDeleted.text)
|
||||
logger.warning(profileDeleted.text)
|
||||
return
|
||||
|
||||
newestPost = soup.find('div', class_ = 'post')
|
||||
@@ -84,16 +83,14 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
||||
headers = headers
|
||||
)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||
posts = r.json()['payload'][1][0]
|
||||
if posts.startswith('<div class="page_block no_posts">'):
|
||||
# Reached the end
|
||||
break
|
||||
if not posts.startswith('<div id="post'):
|
||||
logger.error(f'Got an unknown response: {posts[:200]!r}...')
|
||||
break
|
||||
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||
yield from self._soup_to_items(soup, baseUrl)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user