Consistently raise ScraperException on fatal errors

This commit is contained in:
JustAnotherArchivist
2020-05-30 00:40:04 +00:00
parent dbe4c5ce55
commit 1459245258
5 changed files with 20 additions and 31 deletions

View File

@@ -141,8 +141,7 @@ class FacebookUserScraper(FacebookCommonScraper):
logger.warning('User does not exist')
return
elif r.status_code != 200:
logger.error('Got status code {r.status_code}')
return
raise snscrape.base.ScraperException('Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
yield from self._soup_to_items(soup, baseUrl, 'user')
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
@@ -154,8 +153,7 @@ class FacebookUserScraper(FacebookCommonScraper):
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
response = json.loads(spuriousForLoopPattern.sub('', r.text))
assert 'domops' in response
assert len(response['domops']) == 1
@@ -197,12 +195,10 @@ class FacebookGroupScraper(FacebookCommonScraper):
logger.warning('Group does not exist')
return
elif r.status_code != 200:
logger.error('Got status code {r.status_code}')
return
raise snscrape.base.ScraperException('Got status code {r.status_code}')
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
logger.error('Code container ID marker not found (does the group exist?)')
return
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
soup = bs4.BeautifulSoup(r.text, 'lxml')
@@ -212,9 +208,9 @@ class FacebookGroupScraper(FacebookCommonScraper):
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
codeContainer = soup.find('code', id = codeContainerId)
if not codeContainer:
raise RuntimeError('Code container not found')
raise snscrape.base.ScraperException('Code container not found')
if type(codeContainer.string) is not bs4.element.Comment:
raise RuntimeError('Code container does not contain a comment')
raise snscrape.base.ScraperException('Code container does not contain a comment')
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
@@ -228,7 +224,7 @@ class FacebookGroupScraper(FacebookCommonScraper):
headers = headers,
)
if r.status_code != 200:
raise RuntimeError(f'Got status code {r.status_code}')
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
if obj['payload'] == '':
# End of pagination

View File

@@ -109,8 +109,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
logger.warning(f'{self._mode} does not exist')
return
elif r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
response = r._snscrape_json_obj
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
@@ -133,8 +132,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
response = r._snscrape_json_obj
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:

View File

@@ -49,8 +49,7 @@ class TelegramChannelScraper(snscrape.base.Scraper):
while True:
r = self._get(nextPageUrl, headers = headers)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
soup = bs4.BeautifulSoup(r.text, 'lxml')
yield from self._soup_to_items(soup, nextPageUrl)
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})

View File

@@ -100,7 +100,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent})
match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+);', r.text)
if not match:
raise RuntimeError('Unable to find guest token')
raise snscrape.base.ScraperException('Unable to find guest token')
return match.group(1)
def _check_scroll_response(self, r):
@@ -166,8 +166,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
try:
obj = r.json()
except json.JSONDecodeError as e:
logger.error(f'Received invalid JSON from Twitter: {e!s}')
raise RuntimeError('Received invalid JSON from Twitter') from e
raise snscrape.base.ScraperException('Received invalid JSON from Twitter') from e
# No data format test, just a hard and loud crash if anything's wrong :-)
newCursor = None
@@ -185,7 +184,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
elif 'tombstone' in entry['content']['item']['content'] and 'tweet' in entry['content']['item']['content']['tombstone']:
tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tombstone']['tweet']['id']]
else:
raise RuntimeError(f'Unable to handle entry {entry["entryId"]!r}')
raise snscrape.base.ScraperException(f'Unable to handle entry {entry["entryId"]!r}')
tweetID = tweet['id']
content = tweet['full_text']
username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
@@ -340,7 +339,7 @@ class TwitterListMembersScraper(TwitterCommonScraper):
soup = bs4.BeautifulSoup(r.text, 'lxml')
container = soup.find('div', 'stream-container')
if not container:
raise RuntimeError('Unable to find container')
raise snscrape.base.ScraperException('Unable to find container')
items = container.find_all('li', 'js-stream-item')
if not items:
logger.warning('Empty list')

View File

@@ -43,23 +43,22 @@ class VKontakteUserScraper(snscrape.base.Scraper):
logger.info('Retrieving initial data')
r = self._get(baseUrl, headers = headers)
if r.status_code == 404:
logger.error('Wall does not exist')
logger.warning('Wall does not exist')
return
elif r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
logger.error('Private profile')
logger.warning('Private profile')
return
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
if profileDeleted:
# Unclear what this state represents, so just log website text.
logger.error(profileDeleted.text)
logger.warning(profileDeleted.text)
return
newestPost = soup.find('div', class_ = 'post')
@@ -84,16 +83,14 @@ class VKontakteUserScraper(snscrape.base.Scraper):
headers = headers
)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
posts = r.json()['payload'][1][0]
if posts.startswith('<div class="page_block no_posts">'):
# Reached the end
break
if not posts.startswith('<div id="post'):
logger.error(f'Got an unknown response: {posts[:200]!r}...')
break
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
soup = bs4.BeautifulSoup(posts, 'lxml')
yield from self._soup_to_items(soup, baseUrl)