Better log output for temporary failures (fixes #2)

This commit is contained in:
JustAnotherArchivist
2018-10-01 03:24:29 +02:00
parent 4d0350e541
commit d0d3c8b2a6
2 changed files with 27 additions and 7 deletions

View File

@@ -59,11 +59,32 @@ class Scraper:
logger.debug(f'... with data: {data!r}')
try:
r = self._session.send(req, timeout = timeout)
if responseOkCallback is None or responseOkCallback(r):
logger.debug(f'{req.url} retrieved successfully')
return r
except requests.exceptions.RequestException as exc:
logger.error(f'Error retrieving {url}: {exc!r}')
if attempt < self._retries:
retrying = ', retrying'
level = logging.WARNING
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
else:
if responseOkCallback is not None:
success, msg = responseOkCallback(r)
else:
success, msg = (True, None)
msg = f': {msg}' if msg else ''
if success:
logger.debug(f'{req.url} retrieved successfully{msg}')
return r
else:
if attempt < self._retries:
retrying = ', retrying'
level = logging.WARNING
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
if attempt < self._retries:
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
logger.info(f'Waiting {sleepTime:.0f} seconds')

View File

@@ -27,9 +27,8 @@ class TwitterSearchScraper(snscrape.base.Scraper):
def _check_json_callback(self, r):
if r.headers['content-type'] != 'application/json;charset=utf-8':
logger.error(f'Content type of {r.url} is not JSON')
return False
return True
return False, f'content type is not JSON'
return True, None
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}