From d0d3c8b2a66f6244b38ee76d0cb8da72d406031b Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Mon, 1 Oct 2018 03:24:29 +0200 Subject: [PATCH] Better log output for temporary failures (fixes #2) --- snscrape/base.py | 29 +++++++++++++++++++++++++---- snscrape/modules/twitter.py | 5 ++--- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/snscrape/base.py b/snscrape/base.py index a4a6f21..443efc2 100644 --- a/snscrape/base.py +++ b/snscrape/base.py @@ -59,11 +59,32 @@ class Scraper: logger.debug(f'... with data: {data!r}') try: r = self._session.send(req, timeout = timeout) - if responseOkCallback is None or responseOkCallback(r): - logger.debug(f'{req.url} retrieved successfully') - return r except requests.exceptions.RequestException as exc: - logger.error(f'Error retrieving {url}: {exc!r}') + if attempt < self._retries: + retrying = ', retrying' + level = logging.WARNING + else: + retrying = '' + level = logging.ERROR + logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') + else: + if responseOkCallback is not None: + success, msg = responseOkCallback(r) + else: + success, msg = (True, None) + msg = f': {msg}' if msg else '' + + if success: + logger.debug(f'{req.url} retrieved successfully{msg}') + return r + else: + if attempt < self._retries: + retrying = ', retrying' + level = logging.WARNING + else: + retrying = '' + level = logging.ERROR + logger.log(level, f'Error retrieving {req.url}{msg}{retrying}') if attempt < self._retries: sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc. logger.info(f'Waiting {sleepTime:.0f} seconds') diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index 8a8436c..30e64c2 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -27,9 +27,8 @@ class TwitterSearchScraper(snscrape.base.Scraper): def _check_json_callback(self, r): if r.headers['content-type'] != 'application/json;charset=utf-8': - logger.error(f'Content type of {r.url} is not JSON') - return False - return True + return False, f'content type is not JSON' + return True, None def get_items(self): headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}