5 Commits

Author SHA1 Message Date
JustAnotherArchivist
cfb1c9a2aa Version 0.1.3 2018-10-01 03:26:22 +02:00
JustAnotherArchivist
d0d3c8b2a6 Better log output for temporary failures (fixes #2) 2018-10-01 03:24:29 +02:00
JustAnotherArchivist
4d0350e541 Disable "quality filter" on Twitter (fixes #3) 2018-10-01 02:51:33 +02:00
JustAnotherArchivist
d17aa15bcb Version 0.1.2 2018-09-11 12:44:07 +02:00
JustAnotherArchivist
d1ef280d6e Fix snscrape.modules not getting installed 2018-09-11 12:43:10 +02:00
3 changed files with 31 additions and 10 deletions

View File

@@ -3,7 +3,7 @@ import setuptools
setuptools.setup(
name = 'snscrape',
version = '0.1.1',
version = '0.1.3',
description = 'A social networking service scraper',
author = 'JustAnotherArchivist',
url = 'https://github.com/JustAnotherArchivist/snscrape',
@@ -12,7 +12,7 @@ setuptools.setup(
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Programming Language :: Python :: 3.6',
],
packages = ['snscrape'],
packages = ['snscrape', 'snscrape.modules'],
install_requires = ['requests', 'lxml', 'beautifulsoup4'],
entry_points = {
'console_scripts': [

View File

@@ -59,11 +59,32 @@ class Scraper:
logger.debug(f'... with data: {data!r}')
try:
r = self._session.send(req, timeout = timeout)
if responseOkCallback is None or responseOkCallback(r):
logger.debug(f'{req.url} retrieved successfully')
return r
except requests.exceptions.RequestException as exc:
logger.error(f'Error retrieving {url}: {exc!r}')
if attempt < self._retries:
retrying = ', retrying'
level = logging.WARNING
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
else:
if responseOkCallback is not None:
success, msg = responseOkCallback(r)
else:
success, msg = (True, None)
msg = f': {msg}' if msg else ''
if success:
logger.debug(f'{req.url} retrieved successfully{msg}')
return r
else:
if attempt < self._retries:
retrying = ', retrying'
level = logging.WARNING
else:
retrying = ''
level = logging.ERROR
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
if attempt < self._retries:
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
logger.info(f'Waiting {sleepTime:.0f} seconds')

View File

@@ -27,16 +27,15 @@ class TwitterSearchScraper(snscrape.base.Scraper):
def _check_json_callback(self, r):
if r.headers['content-type'] != 'application/json;charset=utf-8':
logger.error(f'Content type of {r.url} is not JSON')
return False
return True
return False, f'content type is not JSON'
return True, None
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# First page
logger.info(f'Retrieving search page for {self._query}')
r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': self._query, 'src': 'typd'}, headers = headers)
r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': self._query, 'src': 'typd', 'qf': 'off'}, headers = headers)
feed = self._get_feed_from_html(r.text)
if not feed:
@@ -57,6 +56,7 @@ class TwitterSearchScraper(snscrape.base.Scraper):
'include_entities': '1',
'reset_error_state': 'false',
'src': 'typd',
'qf': 'off',
'max_position': maxPosition,
},
headers = headers,