mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-11 20:08:29 +03:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cfb1c9a2aa | ||
|
|
d0d3c8b2a6 | ||
|
|
4d0350e541 | ||
|
|
d17aa15bcb | ||
|
|
d1ef280d6e | ||
|
|
2823272e0b | ||
|
|
540f557002 |
6
setup.py
6
setup.py
@@ -3,16 +3,16 @@ import setuptools
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name = 'snscrape',
|
name = 'snscrape',
|
||||||
version = '0.1',
|
version = '0.1.3',
|
||||||
description = 'A social networking service scraper',
|
description = 'A social networking service scraper',
|
||||||
author = 'JustAnotherArchivist',
|
author = 'JustAnotherArchivist',
|
||||||
url = 'https://github.com/JustAnotherArchivist/snscrape',
|
url = 'https://github.com/JustAnotherArchivist/snscrape',
|
||||||
classifiers = [
|
classifiers = [
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 4 - Beta',
|
||||||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)'.
|
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
],
|
],
|
||||||
packages = ['snscrape'],
|
packages = ['snscrape', 'snscrape.modules'],
|
||||||
install_requires = ['requests', 'lxml', 'beautifulsoup4'],
|
install_requires = ['requests', 'lxml', 'beautifulsoup4'],
|
||||||
entry_points = {
|
entry_points = {
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
|
|||||||
@@ -59,11 +59,32 @@ class Scraper:
|
|||||||
logger.debug(f'... with data: {data!r}')
|
logger.debug(f'... with data: {data!r}')
|
||||||
try:
|
try:
|
||||||
r = self._session.send(req, timeout = timeout)
|
r = self._session.send(req, timeout = timeout)
|
||||||
if responseOkCallback is None or responseOkCallback(r):
|
|
||||||
logger.debug(f'{req.url} retrieved successfully')
|
|
||||||
return r
|
|
||||||
except requests.exceptions.RequestException as exc:
|
except requests.exceptions.RequestException as exc:
|
||||||
logger.error(f'Error retrieving {url}: {exc!r}')
|
if attempt < self._retries:
|
||||||
|
retrying = ', retrying'
|
||||||
|
level = logging.WARNING
|
||||||
|
else:
|
||||||
|
retrying = ''
|
||||||
|
level = logging.ERROR
|
||||||
|
logger.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
||||||
|
else:
|
||||||
|
if responseOkCallback is not None:
|
||||||
|
success, msg = responseOkCallback(r)
|
||||||
|
else:
|
||||||
|
success, msg = (True, None)
|
||||||
|
msg = f': {msg}' if msg else ''
|
||||||
|
|
||||||
|
if success:
|
||||||
|
logger.debug(f'{req.url} retrieved successfully{msg}')
|
||||||
|
return r
|
||||||
|
else:
|
||||||
|
if attempt < self._retries:
|
||||||
|
retrying = ', retrying'
|
||||||
|
level = logging.WARNING
|
||||||
|
else:
|
||||||
|
retrying = ''
|
||||||
|
level = logging.ERROR
|
||||||
|
logger.log(level, f'Error retrieving {req.url}{msg}{retrying}')
|
||||||
if attempt < self._retries:
|
if attempt < self._retries:
|
||||||
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
|
sleepTime = 1.0 * 2**attempt # exponential backoff: sleep 1 second after first attempt, 2 after second, 4 after third, etc.
|
||||||
logger.info(f'Waiting {sleepTime:.0f} seconds')
|
logger.info(f'Waiting {sleepTime:.0f} seconds')
|
||||||
|
|||||||
@@ -27,16 +27,15 @@ class TwitterSearchScraper(snscrape.base.Scraper):
|
|||||||
|
|
||||||
def _check_json_callback(self, r):
|
def _check_json_callback(self, r):
|
||||||
if r.headers['content-type'] != 'application/json;charset=utf-8':
|
if r.headers['content-type'] != 'application/json;charset=utf-8':
|
||||||
logger.error(f'Content type of {r.url} is not JSON')
|
return False, f'content type is not JSON'
|
||||||
return False
|
return True, None
|
||||||
return True
|
|
||||||
|
|
||||||
def get_items(self):
|
def get_items(self):
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||||
|
|
||||||
# First page
|
# First page
|
||||||
logger.info(f'Retrieving search page for {self._query}')
|
logger.info(f'Retrieving search page for {self._query}')
|
||||||
r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': self._query, 'src': 'typd'}, headers = headers)
|
r = self._get('https://twitter.com/search', params = {'f': 'tweets', 'vertical': 'default', 'lang': 'en', 'q': self._query, 'src': 'typd', 'qf': 'off'}, headers = headers)
|
||||||
|
|
||||||
feed = self._get_feed_from_html(r.text)
|
feed = self._get_feed_from_html(r.text)
|
||||||
if not feed:
|
if not feed:
|
||||||
@@ -57,6 +56,7 @@ class TwitterSearchScraper(snscrape.base.Scraper):
|
|||||||
'include_entities': '1',
|
'include_entities': '1',
|
||||||
'reset_error_state': 'false',
|
'reset_error_state': 'false',
|
||||||
'src': 'typd',
|
'src': 'typd',
|
||||||
|
'qf': 'off',
|
||||||
'max_position': maxPosition,
|
'max_position': maxPosition,
|
||||||
},
|
},
|
||||||
headers = headers,
|
headers = headers,
|
||||||
|
|||||||
Reference in New Issue
Block a user