mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-11 20:08:29 +03:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7b967ff82a | ||
|
|
90f9598ecc | ||
|
|
7b3c7deb28 | ||
|
|
040a11656c | ||
|
|
1459245258 | ||
|
|
dbe4c5ce55 | ||
|
|
80491ecc2c | ||
|
|
1a71b58101 | ||
|
|
0ce37a69d4 | ||
|
|
722bfd5f7c |
@@ -3,9 +3,8 @@ snscrape is a scraper for social networking services (SNS). It scrapes things li
|
|||||||
|
|
||||||
The following services are currently supported:
|
The following services are currently supported:
|
||||||
* Facebook: user profiles and groups
|
* Facebook: user profiles and groups
|
||||||
* Gab: user profile posts, media, and comments
|
|
||||||
* Google+: user profiles
|
|
||||||
* Instagram: user profiles, hashtags, and locations
|
* Instagram: user profiles, hashtags, and locations
|
||||||
|
* Telegram: channels
|
||||||
* Twitter: user profiles, hashtags, searches, threads, and lists (members as well as posts)
|
* Twitter: user profiles, hashtags, searches, threads, and lists (members as well as posts)
|
||||||
* VKontakte: user profiles
|
* VKontakte: user profiles
|
||||||
|
|
||||||
|
|||||||
@@ -93,13 +93,19 @@ def _dump_locals_on_exception():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
trace = inspect.trace()
|
trace = inspect.trace()
|
||||||
if len(trace) >= 2:
|
if len(trace) >= 2:
|
||||||
name = _dump_stack_and_locals(trace[1:])
|
name = _dump_stack_and_locals(trace[1:], exc = e)
|
||||||
logger.fatal(f'Dumped stack and locals to {name}')
|
logger.fatal(f'Dumped stack and locals to {name}')
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _dump_stack_and_locals(trace):
|
def _dump_stack_and_locals(trace, exc = None):
|
||||||
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
|
with tempfile.NamedTemporaryFile('w', prefix = 'snscrape_locals_', delete = False) as fp:
|
||||||
|
if exc is not None:
|
||||||
|
fp.write('Exception:\n')
|
||||||
|
fp.write(f' {type(exc).__module__}.{type(exc).__name__}: {exc!s}\n')
|
||||||
|
fp.write(f' args: {exc.args!r}\n')
|
||||||
|
fp.write('\n')
|
||||||
|
|
||||||
fp.write('Stack:\n')
|
fp.write('Stack:\n')
|
||||||
for frameRecord in trace:
|
for frameRecord in trace:
|
||||||
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
|
||||||
|
|||||||
@@ -141,8 +141,7 @@ class FacebookUserScraper(FacebookCommonScraper):
|
|||||||
logger.warning('User does not exist')
|
logger.warning('User does not exist')
|
||||||
return
|
return
|
||||||
elif r.status_code != 200:
|
elif r.status_code != 200:
|
||||||
logger.error('Got status code {r.status_code}')
|
raise snscrape.base.ScraperException('Got status code {r.status_code}')
|
||||||
return
|
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
yield from self._soup_to_items(soup, baseUrl, 'user')
|
yield from self._soup_to_items(soup, baseUrl, 'user')
|
||||||
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
nextPageLink = soup.find('a', ajaxify = nextPageLinkPattern)
|
||||||
@@ -154,8 +153,7 @@ class FacebookUserScraper(FacebookCommonScraper):
|
|||||||
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
|
# Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here.
|
||||||
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
|
r = self._get(urllib.parse.urljoin(baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.error(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
return
|
|
||||||
response = json.loads(spuriousForLoopPattern.sub('', r.text))
|
response = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||||
assert 'domops' in response
|
assert 'domops' in response
|
||||||
assert len(response['domops']) == 1
|
assert len(response['domops']) == 1
|
||||||
@@ -197,12 +195,10 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
|||||||
logger.warning('Group does not exist')
|
logger.warning('Group does not exist')
|
||||||
return
|
return
|
||||||
elif r.status_code != 200:
|
elif r.status_code != 200:
|
||||||
logger.error('Got status code {r.status_code}')
|
raise snscrape.base.ScraperException('Got status code {r.status_code}')
|
||||||
return
|
|
||||||
|
|
||||||
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
|
if 'content:{pagelet_group_mall:{container_id:"' not in r.text:
|
||||||
logger.error('Code container ID marker not found (does the group exist?)')
|
raise snscrape.base.ScraperException('Code container ID marker not found (does the group exist?)')
|
||||||
return
|
|
||||||
|
|
||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
|
|
||||||
@@ -212,9 +208,9 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
|||||||
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
|
codeContainerId = r.text[codeContainerIdPos : r.text.index('"', codeContainerIdPos)]
|
||||||
codeContainer = soup.find('code', id = codeContainerId)
|
codeContainer = soup.find('code', id = codeContainerId)
|
||||||
if not codeContainer:
|
if not codeContainer:
|
||||||
raise RuntimeError('Code container not found')
|
raise snscrape.base.ScraperException('Code container not found')
|
||||||
if type(codeContainer.string) is not bs4.element.Comment:
|
if type(codeContainer.string) is not bs4.element.Comment:
|
||||||
raise RuntimeError('Code container does not contain a comment')
|
raise snscrape.base.ScraperException('Code container does not contain a comment')
|
||||||
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
|
codeSoup = bs4.BeautifulSoup(codeContainer.string, 'lxml')
|
||||||
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
yield from self._soup_to_items(codeSoup, baseUrl, 'group')
|
||||||
|
|
||||||
@@ -228,7 +224,7 @@ class FacebookGroupScraper(FacebookCommonScraper):
|
|||||||
headers = headers,
|
headers = headers,
|
||||||
)
|
)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise RuntimeError(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
|
obj = json.loads(spuriousForLoopPattern.sub('', r.text))
|
||||||
if obj['payload'] == '':
|
if obj['payload'] == '':
|
||||||
# End of pagination
|
# End of pagination
|
||||||
|
|||||||
@@ -1,115 +0,0 @@
|
|||||||
import datetime
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import snscrape.base
|
|
||||||
import time
|
|
||||||
import typing
|
|
||||||
import urllib.parse
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class GabPost(typing.NamedTuple, snscrape.base.Item):
|
|
||||||
url: str
|
|
||||||
date: datetime.datetime
|
|
||||||
content: str
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.url
|
|
||||||
|
|
||||||
|
|
||||||
class GabUserCommonScraper(snscrape.base.Scraper):
|
|
||||||
def __init__(self, mode, username, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
if mode not in ('posts', 'comments', 'media'):
|
|
||||||
raise ValueError('Invalid mode')
|
|
||||||
self._mode = mode
|
|
||||||
self._username = username
|
|
||||||
if mode == 'posts':
|
|
||||||
self._baseUrl = f'https://gab.com/api/feed/{username}'
|
|
||||||
self._beforeGlue = '?'
|
|
||||||
elif mode == 'comments':
|
|
||||||
self._baseUrl = f'https://gab.com/api/feed/{username}/comments?includes=post.conversation_parent'
|
|
||||||
self._beforeGlue = '&'
|
|
||||||
elif mode == 'media':
|
|
||||||
self._baseUrl = f'https://gab.com/api/feed/{username}/media'
|
|
||||||
self._beforeGlue = '?'
|
|
||||||
|
|
||||||
def _response_to_items(self, response):
|
|
||||||
yielded = set()
|
|
||||||
for post in response['data']:
|
|
||||||
if post['post']['id'] not in yielded:
|
|
||||||
yield GabPost(
|
|
||||||
url = f'https://gab.com/{post["post"]["user"]["username"]}/posts/{post["post"]["id"]}',
|
|
||||||
date = datetime.datetime.strptime(post['post']['created_at'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z'),
|
|
||||||
content = post['post']['body'],
|
|
||||||
)
|
|
||||||
yielded.add(post['post']['id'])
|
|
||||||
|
|
||||||
def get_items(self):
|
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept-Language': 'en-US,en;q=0.5'}
|
|
||||||
|
|
||||||
logger.info('Retrieving initial data')
|
|
||||||
r = self._get(self._baseUrl, headers = headers)
|
|
||||||
if r.status_code == 404:
|
|
||||||
logger.error('User does not exist')
|
|
||||||
return
|
|
||||||
elif r.status_code != 200:
|
|
||||||
logger.error(f'Got status code {r.status_code}')
|
|
||||||
return
|
|
||||||
|
|
||||||
response = json.loads(r.text)
|
|
||||||
if not response['data']:
|
|
||||||
logger.error('User has no posts')
|
|
||||||
return
|
|
||||||
yield from self._response_to_items(response)
|
|
||||||
if self._mode == 'posts':
|
|
||||||
before = response['data'][-1]['published_at']
|
|
||||||
elif self._mode in ('comments', 'media'):
|
|
||||||
before = 30
|
|
||||||
|
|
||||||
while True:
|
|
||||||
logger.info('Retrieving next page')
|
|
||||||
r = self._get(f'{self._baseUrl}{self._beforeGlue}before={before}', headers = headers)
|
|
||||||
if r.status_code != 200:
|
|
||||||
logger.error(f'Got status code {r.status_code}')
|
|
||||||
return
|
|
||||||
response = json.loads(r.text)
|
|
||||||
yield from self._response_to_items(response)
|
|
||||||
if response['no-more'] or not response['data']:
|
|
||||||
# Last page
|
|
||||||
return
|
|
||||||
if self._mode == 'posts':
|
|
||||||
before = response['data'][-1]['published_at']
|
|
||||||
elif self._mode in ('comments', 'media'):
|
|
||||||
before += 30
|
|
||||||
time.sleep(1) # Gab's API is pretty quick but doesn't like being hammered...
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def setup_parser(cls, subparser):
|
|
||||||
subparser.add_argument('username', help = 'A Gab username')
|
|
||||||
|
|
||||||
|
|
||||||
class GabUserPostsScraper(GabUserCommonScraper):
|
|
||||||
name = 'gab-user'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_args(cls, args):
|
|
||||||
return cls('posts', args.username, retries = args.retries)
|
|
||||||
|
|
||||||
|
|
||||||
class GabUserCommentsScraper(GabUserCommonScraper):
|
|
||||||
name = 'gab-user-comments'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_args(cls, args):
|
|
||||||
return cls('comments', args.username, retries = args.retries)
|
|
||||||
|
|
||||||
|
|
||||||
class GabUserMediaScraper(GabUserCommonScraper):
|
|
||||||
name = 'gab-user-media'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_args(cls, args):
|
|
||||||
return cls('media', args.username, retries = args.retries)
|
|
||||||
@@ -1,102 +0,0 @@
|
|||||||
import datetime
|
|
||||||
import itertools
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import snscrape.base
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class GooglePlusUserScraper(snscrape.base.Scraper):
|
|
||||||
name = 'googleplus-user'
|
|
||||||
|
|
||||||
def __init__(self, user, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self._user = user
|
|
||||||
|
|
||||||
def get_items(self):
|
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
|
||||||
|
|
||||||
logger.info('Retrieving initial data')
|
|
||||||
r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
|
|
||||||
if r.status_code == 404:
|
|
||||||
logger.warning('User does not exist')
|
|
||||||
return
|
|
||||||
elif r.status_code != 200:
|
|
||||||
logger.error(f'Got status code {r.status_code}')
|
|
||||||
return
|
|
||||||
|
|
||||||
# Global data; only needed for the session ID
|
|
||||||
#TODO: Make this more robust somehow
|
|
||||||
match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
|
|
||||||
if not match:
|
|
||||||
logger.error('Unable to find session ID')
|
|
||||||
return
|
|
||||||
sid = match.group('sid')
|
|
||||||
|
|
||||||
# Page data
|
|
||||||
# As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
|
|
||||||
match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
|
|
||||||
if not match:
|
|
||||||
logger.error('Unable to extract data')
|
|
||||||
return
|
|
||||||
jsonData = match.group('data')
|
|
||||||
response = json.loads(jsonData)
|
|
||||||
if response[0][7] is None:
|
|
||||||
logger.info('User has no posts')
|
|
||||||
return
|
|
||||||
for postObj in response[0][7]:
|
|
||||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
|
||||||
cursor = response[0][1] # 'ADSJ_x'
|
|
||||||
if cursor is None:
|
|
||||||
# No further pages
|
|
||||||
return
|
|
||||||
baseDate = datetime.datetime.utcnow()
|
|
||||||
baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
|
|
||||||
userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
|
|
||||||
|
|
||||||
for counter in itertools.count(start = 2):
|
|
||||||
logger.info('Retrieving next page')
|
|
||||||
reqid = 1 + baseSeconds + int(1e5) * counter
|
|
||||||
r = self._post(
|
|
||||||
f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
|
|
||||||
data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
|
|
||||||
headers = headers
|
|
||||||
)
|
|
||||||
if r.status_code != 200:
|
|
||||||
logger.error(f'Got status code {r.status_code}')
|
|
||||||
return
|
|
||||||
|
|
||||||
# As if everything up to here wasn't terrible already, this is where it gets *really* bad.
|
|
||||||
# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
|
|
||||||
# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
|
|
||||||
# It sucks.
|
|
||||||
# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
|
|
||||||
# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
|
|
||||||
# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
|
|
||||||
|
|
||||||
garbage = r.text
|
|
||||||
assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
|
|
||||||
data = []
|
|
||||||
pos = 6
|
|
||||||
while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
|
|
||||||
pos += 1
|
|
||||||
response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
|
|
||||||
|
|
||||||
for postObj in response[0][2]['74333095'][0][7]:
|
|
||||||
yield snscrape.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
|
|
||||||
|
|
||||||
cursor = response[0][2]['74333095'][0][1]
|
|
||||||
|
|
||||||
if cursor is None:
|
|
||||||
break
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def setup_parser(cls, subparser):
|
|
||||||
subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_args(cls, args):
|
|
||||||
return cls(args.user, retries = args.retries)
|
|
||||||
@@ -109,8 +109,9 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
|||||||
logger.warning(f'{self._mode} does not exist')
|
logger.warning(f'{self._mode} does not exist')
|
||||||
return
|
return
|
||||||
elif r.status_code != 200:
|
elif r.status_code != 200:
|
||||||
logger.error(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
return
|
elif r.url.startswith('https://www.instagram.com/accounts/login/'):
|
||||||
|
raise snscrape.base.ScraperException('Redirected to login page')
|
||||||
response = r._snscrape_json_obj
|
response = r._snscrape_json_obj
|
||||||
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
rhxGis = response['rhx_gis'] if 'rhx_gis' in response else ''
|
||||||
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
if response['entry_data'][self._pageName][0]['graphql'][self._responseContainer][self._edgeXToMedia]['count'] == 0:
|
||||||
@@ -133,8 +134,7 @@ class InstagramCommonScraper(snscrape.base.Scraper):
|
|||||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
|
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash={self._queryHash}&variables={variables}', headers = headers, responseOkCallback = self._check_json_callback)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.error(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
return
|
|
||||||
|
|
||||||
response = r._snscrape_json_obj
|
response = r._snscrape_json_obj
|
||||||
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
|
if not response['data'][self._responseContainer][self._edgeXToMedia]['edges']:
|
||||||
|
|||||||
66
snscrape/modules/telegram.py
Normal file
66
snscrape/modules/telegram.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import bs4
|
||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
import snscrape.base
|
||||||
|
import typing
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TelegramPost(typing.NamedTuple, snscrape.base.Item):
|
||||||
|
url: str
|
||||||
|
date: datetime.datetime
|
||||||
|
content: str
|
||||||
|
outlinks: list
|
||||||
|
outlinksss: str
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.url
|
||||||
|
|
||||||
|
|
||||||
|
class TelegramChannelScraper(snscrape.base.Scraper):
|
||||||
|
name = 'telegram-channel'
|
||||||
|
|
||||||
|
def __init__(self, name, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self._name = name
|
||||||
|
|
||||||
|
def _soup_to_items(self, soup, pageUrl):
|
||||||
|
posts = soup.find_all('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})
|
||||||
|
for post in reversed(posts):
|
||||||
|
date = datetime.datetime.strptime(post.find('div', class_ = 'tgme_widget_message_footer').find('a', class_ = 'tgme_widget_message_date').find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
|
||||||
|
message = post.find('div', class_ = 'tgme_widget_message_text')
|
||||||
|
if message:
|
||||||
|
content = message.text
|
||||||
|
outlinks = [urllib.parse.urljoin(pageUrl, link['href']) for link in post.find_all('a') if not link.text.startswith('@') and link['href'].startswith('https://t.me/')]
|
||||||
|
outlinksss = ' '.join(outlinks)
|
||||||
|
else:
|
||||||
|
content = None
|
||||||
|
outlinks = []
|
||||||
|
outlinksss = ''
|
||||||
|
yield TelegramPost(url = f'https://t.me/s/{post["data-post"]}', date = date, content = content, outlinks = outlinks, outlinksss = outlinksss)
|
||||||
|
|
||||||
|
def get_items(self):
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
|
||||||
|
|
||||||
|
nextPageUrl = f'https://t.me/s/{self._name}'
|
||||||
|
while True:
|
||||||
|
r = self._get(nextPageUrl, headers = headers)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
|
yield from self._soup_to_items(soup, nextPageUrl)
|
||||||
|
pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
|
||||||
|
if not pageLink:
|
||||||
|
break
|
||||||
|
nextPageUrl = urllib.parse.urljoin(nextPageUrl, pageLink['href'])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setup_parser(cls, subparser):
|
||||||
|
subparser.add_argument('channel', help = 'A channel name')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_args(cls, args):
|
||||||
|
return cls(args.channel, retries = args.retries)
|
||||||
@@ -5,6 +5,7 @@ import random
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import snscrape.base
|
import snscrape.base
|
||||||
|
import time
|
||||||
import typing
|
import typing
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
@@ -98,10 +99,14 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
|||||||
def _get_guest_token(self):
|
def _get_guest_token(self):
|
||||||
logger.info(f'Retrieving guest token from search page')
|
logger.info(f'Retrieving guest token from search page')
|
||||||
r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent})
|
r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent})
|
||||||
match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+);', r.text)
|
match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text)
|
||||||
if not match:
|
if match:
|
||||||
raise RuntimeError('Unable to find guest token')
|
logger.debug('Found guest token in HTML')
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
if 'gt' in r.cookies:
|
||||||
|
logger.debug('Found guest token in cookies')
|
||||||
|
return r.cookies['gt']
|
||||||
|
raise snscrape.base.ScraperException('Unable to find guest token')
|
||||||
|
|
||||||
def _check_scroll_response(self, r):
|
def _check_scroll_response(self, r):
|
||||||
if r.status_code == 429:
|
if r.status_code == 429:
|
||||||
@@ -124,6 +129,7 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
|||||||
while True:
|
while True:
|
||||||
if not guestToken:
|
if not guestToken:
|
||||||
guestToken = self._get_guest_token()
|
guestToken = self._get_guest_token()
|
||||||
|
self._session.cookies.set('gt', guestToken, domain = '.twitter.com', path = '/', secure = True, expires = time.time() + 10800)
|
||||||
headers['x-guest-token'] = guestToken
|
headers['x-guest-token'] = guestToken
|
||||||
|
|
||||||
logger.info(f'Retrieving scroll page {cursor}')
|
logger.info(f'Retrieving scroll page {cursor}')
|
||||||
@@ -162,12 +168,13 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
|||||||
r = self._get('https://api.twitter.com/2/search/adaptive.json', params = params, headers = headers, responseOkCallback = self._check_scroll_response)
|
r = self._get('https://api.twitter.com/2/search/adaptive.json', params = params, headers = headers, responseOkCallback = self._check_scroll_response)
|
||||||
if r.status_code == 429:
|
if r.status_code == 429:
|
||||||
guestToken = None
|
guestToken = None
|
||||||
|
del self._session.cookies['gt']
|
||||||
|
del headers['x-guest-token']
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
obj = r.json()
|
obj = r.json()
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
logger.error(f'Received invalid JSON from Twitter: {e!s}')
|
raise snscrape.base.ScraperException('Received invalid JSON from Twitter') from e
|
||||||
raise RuntimeError('Received invalid JSON from Twitter') from e
|
|
||||||
|
|
||||||
# No data format test, just a hard and loud crash if anything's wrong :-)
|
# No data format test, just a hard and loud crash if anything's wrong :-)
|
||||||
newCursor = None
|
newCursor = None
|
||||||
@@ -180,7 +187,12 @@ class TwitterSearchScraper(TwitterCommonScraper):
|
|||||||
continue
|
continue
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
if entry['entryId'].startswith('sq-I-t-'):
|
if entry['entryId'].startswith('sq-I-t-'):
|
||||||
tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tweet']['id']]
|
if 'tweet' in entry['content']['item']['content']:
|
||||||
|
tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tweet']['id']]
|
||||||
|
elif 'tombstone' in entry['content']['item']['content'] and 'tweet' in entry['content']['item']['content']['tombstone']:
|
||||||
|
tweet = obj['globalObjects']['tweets'][entry['content']['item']['content']['tombstone']['tweet']['id']]
|
||||||
|
else:
|
||||||
|
raise snscrape.base.ScraperException(f'Unable to handle entry {entry["entryId"]!r}')
|
||||||
tweetID = tweet['id']
|
tweetID = tweet['id']
|
||||||
content = tweet['full_text']
|
content = tweet['full_text']
|
||||||
username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
|
username = obj['globalObjects']['users'][tweet['user_id_str']]['screen_name']
|
||||||
@@ -335,7 +347,7 @@ class TwitterListMembersScraper(TwitterCommonScraper):
|
|||||||
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
soup = bs4.BeautifulSoup(r.text, 'lxml')
|
||||||
container = soup.find('div', 'stream-container')
|
container = soup.find('div', 'stream-container')
|
||||||
if not container:
|
if not container:
|
||||||
raise RuntimeError('Unable to find container')
|
raise snscrape.base.ScraperException('Unable to find container')
|
||||||
items = container.find_all('li', 'js-stream-item')
|
items = container.find_all('li', 'js-stream-item')
|
||||||
if not items:
|
if not items:
|
||||||
logger.warning('Empty list')
|
logger.warning('Empty list')
|
||||||
|
|||||||
@@ -43,23 +43,22 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
logger.info('Retrieving initial data')
|
logger.info('Retrieving initial data')
|
||||||
r = self._get(baseUrl, headers = headers)
|
r = self._get(baseUrl, headers = headers)
|
||||||
if r.status_code == 404:
|
if r.status_code == 404:
|
||||||
logger.error('Wall does not exist')
|
logger.warning('Wall does not exist')
|
||||||
return
|
return
|
||||||
elif r.status_code != 200:
|
elif r.status_code != 200:
|
||||||
logger.error(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
return
|
|
||||||
|
|
||||||
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
# VK sends windows-1251-encoded data, but Requests's decoding doesn't seem to work correctly and causes lxml to choke, so we need to pass the binary content and the encoding explicitly.
|
||||||
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
soup = bs4.BeautifulSoup(r.content, 'lxml', from_encoding = r.encoding)
|
||||||
|
|
||||||
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
if soup.find('div', class_ = 'profile_closed_wall_dummy'):
|
||||||
logger.error('Private profile')
|
logger.warning('Private profile')
|
||||||
return
|
return
|
||||||
|
|
||||||
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
|
profileDeleted = soup.find('h5', class_ = 'profile_deleted_text')
|
||||||
if profileDeleted:
|
if profileDeleted:
|
||||||
# Unclear what this state represents, so just log website text.
|
# Unclear what this state represents, so just log website text.
|
||||||
logger.error(profileDeleted.text)
|
logger.warning(profileDeleted.text)
|
||||||
return
|
return
|
||||||
|
|
||||||
newestPost = soup.find('div', class_ = 'post')
|
newestPost = soup.find('div', class_ = 'post')
|
||||||
@@ -84,16 +83,14 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
headers = headers
|
headers = headers
|
||||||
)
|
)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
logger.error(f'Got status code {r.status_code}')
|
raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
|
||||||
return
|
|
||||||
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
# Convert to JSON and read the HTML payload. Note that this implicitly converts the data to a Python string (i.e., Unicode), away from a windows-1251-encoded bytes.
|
||||||
posts = r.json()['payload'][1][0]
|
posts = r.json()['payload'][1][0]
|
||||||
if posts.startswith('<div class="page_block no_posts">'):
|
if posts.startswith('<div class="page_block no_posts">'):
|
||||||
# Reached the end
|
# Reached the end
|
||||||
break
|
break
|
||||||
if not posts.startswith('<div id="post'):
|
if not posts.startswith('<div id="post'):
|
||||||
logger.error(f'Got an unknown response: {posts[:200]!r}...')
|
raise snscrape.base.ScraperException(f'Got an unknown response: {posts[:200]!r}...')
|
||||||
break
|
|
||||||
soup = bs4.BeautifulSoup(posts, 'lxml')
|
soup = bs4.BeautifulSoup(posts, 'lxml')
|
||||||
yield from self._soup_to_items(soup, baseUrl)
|
yield from self._soup_to_items(soup, baseUrl)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user