mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-11 20:08:29 +03:00
Fix Instagram
Instagram dropped the max_id parameter, so it is no longer possible to iterate over the posts so easily. Switch to GraphQL instead, which is what's used in the browser as well.
This commit is contained in:
@@ -13,33 +13,52 @@ class InstagramUserScraper(socialmediascraper.base.Scraper):
|
||||
super().__init__(**kwargs)
|
||||
self._username = username
|
||||
|
||||
def _response_to_items(self, response):
|
||||
username = response['user']['username'] # Might have different capitalisation than self._username
|
||||
|
||||
for node in response['user']['media']['nodes']:
|
||||
code = node['code']
|
||||
def _response_to_items(self, response, username):
|
||||
for node in response['user']['edge_owner_to_timeline_media']['edges']:
|
||||
code = node['node']['shortcode']
|
||||
yield socialmediascraper.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here?
|
||||
|
||||
def get_items(self):
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
maxID = None
|
||||
logger.info('Retrieving initial data')
|
||||
r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers)
|
||||
if r.status_code == 404:
|
||||
logger.warning('User does not exist')
|
||||
return
|
||||
elif r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
response = json.loads(r.text)
|
||||
if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
|
||||
logger.info('User has no posts')
|
||||
return
|
||||
if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']:
|
||||
logger.warning('Private account')
|
||||
return
|
||||
userID = response['graphql']['user']['id']
|
||||
username = response['graphql']['user']['username'] # Might have different capitalisation than self._username
|
||||
yield from self._response_to_items(response['graphql'], username)
|
||||
if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
|
||||
return
|
||||
endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
|
||||
|
||||
# Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper
|
||||
while True:
|
||||
logger.info(f'Retrieving max_id = {maxID!r}')
|
||||
if maxID is None:
|
||||
url = f'https://www.instagram.com/{self._username}/?__a=1'
|
||||
else:
|
||||
url = f'https://www.instagram.com/{self._username}/?__a=1&max_id={maxID}'
|
||||
r = self._get(url, headers = headers)
|
||||
logger.info(f'Retrieving endCursor = {endCursor!r}')
|
||||
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers)
|
||||
|
||||
#TODO: Handle 404 (HTML)
|
||||
if r.status_code != 200:
|
||||
logger.error(f'Got status code {r.status_code}')
|
||||
return
|
||||
|
||||
response = json.loads(r.text)
|
||||
if not response['user']['media']['nodes']:
|
||||
if not response['data']['user']['edge_owner_to_timeline_media']['edges']:
|
||||
return
|
||||
yield from self._response_to_items(response)
|
||||
maxID = response['user']['media']['nodes'][-1]['id']
|
||||
yield from self._response_to_items(response['data'], username)
|
||||
if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
|
||||
return
|
||||
endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
|
||||
|
||||
@classmethod
|
||||
def setup_parser(cls, subparser):
|
||||
|
||||
Reference in New Issue
Block a user