Fix Instagram

Instagram dropped the max_id parameter, so it is no longer possible to iterate over the posts so easily. Switch to GraphQL instead, which is what's used in the browser as well.
This commit is contained in:
JustAnotherArchivist
2018-04-25 22:11:10 +02:00
parent 14831d4137
commit 5a084af85c

View File

@@ -13,33 +13,52 @@ class InstagramUserScraper(socialmediascraper.base.Scraper):
super().__init__(**kwargs)
self._username = username
def _response_to_items(self, response):
username = response['user']['username'] # Might have different capitalisation than self._username
for node in response['user']['media']['nodes']:
code = node['code']
def _response_to_items(self, response, username):
for node in response['user']['edge_owner_to_timeline_media']['edges']:
code = node['node']['shortcode']
yield socialmediascraper.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here?
def get_items(self):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
maxID = None
logger.info('Retrieving initial data')
r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers)
if r.status_code == 404:
logger.warning('User does not exist')
return
elif r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
response = json.loads(r.text)
if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
logger.info('User has no posts')
return
if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']:
logger.warning('Private account')
return
userID = response['graphql']['user']['id']
username = response['graphql']['user']['username'] # Might have different capitalisation than self._username
yield from self._response_to_items(response['graphql'], username)
if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
return
endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
# Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper
while True:
logger.info(f'Retrieving max_id = {maxID!r}')
if maxID is None:
url = f'https://www.instagram.com/{self._username}/?__a=1'
else:
url = f'https://www.instagram.com/{self._username}/?__a=1&max_id={maxID}'
r = self._get(url, headers = headers)
logger.info(f'Retrieving endCursor = {endCursor!r}')
r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers)
#TODO: Handle 404 (HTML)
if r.status_code != 200:
logger.error(f'Got status code {r.status_code}')
return
response = json.loads(r.text)
if not response['user']['media']['nodes']:
if not response['data']['user']['edge_owner_to_timeline_media']['edges']:
return
yield from self._response_to_items(response)
maxID = response['user']['media']['nodes'][-1]['id']
yield from self._response_to_items(response['data'], username)
if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
return
endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
@classmethod
def setup_parser(cls, subparser):