From 5a084af85c5bb920521f8b7ac6ae5d58171a1903 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 25 Apr 2018 22:11:10 +0200 Subject: [PATCH] Fix Instagram Instagram dropped the max_id parameter, so it is no longer possible to iterate over the posts so easily. Switch to GraphQL instead, which is what's used in the browser as well. --- socialmediascraper/modules/instagram.py | 51 +++++++++++++++++-------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/socialmediascraper/modules/instagram.py b/socialmediascraper/modules/instagram.py index 7f97174..6d8186a 100644 --- a/socialmediascraper/modules/instagram.py +++ b/socialmediascraper/modules/instagram.py @@ -13,33 +13,52 @@ class InstagramUserScraper(socialmediascraper.base.Scraper): super().__init__(**kwargs) self._username = username - def _response_to_items(self, response): - username = response['user']['username'] # Might have different capitalisation than self._username - - for node in response['user']['media']['nodes']: - code = node['code'] + def _response_to_items(self, response, username): + for node in response['user']['edge_owner_to_timeline_media']['edges']: + code = node['node']['shortcode'] yield socialmediascraper.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here? def get_items(self): headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} - maxID = None + logger.info('Retrieving initial data') + r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers) + if r.status_code == 404: + logger.warning('User does not exist') + return + elif r.status_code != 200: + logger.error(f'Got status code {r.status_code}') + return + response = json.loads(r.text) + if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0: + logger.info('User has no posts') + return + if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']: + logger.warning('Private account') + return + userID = response['graphql']['user']['id'] + username = response['graphql']['user']['username'] # Might have different capitalisation than self._username + yield from self._response_to_items(response['graphql'], username) + if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']: + return + endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] + # Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper while True: - logger.info(f'Retrieving max_id = {maxID!r}') - if maxID is None: - url = f'https://www.instagram.com/{self._username}/?__a=1' - else: - url = f'https://www.instagram.com/{self._username}/?__a=1&max_id={maxID}' - r = self._get(url, headers = headers) + logger.info(f'Retrieving endCursor = {endCursor!r}') + r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers) - #TODO: Handle 404 (HTML) + if r.status_code != 200: + logger.error(f'Got status code {r.status_code}') + return response = json.loads(r.text) - if not response['user']['media']['nodes']: + if not response['data']['user']['edge_owner_to_timeline_media']['edges']: return - yield from self._response_to_items(response) - maxID = response['user']['media']['nodes'][-1]['id'] + yield from self._response_to_items(response['data'], username) + if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']: + return + endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] @classmethod def setup_parser(cls, subparser):