From 7b967ff82a4c37f6d846a47e66a82d6c54443936 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Wed, 8 Jul 2020 22:07:18 +0000 Subject: [PATCH] Twitter reverted their guest token change (90f9598e) --- snscrape/modules/twitter.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/snscrape/modules/twitter.py b/snscrape/modules/twitter.py index a00fe10..ae42a48 100644 --- a/snscrape/modules/twitter.py +++ b/snscrape/modules/twitter.py @@ -5,6 +5,7 @@ import random import logging import re import snscrape.base +import time import typing import urllib.parse @@ -98,9 +99,14 @@ class TwitterSearchScraper(TwitterCommonScraper): def _get_guest_token(self): logger.info(f'Retrieving guest token from search page') r = self._get(self._baseUrl, headers = {'User-Agent': self._userAgent}) - if 'gt' not in r.cookies: - raise snscrape.base.ScraperException("Twitter didn't set the cookie") - return r.cookies['gt'] + match = re.search(r'document\.cookie = decodeURIComponent\("gt=(\d+); Max-Age=10800; Domain=\.twitter\.com; Path=/; Secure"\);', r.text) + if match: + logger.debug('Found guest token in HTML') + return match.group(1) + if 'gt' in r.cookies: + logger.debug('Found guest token in cookies') + return r.cookies['gt'] + raise snscrape.base.ScraperException('Unable to find guest token') def _check_scroll_response(self, r): if r.status_code == 429: @@ -123,6 +129,7 @@ class TwitterSearchScraper(TwitterCommonScraper): while True: if not guestToken: guestToken = self._get_guest_token() + self._session.cookies.set('gt', guestToken, domain = '.twitter.com', path = '/', secure = True, expires = time.time() + 10800) headers['x-guest-token'] = guestToken logger.info(f'Retrieving scroll page {cursor}') @@ -162,6 +169,7 @@ class TwitterSearchScraper(TwitterCommonScraper): if r.status_code == 429: guestToken = None del self._session.cookies['gt'] + del headers['x-guest-token'] continue try: obj = r.json()