From c4a5715e183f9dc088e87918e11bd5ed3b415773 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Fri, 16 Oct 2020 01:20:50 +0000 Subject: [PATCH] Fix Facebook user and community scrapers Facebook is redirecting the previous user agent to the mobile site; use current Firefox ESR instead. --- snscrape/modules/facebook.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index ffb4fc7..f22b29a 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -162,7 +162,7 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper): def __init__(self, username, **kwargs): super().__init__(**kwargs) self._username = username - self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'} + self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:78.0) Gecko/20100101 Firefox/78.0', 'Accept-Language': 'en-US,en;q=0.5'} self._initialPage = None self._initialPageSoup = None @@ -177,8 +177,6 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper): return self._initialPage, self._initialPageSoup def get_items(self): - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'} - nextPageLinkPattern = re.compile(r'^/pages_reaction_units/more/\?page_id=') spuriousForLoopPattern = re.compile(r'^for \(;;\);') @@ -193,7 +191,7 @@ class FacebookUserAndCommunityScraper(FacebookCommonScraper): # The web app sends a bunch of additional parameters. Most of them would be easy to add, but there's also __dyn, which is a compressed list of the "modules" loaded in the browser. # Reproducing that would be difficult to get right, especially as Facebook's codebase evolves, so it's just not sent at all here. - r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = headers) + r = self._get(urllib.parse.urljoin(self._baseUrl, nextPageLink.get('ajaxify')) + '&__a=1', headers = self._headers) if r.status_code != 200: raise snscrape.base.ScraperException(f'Got status code {r.status_code}') response = json.loads(spuriousForLoopPattern.sub('', r.text))