From 7001983556f3bade9b0f842d3fa094829182ea21 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 16 May 2019 23:17:46 +0000 Subject: [PATCH] Skip timeline entries that don't have a link (fixes #36) --- snscrape/modules/facebook.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index 13339ba..edc86f0 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -57,6 +57,13 @@ class FacebookUserScraper(snscrape.base.Scraper): def _soup_to_items(self, soup, baseUrl): for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019 entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry. + if not entryA: + mediaSetA = entry.find('a', class_ = '_17z-') + if mediaSetA and mediaSetA.has_attr('href'): + logger.warning(f'Ignoring link-less media set: {mediaSetA["href"]}') + else: + logger.warning(f'Ignoring entry without a link after {cleanUrl}') + continue href = entryA.get('href') if not any(x in href for x in ('/posts/', '/photos/', '/videos/', '/permalink.php?', '/events/', '/notes/')): if href != '#' or 'new photo' not in entry.text or 'to the album' not in entry.text: