From f8d812f799a3d2c5e2cf4bb6c892cff5a58d6c9f Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Thu, 18 Apr 2019 04:22:47 +0200 Subject: [PATCH] Include permalink.php, events, and notes (fixes #32) --- snscrape/modules/facebook.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index efe7231..e4e28d4 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -22,8 +22,10 @@ class FacebookUserScraper(snscrape.base.Scraper): for entry in soup.find_all('div', class_ = '_5pcr'): # also class 'fbUserContent' in 2017 and 'userContentWrapper' in 2019 entryA = entry.find('a', class_ = '_5pcq') # There can be more than one, e.g. when a post is shared by another user, but the first one is always the one of this entry. href = entryA.get('href') - if not ('/posts/' in href or '/photos/' in href or '/videos/' in href): - logger.debug(f'Ignoring odd link: {href}') + if not any(x in href for x in ('/posts/', '/photos/', '/videos/', '/permalink.php?', '/events/', '/notes/')): + if href != '#' or 'new photo' not in entry.text or 'to the album' not in entry.text: + # Don't print a warning if it's a "User added 5 new photos to the album"-type entry, which doesn't have a permalink. + logger.warning(f'Ignoring odd link: {href}') continue link = urllib.parse.urljoin(baseUrl, href) if link not in yielded: