From 30f945897a99eed5d736bfc7a5aafa30d3f70d45 Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Tue, 11 Aug 2020 20:48:14 +0000 Subject: [PATCH] Clean Facebook group post URLs Most of the time, the URLs are already clean, but occasionally, Facebook includes tracking parameters (__xts__[0], __tn__)... --- snscrape/modules/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snscrape/modules/facebook.py b/snscrape/modules/facebook.py index b6a43ae..9af31fa 100644 --- a/snscrape/modules/facebook.py +++ b/snscrape/modules/facebook.py @@ -41,7 +41,7 @@ class FacebookCommonScraper(snscrape.base.Scraper): if setVal.rstrip('0123456789').endswith('.a.'): setVal = f'a.{setVal.rsplit(".", 1)[1]}' clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '') - elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/'): + elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']: # No manipulation of the path needed, but strip the query string clean = (u.scheme, u.netloc, u.path, '', '') elif u.path.split('/')[2] in ('photos', 'videos'):