Clean Facebook group post URLs

Most of the time, the URLs are already clean, but occasionally, Facebook includes tracking parameters (__xts__[0], __tn__)...
This commit is contained in:
JustAnotherArchivist
2020-08-11 20:48:14 +00:00
parent eee5794ff9
commit 30f945897a

View File

@@ -41,7 +41,7 @@ class FacebookCommonScraper(snscrape.base.Scraper):
if setVal.rstrip('0123456789').endswith('.a.'):
setVal = f'a.{setVal.rsplit(".", 1)[1]}'
clean = (u.scheme, u.netloc, u.path, urllib.parse.urlencode((('set', setVal),)), '')
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/'):
elif u.path.split('/')[2] == 'posts' or u.path.startswith('/events/') or u.path.startswith('/notes/') or u.path.split('/')[1:4:2] == ['groups', 'permalink']:
# No manipulation of the path needed, but strip the query string
clean = (u.scheme, u.netloc, u.path, '', '')
elif u.path.split('/')[2] in ('photos', 'videos'):