From f3d9dc91c64d31c8b7174311f905fa1e891ddd49 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 1 Mar 2022 14:13:04 -0600 Subject: [PATCH] changed URL parsing to use urllib --- cisticola/scraper/base.py | 4 ++-- cisticola/scraper/gettr.py | 6 +++--- cisticola/scraper/odysee.py | 3 ++- cisticola/scraper/rumble.py | 5 +++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 8b0bb90..c9e3fb7 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -4,6 +4,7 @@ import requests import os import boto3 from io import BytesIO +from urllib.parse import urlparse from loguru import logger class Scraper: @@ -28,8 +29,7 @@ class Scraper: return self.__version__ def url_to_key(self, url: str, content_type: str) -> str: - key = url.split('/')[-1] - key = key.split('?')[0] + key = urlparse(url).path.split('/')[-1] return key def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 3471f25..66ec977 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -6,6 +6,8 @@ from typing import Generator, Tuple from gogettr import PublicClient import ffmpeg import tempfile +from urllib.parse import urlparse + class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper): with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: - ydl_opts = {} - ( ffmpeg .input(url) @@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper): blob = temp_file.read() if key is None: - key = url.split('/')[-2] + ext + key = urlparse(url).path.split('/')[-2] + ext return blob, content_type, key \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 1d9d4e5..2876a66 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -4,6 +4,7 @@ from datetime import datetime import json from typing import Generator from polyphemus.base import OdyseeChannel +from urllib.parse import urlparse class OdyseeScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" @@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): return True def url_to_key(self, url: str, content_type: str) -> str: - key = url.split('/')[-2] + key = urlparse(url).path.split('/')[-2] ext = content_type.split('/')[-1] return f'{key}.{ext}' \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 98e7386..620dcc0 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -9,7 +9,8 @@ import tempfile import requests from bs4 import BeautifulSoup import youtube_dl -import json +import json +from urllib.parse import urlparse BASE_URL = 'https://rumble.com' @@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper): blob = f.read() if key is None: - key = url.split('/')[-2] + ext + key = urlparse(url).path.split('/')[-2] + ext return blob, content_type, key