changed URL parsing to use urllib

2026-06-08 03:18:34 +03:00 · 2022-03-01 14:13:04 -06:00
parent ee4d64750b
commit f3d9dc91c6
4 changed files with 10 additions and 8 deletions
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -4,6 +4,7 @@ import requests
 import os
 import boto3
 from io import BytesIO
+from urllib.parse import urlparse
 from loguru import logger

 class Scraper:
@@ -28,8 +29,7 @@ class Scraper:
        return self.__version__

    def url_to_key(self, url: str, content_type: str) -> str:
-        key = url.split('/')[-1]
-        key = key.split('?')[0]
+        key = urlparse(url).path.split('/')[-1]
        return key 

    def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -6,6 +6,8 @@ from typing import Generator, Tuple
 from gogettr import PublicClient
 import ffmpeg
 import tempfile
+from urllib.parse import urlparse
+
 class GettrScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Gettr, using gogettr library"""
    __version__ = "GettrScraper 0.0.1"
@@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):

        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
            
-            ydl_opts = {}
-
            (
                ffmpeg
                .input(url)
@@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
            blob = temp_file.read()

        if key is None:
-            key = url.split('/')[-2] + ext
+            key = urlparse(url).path.split('/')[-2] + ext

        return blob, content_type, key
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -4,6 +4,7 @@ from datetime import datetime
 import json
 from typing import Generator
 from polyphemus.base import OdyseeChannel
+from urllib.parse import urlparse

 class OdyseeScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Odysee, using polyphemus library"""
@@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
            return True

    def url_to_key(self, url: str, content_type: str) -> str:
-        key = url.split('/')[-2]
+        key = urlparse(url).path.split('/')[-2]
        ext = content_type.split('/')[-1]

        return f'{key}.{ext}'
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -9,7 +9,8 @@ import tempfile
 import requests
 from bs4 import BeautifulSoup
 import youtube_dl
-import json 
+import json
+from urllib.parse import urlparse

 BASE_URL = 'https://rumble.com'

@@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
                    blob = f.read()

        if key is None:
-            key = url.split('/')[-2] + ext
+            key = urlparse(url).path.split('/')[-2] + ext

        return blob, content_type, key