changed URL parsing to use urllib

This commit is contained in:
Tristan Lee
2022-03-01 14:13:04 -06:00
parent ee4d64750b
commit f3d9dc91c6
4 changed files with 10 additions and 8 deletions

View File

@@ -4,6 +4,7 @@ import requests
import os import os
import boto3 import boto3
from io import BytesIO from io import BytesIO
from urllib.parse import urlparse
from loguru import logger from loguru import logger
class Scraper: class Scraper:
@@ -28,8 +29,7 @@ class Scraper:
return self.__version__ return self.__version__
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:
key = url.split('/')[-1] key = urlparse(url).path.split('/')[-1]
key = key.split('?')[0]
return key return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:

View File

@@ -6,6 +6,8 @@ from typing import Generator, Tuple
from gogettr import PublicClient from gogettr import PublicClient
import ffmpeg import ffmpeg
import tempfile import tempfile
from urllib.parse import urlparse
class GettrScraper(cisticola.scraper.base.Scraper): class GettrScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library""" """An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1" __version__ = "GettrScraper 0.0.1"
@@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
ydl_opts = {}
( (
ffmpeg ffmpeg
.input(url) .input(url)
@@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
blob = temp_file.read() blob = temp_file.read()
if key is None: if key is None:
key = url.split('/')[-2] + ext key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key return blob, content_type, key

View File

@@ -4,6 +4,7 @@ from datetime import datetime
import json import json
from typing import Generator from typing import Generator
from polyphemus.base import OdyseeChannel from polyphemus.base import OdyseeChannel
from urllib.parse import urlparse
class OdyseeScraper(cisticola.scraper.base.Scraper): class OdyseeScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library""" """An implementation of a Scraper for Odysee, using polyphemus library"""
@@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
return True return True
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:
key = url.split('/')[-2] key = urlparse(url).path.split('/')[-2]
ext = content_type.split('/')[-1] ext = content_type.split('/')[-1]
return f'{key}.{ext}' return f'{key}.{ext}'

View File

@@ -9,7 +9,8 @@ import tempfile
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import youtube_dl import youtube_dl
import json import json
from urllib.parse import urlparse
BASE_URL = 'https://rumble.com' BASE_URL = 'https://rumble.com'
@@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
blob = f.read() blob = f.read()
if key is None: if key is None:
key = url.split('/')[-2] + ext key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key return blob, content_type, key