changed URL parsing to use urllib

This commit is contained in:
Tristan Lee
2022-03-01 14:13:04 -06:00
parent ee4d64750b
commit f3d9dc91c6
4 changed files with 10 additions and 8 deletions

View File

@@ -4,6 +4,7 @@ import requests
import os
import boto3
from io import BytesIO
from urllib.parse import urlparse
from loguru import logger
class Scraper:
@@ -28,8 +29,7 @@ class Scraper:
return self.__version__
def url_to_key(self, url: str, content_type: str) -> str:
key = url.split('/')[-1]
key = key.split('?')[0]
key = urlparse(url).path.split('/')[-1]
return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:

View File

@@ -6,6 +6,8 @@ from typing import Generator, Tuple
from gogettr import PublicClient
import ffmpeg
import tempfile
from urllib.parse import urlparse
class GettrScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
@@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
ydl_opts = {}
(
ffmpeg
.input(url)
@@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
blob = temp_file.read()
if key is None:
key = url.split('/')[-2] + ext
key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key

View File

@@ -4,6 +4,7 @@ from datetime import datetime
import json
from typing import Generator
from polyphemus.base import OdyseeChannel
from urllib.parse import urlparse
class OdyseeScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
@@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
return True
def url_to_key(self, url: str, content_type: str) -> str:
key = url.split('/')[-2]
key = urlparse(url).path.split('/')[-2]
ext = content_type.split('/')[-1]
return f'{key}.{ext}'

View File

@@ -9,7 +9,8 @@ import tempfile
import requests
from bs4 import BeautifulSoup
import youtube_dl
import json
import json
from urllib.parse import urlparse
BASE_URL = 'https://rumble.com'
@@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
blob = f.read()
if key is None:
key = url.split('/')[-2] + ext
key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key