mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
changed URL parsing to use urllib
This commit is contained in:
@@ -4,6 +4,7 @@ import requests
|
||||
import os
|
||||
import boto3
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
class Scraper:
|
||||
@@ -28,8 +29,7 @@ class Scraper:
|
||||
return self.__version__
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
key = url.split('/')[-1]
|
||||
key = key.split('?')[0]
|
||||
key = urlparse(url).path.split('/')[-1]
|
||||
return key
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
@@ -6,6 +6,8 @@ from typing import Generator, Tuple
|
||||
from gogettr import PublicClient
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
@@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
||||
|
||||
ydl_opts = {}
|
||||
|
||||
(
|
||||
ffmpeg
|
||||
.input(url)
|
||||
@@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
blob = temp_file.read()
|
||||
|
||||
if key is None:
|
||||
key = url.split('/')[-2] + ext
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
|
||||
return blob, content_type, key
|
||||
@@ -4,6 +4,7 @@ from datetime import datetime
|
||||
import json
|
||||
from typing import Generator
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
@@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
key = url.split('/')[-2]
|
||||
key = urlparse(url).path.split('/')[-2]
|
||||
ext = content_type.split('/')[-1]
|
||||
|
||||
return f'{key}.{ext}'
|
||||
@@ -9,7 +9,8 @@ import tempfile
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import youtube_dl
|
||||
import json
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
BASE_URL = 'https://rumble.com'
|
||||
|
||||
@@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
|
||||
blob = f.read()
|
||||
|
||||
if key is None:
|
||||
key = url.split('/')[-2] + ext
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
|
||||
Reference in New Issue
Block a user