mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
changed URL parsing to use urllib
This commit is contained in:
@@ -4,6 +4,7 @@ import requests
|
|||||||
import os
|
import os
|
||||||
import boto3
|
import boto3
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
@@ -28,8 +29,7 @@ class Scraper:
|
|||||||
return self.__version__
|
return self.__version__
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
key = url.split('/')[-1]
|
key = urlparse(url).path.split('/')[-1]
|
||||||
key = key.split('?')[0]
|
|
||||||
return key
|
return key
|
||||||
|
|
||||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ from typing import Generator, Tuple
|
|||||||
from gogettr import PublicClient
|
from gogettr import PublicClient
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||||
__version__ = "GettrScraper 0.0.1"
|
__version__ = "GettrScraper 0.0.1"
|
||||||
@@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
|||||||
|
|
||||||
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
||||||
|
|
||||||
ydl_opts = {}
|
|
||||||
|
|
||||||
(
|
(
|
||||||
ffmpeg
|
ffmpeg
|
||||||
.input(url)
|
.input(url)
|
||||||
@@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
|||||||
blob = temp_file.read()
|
blob = temp_file.read()
|
||||||
|
|
||||||
if key is None:
|
if key is None:
|
||||||
key = url.split('/')[-2] + ext
|
key = urlparse(url).path.split('/')[-2] + ext
|
||||||
|
|
||||||
return blob, content_type, key
|
return blob, content_type, key
|
||||||
@@ -4,6 +4,7 @@ from datetime import datetime
|
|||||||
import json
|
import json
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
from polyphemus.base import OdyseeChannel
|
from polyphemus.base import OdyseeChannel
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
class OdyseeScraper(cisticola.scraper.base.Scraper):
|
class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||||
@@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
key = url.split('/')[-2]
|
key = urlparse(url).path.split('/')[-2]
|
||||||
ext = content_type.split('/')[-1]
|
ext = content_type.split('/')[-1]
|
||||||
|
|
||||||
return f'{key}.{ext}'
|
return f'{key}.{ext}'
|
||||||
@@ -9,7 +9,8 @@ import tempfile
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import youtube_dl
|
import youtube_dl
|
||||||
import json
|
import json
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
BASE_URL = 'https://rumble.com'
|
BASE_URL = 'https://rumble.com'
|
||||||
|
|
||||||
@@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper):
|
|||||||
blob = f.read()
|
blob = f.read()
|
||||||
|
|
||||||
if key is None:
|
if key is None:
|
||||||
key = url.split('/')[-2] + ext
|
key = urlparse(url).path.split('/')[-2] + ext
|
||||||
|
|
||||||
return blob, content_type, key
|
return blob, content_type, key
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user