mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
fixed various bugs related to archived URL creation and media downloading. Things seem to work well now
This commit is contained in:
@@ -6,7 +6,8 @@ import boto3
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
@@ -55,6 +56,28 @@ class Scraper:
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
||||
|
||||
(
|
||||
ffmpeg
|
||||
.input(url)
|
||||
.output(temp_file.name, vcodec='copy')
|
||||
.global_args('-loglevel', 'error')
|
||||
.run(overwrite_output=True))
|
||||
|
||||
temp_file.seek(0)
|
||||
blob = temp_file.read()
|
||||
|
||||
if key is None:
|
||||
key = self.url_to_key(url = url, content_type = content_type)
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
@@ -4,8 +4,6 @@ from datetime import datetime
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
from gogettr import PublicClient
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
@@ -63,24 +61,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
||||
|
||||
(
|
||||
ffmpeg
|
||||
.input(url)
|
||||
.output(temp_file.name, vcodec='copy')
|
||||
.global_args('-loglevel', 'error')
|
||||
.run(overwrite_output=True))
|
||||
|
||||
temp_file.seek(0)
|
||||
blob = temp_file.read()
|
||||
|
||||
if key is None:
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
|
||||
return blob, content_type, key
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
return key
|
||||
@@ -5,6 +5,7 @@ import json
|
||||
from typing import Generator
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from urllib.parse import urlparse
|
||||
import requests
|
||||
|
||||
class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
@@ -29,7 +30,14 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
url = video.info['streaming_url']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
@@ -55,7 +63,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
|
||||
date=datetime.fromtimestamp(comment.info['created']),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(comment.info),
|
||||
archived_urls=archived_urls)
|
||||
archived_urls={})
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
import snscrape.modules
|
||||
from loguru import logger
|
||||
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
@@ -58,3 +58,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and channel.platform_id:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
parsed_url = urlparse(url)
|
||||
queries = parse_qs(parsed_url.query)
|
||||
|
||||
# TODO might require additional statements for other media formats
|
||||
if 'jpg' in queries.get('format', []):
|
||||
ext = '.jpg'
|
||||
elif parsed_url.path.endswith('.mp4'):
|
||||
ext = ''
|
||||
|
||||
key = parsed_url.path.split('/')[-1] + ext
|
||||
return key
|
||||
Reference in New Issue
Block a user