From 75240bb060b7fac806fde0d832579d385f335277 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 1 Mar 2022 15:58:18 -0600 Subject: [PATCH] fixed various bugs related to archived URL creation and media downloading. Things seem to work well now --- cisticola/scraper/base.py | 25 ++++++++++++++++++++++++- cisticola/scraper/gettr.py | 25 +++---------------------- cisticola/scraper/odysee.py | 12 ++++++++++-- cisticola/scraper/twitter.py | 15 ++++++++++++++- 4 files changed, 51 insertions(+), 26 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index c9e3fb7..465b1f7 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -6,7 +6,8 @@ import boto3 from io import BytesIO from urllib.parse import urlparse from loguru import logger - +import ffmpeg +import tempfile class Scraper: __version__ = "Scraper 0.0.0" @@ -55,6 +56,28 @@ class Scraper: return blob, content_type, key + def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + content_type = 'video/mp4' + ext = '.' + content_type.split('/')[-1] + + with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: + + ( + ffmpeg + .input(url) + .output(temp_file.name, vcodec='copy') + .global_args('-loglevel', 'error') + .run(overwrite_output=True)) + + temp_file.seek(0) + blob = temp_file.read() + + if key is None: + key = self.url_to_key(url = url, content_type = content_type) + + return blob, content_type, key + def archive_media(self, blob: bytes, content_type: str, key: str) -> str: filename = self.__version__.replace(' ', '_') + '/' + key diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 66ec977..cdcb6cf 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -4,8 +4,6 @@ from datetime import datetime import json from typing import Generator, Tuple from gogettr import PublicClient -import ffmpeg -import tempfile from urllib.parse import urlparse class GettrScraper(cisticola.scraper.base.Scraper): @@ -63,24 +61,7 @@ class GettrScraper(cisticola.scraper.base.Scraper): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: return True - def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - - content_type = 'video/mp4' + def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] - - with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: - - ( - ffmpeg - .input(url) - .output(temp_file.name, vcodec='copy') - .global_args('-loglevel', 'error') - .run(overwrite_output=True)) - - temp_file.seek(0) - blob = temp_file.read() - - if key is None: - key = urlparse(url).path.split('/')[-2] + ext - - return blob, content_type, key \ No newline at end of file + key = urlparse(url).path.split('/')[-2] + ext + return key \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 2876a66..fc0c3da 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -5,6 +5,7 @@ import json from typing import Generator from polyphemus.base import OdyseeChannel from urllib.parse import urlparse +import requests class OdyseeScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" @@ -29,7 +30,14 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): archived_urls = {} url = video.info['streaming_url'] - media_blob, content_type, key = self.url_to_blob(url) + + # Check if file is a video file or an m3u8 file + r = requests.head(url) + if r.headers['Content-Type'] == 'text/html; charset=utf-8': + media_blob, content_type, key = self.m3u8_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url @@ -55,7 +63,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): date=datetime.fromtimestamp(comment.info['created']), date_archived=datetime.now(), raw_data=json.dumps(comment.info), - archived_urls=archived_urls) + archived_urls={}) def can_handle(self, channel): if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index e833ec3..e36aab1 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -4,7 +4,7 @@ from datetime import datetime, timezone from typing import Generator import snscrape.modules from loguru import logger - +from urllib.parse import urlparse, parse_qs class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" @@ -58,3 +58,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper): def can_handle(self, channel): if channel.platform == "Twitter" and channel.platform_id: return True + + def url_to_key(self, url: str, content_type: str) -> str: + parsed_url = urlparse(url) + queries = parse_qs(parsed_url.query) + + # TODO might require additional statements for other media formats + if 'jpg' in queries.get('format', []): + ext = '.jpg' + elif parsed_url.path.endswith('.mp4'): + ext = '' + + key = parsed_url.path.split('/')[-1] + ext + return key \ No newline at end of file