fixed various bugs related to archived URL creation and media downloading. Things seem to work well now

This commit is contained in:
Tristan Lee
2022-03-01 15:58:18 -06:00
parent f3d9dc91c6
commit 75240bb060
4 changed files with 51 additions and 26 deletions

View File

@@ -6,7 +6,8 @@ import boto3
from io import BytesIO
from urllib.parse import urlparse
from loguru import logger
import ffmpeg
import tempfile
class Scraper:
__version__ = "Scraper 0.0.0"
@@ -55,6 +56,28 @@ class Scraper:
return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
(
ffmpeg
.input(url)
.output(temp_file.name, vcodec='copy')
.global_args('-loglevel', 'error')
.run(overwrite_output=True))
temp_file.seek(0)
blob = temp_file.read()
if key is None:
key = self.url_to_key(url = url, content_type = content_type)
return blob, content_type, key
def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
filename = self.__version__.replace(' ', '_') + '/' + key

View File

@@ -4,8 +4,6 @@ from datetime import datetime
import json
from typing import Generator, Tuple
from gogettr import PublicClient
import ffmpeg
import tempfile
from urllib.parse import urlparse
class GettrScraper(cisticola.scraper.base.Scraper):
@@ -63,24 +61,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
return True
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
(
ffmpeg
.input(url)
.output(temp_file.name, vcodec='copy')
.global_args('-loglevel', 'error')
.run(overwrite_output=True))
temp_file.seek(0)
blob = temp_file.read()
if key is None:
key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key
key = urlparse(url).path.split('/')[-2] + ext
return key

View File

@@ -5,6 +5,7 @@ import json
from typing import Generator
from polyphemus.base import OdyseeChannel
from urllib.parse import urlparse
import requests
class OdyseeScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
@@ -29,7 +30,14 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
archived_urls = {}
url = video.info['streaming_url']
media_blob, content_type, key = self.url_to_blob(url)
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -55,7 +63,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
archived_urls=archived_urls)
archived_urls={})
def can_handle(self, channel):
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:

View File

@@ -4,7 +4,7 @@ from datetime import datetime, timezone
from typing import Generator
import snscrape.modules
from loguru import logger
from urllib.parse import urlparse, parse_qs
class TwitterScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
@@ -58,3 +58,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
def can_handle(self, channel):
if channel.platform == "Twitter" and channel.platform_id:
return True
def url_to_key(self, url: str, content_type: str) -> str:
parsed_url = urlparse(url)
queries = parse_qs(parsed_url.query)
# TODO might require additional statements for other media formats
if 'jpg' in queries.get('format', []):
ext = '.jpg'
elif parsed_url.path.endswith('.mp4'):
ext = ''
key = parsed_url.path.split('/')[-1] + ext
return key