fixed various bugs related to archived URL creation and media downloading. Things seem to work well now

2026-06-08 03:18:34 +03:00 · 2022-03-01 15:58:18 -06:00
parent f3d9dc91c6
commit 75240bb060
4 changed files with 51 additions and 26 deletions
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -6,7 +6,8 @@ import boto3
 from io import BytesIO
 from urllib.parse import urlparse
 from loguru import logger
-
+import ffmpeg
+import tempfile
 class Scraper:
    __version__ = "Scraper 0.0.0"

@@ -55,6 +56,28 @@ class Scraper:

        return blob, content_type, key

+    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+        
+        content_type = 'video/mp4'
+        ext = '.' + content_type.split('/')[-1]
+
+        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
+            
+            (
+                ffmpeg
+                .input(url)
+                .output(temp_file.name, vcodec='copy')
+                .global_args('-loglevel', 'error')
+                .run(overwrite_output=True))
+            
+            temp_file.seek(0)
+            blob = temp_file.read()
+
+        if key is None:
+            key = self.url_to_key(url = url, content_type = content_type)
+
+        return blob, content_type, key
+
    def archive_media(self, blob: bytes, content_type: str, key: str) -> str:

        filename = self.__version__.replace(' ', '_') + '/' + key
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -4,8 +4,6 @@ from datetime import datetime
 import json
 from typing import Generator, Tuple
 from gogettr import PublicClient
-import ffmpeg
-import tempfile
 from urllib.parse import urlparse

 class GettrScraper(cisticola.scraper.base.Scraper):
@@ -63,24 +61,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
        if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
            return True

-    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
-        
-        content_type = 'video/mp4'
+    def url_to_key(self, url: str, content_type: str) -> str:
        ext = '.' + content_type.split('/')[-1]
-
-        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
-            
-            (
-                ffmpeg
-                .input(url)
-                .output(temp_file.name, vcodec='copy')
-                .global_args('-loglevel', 'error')
-                .run(overwrite_output=True))
-            
-            temp_file.seek(0)
-            blob = temp_file.read()
-
-        if key is None:
-            key = urlparse(url).path.split('/')[-2] + ext
-
-        return blob, content_type, key
+        key = urlparse(url).path.split('/')[-2] + ext
+        return key 
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -5,6 +5,7 @@ import json
 from typing import Generator
 from polyphemus.base import OdyseeChannel
 from urllib.parse import urlparse
+import requests

 class OdyseeScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Odysee, using polyphemus library"""
@@ -29,7 +30,14 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):

            archived_urls = {}
            url = video.info['streaming_url']
-            media_blob, content_type, key = self.url_to_blob(url)
+
+            # Check if file is a video file or an m3u8 file
+            r = requests.head(url)
+            if r.headers['Content-Type'] == 'text/html; charset=utf-8':
+                media_blob, content_type, key = self.m3u8_url_to_blob(url)
+            else:
+                media_blob, content_type, key = self.url_to_blob(url)
+
            archived_url = self.archive_media(media_blob, content_type, key)
            archived_urls[url] = archived_url

@@ -55,7 +63,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
                    date=datetime.fromtimestamp(comment.info['created']),
                    date_archived=datetime.now(),
                    raw_data=json.dumps(comment.info),
-                    archived_urls=archived_urls)
+                    archived_urls={})

    def can_handle(self, channel):
        if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
 from typing import Generator
 import snscrape.modules
 from loguru import logger
-
+from urllib.parse import urlparse, parse_qs

 class TwitterScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Twitter, using snscrape library"""
@@ -58,3 +58,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
    def can_handle(self, channel):
        if channel.platform == "Twitter" and channel.platform_id:
            return True
+
+    def url_to_key(self, url: str, content_type: str) -> str:
+        parsed_url = urlparse(url)
+        queries = parse_qs(parsed_url.query)
+
+        # TODO might require additional statements for other media formats
+        if 'jpg' in queries.get('format', []):
+            ext = '.jpg'
+        elif parsed_url.path.endswith('.mp4'):
+            ext = ''
+
+        key = parsed_url.path.split('/')[-1] + ext
+        return key