From 75240bb060b7fac806fde0d832579d385f335277 Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Tue, 1 Mar 2022 15:58:18 -0600
Subject: [PATCH] fixed various bugs related to archived URL creation and media
 downloading. Things seem to work well now

---
 cisticola/scraper/base.py    | 25 ++++++++++++++++++++++++-
 cisticola/scraper/gettr.py   | 25 +++----------------------
 cisticola/scraper/odysee.py  | 12 ++++++++++--
 cisticola/scraper/twitter.py | 15 ++++++++++++++-
 4 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py
index c9e3fb7..465b1f7 100644
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -6,7 +6,8 @@ import boto3
 from io import BytesIO
 from urllib.parse import urlparse
 from loguru import logger
-
+import ffmpeg
+import tempfile
 class Scraper:
     __version__ = "Scraper 0.0.0"
 
@@ -55,6 +56,28 @@ class Scraper:
 
         return blob, content_type, key
 
+    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+        
+        content_type = 'video/mp4'
+        ext = '.' + content_type.split('/')[-1]
+
+        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
+            
+            (
+                ffmpeg
+                .input(url)
+                .output(temp_file.name, vcodec='copy')
+                .global_args('-loglevel', 'error')
+                .run(overwrite_output=True))
+            
+            temp_file.seek(0)
+            blob = temp_file.read()
+
+        if key is None:
+            key = self.url_to_key(url = url, content_type = content_type)
+
+        return blob, content_type, key
+
     def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
 
         filename = self.__version__.replace(' ', '_') + '/' + key
diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py
index 66ec977..cdcb6cf 100644
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -4,8 +4,6 @@ from datetime import datetime
 import json
 from typing import Generator, Tuple
 from gogettr import PublicClient
-import ffmpeg
-import tempfile
 from urllib.parse import urlparse
 
 class GettrScraper(cisticola.scraper.base.Scraper):
@@ -63,24 +61,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
         if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
             return True
 
-    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
-        
-        content_type = 'video/mp4'
+    def url_to_key(self, url: str, content_type: str) -> str:
         ext = '.' + content_type.split('/')[-1]
-
-        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
-            
-            (
-                ffmpeg
-                .input(url)
-                .output(temp_file.name, vcodec='copy')
-                .global_args('-loglevel', 'error')
-                .run(overwrite_output=True))
-            
-            temp_file.seek(0)
-            blob = temp_file.read()
-
-        if key is None:
-            key = urlparse(url).path.split('/')[-2] + ext
-
-        return blob, content_type, key
\ No newline at end of file
+        key = urlparse(url).path.split('/')[-2] + ext
+        return key 
\ No newline at end of file
diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py
index 2876a66..fc0c3da 100644
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -5,6 +5,7 @@ import json
 from typing import Generator
 from polyphemus.base import OdyseeChannel
 from urllib.parse import urlparse
+import requests
 
 class OdyseeScraper(cisticola.scraper.base.Scraper):
     """An implementation of a Scraper for Odysee, using polyphemus library"""
@@ -29,7 +30,14 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
 
             archived_urls = {}
             url = video.info['streaming_url']
-            media_blob, content_type, key = self.url_to_blob(url)
+
+            # Check if file is a video file or an m3u8 file
+            r = requests.head(url)
+            if r.headers['Content-Type'] == 'text/html; charset=utf-8':
+                media_blob, content_type, key = self.m3u8_url_to_blob(url)
+            else:
+                media_blob, content_type, key = self.url_to_blob(url)
+
             archived_url = self.archive_media(media_blob, content_type, key)
             archived_urls[url] = archived_url
 
@@ -55,7 +63,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper):
                     date=datetime.fromtimestamp(comment.info['created']),
                     date_archived=datetime.now(),
                     raw_data=json.dumps(comment.info),
-                    archived_urls=archived_urls)
+                    archived_urls={})
 
     def can_handle(self, channel):
         if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py
index e833ec3..e36aab1 100644
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
 from typing import Generator
 import snscrape.modules
 from loguru import logger
-
+from urllib.parse import urlparse, parse_qs
 
 class TwitterScraper(cisticola.scraper.base.Scraper):
     """An implementation of a Scraper for Twitter, using snscrape library"""
@@ -58,3 +58,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
     def can_handle(self, channel):
         if channel.platform == "Twitter" and channel.platform_id:
             return True
+
+    def url_to_key(self, url: str, content_type: str) -> str:
+        parsed_url = urlparse(url)
+        queries = parse_qs(parsed_url.query)
+
+        # TODO might require additional statements for other media formats
+        if 'jpg' in queries.get('format', []):
+            ext = '.jpg'
+        elif parsed_url.path.endswith('.mp4'):
+            ext = ''
+
+        key = parsed_url.path.split('/')[-1] + ext
+        return key 
\ No newline at end of file