From 965bf1e2dc5013ee5865be4885973be872cedb97 Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Fri, 11 Mar 2022 17:19:52 -0600
Subject: [PATCH] added youtube scraper, moved from official youtube-dl repo to
 using yt-dlp because download speed for youtube videos is much better

---
 Pipfile                        |  2 +-
 cisticola/scraper/__init__.py  |  3 +-
 cisticola/scraper/base.py      |  8 ++--
 cisticola/scraper/rumble.py    |  2 +-
 cisticola/scraper/vkontakte.py |  2 +-
 cisticola/scraper/youtube.py   | 79 ++++++++++++++++++++++++++++++++++
 tests/conftest.py              | 18 +++++++-
 tests/scraper/youtube.py       | 16 +++++++
 8 files changed, 121 insertions(+), 9 deletions(-)
 create mode 100644 cisticola/scraper/youtube.py
 create mode 100644 tests/scraper/youtube.py

diff --git a/Pipfile b/Pipfile
index 5f86225..0337328 100644
--- a/Pipfile
+++ b/Pipfile
@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
 ffmpeg-python = "*"
 polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
 garc = "*"
-youtube-dl = "*" 
+yt-dlp = "*" 
 telethon = "*"
 pytesseract = "*"
 pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py
index 92a3e7a..f5240d3 100644
--- a/cisticola/scraper/__init__.py
+++ b/cisticola/scraper/__init__.py
@@ -8,4 +8,5 @@ from .rumble import RumbleScraper
 from .telegram_snscrape import TelegramSnscrapeScraper
 from .telegram_telethon import TelegramTelethonScraper
 from .twitter import TwitterScraper
-from .vkontakte import VkontakteScraper
\ No newline at end of file
+from .vkontakte import VkontakteScraper
+from .youtube import YoutubeScraper
\ No newline at end of file
diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py
index 9666faf..d7f69a1 100644
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -8,7 +8,7 @@ import boto3
 from loguru import logger
 import ffmpeg
 from sqlalchemy.orm import sessionmaker
-import youtube_dl
+import yt_dlp
 
 from cisticola.base import Channel, ScraperResult, mapper_registry
 from cisticola.utils import make_request
@@ -70,7 +70,7 @@ class Scraper:
 
         return blob, content_type, key
 
-    def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+    def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
         
         content_type = 'video/mp4'
 
@@ -82,13 +82,13 @@ class Scraper:
                 "noplaylist": True,
                 'quiet': True,
                 "verbose": False,}
-            ydl = youtube_dl.YoutubeDL(ydl_opts)
+            ydl = yt_dlp.YoutubeDL(ydl_opts)
 
             try:
                 meta = ydl.extract_info(
                     url,
                     download=True,)
-            except youtube_dl.utils.DownloadError as e:
+            except yt_dlp.utils.DownloadError as e:
                 raise e
             else:
                 video_id = meta["id"]
diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py
index f0cdefb..8546d6e 100644
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -34,7 +34,7 @@ class RumbleScraper(Scraper):
 
                 url = post['media_url']
 
-                media_blob, content_type, key = self.youtubedl_url_to_blob(url)
+                media_blob, content_type, key = self.ytdlp_url_to_blob(url)
                 archived_url = self.archive_blob(media_blob, content_type, key)
                 archived_urls[post['media_url']] = archived_url
 
diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py
index 7535324..7ca5659 100644
--- a/cisticola/scraper/vkontakte.py
+++ b/cisticola/scraper/vkontakte.py
@@ -51,7 +51,7 @@ class VkontakteScraper(Scraper):
 
                 if post.video:
                     url = post.video.url
-                    media_blob, content_type, key = self.youtubedl_url_to_blob(url)
+                    media_blob, content_type, key = self.ytdlp_url_to_blob(url)
                     archived_url = self.archive_blob(media_blob, content_type, key)
                     archived_urls[url] = archived_url
 
diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py
new file mode 100644
index 0000000..20ae6a3
--- /dev/null
+++ b/cisticola/scraper/youtube.py
@@ -0,0 +1,79 @@
+from datetime import datetime, timezone
+import json
+from typing import Generator
+import tempfile
+
+import yt_dlp
+
+from cisticola.base import Channel, ScraperResult
+from cisticola.scraper import Scraper
+
+class YoutubeScraper(Scraper):
+    """An implementation of a Scraper for Youtube, using youtube-dl"""
+    __version__ = "YoutubeScraper 0.0.1"
+
+    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
+
+        content_type = 'video/mp4'
+
+        if since is None:
+            since_date = datetime.min
+            start_date = None
+        else:
+            since_date = since.date
+            start_date = since.date.strftime('%Y%m%d')
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            daterange = yt_dlp.utils.DateRange(start = start_date)
+
+            ydl_opts = {
+                "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+                "merge_output_format": "mp4",
+                "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
+                "daterange" : daterange}
+
+            ydl = yt_dlp.YoutubeDL(ydl_opts)
+
+            try:
+                meta = ydl.extract_info(
+                    channel.url,
+                    download=archive_media)
+            except yt_dlp.utils.DownloadError as e:
+                raise e
+            else:
+                videos = meta['entries']
+                valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
+                        
+                for video in valid_videos:
+
+                    archived_urls = {}
+                    video_id = video["id"]
+                    video_ext = video["ext"]
+
+                    if archive_media:
+                    
+                        key = f"{video_id}.{video_ext}"
+
+                        with open(f"{temp_dir}/{key}", "rb") as f:
+                            media_blob = f.read()
+                        archived_url = self.archive_blob(media_blob, content_type, key)
+
+                        url = video['webpage_url']
+
+                        archived_url = self.archive_blob(media_blob, content_type, key)
+                        archived_urls[url] = archived_url
+
+                    yield ScraperResult(
+                        scraper=self.__version__,
+                        platform="Youtube",
+                        channel=channel.id,
+                        platform_id=video_id,
+                        date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
+                        date_archived=datetime.now(timezone.utc),
+                        raw_data=json.dumps(video, default = str),
+                        archived_urls=archived_urls)
+                        
+    def can_handle(self, channel):
+        if channel.platform == "Youtube" and channel.url:
+            return True
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 26bd92b..338fcb9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -126,6 +126,21 @@ VKONTAKTE_CHANNEL_KWARGS = {
     'chat': False,
     'notes': ''}
 
+YOUTUBE_CHANNEL_KWARGS = {
+    'id': 7,
+    'name': 'AnEs87 (test)',
+    'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
+    'category': 'test',
+    'followers': None,
+    'platform': 'Youtube',
+    'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
+    'screenname': 'AnEs87',
+    'country': 'SV',
+    'influencer': None,
+    'public': True,
+    'chat': False,
+    'notes': ''}
+
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
 
 @pytest.fixture(scope='package')
@@ -158,6 +173,7 @@ def channel_kwargs():
         'rumble' : RUMBLE_CHANNEL_KWARGS,
         'telegram' : TELEGRAM_CHANNEL_KWARGS,
         'twitter' : TWITTER_CHANNEL_KWARGS,
-        'vkontakte' : VKONTAKTE_CHANNEL_KWARGS}
+        'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
+        'youtube' : YOUTUBE_CHANNEL_KWARGS}
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
\ No newline at end of file
diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py
new file mode 100644
index 0000000..9d14760
--- /dev/null
+++ b/tests/scraper/youtube.py
@@ -0,0 +1,16 @@
+from cisticola.base import Channel
+from cisticola.scraper import YoutubeScraper
+
+def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
+
+    channels = [Channel(**channel_kwargs['youtube'])]
+    controller.register_scraper(scraper = YoutubeScraper())
+    controller.scrape_channels(channels = channels, archive_media = False)
+
+def test_scrape_youtube_channel(controller, channel_kwargs):
+
+    controller.reset_db()
+    
+    channels = [Channel(**channel_kwargs['youtube'])]
+    controller.register_scraper(scraper = YoutubeScraper())
+    controller.scrape_channels(channels = channels, archive_media = True)