added youtube scraper, moved from official youtube-dl repo to using yt-dlp because download speed for youtube videos is much better

2026-06-11 12:58:33 +03:00 · 2022-03-11 17:19:52 -06:00
parent 821c39004b
commit 965bf1e2dc
8 changed files with 121 additions and 9 deletions
--- a/2
+++ b/2
@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
 ffmpeg-python = "*"
 polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
 garc = "*"
-youtube-dl = "*" 
+yt-dlp = "*" 
 telethon = "*"
 pytesseract = "*"
 pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
--- a/cisticola/scraper/init.py
+++ b/cisticola/scraper/init.py
@@ -8,4 +8,5 @@ from .rumble import RumbleScraper
 from .telegram_snscrape import TelegramSnscrapeScraper
 from .telegram_telethon import TelegramTelethonScraper
 from .twitter import TwitterScraper
-from .vkontakte import VkontakteScraper
+from .vkontakte import VkontakteScraper
+from .youtube import YoutubeScraper
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -8,7 +8,7 @@ import boto3
 from loguru import logger
 import ffmpeg
 from sqlalchemy.orm import sessionmaker
-import youtube_dl
+import yt_dlp

 from cisticola.base import Channel, ScraperResult, mapper_registry
 from cisticola.utils import make_request
@@ -70,7 +70,7 @@ class Scraper:

        return blob, content_type, key

-    def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+    def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
        
        content_type = 'video/mp4'

@@ -82,13 +82,13 @@ class Scraper:
                "noplaylist": True,
                'quiet': True,
                "verbose": False,}
-            ydl = youtube_dl.YoutubeDL(ydl_opts)
+            ydl = yt_dlp.YoutubeDL(ydl_opts)

            try:
                meta = ydl.extract_info(
                    url,
                    download=True,)
-            except youtube_dl.utils.DownloadError as e:
+            except yt_dlp.utils.DownloadError as e:
                raise e
            else:
                video_id = meta["id"]
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -34,7 +34,7 @@ class RumbleScraper(Scraper):

                url = post['media_url']

-                media_blob, content_type, key = self.youtubedl_url_to_blob(url)
+                media_blob, content_type, key = self.ytdlp_url_to_blob(url)
                archived_url = self.archive_blob(media_blob, content_type, key)
                archived_urls[post['media_url']] = archived_url

--- a/cisticola/scraper/vkontakte.py
+++ b/cisticola/scraper/vkontakte.py
@@ -51,7 +51,7 @@ class VkontakteScraper(Scraper):

                if post.video:
                    url = post.video.url
-                    media_blob, content_type, key = self.youtubedl_url_to_blob(url)
+                    media_blob, content_type, key = self.ytdlp_url_to_blob(url)
                    archived_url = self.archive_blob(media_blob, content_type, key)
                    archived_urls[url] = archived_url

--- a/cisticola/scraper/youtube.py
+++ b/cisticola/scraper/youtube.py
@@ -0,0 +1,79 @@
+from datetime import datetime, timezone
+import json
+from typing import Generator
+import tempfile
+
+import yt_dlp
+
+from cisticola.base import Channel, ScraperResult
+from cisticola.scraper import Scraper
+
+class YoutubeScraper(Scraper):
+    """An implementation of a Scraper for Youtube, using youtube-dl"""
+    __version__ = "YoutubeScraper 0.0.1"
+
+    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
+
+        content_type = 'video/mp4'
+
+        if since is None:
+            since_date = datetime.min
+            start_date = None
+        else:
+            since_date = since.date
+            start_date = since.date.strftime('%Y%m%d')
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            daterange = yt_dlp.utils.DateRange(start = start_date)
+
+            ydl_opts = {
+                "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+                "merge_output_format": "mp4",
+                "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
+                "daterange" : daterange}
+
+            ydl = yt_dlp.YoutubeDL(ydl_opts)
+
+            try:
+                meta = ydl.extract_info(
+                    channel.url,
+                    download=archive_media)
+            except yt_dlp.utils.DownloadError as e:
+                raise e
+            else:
+                videos = meta['entries']
+                valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
+                        
+                for video in valid_videos:
+
+                    archived_urls = {}
+                    video_id = video["id"]
+                    video_ext = video["ext"]
+
+                    if archive_media:
+                    
+                        key = f"{video_id}.{video_ext}"
+
+                        with open(f"{temp_dir}/{key}", "rb") as f:
+                            media_blob = f.read()
+                        archived_url = self.archive_blob(media_blob, content_type, key)
+
+                        url = video['webpage_url']
+
+                        archived_url = self.archive_blob(media_blob, content_type, key)
+                        archived_urls[url] = archived_url
+
+                    yield ScraperResult(
+                        scraper=self.__version__,
+                        platform="Youtube",
+                        channel=channel.id,
+                        platform_id=video_id,
+                        date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
+                        date_archived=datetime.now(timezone.utc),
+                        raw_data=json.dumps(video, default = str),
+                        archived_urls=archived_urls)
+                        
+    def can_handle(self, channel):
+        if channel.platform == "Youtube" and channel.url:
+            return True
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -126,6 +126,21 @@ VKONTAKTE_CHANNEL_KWARGS = {
    'chat': False,
    'notes': ''}

+YOUTUBE_CHANNEL_KWARGS = {
+    'id': 7,
+    'name': 'AnEs87 (test)',
+    'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
+    'category': 'test',
+    'followers': None,
+    'platform': 'Youtube',
+    'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
+    'screenname': 'AnEs87',
+    'country': 'SV',
+    'influencer': None,
+    'public': True,
+    'chat': False,
+    'notes': ''}
+
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

@pytest.fixture(scope='package')
@@ -158,6 +173,7 @@ def channel_kwargs():
        'rumble' : RUMBLE_CHANNEL_KWARGS,
        'telegram' : TELEGRAM_CHANNEL_KWARGS,
        'twitter' : TWITTER_CHANNEL_KWARGS,
-        'vkontakte' : VKONTAKTE_CHANNEL_KWARGS}
+        'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
+        'youtube' : YOUTUBE_CHANNEL_KWARGS}

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/tests/scraper/youtube.py
+++ b/tests/scraper/youtube.py
@@ -0,0 +1,16 @@
+from cisticola.base import Channel
+from cisticola.scraper import YoutubeScraper
+
+def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
+
+    channels = [Channel(**channel_kwargs['youtube'])]
+    controller.register_scraper(scraper = YoutubeScraper())
+    controller.scrape_channels(channels = channels, archive_media = False)
+
+def test_scrape_youtube_channel(controller, channel_kwargs):
+
+    controller.reset_db()
+    
+    channels = [Channel(**channel_kwargs['youtube'])]
+    controller.register_scraper(scraper = YoutubeScraper())
+    controller.scrape_channels(channels = channels, archive_media = True)