From 965bf1e2dc5013ee5865be4885973be872cedb97 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 11 Mar 2022 17:19:52 -0600 Subject: [PATCH] added youtube scraper, moved from official youtube-dl repo to using yt-dlp because download speed for youtube videos is much better --- Pipfile | 2 +- cisticola/scraper/__init__.py | 3 +- cisticola/scraper/base.py | 8 ++-- cisticola/scraper/rumble.py | 2 +- cisticola/scraper/vkontakte.py | 2 +- cisticola/scraper/youtube.py | 79 ++++++++++++++++++++++++++++++++++ tests/conftest.py | 18 +++++++- tests/scraper/youtube.py | 16 +++++++ 8 files changed, 121 insertions(+), 9 deletions(-) create mode 100644 cisticola/scraper/youtube.py create mode 100644 tests/scraper/youtube.py diff --git a/Pipfile b/Pipfile index 5f86225..0337328 100644 --- a/Pipfile +++ b/Pipfile @@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" -youtube-dl = "*" +yt-dlp = "*" telethon = "*" pytesseract = "*" pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 92a3e7a..f5240d3 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -8,4 +8,5 @@ from .rumble import RumbleScraper from .telegram_snscrape import TelegramSnscrapeScraper from .telegram_telethon import TelegramTelethonScraper from .twitter import TwitterScraper -from .vkontakte import VkontakteScraper \ No newline at end of file +from .vkontakte import VkontakteScraper +from .youtube import YoutubeScraper \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 9666faf..d7f69a1 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -8,7 +8,7 @@ import boto3 from loguru import logger import ffmpeg from sqlalchemy.orm import sessionmaker -import youtube_dl +import yt_dlp from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.utils import make_request @@ -70,7 +70,7 @@ class Scraper: return blob, content_type, key - def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: content_type = 'video/mp4' @@ -82,13 +82,13 @@ class Scraper: "noplaylist": True, 'quiet': True, "verbose": False,} - ydl = youtube_dl.YoutubeDL(ydl_opts) + ydl = yt_dlp.YoutubeDL(ydl_opts) try: meta = ydl.extract_info( url, download=True,) - except youtube_dl.utils.DownloadError as e: + except yt_dlp.utils.DownloadError as e: raise e else: video_id = meta["id"] diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index f0cdefb..8546d6e 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -34,7 +34,7 @@ class RumbleScraper(Scraper): url = post['media_url'] - media_blob, content_type, key = self.youtubedl_url_to_blob(url) + media_blob, content_type, key = self.ytdlp_url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[post['media_url']] = archived_url diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 7535324..7ca5659 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -51,7 +51,7 @@ class VkontakteScraper(Scraper): if post.video: url = post.video.url - media_blob, content_type, key = self.youtubedl_url_to_blob(url) + media_blob, content_type, key = self.ytdlp_url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py new file mode 100644 index 0000000..20ae6a3 --- /dev/null +++ b/cisticola/scraper/youtube.py @@ -0,0 +1,79 @@ +from datetime import datetime, timezone +import json +from typing import Generator +import tempfile + +import yt_dlp + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper import Scraper + +class YoutubeScraper(Scraper): + """An implementation of a Scraper for Youtube, using youtube-dl""" + __version__ = "YoutubeScraper 0.0.1" + + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + content_type = 'video/mp4' + + if since is None: + since_date = datetime.min + start_date = None + else: + since_date = since.date + start_date = since.date.strftime('%Y%m%d') + + with tempfile.TemporaryDirectory() as temp_dir: + + daterange = yt_dlp.utils.DateRange(start = start_date) + + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", + "daterange" : daterange} + + ydl = yt_dlp.YoutubeDL(ydl_opts) + + try: + meta = ydl.extract_info( + channel.url, + download=archive_media) + except yt_dlp.utils.DownloadError as e: + raise e + else: + videos = meta['entries'] + valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')] + + for video in valid_videos: + + archived_urls = {} + video_id = video["id"] + video_ext = video["ext"] + + if archive_media: + + key = f"{video_id}.{video_ext}" + + with open(f"{temp_dir}/{key}", "rb") as f: + media_blob = f.read() + archived_url = self.archive_blob(media_blob, content_type, key) + + url = video['webpage_url'] + + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Youtube", + channel=channel.id, + platform_id=video_id, + date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(video, default = str), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Youtube" and channel.url: + return True \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 26bd92b..338fcb9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -126,6 +126,21 @@ VKONTAKTE_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} +YOUTUBE_CHANNEL_KWARGS = { + 'id': 7, + 'name': 'AnEs87 (test)', + 'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA', + 'category': 'test', + 'followers': None, + 'platform': 'Youtube', + 'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA', + 'screenname': 'AnEs87', + 'country': 'SV', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @pytest.fixture(scope='package') @@ -158,6 +173,7 @@ def channel_kwargs(): 'rumble' : RUMBLE_CHANNEL_KWARGS, 'telegram' : TELEGRAM_CHANNEL_KWARGS, 'twitter' : TWITTER_CHANNEL_KWARGS, - 'vkontakte' : VKONTAKTE_CHANNEL_KWARGS} + 'vkontakte' : VKONTAKTE_CHANNEL_KWARGS, + 'youtube' : YOUTUBE_CHANNEL_KWARGS} #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py new file mode 100644 index 0000000..9d14760 --- /dev/null +++ b/tests/scraper/youtube.py @@ -0,0 +1,16 @@ +from cisticola.base import Channel +from cisticola.scraper import YoutubeScraper + +def test_scrape_youtube_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['youtube'])] + controller.register_scraper(scraper = YoutubeScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +def test_scrape_youtube_channel(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['youtube'])] + controller.register_scraper(scraper = YoutubeScraper()) + controller.scrape_channels(channels = channels, archive_media = True)