added youtube scraper, moved from official youtube-dl repo to using yt-dlp because download speed for youtube videos is much better

This commit is contained in:
Tristan Lee
2022-03-11 17:19:52 -06:00
parent 821c39004b
commit 965bf1e2dc
8 changed files with 121 additions and 9 deletions

View File

@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
yt-dlp = "*"
telethon = "*"
pytesseract = "*"
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}

View File

@@ -8,4 +8,5 @@ from .rumble import RumbleScraper
from .telegram_snscrape import TelegramSnscrapeScraper
from .telegram_telethon import TelegramTelethonScraper
from .twitter import TwitterScraper
from .vkontakte import VkontakteScraper
from .vkontakte import VkontakteScraper
from .youtube import YoutubeScraper

View File

@@ -8,7 +8,7 @@ import boto3
from loguru import logger
import ffmpeg
from sqlalchemy.orm import sessionmaker
import youtube_dl
import yt_dlp
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
@@ -70,7 +70,7 @@ class Scraper:
return blob, content_type, key
def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
@@ -82,13 +82,13 @@ class Scraper:
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = youtube_dl.YoutubeDL(ydl_opts)
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except youtube_dl.utils.DownloadError as e:
except yt_dlp.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]

View File

@@ -34,7 +34,7 @@ class RumbleScraper(Scraper):
url = post['media_url']
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url

View File

@@ -51,7 +51,7 @@ class VkontakteScraper(Scraper):
if post.video:
url = post.video.url
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url

View File

@@ -0,0 +1,79 @@
from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
import yt_dlp
from cisticola.base import Channel, ScraperResult
from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
"""An implementation of a Scraper for Youtube, using youtube-dl"""
__version__ = "YoutubeScraper 0.0.1"
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
content_type = 'video/mp4'
if since is None:
since_date = datetime.min
start_date = None
else:
since_date = since.date
start_date = since.date.strftime('%Y%m%d')
with tempfile.TemporaryDirectory() as temp_dir:
daterange = yt_dlp.utils.DateRange(start = start_date)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"daterange" : daterange}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
channel.url,
download=archive_media)
except yt_dlp.utils.DownloadError as e:
raise e
else:
videos = meta['entries']
valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
for video in valid_videos:
archived_urls = {}
video_id = video["id"]
video_ext = video["ext"]
if archive_media:
key = f"{video_id}.{video_ext}"
with open(f"{temp_dir}/{key}", "rb") as f:
media_blob = f.read()
archived_url = self.archive_blob(media_blob, content_type, key)
url = video['webpage_url']
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Youtube",
channel=channel.id,
platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Youtube" and channel.url:
return True

View File

@@ -126,6 +126,21 @@ VKONTAKTE_CHANNEL_KWARGS = {
'chat': False,
'notes': ''}
YOUTUBE_CHANNEL_KWARGS = {
'id': 7,
'name': 'AnEs87 (test)',
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
'category': 'test',
'followers': None,
'platform': 'Youtube',
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
'screenname': 'AnEs87',
'country': 'SV',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@pytest.fixture(scope='package')
@@ -158,6 +173,7 @@ def channel_kwargs():
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS,
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS}
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
'youtube' : YOUTUBE_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

16
tests/scraper/youtube.py Normal file
View File

@@ -0,0 +1,16 @@
from cisticola.base import Channel
from cisticola.scraper import YoutubeScraper
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_youtube_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = True)