mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 12:58:33 +03:00
added youtube scraper, moved from official youtube-dl repo to using yt-dlp because download speed for youtube videos is much better
This commit is contained in:
2
Pipfile
2
Pipfile
@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||
ffmpeg-python = "*"
|
||||
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
|
||||
garc = "*"
|
||||
youtube-dl = "*"
|
||||
yt-dlp = "*"
|
||||
telethon = "*"
|
||||
pytesseract = "*"
|
||||
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
|
||||
|
||||
@@ -8,4 +8,5 @@ from .rumble import RumbleScraper
|
||||
from .telegram_snscrape import TelegramSnscrapeScraper
|
||||
from .telegram_telethon import TelegramTelethonScraper
|
||||
from .twitter import TwitterScraper
|
||||
from .vkontakte import VkontakteScraper
|
||||
from .vkontakte import VkontakteScraper
|
||||
from .youtube import YoutubeScraper
|
||||
@@ -8,7 +8,7 @@ import boto3
|
||||
from loguru import logger
|
||||
import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import youtube_dl
|
||||
import yt_dlp
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
@@ -70,7 +70,7 @@ class Scraper:
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
@@ -82,13 +82,13 @@ class Scraper:
|
||||
"noplaylist": True,
|
||||
'quiet': True,
|
||||
"verbose": False,}
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
url,
|
||||
download=True,)
|
||||
except youtube_dl.utils.DownloadError as e:
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
video_id = meta["id"]
|
||||
|
||||
@@ -34,7 +34,7 @@ class RumbleScraper(Scraper):
|
||||
|
||||
url = post['media_url']
|
||||
|
||||
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
if post.video:
|
||||
url = post.video.url
|
||||
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
|
||||
79
cisticola/scraper/youtube.py
Normal file
79
cisticola/scraper/youtube.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
import tempfile
|
||||
|
||||
import yt_dlp
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper import Scraper
|
||||
|
||||
class YoutubeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Youtube, using youtube-dl"""
|
||||
__version__ = "YoutubeScraper 0.0.1"
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
if since is None:
|
||||
since_date = datetime.min
|
||||
start_date = None
|
||||
else:
|
||||
since_date = since.date
|
||||
start_date = since.date.strftime('%Y%m%d')
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
daterange = yt_dlp.utils.DateRange(start = start_date)
|
||||
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||
"daterange" : daterange}
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
download=archive_media)
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
videos = meta['entries']
|
||||
valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
|
||||
|
||||
for video in valid_videos:
|
||||
|
||||
archived_urls = {}
|
||||
video_id = video["id"]
|
||||
video_ext = video["ext"]
|
||||
|
||||
if archive_media:
|
||||
|
||||
key = f"{video_id}.{video_ext}"
|
||||
|
||||
with open(f"{temp_dir}/{key}", "rb") as f:
|
||||
media_blob = f.read()
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
|
||||
url = video['webpage_url']
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Youtube",
|
||||
channel=channel.id,
|
||||
platform_id=video_id,
|
||||
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video, default = str),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
return True
|
||||
@@ -126,6 +126,21 @@ VKONTAKTE_CHANNEL_KWARGS = {
|
||||
'chat': False,
|
||||
'notes': ''}
|
||||
|
||||
YOUTUBE_CHANNEL_KWARGS = {
|
||||
'id': 7,
|
||||
'name': 'AnEs87 (test)',
|
||||
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Youtube',
|
||||
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
|
||||
'screenname': 'AnEs87',
|
||||
'country': 'SV',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': ''}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
@@ -158,6 +173,7 @@ def channel_kwargs():
|
||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS,
|
||||
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS}
|
||||
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
|
||||
'youtube' : YOUTUBE_CHANNEL_KWARGS}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
16
tests/scraper/youtube.py
Normal file
16
tests/scraper/youtube.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import YoutubeScraper
|
||||
|
||||
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['youtube'])]
|
||||
controller.register_scraper(scraper = YoutubeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
def test_scrape_youtube_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['youtube'])]
|
||||
controller.register_scraper(scraper = YoutubeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
Reference in New Issue
Block a user