Merge branch 'main' into odysee-refactor

This commit is contained in:
Tristan Lee
2022-04-12 23:26:18 -05:00
8 changed files with 100 additions and 868 deletions

2
.gitignore vendored
View File

@@ -9,9 +9,11 @@ docs/source/_*
*.db *.db
.env .env
*.session *.session
*.session-journal
service_account.json service_account.json
.vscode/ .vscode/
*.log *.log
*.lock
# Unit test / coverage reports # Unit test / coverage reports
reports reports

874
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

10
app.py
View File

@@ -13,6 +13,9 @@ from cisticola.scraper import (
VkontakteScraper, VkontakteScraper,
TelegramTelethonScraper, TelegramTelethonScraper,
GettrScraper, GettrScraper,
BitchuteScraper,
YoutubeScraper,
RumbleScraper,
) )
@@ -92,7 +95,12 @@ def get_scraper_controller():
controller = ScraperController() controller = ScraperController()
controller.connect_to_db(engine) controller.connect_to_db(engine)
scrapers = [TelegramTelethonScraper(), VkontakteScraper(), GettrScraper()] scrapers = [VkontakteScraper(),
TelegramTelethonScraper(),
GettrScraper(),
BitchuteScraper(),
RumbleScraper(),
YoutubeScraper()]
controller.register_scrapers(scrapers) controller.register_scrapers(scrapers)

View File

@@ -10,7 +10,8 @@ from loguru import logger
import ffmpeg import ffmpeg
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import yt_dlp import yt_dlp
from sqlalchemy.sql.expression import func from sqlalchemy.sql.expression import func
from pathlib import Path
from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request from cisticola.utils import make_request
@@ -181,13 +182,20 @@ class Scraper:
content_type = 'video/mp4' content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
ydl_opts = { ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4", "merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s", "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True, "noplaylist": True,
'quiet': True, "quiet": True,
"verbose": False,} "verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts) ydl = yt_dlp.YoutubeDL(ydl_opts)
try: try:
@@ -421,7 +429,7 @@ class ScraperController:
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
# simultaneously with low risk of collision (at least while the number of unarchived items is very large) # simultaneously with low risk of collision (at least while the number of unarchived items is very large)
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all() posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(4000).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now") logger.info(f"Found {len(posts)} posts without media. Archiving now")

View File

@@ -16,7 +16,7 @@ from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper): class BitchuteScraper(Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat """An implementation of a Scraper for Bitchute, using classes from the 4cat
library""" library"""
__version__ = "BitchuteScraper 0.0.0" __version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(self, url): def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/') username = url.split('bitchute.com/channel/')[-1].strip('/')

View File

@@ -5,6 +5,7 @@ from urllib.parse import urlparse
from loguru import logger from loguru import logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import os
from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request from cisticola.scraper import Scraper, make_request
@@ -13,7 +14,10 @@ BASE_URL = 'https://rumble.com'
class RumbleScraper(Scraper): class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions""" """An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.0" __version__ = "RumbleScraper 0.0.1"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
@logger.catch @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:

View File

@@ -13,7 +13,10 @@ from cisticola.scraper import Scraper
class YoutubeScraper(Scraper): class YoutubeScraper(Scraper):
"""An implementation of a Scraper for Youtube, using youtube-dl""" """An implementation of a Scraper for Youtube, using youtube-dl"""
__version__ = "YoutubeScraper 0.0.0" __version__ = "YoutubeScraper 0.0.1"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
@logger.catch @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
@@ -29,13 +32,21 @@ class YoutubeScraper(Scraper):
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
daterange = yt_dlp.utils.DateRange(start = start_date) daterange = yt_dlp.utils.DateRange(start = start_date)
ydl_opts = { ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4", "merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s", "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"daterange" : daterange} "daterange" : daterange,
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts) ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -92,10 +103,18 @@ class YoutubeScraper(Scraper):
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
ydl_opts = { ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4", "merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"} "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts) ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -104,7 +123,7 @@ class YoutubeScraper(Scraper):
except yt_dlp.utils.DownloadError as e: except yt_dlp.utils.DownloadError as e:
raise e raise e
files = os.listdir(temp_dir) files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename]
if len(files) != 1: if len(files) != 1:
logger.warning(f'{len(files)} files downloaded for video: {url}') logger.warning(f'{len(files)} files downloaded for video: {url}')
key = files[0] key = files[0]
@@ -120,7 +139,12 @@ class YoutubeScraper(Scraper):
return result return result
def get_profile(self, channel: Channel) -> RawChannelInfo: def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl_opts = {
"quiet": True,
"verbose": False,
"retries": 5}
ydl = yt_dlp.YoutubeDL(ydl_opts) ydl = yt_dlp.YoutubeDL(ydl_opts)
meta = None meta = None

View File

@@ -31,21 +31,15 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
r = None r = None
for n_retries in range(max_retries): try:
try: r = request_until_200(
r = request_until_200( url = url,
url = url, headers = headers,
headers = headers, max_retries = max_retries,
max_retries = max_retries, break_codes = break_codes)
break_codes = break_codes) logger.debug(f"Request for url: {url} succeeded")
logger.debug(f"Request for url: {url} succeeded on attempt: {n_retries}/{max_retries}") except Exception as e:
except Exception as e: logger.warning(f"Request for url: {url} raised exception: [{e}]")
logger.warning(f"Request for url: {url} raised exception: [{e}] on attempt: {n_retries}/{max_retries}")
continue
else:
break
else:
logger.error(f"Request for url: {url} failed after {max_retries} attempts")
return r return r