mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
Merge branch 'main' into odysee-refactor
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -9,9 +9,11 @@ docs/source/_*
|
|||||||
*.db
|
*.db
|
||||||
.env
|
.env
|
||||||
*.session
|
*.session
|
||||||
|
*.session-journal
|
||||||
service_account.json
|
service_account.json
|
||||||
.vscode/
|
.vscode/
|
||||||
*.log
|
*.log
|
||||||
|
*.lock
|
||||||
|
|
||||||
# Unit test / coverage reports
|
# Unit test / coverage reports
|
||||||
reports
|
reports
|
||||||
|
|||||||
874
Pipfile.lock
generated
874
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
10
app.py
10
app.py
@@ -13,6 +13,9 @@ from cisticola.scraper import (
|
|||||||
VkontakteScraper,
|
VkontakteScraper,
|
||||||
TelegramTelethonScraper,
|
TelegramTelethonScraper,
|
||||||
GettrScraper,
|
GettrScraper,
|
||||||
|
BitchuteScraper,
|
||||||
|
YoutubeScraper,
|
||||||
|
RumbleScraper,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -92,7 +95,12 @@ def get_scraper_controller():
|
|||||||
controller = ScraperController()
|
controller = ScraperController()
|
||||||
controller.connect_to_db(engine)
|
controller.connect_to_db(engine)
|
||||||
|
|
||||||
scrapers = [TelegramTelethonScraper(), VkontakteScraper(), GettrScraper()]
|
scrapers = [VkontakteScraper(),
|
||||||
|
TelegramTelethonScraper(),
|
||||||
|
GettrScraper(),
|
||||||
|
BitchuteScraper(),
|
||||||
|
RumbleScraper(),
|
||||||
|
YoutubeScraper()]
|
||||||
|
|
||||||
controller.register_scrapers(scrapers)
|
controller.register_scrapers(scrapers)
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ from loguru import logger
|
|||||||
import ffmpeg
|
import ffmpeg
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
from sqlalchemy.sql.expression import func
|
from sqlalchemy.sql.expression import func
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||||
from cisticola.utils import make_request
|
from cisticola.utils import make_request
|
||||||
@@ -181,13 +182,20 @@ class Scraper:
|
|||||||
content_type = 'video/mp4'
|
content_type = 'video/mp4'
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
cookiefile = Path(temp_dir)/self.cookiefilename
|
||||||
|
with open(cookiefile, 'w') as f:
|
||||||
|
f.write(self.cookiestring)
|
||||||
|
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||||
"merge_output_format": "mp4",
|
"merge_output_format": "mp4",
|
||||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||||
"noplaylist": True,
|
"noplaylist": True,
|
||||||
'quiet': True,
|
"quiet": True,
|
||||||
"verbose": False,}
|
"verbose": False,
|
||||||
|
"retries": 5,
|
||||||
|
"cookiefile": cookiefile}
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -421,7 +429,7 @@ class ScraperController:
|
|||||||
|
|
||||||
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
||||||
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
||||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all()
|
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(4000).all()
|
||||||
|
|
||||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ from cisticola.scraper.base import Scraper
|
|||||||
class BitchuteScraper(Scraper):
|
class BitchuteScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
||||||
library"""
|
library"""
|
||||||
__version__ = "BitchuteScraper 0.0.0"
|
__version__ = "BitchuteScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(self, url):
|
def get_username_from_url(self, url):
|
||||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from urllib.parse import urlparse
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import os
|
||||||
|
|
||||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||||
from cisticola.scraper import Scraper, make_request
|
from cisticola.scraper import Scraper, make_request
|
||||||
@@ -13,7 +14,10 @@ BASE_URL = 'https://rumble.com'
|
|||||||
|
|
||||||
class RumbleScraper(Scraper):
|
class RumbleScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||||
__version__ = "RumbleScraper 0.0.0"
|
__version__ = "RumbleScraper 0.0.1"
|
||||||
|
|
||||||
|
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
||||||
|
cookiefilename = 'cookiefile.txt'
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
|
|||||||
@@ -13,7 +13,10 @@ from cisticola.scraper import Scraper
|
|||||||
|
|
||||||
class YoutubeScraper(Scraper):
|
class YoutubeScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Youtube, using youtube-dl"""
|
"""An implementation of a Scraper for Youtube, using youtube-dl"""
|
||||||
__version__ = "YoutubeScraper 0.0.0"
|
__version__ = "YoutubeScraper 0.0.1"
|
||||||
|
|
||||||
|
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
||||||
|
cookiefilename = 'cookiefile.txt'
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
@@ -29,13 +32,21 @@ class YoutubeScraper(Scraper):
|
|||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
|
||||||
|
cookiefile = Path(temp_dir)/self.cookiefilename
|
||||||
|
with open(cookiefile, 'w') as f:
|
||||||
|
f.write(self.cookiestring)
|
||||||
|
|
||||||
daterange = yt_dlp.utils.DateRange(start = start_date)
|
daterange = yt_dlp.utils.DateRange(start = start_date)
|
||||||
|
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||||
"merge_output_format": "mp4",
|
"merge_output_format": "mp4",
|
||||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||||
"daterange" : daterange}
|
"daterange" : daterange,
|
||||||
|
"quiet": True,
|
||||||
|
"verbose": False,
|
||||||
|
"retries": 5,
|
||||||
|
"cookiefile": cookiefile}
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
@@ -92,10 +103,18 @@ class YoutubeScraper(Scraper):
|
|||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
|
||||||
|
cookiefile = Path(temp_dir)/self.cookiefilename
|
||||||
|
with open(cookiefile, 'w') as f:
|
||||||
|
f.write(self.cookiestring)
|
||||||
|
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||||
"merge_output_format": "mp4",
|
"merge_output_format": "mp4",
|
||||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
|
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||||
|
"quiet": True,
|
||||||
|
"verbose": False,
|
||||||
|
"retries": 5,
|
||||||
|
"cookiefile": cookiefile}
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
@@ -104,7 +123,7 @@ class YoutubeScraper(Scraper):
|
|||||||
except yt_dlp.utils.DownloadError as e:
|
except yt_dlp.utils.DownloadError as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
files = os.listdir(temp_dir)
|
files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename]
|
||||||
if len(files) != 1:
|
if len(files) != 1:
|
||||||
logger.warning(f'{len(files)} files downloaded for video: {url}')
|
logger.warning(f'{len(files)} files downloaded for video: {url}')
|
||||||
key = files[0]
|
key = files[0]
|
||||||
@@ -120,7 +139,12 @@ class YoutubeScraper(Scraper):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||||
ydl_opts = {}
|
|
||||||
|
ydl_opts = {
|
||||||
|
"quiet": True,
|
||||||
|
"verbose": False,
|
||||||
|
"retries": 5}
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
meta = None
|
meta = None
|
||||||
|
|||||||
@@ -31,21 +31,15 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
|
|||||||
|
|
||||||
r = None
|
r = None
|
||||||
|
|
||||||
for n_retries in range(max_retries):
|
try:
|
||||||
try:
|
r = request_until_200(
|
||||||
r = request_until_200(
|
url = url,
|
||||||
url = url,
|
headers = headers,
|
||||||
headers = headers,
|
max_retries = max_retries,
|
||||||
max_retries = max_retries,
|
break_codes = break_codes)
|
||||||
break_codes = break_codes)
|
logger.debug(f"Request for url: {url} succeeded")
|
||||||
logger.debug(f"Request for url: {url} succeeded on attempt: {n_retries}/{max_retries}")
|
except Exception as e:
|
||||||
except Exception as e:
|
logger.warning(f"Request for url: {url} raised exception: [{e}]")
|
||||||
logger.warning(f"Request for url: {url} raised exception: [{e}] on attempt: {n_retries}/{max_retries}")
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
logger.error(f"Request for url: {url} failed after {max_retries} attempts")
|
|
||||||
|
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user