Merge pull request #46 from bellingcat/next-release

Release 2022-04-12
This commit is contained in:
Logan Williams
2022-04-12 14:59:01 +02:00
committed by GitHub
7 changed files with 99 additions and 875 deletions

874
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

10
app.py
View File

@@ -13,6 +13,9 @@ from cisticola.scraper import (
VkontakteScraper,
TelegramTelethonScraper,
GettrScraper,
BitchuteScraper,
YoutubeScraper,
RumbleScraper,
)
@@ -92,7 +95,12 @@ def get_scraper_controller():
controller = ScraperController()
controller.connect_to_db(engine)
scrapers = [TelegramTelethonScraper(), VkontakteScraper(), GettrScraper()]
scrapers = [VkontakteScraper(),
TelegramTelethonScraper(),
GettrScraper(),
BitchuteScraper(),
RumbleScraper(),
YoutubeScraper()]
controller.register_scrapers(scrapers)

View File

@@ -10,7 +10,8 @@ from loguru import logger
import ffmpeg
from sqlalchemy.orm import sessionmaker
import yt_dlp
from sqlalchemy.sql.expression import func
from sqlalchemy.sql.expression import func
from pathlib import Path
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
@@ -181,13 +182,20 @@ class Scraper:
content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:

View File

@@ -16,7 +16,7 @@ from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
library"""
__version__ = "BitchuteScraper 0.0.0"
__version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
@@ -184,7 +184,7 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
raise NotImplemented()
if request.status_code >= 300:
raise ValueError("Response %i from BitChut for URL %s, need to retry" % (request.status_code, url))
raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url))
response = request.json()
return response
@@ -421,14 +421,8 @@ def get_videos_user(session, user, csrftoken, detail):
post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
try:
request = session.post(url, data=post_data, headers=headers)
if request.status_code != 200:
raise ConnectionError()
response = request.json()
response = request_from_bitchute(session, "POST", url, headers=headers, data=post_data)
except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
raise ValueError('FALSE')
soup = BeautifulSoup(response["html"], 'html.parser')
videos = soup.select(".channel-videos-container")
comments = []

View File

@@ -5,6 +5,7 @@ from urllib.parse import urlparse
from loguru import logger
from bs4 import BeautifulSoup
import os
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request
@@ -13,7 +14,10 @@ BASE_URL = 'https://rumble.com'
class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.0"
__version__ = "RumbleScraper 0.0.1"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:

View File

@@ -13,7 +13,10 @@ from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
"""An implementation of a Scraper for Youtube, using youtube-dl"""
__version__ = "YoutubeScraper 0.0.0"
__version__ = "YoutubeScraper 0.0.1"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
@@ -29,13 +32,21 @@ class YoutubeScraper(Scraper):
with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
daterange = yt_dlp.utils.DateRange(start = start_date)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"daterange" : daterange}
"daterange" : daterange,
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -92,10 +103,18 @@ class YoutubeScraper(Scraper):
with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -104,7 +123,7 @@ class YoutubeScraper(Scraper):
except yt_dlp.utils.DownloadError as e:
raise e
files = os.listdir(temp_dir)
files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename]
if len(files) != 1:
logger.warning(f'{len(files)} files downloaded for video: {url}')
key = files[0]
@@ -120,7 +139,12 @@ class YoutubeScraper(Scraper):
return result
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl_opts = {
"quiet": True,
"verbose": False,
"retries": 5}
ydl = yt_dlp.YoutubeDL(ydl_opts)
meta = None

View File

@@ -31,21 +31,15 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
r = None
for n_retries in range(max_retries):
try:
r = request_until_200(
url = url,
headers = headers,
max_retries = max_retries,
break_codes = break_codes)
logger.debug(f"Request for url: {url} succeeded on attempt: {n_retries}/{max_retries}")
except Exception as e:
logger.warning(f"Request for url: {url} raised exception: [{e}] on attempt: {n_retries}/{max_retries}")
continue
else:
break
else:
logger.error(f"Request for url: {url} failed after {max_retries} attempts")
try:
r = request_until_200(
url = url,
headers = headers,
max_retries = max_retries,
break_codes = break_codes)
logger.debug(f"Request for url: {url} succeeded")
except Exception as e:
logger.warning(f"Request for url: {url} raised exception: [{e}]")
return r