mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
added prototype Rumble scraper
This commit is contained in:
2
Pipfile
2
Pipfile
@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||
ffmpeg-python = "*"
|
||||
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
|
||||
garc = "*"
|
||||
|
||||
youtube-dl = "*"
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
|
||||
24
Pipfile.lock
generated
24
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507"
|
||||
"sha256": "ceba738b6c5ca9afd5fa79490cffde53d97ea7ec86034e0f31ecbc54fd418055"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -49,19 +49,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71",
|
||||
"sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7"
|
||||
"sha256:32080e2d956b222f36b76f8fec532ec237ddb4a935dd1c9bb79c759fbe4a5868",
|
||||
"sha256:bd7c71274e9257596879f99cff3d0f531b801e567e509b5e3d613bd2033a7279"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.8"
|
||||
"version": "==1.21.9"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c",
|
||||
"sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759"
|
||||
"sha256:8d41deb25e585b0d7b6ee8547990d5e95562f1dc5d3127af58459450b25c13c7",
|
||||
"sha256:c44758c487df7a357c4a103d959962d78e225d1ab6c9eeda4c77f79a410ccd19"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.8"
|
||||
"version": "==1.24.9"
|
||||
},
|
||||
"bs4": {
|
||||
"hashes": [
|
||||
@@ -763,9 +763,17 @@
|
||||
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
|
||||
"version": "==1.26.8"
|
||||
},
|
||||
"youtube-dl": {
|
||||
"hashes": [
|
||||
"sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
|
||||
"sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.12.17"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
|
||||
|
||||
@@ -2,9 +2,8 @@ import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
from typing import Generator
|
||||
from garc import Garc
|
||||
import tempfile
|
||||
|
||||
class GabScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
@@ -22,7 +21,7 @@ class GabScraper(cisticola.scraper.base.Scraper):
|
||||
scraper = client.userposts(username)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date:
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date:
|
||||
break
|
||||
|
||||
media_urls = []
|
||||
|
||||
@@ -63,12 +63,13 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
# Using mkv might be more robust: https://stackoverflow.com/a/42871067
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
||||
|
||||
ydl_opts = {}
|
||||
|
||||
(
|
||||
ffmpeg
|
||||
.input(url)
|
||||
|
||||
143
cisticola/scraper/rumble.py
Normal file
143
cisticola/scraper/rumble.py
Normal file
@@ -0,0 +1,143 @@
|
||||
from concurrent.futures import process
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import youtube_dl
|
||||
import json
|
||||
|
||||
BASE_URL = 'https://rumble.com'
|
||||
|
||||
class RumbleScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
username = url.split('https://rumble.com/c/')[1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
|
||||
username = RumbleScraper.get_username_from_url(channel.url)
|
||||
scraper = get_channel_videos(username)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
url = post['media_url']
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Rumble",
|
||||
channel=channel.id,
|
||||
platform_id=post['media_url'].split('/')[-2],
|
||||
date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||
"noplaylist": True,
|
||||
'quiet': True,
|
||||
"verbose": False,}
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
url,
|
||||
download=True,)
|
||||
except youtube_dl.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
video_id = meta["id"]
|
||||
video_ext = meta["ext"]
|
||||
|
||||
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
|
||||
blob = f.read()
|
||||
|
||||
if key is None:
|
||||
key = url.split('/')[-2] + ext
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def get_media_url(url):
|
||||
|
||||
r = requests.get(url)
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
|
||||
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
|
||||
media_url = script[0]['embedUrl']
|
||||
|
||||
return media_url
|
||||
|
||||
def process_video(video):
|
||||
|
||||
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
|
||||
if rumble_soup is None:
|
||||
rumbles = '0'
|
||||
else:
|
||||
rumbles = rumble_soup['data-value']
|
||||
|
||||
info = {
|
||||
'title' : video.find('h3').text,
|
||||
'thumbnail' : video.find('img')['src'],
|
||||
'link' : BASE_URL + video.find('a', href = True)['href'],
|
||||
'views' : video.find('span', {'class' : 'video-item--views'})['data-value'],
|
||||
'rumbles' : rumbles,
|
||||
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
|
||||
'datetime' : video.find('time')['datetime']}
|
||||
|
||||
info['media_url'] = get_media_url(info['link'])
|
||||
|
||||
return info
|
||||
|
||||
def get_channel_videos(channel):
|
||||
|
||||
page = 1
|
||||
channel_url = f'{BASE_URL}/c/{channel}?page='
|
||||
|
||||
while True:
|
||||
url = channel_url + str(page)
|
||||
r = requests.get(url)
|
||||
|
||||
if r.status_code == 404:
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
|
||||
video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
|
||||
|
||||
for video in video_list:
|
||||
yield process_video(video)
|
||||
|
||||
page += 1
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
9
test.py
9
test.py
@@ -5,6 +5,7 @@ import cisticola.scraper.gettr
|
||||
import cisticola.scraper.bitchute
|
||||
import cisticola.scraper.odysee
|
||||
import cisticola.scraper.gab
|
||||
import cisticola.scraper.rumble
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
@@ -37,6 +38,11 @@ test_channels = [
|
||||
id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt',
|
||||
category="test", followers=None, platform="Gab",
|
||||
url="https://gab.com/marc_capt", screenname='marc_capt', country="CA",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(
|
||||
id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
|
||||
category="test", followers=None, platform="Rumble",
|
||||
url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA",
|
||||
influencer=None, public=True, chat=False, notes="")]
|
||||
|
||||
|
||||
@@ -60,6 +66,9 @@ controller.register_scraper(odysee)
|
||||
gab = cisticola.scraper.gab.GabScraper()
|
||||
controller.register_scraper(gab)
|
||||
|
||||
rumble = cisticola.scraper.rumble.RumbleScraper()
|
||||
controller.register_scraper(rumble)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user