added prototype Rumble scraper

This commit is contained in:
Tristan Lee
2022-02-28 18:38:33 -06:00
parent bc840e631d
commit ee4d64750b
6 changed files with 173 additions and 13 deletions

View File

@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
[dev-packages]
[requires]

24
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507"
"sha256": "ceba738b6c5ca9afd5fa79490cffde53d97ea7ec86034e0f31ecbc54fd418055"
},
"pipfile-spec": 6,
"requires": {
@@ -49,19 +49,19 @@
},
"boto3": {
"hashes": [
"sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71",
"sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7"
"sha256:32080e2d956b222f36b76f8fec532ec237ddb4a935dd1c9bb79c759fbe4a5868",
"sha256:bd7c71274e9257596879f99cff3d0f531b801e567e509b5e3d613bd2033a7279"
],
"index": "pypi",
"version": "==1.21.8"
"version": "==1.21.9"
},
"botocore": {
"hashes": [
"sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c",
"sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759"
"sha256:8d41deb25e585b0d7b6ee8547990d5e95562f1dc5d3127af58459450b25c13c7",
"sha256:c44758c487df7a357c4a103d959962d78e225d1ab6c9eeda4c77f79a410ccd19"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.8"
"version": "==1.24.9"
},
"bs4": {
"hashes": [
@@ -763,9 +763,17 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"version": "==1.26.8"
},
"youtube-dl": {
"hashes": [
"sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
"sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
],
"index": "pypi",
"version": "==2021.12.17"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",

View File

@@ -2,9 +2,8 @@ import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import Generator, Tuple
from typing import Generator
from garc import Garc
import tempfile
class GabScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
@@ -22,7 +21,7 @@ class GabScraper(cisticola.scraper.base.Scraper):
scraper = client.userposts(username)
for post in scraper:
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date:
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date:
break
media_urls = []

View File

@@ -63,12 +63,13 @@ class GettrScraper(cisticola.scraper.base.Scraper):
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
# Using mkv might be more robust: https://stackoverflow.com/a/42871067
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
ydl_opts = {}
(
ffmpeg
.input(url)

143
cisticola/scraper/rumble.py Normal file
View File

@@ -0,0 +1,143 @@
from concurrent.futures import process
import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import Generator, Tuple
import tempfile
import requests
from bs4 import BeautifulSoup
import youtube_dl
import json
BASE_URL = 'https://rumble.com'
class RumbleScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
def get_username_from_url(url):
username = url.split('https://rumble.com/c/')[1]
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
for post in scraper:
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
break
archived_urls = {}
url = post['media_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Rumble",
channel=channel.id,
platform_id=post['media_url'].split('/')[-2],
date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None),
date_archived=datetime.now(),
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
return True
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = youtube_dl.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except youtube_dl.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = url.split('/')[-2] + ext
return blob, content_type, key
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_media_url(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, features = 'lxml')
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
media_url = script[0]['embedUrl']
return media_url
def process_video(video):
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
if rumble_soup is None:
rumbles = '0'
else:
rumbles = rumble_soup['data-value']
info = {
'title' : video.find('h3').text,
'thumbnail' : video.find('img')['src'],
'link' : BASE_URL + video.find('a', href = True)['href'],
'views' : video.find('span', {'class' : 'video-item--views'})['data-value'],
'rumbles' : rumbles,
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : video.find('time')['datetime']}
info['media_url'] = get_media_url(info['link'])
return info
def get_channel_videos(channel):
page = 1
channel_url = f'{BASE_URL}/c/{channel}?page='
while True:
url = channel_url + str(page)
r = requests.get(url)
if r.status_code == 404:
break
soup = BeautifulSoup(r.content, features = 'lxml')
video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
for video in video_list:
yield process_video(video)
page += 1
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -5,6 +5,7 @@ import cisticola.scraper.gettr
import cisticola.scraper.bitchute
import cisticola.scraper.odysee
import cisticola.scraper.gab
import cisticola.scraper.rumble
from sqlalchemy import create_engine
@@ -37,6 +38,11 @@ test_channels = [
id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt',
category="test", followers=None, platform="Gab",
url="https://gab.com/marc_capt", screenname='marc_capt', country="CA",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(
id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
category="test", followers=None, platform="Rumble",
url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA",
influencer=None, public=True, chat=False, notes="")]
@@ -60,6 +66,9 @@ controller.register_scraper(odysee)
gab = cisticola.scraper.gab.GabScraper()
controller.register_scraper(gab)
rumble = cisticola.scraper.rumble.RumbleScraper()
controller.register_scraper(rumble)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)