added prototype Rumble scraper

2026-06-12 21:38:33 +03:00 · 2022-02-28 18:38:33 -06:00
parent bc840e631d
commit ee4d64750b
6 changed files with 173 additions and 13 deletions
--- a/2
+++ b/2
@@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
 ffmpeg-python = "*"
 polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
 garc = "*"
-
+youtube-dl = "*" 
 [dev-packages]

 [requires]
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507"
+            "sha256": "ceba738b6c5ca9afd5fa79490cffde53d97ea7ec86034e0f31ecbc54fd418055"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -49,19 +49,19 @@
        },
        "boto3": {
            "hashes": [
-                "sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71",
-                "sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7"
+                "sha256:32080e2d956b222f36b76f8fec532ec237ddb4a935dd1c9bb79c759fbe4a5868",
+                "sha256:bd7c71274e9257596879f99cff3d0f531b801e567e509b5e3d613bd2033a7279"
            ],
            "index": "pypi",
-            "version": "==1.21.8"
+            "version": "==1.21.9"
        },
        "botocore": {
            "hashes": [
-                "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c",
-                "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759"
+                "sha256:8d41deb25e585b0d7b6ee8547990d5e95562f1dc5d3127af58459450b25c13c7",
+                "sha256:c44758c487df7a357c4a103d959962d78e225d1ab6c9eeda4c77f79a410ccd19"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==1.24.8"
+            "version": "==1.24.9"
        },
        "bs4": {
            "hashes": [
@@ -763,9 +763,17 @@
                "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
                "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
            "version": "==1.26.8"
        },
+        "youtube-dl": {
+            "hashes": [
+                "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
+                "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
+            ],
+            "index": "pypi",
+            "version": "==2021.12.17"
+        },
        "zipp": {
            "hashes": [
                "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
--- a/cisticola/scraper/gab.py
+++ b/cisticola/scraper/gab.py
@@ -2,9 +2,8 @@ import cisticola.base
 import cisticola.scraper.base
 from datetime import datetime
 import json
-from typing import Generator, Tuple
+from typing import Generator
 from garc import Garc
-import tempfile

 class GabScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Gab, using GARC library"""
@@ -22,7 +21,7 @@ class GabScraper(cisticola.scraper.base.Scraper):
        scraper = client.userposts(username)

        for post in scraper:
-            if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date:
+            if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date:
                break

            media_urls = []
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -63,12 +63,13 @@ class GettrScraper(cisticola.scraper.base.Scraper):

    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
        
-        # Using mkv might be more robust: https://stackoverflow.com/a/42871067
        content_type = 'video/mp4'
        ext = '.' + content_type.split('/')[-1]

        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
            
+            ydl_opts = {}
+
            (
                ffmpeg
                .input(url)
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -0,0 +1,143 @@
+from concurrent.futures import process
+import cisticola.base
+import cisticola.scraper.base
+from datetime import datetime
+import json
+from typing import Generator, Tuple
+import tempfile
+
+import requests
+from bs4 import BeautifulSoup
+import youtube_dl
+import json 
+
+BASE_URL = 'https://rumble.com'
+
+class RumbleScraper(cisticola.scraper.base.Scraper):
+    """An implementation of a Scraper for Rumble, using custom functions"""
+    __version__ = "RumbleScraper 0.0.1"
+
+    def get_username_from_url(url):
+        username = url.split('https://rumble.com/c/')[1]
+
+        return username
+
+    def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
+
+        username = RumbleScraper.get_username_from_url(channel.url)
+        scraper = get_channel_videos(username)
+
+        for post in scraper:
+            if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
+                break
+
+            archived_urls = {}
+
+            url = post['media_url']
+
+            media_blob, content_type, key = self.url_to_blob(url)
+            archived_url = self.archive_media(media_blob, content_type, key)
+            archived_urls[post['media_url']] = archived_url
+
+            yield cisticola.base.ScraperResult(
+                scraper=self.__version__,
+                platform="Rumble",
+                channel=channel.id,
+                platform_id=post['media_url'].split('/')[-2],
+                date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None),
+                date_archived=datetime.now(),
+                raw_data=json.dumps(post),
+                archived_urls=archived_urls)
+
+    def can_handle(self, channel):
+        if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
+            return True
+
+    def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+        
+        content_type = 'video/mp4'
+        ext = '.' + content_type.split('/')[-1]
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            ydl_opts = {
+                "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+                "merge_output_format": "mp4",
+                "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
+                "noplaylist": True,
+                'quiet': True,
+                "verbose": False,}
+            ydl = youtube_dl.YoutubeDL(ydl_opts)
+
+            try:
+                meta = ydl.extract_info(
+                    url,
+                    download=True,)
+            except youtube_dl.utils.DownloadError as e:
+                raise e
+            else:
+                video_id = meta["id"]
+                video_ext = meta["ext"]
+                
+                with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
+                    blob = f.read()
+
+        if key is None:
+            key = url.split('/')[-2] + ext
+
+        return blob, content_type, key
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def get_media_url(url):
+    
+    r = requests.get(url)
+    soup = BeautifulSoup(r.content, features = 'lxml')
+    
+    script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
+    media_url = script[0]['embedUrl']
+    
+    return media_url
+
+def process_video(video):
+    
+    rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
+    if rumble_soup is None:
+        rumbles = '0'
+    else:
+        rumbles = rumble_soup['data-value']
+
+    info = {
+        'title' : video.find('h3').text,
+        'thumbnail' : video.find('img')['src'],
+        'link' : BASE_URL + video.find('a', href = True)['href'],
+        'views' : video.find('span', {'class' : 'video-item--views'})['data-value'],
+        'rumbles' : rumbles,
+        'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
+        'datetime' : video.find('time')['datetime']}
+    
+    info['media_url'] = get_media_url(info['link'])
+    
+    return info
+
+def get_channel_videos(channel):
+    
+    page = 1
+    channel_url = f'{BASE_URL}/c/{channel}?page='
+
+    while True:
+        url = channel_url + str(page)
+        r = requests.get(url)
+
+        if r.status_code == 404:
+            break
+
+        soup = BeautifulSoup(r.content, features = 'lxml')
+
+        video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
+
+        for video in video_list:
+            yield process_video(video)
+
+        page += 1
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/test.py
+++ b/test.py
@@ -5,6 +5,7 @@ import cisticola.scraper.gettr
 import cisticola.scraper.bitchute
 import cisticola.scraper.odysee
 import cisticola.scraper.gab
+import cisticola.scraper.rumble

 from sqlalchemy import create_engine

@@ -37,6 +38,11 @@ test_channels = [
                                    id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt',
                                    category="test", followers=None, platform="Gab",
                                    url="https://gab.com/marc_capt", screenname='marc_capt', country="CA",
+                                    influencer=None, public=True, chat=False, notes=""),
+                cisticola.base.Channel(
+                                    id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
+                                    category="test", followers=None, platform="Rumble",
+                                    url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA",
                                    influencer=None, public=True, chat=False, notes="")]


@@ -60,6 +66,9 @@ controller.register_scraper(odysee)
 gab = cisticola.scraper.gab.GabScraper()
 controller.register_scraper(gab)

+rumble = cisticola.scraper.rumble.RumbleScraper()
+controller.register_scraper(rumble)
+
 engine = create_engine('sqlite:///test3.db')
 controller.connect_to_db(engine)