diff --git a/Pipfile b/Pipfile index d2c3af8..7ea75cd 100644 --- a/Pipfile +++ b/Pipfile @@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" - +youtube-dl = "*" [dev-packages] [requires] diff --git a/Pipfile.lock b/Pipfile.lock index e9c95cf..c66dfed 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507" + "sha256": "ceba738b6c5ca9afd5fa79490cffde53d97ea7ec86034e0f31ecbc54fd418055" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,19 @@ }, "boto3": { "hashes": [ - "sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71", - "sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7" + "sha256:32080e2d956b222f36b76f8fec532ec237ddb4a935dd1c9bb79c759fbe4a5868", + "sha256:bd7c71274e9257596879f99cff3d0f531b801e567e509b5e3d613bd2033a7279" ], "index": "pypi", - "version": "==1.21.8" + "version": "==1.21.9" }, "botocore": { "hashes": [ - "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c", - "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759" + "sha256:8d41deb25e585b0d7b6ee8547990d5e95562f1dc5d3127af58459450b25c13c7", + "sha256:c44758c487df7a357c4a103d959962d78e225d1ab6c9eeda4c77f79a410ccd19" ], "markers": "python_version >= '3.6'", - "version": "==1.24.8" + "version": "==1.24.9" }, "bs4": { "hashes": [ @@ -763,9 +763,17 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.8" }, + "youtube-dl": { + "hashes": [ + "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2", + "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55" + ], + "index": "pypi", + "version": "==2021.12.17" + }, "zipp": { "hashes": [ "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d", diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 27a0924..e406078 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -2,9 +2,8 @@ import cisticola.base import cisticola.scraper.base from datetime import datetime import json -from typing import Generator, Tuple +from typing import Generator from garc import Garc -import tempfile class GabScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gab, using GARC library""" @@ -22,7 +21,7 @@ class GabScraper(cisticola.scraper.base.Scraper): scraper = client.userposts(username) for post in scraper: - if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date: + if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date: break media_urls = [] diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 2e59e3c..3471f25 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -63,12 +63,13 @@ class GettrScraper(cisticola.scraper.base.Scraper): def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - # Using mkv might be more robust: https://stackoverflow.com/a/42871067 content_type = 'video/mp4' ext = '.' + content_type.split('/')[-1] with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: + ydl_opts = {} + ( ffmpeg .input(url) diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py new file mode 100644 index 0000000..98e7386 --- /dev/null +++ b/cisticola/scraper/rumble.py @@ -0,0 +1,143 @@ +from concurrent.futures import process +import cisticola.base +import cisticola.scraper.base +from datetime import datetime +import json +from typing import Generator, Tuple +import tempfile + +import requests +from bs4 import BeautifulSoup +import youtube_dl +import json + +BASE_URL = 'https://rumble.com' + +class RumbleScraper(cisticola.scraper.base.Scraper): + """An implementation of a Scraper for Rumble, using custom functions""" + __version__ = "RumbleScraper 0.0.1" + + def get_username_from_url(url): + username = url.split('https://rumble.com/c/')[1] + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + + username = RumbleScraper.get_username_from_url(channel.url) + scraper = get_channel_videos(username) + + for post in scraper: + if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: + break + + archived_urls = {} + + url = post['media_url'] + + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post['media_url']] = archived_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Rumble", + channel=channel.id, + platform_id=post['media_url'].split('/')[-2], + date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None), + date_archived=datetime.now(), + raw_data=json.dumps(post), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: + return True + + def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + content_type = 'video/mp4' + ext = '.' + content_type.split('/')[-1] + + with tempfile.TemporaryDirectory() as temp_dir: + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", + "noplaylist": True, + 'quiet': True, + "verbose": False,} + ydl = youtube_dl.YoutubeDL(ydl_opts) + + try: + meta = ydl.extract_info( + url, + download=True,) + except youtube_dl.utils.DownloadError as e: + raise e + else: + video_id = meta["id"] + video_ext = meta["ext"] + + with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f: + blob = f.read() + + if key is None: + key = url.split('/')[-2] + ext + + return blob, content_type, key + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_media_url(url): + + r = requests.get(url) + soup = BeautifulSoup(r.content, features = 'lxml') + + script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text)) + media_url = script[0]['embedUrl'] + + return media_url + +def process_video(video): + + rumble_soup = video.find('span', {'class' : 'video-item--rumbles'}) + if rumble_soup is None: + rumbles = '0' + else: + rumbles = rumble_soup['data-value'] + + info = { + 'title' : video.find('h3').text, + 'thumbnail' : video.find('img')['src'], + 'link' : BASE_URL + video.find('a', href = True)['href'], + 'views' : video.find('span', {'class' : 'video-item--views'})['data-value'], + 'rumbles' : rumbles, + 'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], + 'datetime' : video.find('time')['datetime']} + + info['media_url'] = get_media_url(info['link']) + + return info + +def get_channel_videos(channel): + + page = 1 + channel_url = f'{BASE_URL}/c/{channel}?page=' + + while True: + url = channel_url + str(page) + r = requests.get(url) + + if r.status_code == 404: + break + + soup = BeautifulSoup(r.content, features = 'lxml') + + video_list = soup.find_all('li', {'class' : 'video-listing-entry'}) + + for video in video_list: + yield process_video(video) + + page += 1 + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/test.py b/test.py index dfcb2cd..d3bda39 100644 --- a/test.py +++ b/test.py @@ -5,6 +5,7 @@ import cisticola.scraper.gettr import cisticola.scraper.bitchute import cisticola.scraper.odysee import cisticola.scraper.gab +import cisticola.scraper.rumble from sqlalchemy import create_engine @@ -37,6 +38,11 @@ test_channels = [ id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt', category="test", followers=None, platform="Gab", url="https://gab.com/marc_capt", screenname='marc_capt', country="CA", + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel( + id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', + category="test", followers=None, platform="Rumble", + url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA", influencer=None, public=True, chat=False, notes="")] @@ -60,6 +66,9 @@ controller.register_scraper(odysee) gab = cisticola.scraper.gab.GabScraper() controller.register_scraper(gab) +rumble = cisticola.scraper.rumble.RumbleScraper() +controller.register_scraper(rumble) + engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine)