diff --git a/Pipfile b/Pipfile index 533ee24..2f4187e 100644 --- a/Pipfile +++ b/Pipfile @@ -13,6 +13,7 @@ dateparser = "*" sphinx = "*" boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} +ffmpeg-python = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 8a11934..0bdf685 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561" + "sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6" }, "pipfile-spec": 6, "requires": { @@ -41,19 +41,19 @@ }, "boto3": { "hashes": [ - "sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d", - "sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887" + "sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b", + "sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61" ], "index": "pypi", - "version": "==1.21.6" + "version": "==1.21.7" }, "botocore": { "hashes": [ - "sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c", - "sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949" + "sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c", + "sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50" ], "markers": "python_version >= '3.6'", - "version": "==1.24.6" + "version": "==1.24.7" }, "bs4": { "hashes": [ @@ -101,6 +101,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==0.17.1" }, + "ffmpeg-python": { + "hashes": [ + "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", + "sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5" + ], + "index": "pypi", + "version": "==0.2.0" + }, "filelock": { "hashes": [ "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85", @@ -109,6 +117,13 @@ "markers": "python_version >= '3.7'", "version": "==3.6.0" }, + "future": { + "hashes": [ + "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.18.2" + }, "gogettr": { "hashes": [ "sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255", @@ -175,7 +190,7 @@ "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c", "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", "version": "==1.1.2" }, "idna": { @@ -474,9 +489,7 @@ "version": "==2022.1.18" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -486,11 +499,11 @@ }, "s3transfer": { "hashes": [ - "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", - "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" + "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971", + "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed" ], "markers": "python_version >= '3.6'", - "version": "==0.5.1" + "version": "==0.5.2" }, "six": { "hashes": [ diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a41a0b2..9712a97 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -1,4 +1,4 @@ -from typing import Generator +from typing import Generator, Tuple import cisticola.base import requests import os @@ -24,7 +24,8 @@ class Scraper: def __str__(self): return self.__version__ - def archive_media(self, url: str, key: str = None) -> str: + def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + n_retries = 0 r = requests.get(url) @@ -38,13 +39,16 @@ class Scraper: return url blob = r.content - content_type = r.headers.get('Content-Type') if key is None: key = url.split('/')[-1] key = key.split('?')[0] + return blob, content_type, key + + def archive_media(self, blob: bytes, content_type: str, key: str) -> str: + filename = self.__version__.replace(' ', '_') + '/' + key self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv( diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index c5c9f66..8063713 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup import cisticola.base -class BitchuteScraper(cisticola.scraper.Scraper): +class BitchuteScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Bitchute, using classes from the 4cat library""" __version__ = "BitchuteScraper 0.0.1" @@ -34,28 +34,33 @@ class BitchuteScraper(cisticola.scraper.Scraper): # Don't scrape comment information #TODO implement framework for processing and storing comments - detail = 'basic' + detail = 'comments' - posts = [] username = BitchuteScraper.get_username_from_url(channel.url) scraper = get_videos_user(session, username, csrftoken, detail) - for i, post in enumerate(scraper): + for post in scraper: - if since is not None and post['timestamp'] <= since.date_archived.timestamp(): - print( f'\n\nBREAK ON VIDEO: {i}\n\n') + if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date: break - posts.append(cisticola.base.ScraperResult( + archived_urls = {} + + if 'video_url' in post: + url = post['video_url'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Bitchute", channel=channel.id, platform_id=post['id'], date=datetime.fromtimestamp(post['timestamp']), date_archived=datetime.now(), - raw_data=json.dumps(post))) - - return posts + raw_data=json.dumps(post), + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 9a52a69..2e59e3c 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -2,9 +2,10 @@ import cisticola.base import cisticola.scraper.base from datetime import datetime import json -from typing import Generator +from typing import Generator, Tuple from gogettr import PublicClient - +import ffmpeg +import tempfile class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -30,16 +31,20 @@ class GettrScraper(cisticola.scraper.base.Scraper): if 'imgs' in post: for img in post['imgs']: url = "https://media.gettr.com/" + img - archived_url = self.archive_media(url) + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[img] = archived_url if 'main' in post: - archived_url = self.archive_media("https://media.gettr.com/" + post['main']) + url = "https://media.gettr.com/" + post['main'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post['main']] = archived_url - # TODO this is just archiving the playlist file, not the actual video if 'vid' in post: - archived_url = self.archive_media("https://media.gettr.com/" + post['vid']) + url = "https://media.gettr.com/" + post['vid'] + media_blob, content_type, key = self.m3u8_url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post['vid']] = archived_url yield cisticola.base.ScraperResult( @@ -55,3 +60,26 @@ class GettrScraper(cisticola.scraper.base.Scraper): def can_handle(self, channel): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: return True + + def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + # Using mkv might be more robust: https://stackoverflow.com/a/42871067 + content_type = 'video/mp4' + ext = '.' + content_type.split('/')[-1] + + with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: + + ( + ffmpeg + .input(url) + .output(temp_file.name, vcodec='copy') + .global_args('-loglevel', 'error') + .run(overwrite_output=True)) + + temp_file.seek(0) + blob = temp_file.read() + + if key is None: + key = url.split('/')[-2] + ext + + return blob, content_type, key \ No newline at end of file diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index bba9276..83b15b1 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -25,12 +25,14 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): archived_urls = {} for image_url in post.images: - archive_url = self.archive_media(image_url) - archived_urls[image_url] = archive_url + media_blob, content_type, key = self.url_to_blob(image_url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[image_url] = archived_url if post.video: - video_archive_url = self.archive_media(post.video) - archived_urls[post.video] = video_archive_url + media_blob, content_type, key = self.url_to_blob(post.video) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post.video] = archived_url yield cisticola.base.ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index a43365c..e833ec3 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -41,7 +41,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper): url = None if url is not None: - archived_url = self.archive_media(url) + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url yield cisticola.base.ScraperResult( diff --git a/test.py b/test.py index 9c60fb0..1270f8b 100644 --- a/test.py +++ b/test.py @@ -2,6 +2,7 @@ import cisticola import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter import cisticola.scraper.gettr +import cisticola.scraper.bitchute from sqlalchemy import create_engine @@ -20,10 +21,11 @@ test_channels = [ category="qanon", followers=None, platform="Gettr", url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC', - category="nazi", followers=None, platform="Bitchute", - url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US", - influencer=None, public=True, chat=False, notes=""),] + cisticola.base.Channel( + id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom', + category="spam", followers=None, platform="Bitchute", + url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US", + influencer=None, public=True, chat=False, notes=""),] controller = cisticola.ScraperController() @@ -37,6 +39,9 @@ controller.register_scraper(telegram) gettr = cisticola.scraper.gettr.GettrScraper() controller.register_scraper(gettr) +bitchute = cisticola.scraper.bitchute.BitchuteScraper() +controller.register_scraper(gettr) + engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine)