From ef83cc4b0a1ee70eaef247516f405f51f3933708 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 25 Feb 2022 13:43:30 -0600 Subject: [PATCH 1/9] converted bitchute to yield, got video archiving working on bitchute and gettr, added url_to_blob method that downloads media bytes blob from url and converted archive_media to take in the media bytes blob instead of the media url. --- Pipfile | 1 + Pipfile.lock | 41 +++++++++++++++++--------- cisticola/scraper/base.py | 10 +++++-- cisticola/scraper/bitchute.py | 25 +++++++++------- cisticola/scraper/gettr.py | 40 +++++++++++++++++++++---- cisticola/scraper/telegram_snscrape.py | 10 ++++--- cisticola/scraper/twitter.py | 3 +- test.py | 13 +++++--- 8 files changed, 101 insertions(+), 42 deletions(-) diff --git a/Pipfile b/Pipfile index 533ee24..2f4187e 100644 --- a/Pipfile +++ b/Pipfile @@ -13,6 +13,7 @@ dateparser = "*" sphinx = "*" boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} +ffmpeg-python = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 8a11934..0bdf685 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561" + "sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6" }, "pipfile-spec": 6, "requires": { @@ -41,19 +41,19 @@ }, "boto3": { "hashes": [ - "sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d", - "sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887" + "sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b", + "sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61" ], "index": "pypi", - "version": "==1.21.6" + "version": "==1.21.7" }, "botocore": { "hashes": [ - "sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c", - "sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949" + "sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c", + "sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50" ], "markers": "python_version >= '3.6'", - "version": "==1.24.6" + "version": "==1.24.7" }, "bs4": { "hashes": [ @@ -101,6 +101,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==0.17.1" }, + "ffmpeg-python": { + "hashes": [ + "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", + "sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5" + ], + "index": "pypi", + "version": "==0.2.0" + }, "filelock": { "hashes": [ "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85", @@ -109,6 +117,13 @@ "markers": "python_version >= '3.7'", "version": "==3.6.0" }, + "future": { + "hashes": [ + "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.18.2" + }, "gogettr": { "hashes": [ "sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255", @@ -175,7 +190,7 @@ "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c", "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", "version": "==1.1.2" }, "idna": { @@ -474,9 +489,7 @@ "version": "==2022.1.18" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -486,11 +499,11 @@ }, "s3transfer": { "hashes": [ - "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f", - "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a" + "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971", + "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed" ], "markers": "python_version >= '3.6'", - "version": "==0.5.1" + "version": "==0.5.2" }, "six": { "hashes": [ diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a41a0b2..9712a97 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -1,4 +1,4 @@ -from typing import Generator +from typing import Generator, Tuple import cisticola.base import requests import os @@ -24,7 +24,8 @@ class Scraper: def __str__(self): return self.__version__ - def archive_media(self, url: str, key: str = None) -> str: + def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + n_retries = 0 r = requests.get(url) @@ -38,13 +39,16 @@ class Scraper: return url blob = r.content - content_type = r.headers.get('Content-Type') if key is None: key = url.split('/')[-1] key = key.split('?')[0] + return blob, content_type, key + + def archive_media(self, blob: bytes, content_type: str, key: str) -> str: + filename = self.__version__.replace(' ', '_') + '/' + key self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv( diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index c5c9f66..8063713 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup import cisticola.base -class BitchuteScraper(cisticola.scraper.Scraper): +class BitchuteScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Bitchute, using classes from the 4cat library""" __version__ = "BitchuteScraper 0.0.1" @@ -34,28 +34,33 @@ class BitchuteScraper(cisticola.scraper.Scraper): # Don't scrape comment information #TODO implement framework for processing and storing comments - detail = 'basic' + detail = 'comments' - posts = [] username = BitchuteScraper.get_username_from_url(channel.url) scraper = get_videos_user(session, username, csrftoken, detail) - for i, post in enumerate(scraper): + for post in scraper: - if since is not None and post['timestamp'] <= since.date_archived.timestamp(): - print( f'\n\nBREAK ON VIDEO: {i}\n\n') + if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date: break - posts.append(cisticola.base.ScraperResult( + archived_urls = {} + + if 'video_url' in post: + url = post['video_url'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Bitchute", channel=channel.id, platform_id=post['id'], date=datetime.fromtimestamp(post['timestamp']), date_archived=datetime.now(), - raw_data=json.dumps(post))) - - return posts + raw_data=json.dumps(post), + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 9a52a69..2e59e3c 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -2,9 +2,10 @@ import cisticola.base import cisticola.scraper.base from datetime import datetime import json -from typing import Generator +from typing import Generator, Tuple from gogettr import PublicClient - +import ffmpeg +import tempfile class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -30,16 +31,20 @@ class GettrScraper(cisticola.scraper.base.Scraper): if 'imgs' in post: for img in post['imgs']: url = "https://media.gettr.com/" + img - archived_url = self.archive_media(url) + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[img] = archived_url if 'main' in post: - archived_url = self.archive_media("https://media.gettr.com/" + post['main']) + url = "https://media.gettr.com/" + post['main'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post['main']] = archived_url - # TODO this is just archiving the playlist file, not the actual video if 'vid' in post: - archived_url = self.archive_media("https://media.gettr.com/" + post['vid']) + url = "https://media.gettr.com/" + post['vid'] + media_blob, content_type, key = self.m3u8_url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post['vid']] = archived_url yield cisticola.base.ScraperResult( @@ -55,3 +60,26 @@ class GettrScraper(cisticola.scraper.base.Scraper): def can_handle(self, channel): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: return True + + def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + # Using mkv might be more robust: https://stackoverflow.com/a/42871067 + content_type = 'video/mp4' + ext = '.' + content_type.split('/')[-1] + + with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: + + ( + ffmpeg + .input(url) + .output(temp_file.name, vcodec='copy') + .global_args('-loglevel', 'error') + .run(overwrite_output=True)) + + temp_file.seek(0) + blob = temp_file.read() + + if key is None: + key = url.split('/')[-2] + ext + + return blob, content_type, key \ No newline at end of file diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index bba9276..83b15b1 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -25,12 +25,14 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): archived_urls = {} for image_url in post.images: - archive_url = self.archive_media(image_url) - archived_urls[image_url] = archive_url + media_blob, content_type, key = self.url_to_blob(image_url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[image_url] = archived_url if post.video: - video_archive_url = self.archive_media(post.video) - archived_urls[post.video] = video_archive_url + media_blob, content_type, key = self.url_to_blob(post.video) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post.video] = archived_url yield cisticola.base.ScraperResult( scraper=self.__version__, diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index a43365c..e833ec3 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -41,7 +41,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper): url = None if url is not None: - archived_url = self.archive_media(url) + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url yield cisticola.base.ScraperResult( diff --git a/test.py b/test.py index 9c60fb0..1270f8b 100644 --- a/test.py +++ b/test.py @@ -2,6 +2,7 @@ import cisticola import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter import cisticola.scraper.gettr +import cisticola.scraper.bitchute from sqlalchemy import create_engine @@ -20,10 +21,11 @@ test_channels = [ category="qanon", followers=None, platform="Gettr", url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC', - category="nazi", followers=None, platform="Bitchute", - url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US", - influencer=None, public=True, chat=False, notes=""),] + cisticola.base.Channel( + id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom', + category="spam", followers=None, platform="Bitchute", + url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US", + influencer=None, public=True, chat=False, notes=""),] controller = cisticola.ScraperController() @@ -37,6 +39,9 @@ controller.register_scraper(telegram) gettr = cisticola.scraper.gettr.GettrScraper() controller.register_scraper(gettr) +bitchute = cisticola.scraper.bitchute.BitchuteScraper() +controller.register_scraper(gettr) + engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) From 47dad8fb00489a875f8d5b90d17b75a272166f76 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 25 Feb 2022 20:28:00 -0600 Subject: [PATCH 2/9] added odysee scraper, minor refactoring of url_to_blob method (added url_to_key method that can be overridden by child classes while still using the parent url_to_blob method) and changed test file to include only channels with a relatively small number of posts, to make testing faster --- Pipfile | 1 + Pipfile.lock | 74 +++++++++++++++++++++++++++++++++---- cisticola/scraper/base.py | 8 +++- cisticola/scraper/odysee.py | 53 ++++++++++++++++++++++++++ test.py | 29 ++++++++++----- 5 files changed, 145 insertions(+), 20 deletions(-) create mode 100644 cisticola/scraper/odysee.py diff --git a/Pipfile b/Pipfile index 2f4187e..3f094c5 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ sphinx = "*" boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" +polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 0bdf685..75b94d7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6" + "sha256": "263a7825d8113518c7a0690d5f69526cabe2dfa6ea572bb39cbe5d26495e619c" }, "pipfile-spec": 6, "requires": { @@ -41,19 +41,19 @@ }, "boto3": { "hashes": [ - "sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b", - "sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61" + "sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71", + "sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7" ], "index": "pypi", - "version": "==1.21.7" + "version": "==1.21.8" }, "botocore": { "hashes": [ - "sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c", - "sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50" + "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c", + "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759" ], "markers": "python_version >= '3.6'", - "version": "==1.24.7" + "version": "==1.24.8" }, "bs4": { "hashes": [ @@ -354,6 +354,31 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, + "numpy": { + "hashes": [ + "sha256:03ae5850619abb34a879d5f2d4bb4dcd025d6d8fb72f5e461dae84edccfe129f", + "sha256:076aee5a3763d41da6bef9565fdf3cb987606f567cd8b104aded2b38b7b47abf", + "sha256:0b536b6840e84c1c6a410f3a5aa727821e6108f3454d81a5cd5900999ef04f89", + "sha256:15efb7b93806d438e3bc590ca8ef2f953b0ce4f86f337ef4559d31ec6cf9d7dd", + "sha256:168259b1b184aa83a514f307352c25c56af111c269ffc109d9704e81f72e764b", + "sha256:2638389562bda1635b564490d76713695ff497242a83d9b684d27bb4a6cc9d7a", + "sha256:3556c5550de40027d3121ebbb170f61bbe19eb639c7ad0c7b482cd9b560cd23b", + "sha256:4a176959b6e7e00b5a0d6f549a479f869829bfd8150282c590deee6d099bbb6e", + "sha256:515a8b6edbb904594685da6e176ac9fbea8f73a5ebae947281de6613e27f1956", + "sha256:55535c7c2f61e2b2fc817c5cbe1af7cb907c7f011e46ae0a52caa4be1f19afe2", + "sha256:59153979d60f5bfe9e4c00e401e24dfe0469ef8da6d68247439d3278f30a180f", + "sha256:60cb8e5933193a3cc2912ee29ca331e9c15b2da034f76159b7abc520b3d1233a", + "sha256:6767ad399e9327bfdbaa40871be4254d1995f4a3ca3806127f10cec778bd9896", + "sha256:76a4f9bce0278becc2da7da3b8ef854bed41a991f4226911a24a9711baad672c", + "sha256:8cf33634b60c9cef346663a222d9841d3bbbc0a2f00221d6bcfd0d993d5543f6", + "sha256:94dd11d9f13ea1be17bac39c1942f527cbf7065f94953cf62dfe805653da2f8f", + "sha256:aafa46b5a39a27aca566198d3312fb3bde95ce9677085efd02c86f7ef6be4ec7", + "sha256:badca914580eb46385e7f7e4e426fea6de0a37b9e06bec252e481ae7ec287082", + "sha256:d76a26c5118c4d96e264acc9e3242d72e1a2b92e739807b3b69d8d47684b6677" + ], + "markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'", + "version": "==1.22.2" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -362,6 +387,37 @@ "markers": "python_version >= '3.6'", "version": "==21.3" }, + "pandas": { + "hashes": [ + "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34", + "sha256:04dd15d9db538470900c851498e532ef28d4e56bfe72c9523acb32042de43dfb", + "sha256:0b1a13f647e4209ed7dbb5da3497891d0045da9785327530ab696417ef478f84", + "sha256:19f7c632436b1b4f84615c3b127bbd7bc603db95e3d4332ed259dc815c9aaa26", + "sha256:1b384516dbb4e6aae30e3464c2e77c563da5980440fbdfbd0968e3942f8f9d70", + "sha256:1d85d5f6be66dfd6d1d8d13b9535e342a2214260f1852654b19fa4d7b8d1218b", + "sha256:2e5a7a1e0ecaac652326af627a3eca84886da9e667d68286866d4e33f6547caf", + "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf", + "sha256:358b0bc98a5ff067132d23bf7a2242ee95db9ea5b7bbc401cf79205f11502fd3", + "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907", + "sha256:4e1176f45981c8ccc8161bc036916c004ca51037a7ed73f2d2a9857e6dbe654f", + "sha256:508c99debccd15790d526ce6b1624b97a5e1e4ca5b871319fb0ebfd46b8f4dad", + "sha256:6105af6533f8b63a43ea9f08a2ede04e8f43e49daef0209ab0d30352bcf08bee", + "sha256:6d6ad1da00c7cc7d8dd1559a6ba59ba3973be6b15722d49738b2be0977eb8a0c", + "sha256:7ea47ba1d6f359680130bd29af497333be6110de8f4c35b9211eec5a5a9630fa", + "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2", + "sha256:96e9ece5759f9b47ae43794b6359bbc54805d76e573b161ae770c1ea59393106", + "sha256:bbb15ad79050e8b8d39ec40dd96a30cd09b886a2ae8848d0df1abba4d5502a67", + "sha256:c614001129b2a5add5e3677c3a213a9e6fd376204cb8d17c04e84ff7dfc02a73", + "sha256:e6a7bbbb7950063bfc942f8794bc3e31697c020a14f1cd8905fc1d28ec674a01", + "sha256:f02e85e6d832be37d7f16cf6ac8bb26b519ace3e5f3235564a91c7f658ab2a43" + ], + "markers": "python_version >= '3.8'", + "version": "==1.4.1" + }, + "polyphemus": { + "git": "https://github.com/bellingcat/polyphemus.git", + "ref": "72ea0a63de4b40bf8038dfdb26cbbab87ba86da9" + }, "pygments": { "hashes": [ "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", @@ -489,7 +545,9 @@ "version": "==2022.1.18" }, "requests": { - "extras": [], + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 9712a97..7a7865e 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -24,6 +24,11 @@ class Scraper: def __str__(self): return self.__version__ + def url_to_key(self, url: str, content_type: str) -> str: + key = url.split('/')[-1] + key = key.split('?')[0] + return key + def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: n_retries = 0 @@ -42,8 +47,7 @@ class Scraper: content_type = r.headers.get('Content-Type') if key is None: - key = url.split('/')[-1] - key = key.split('?')[0] + key = self.url_to_key(url, content_type) return blob, content_type, key diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py new file mode 100644 index 0000000..dc496b2 --- /dev/null +++ b/cisticola/scraper/odysee.py @@ -0,0 +1,53 @@ +import cisticola.base +import cisticola.scraper.base +from datetime import datetime +import json +from typing import Generator +from polyphemus.base import OdyseeChannel + +class OdyseeScraper(cisticola.scraper.base.Scraper): + """An implementation of a Scraper for Odysee, using polyphemus library""" + __version__ = "OdyseeScraper 0.0.1" + + def get_username_from_url(url): + + username = url.split('odysee.com/')[-1].strip('@').split(':')[0] + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + + username = OdyseeScraper.get_username_from_url(channel.url) + odysee_channel = OdyseeChannel(channel_name = username) + + all_videos = odysee_channel.get_all_videos() + + for video in all_videos: + if since is not None and datetime.fromtimestamp(video['created']) <= since.date: + break + + archived_urls = {} + url = video.info['streaming_url'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Odysee", + channel=channel.id, + platform_id=video.info['claim_id'], + date=datetime.fromtimestamp(video.info['created']), + date_archived=datetime.now(), + raw_data=json.dumps(video.info), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: + return True + + def url_to_key(self, url: str, content_type: str) -> str: + key = url.split('/')[-2] + ext = content_type.split('/')[-1] + + return f'{key}.{ext}' \ No newline at end of file diff --git a/test.py b/test.py index 1270f8b..eef8e29 100644 --- a/test.py +++ b/test.py @@ -3,6 +3,7 @@ import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter import cisticola.scraper.gettr import cisticola.scraper.bitchute +import cisticola.scraper.odysee from sqlalchemy import create_engine @@ -13,19 +14,24 @@ test_channels = [ url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, - category="qanon", followers=None, platform="Telegram", - url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR", - influencer="JQNH SPARTAN", public=True, chat=False, notes=""), - cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic', - category="qanon", followers=None, platform="Gettr", + cisticola.base.Channel(id=1, name="South West Ohio Proud Boys (test)", platform_id=-1001276612436, + category="test", followers=None, platform="Telegram", + url="https://t.me/SouthwestOhioPB", screenname="SouthwestOhioPB", country="US", + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel(id=2, name="LizardRepublic (test)", platform_id='lizardrepublic', + category="test", followers=None, platform="Gettr", url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel( - id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom', - category="spam", followers=None, platform="Bitchute", + id=4, name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', + category="test", followers=None, platform="Bitchute", url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US", - influencer=None, public=True, chat=False, notes=""),] + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel( + id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon', + category="test", followers=None, platform="Odysee", + url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US", + influencer=None, public=True, chat=False, notes="")] controller = cisticola.ScraperController() @@ -40,7 +46,10 @@ gettr = cisticola.scraper.gettr.GettrScraper() controller.register_scraper(gettr) bitchute = cisticola.scraper.bitchute.BitchuteScraper() -controller.register_scraper(gettr) +controller.register_scraper(bitchute) + +odysee = cisticola.scraper.odysee.OdyseeScraper() +controller.register_scraper(odysee) engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) From 7a257ea9f51749849a715f566323f17ee537b6f2 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Feb 2022 09:15:09 -0600 Subject: [PATCH 3/9] included comments in Odysee scraper --- Pipfile.lock | 10 +++++----- cisticola/scraper/odysee.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 75b94d7..4690b31 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -211,11 +211,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:175f4ee440a0317f6e8d81b7f8d4869f93316170a65ad2b007d2929186c8052c", - "sha256:e0bc84ff355328a4adfc5240c4f211e0ab386f80aa640d1b11f0618a1d282094" + "sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac", + "sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735" ], "markers": "python_version < '3.10'", - "version": "==4.11.1" + "version": "==4.11.2" }, "jinja2": { "hashes": [ @@ -416,7 +416,7 @@ }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", - "ref": "72ea0a63de4b40bf8038dfdb26cbbab87ba86da9" + "ref": "18b89f19ecdd32e7dc8b5564b258a67165e680ca" }, "pygments": { "hashes": [ @@ -709,7 +709,7 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.8" }, "zipp": { diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index dc496b2..1d9d4e5 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -32,6 +32,8 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url + all_comments = video.get_all_comments() + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Odysee", @@ -42,6 +44,18 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): raw_data=json.dumps(video.info), archived_urls=archived_urls) + for comment in all_comments: + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Odysee", + channel=channel.id, + platform_id=comment.info['claim_id'], + date=datetime.fromtimestamp(comment.info['created']), + date_archived=datetime.now(), + raw_data=json.dumps(comment.info), + archived_urls=archived_urls) + def can_handle(self, channel): if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: return True From bc840e631d34f6a3ebf013af5e221d8ac1c4fcb7 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Feb 2022 12:11:21 -0600 Subject: [PATCH 4/9] added Gab scraper --- Pipfile | 1 + Pipfile.lock | 58 +++++++++++++++++++++++++++++++++-- cisticola/scraper/base.py | 8 +++-- cisticola/scraper/bitchute.py | 2 +- cisticola/scraper/gab.py | 53 ++++++++++++++++++++++++++++++++ test.py | 9 ++++++ 6 files changed, 126 insertions(+), 5 deletions(-) create mode 100644 cisticola/scraper/gab.py diff --git a/Pipfile b/Pipfile index 3f094c5..d2c3af8 100644 --- a/Pipfile +++ b/Pipfile @@ -15,6 +15,7 @@ boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} +garc = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 4690b31..e9c95cf 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "263a7825d8113518c7a0690d5f69526cabe2dfa6ea572bb39cbe5d26495e619c" + "sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507" }, "pipfile-spec": 6, "requires": { @@ -23,6 +23,14 @@ ], "version": "==0.7.12" }, + "attrs": { + "hashes": [ + "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", + "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.4.0" + }, "babel": { "hashes": [ "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", @@ -124,6 +132,13 @@ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.18.2" }, + "garc": { + "hashes": [ + "sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216" + ], + "index": "pypi", + "version": "==2.1" + }, "gogettr": { "hashes": [ "sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255", @@ -217,6 +232,13 @@ "markers": "python_version < '3.10'", "version": "==4.11.2" }, + "iniconfig": { + "hashes": [ + "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", + "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" + ], + "version": "==1.1.1" + }, "jinja2": { "hashes": [ "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", @@ -414,10 +436,26 @@ "markers": "python_version >= '3.8'", "version": "==1.4.1" }, + "pluggy": { + "hashes": [ + "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", + "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" + ], + "markers": "python_version >= '3.6'", + "version": "==1.0.0" + }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", "ref": "18b89f19ecdd32e7dc8b5564b258a67165e680ca" }, + "py": { + "hashes": [ + "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", + "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==1.11.0" + }, "pygments": { "hashes": [ "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", @@ -442,6 +480,14 @@ ], "version": "==1.7.1" }, + "pytest": { + "hashes": [ + "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", + "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + ], + "markers": "python_version >= '3.6'", + "version": "==7.0.1" + }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", @@ -688,6 +734,14 @@ "index": "pypi", "version": "==1.4.31" }, + "tomli": { + "hashes": [ + "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", + "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + ], + "markers": "python_version >= '3.7'", + "version": "==2.0.1" + }, "tzdata": { "hashes": [ "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5", @@ -709,7 +763,7 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.8" }, "zipp": { diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 7a7865e..8b0bb90 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -19,6 +19,9 @@ class Scraper: 'DO_SPACES_KEY'), aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + self.headers = { + 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} + pass def __str__(self): @@ -32,12 +35,13 @@ class Scraper: def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: n_retries = 0 - r = requests.get(url) + + r = requests.get(url, headers = self.headers) while r.status_code != 200 and n_retries < 5: logger.warning(f"{n_retries}/5: Request for {url} failed") n_retries += 1 - r = requests.get(url) + r = requests.get(url, headers = self.headers) if r.status_code != 200: logger.error(f"Could not fetch URL {url}") diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 8063713..9e7f202 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -26,7 +26,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper): def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: session = requests.Session() - session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" + session.headers.update(self.headers) request = session.get("https://www.bitchute.com/search") csrftoken = BeautifulSoup(request.text, 'html.parser').findAll( "input", {"name": "csrfmiddlewaretoken"})[0].get("value") diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py new file mode 100644 index 0000000..27a0924 --- /dev/null +++ b/cisticola/scraper/gab.py @@ -0,0 +1,53 @@ +import cisticola.base +import cisticola.scraper.base +from datetime import datetime +import json +from typing import Generator, Tuple +from garc import Garc +import tempfile + +class GabScraper(cisticola.scraper.base.Scraper): + """An implementation of a Scraper for Gab, using GARC library""" + __version__ = "GabScraper 0.0.1" + + def get_username_from_url(url): + username = url.split('https://gab.com/')[-1] + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + client = Garc(profile = 'main') + username = GabScraper.get_username_from_url(channel.url) + + scraper = client.userposts(username) + + for post in scraper: + if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date: + break + + media_urls = [] + archived_urls = {} + + media_urls.extend([p['url'] for p in post['media_attachments']]) + + if post.get('repost') is not None: + media_urls.extend([p['url'] for p in post['repost']['media_attachments']]) + + for url in media_urls: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Gab", + channel=channel.id, + platform_id=post['id'], + date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None), + date_archived=datetime.now(), + raw_data=json.dumps(post), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None: + return True \ No newline at end of file diff --git a/test.py b/test.py index eef8e29..dfcb2cd 100644 --- a/test.py +++ b/test.py @@ -4,6 +4,7 @@ import cisticola.scraper.twitter import cisticola.scraper.gettr import cisticola.scraper.bitchute import cisticola.scraper.odysee +import cisticola.scraper.gab from sqlalchemy import create_engine @@ -31,6 +32,11 @@ test_channels = [ id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon', category="test", followers=None, platform="Odysee", url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US", + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel( + id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt', + category="test", followers=None, platform="Gab", + url="https://gab.com/marc_capt", screenname='marc_capt', country="CA", influencer=None, public=True, chat=False, notes="")] @@ -51,6 +57,9 @@ controller.register_scraper(bitchute) odysee = cisticola.scraper.odysee.OdyseeScraper() controller.register_scraper(odysee) +gab = cisticola.scraper.gab.GabScraper() +controller.register_scraper(gab) + engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) From ee4d64750b5005559fe144b2450a79e658511bd8 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Feb 2022 18:38:33 -0600 Subject: [PATCH 5/9] added prototype Rumble scraper --- Pipfile | 2 +- Pipfile.lock | 24 ++++-- cisticola/scraper/gab.py | 5 +- cisticola/scraper/gettr.py | 3 +- cisticola/scraper/rumble.py | 143 ++++++++++++++++++++++++++++++++++++ test.py | 9 +++ 6 files changed, 173 insertions(+), 13 deletions(-) create mode 100644 cisticola/scraper/rumble.py diff --git a/Pipfile b/Pipfile index d2c3af8..7ea75cd 100644 --- a/Pipfile +++ b/Pipfile @@ -16,7 +16,7 @@ snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" - +youtube-dl = "*" [dev-packages] [requires] diff --git a/Pipfile.lock b/Pipfile.lock index e9c95cf..c66dfed 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507" + "sha256": "ceba738b6c5ca9afd5fa79490cffde53d97ea7ec86034e0f31ecbc54fd418055" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,19 @@ }, "boto3": { "hashes": [ - "sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71", - "sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7" + "sha256:32080e2d956b222f36b76f8fec532ec237ddb4a935dd1c9bb79c759fbe4a5868", + "sha256:bd7c71274e9257596879f99cff3d0f531b801e567e509b5e3d613bd2033a7279" ], "index": "pypi", - "version": "==1.21.8" + "version": "==1.21.9" }, "botocore": { "hashes": [ - "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c", - "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759" + "sha256:8d41deb25e585b0d7b6ee8547990d5e95562f1dc5d3127af58459450b25c13c7", + "sha256:c44758c487df7a357c4a103d959962d78e225d1ab6c9eeda4c77f79a410ccd19" ], "markers": "python_version >= '3.6'", - "version": "==1.24.8" + "version": "==1.24.9" }, "bs4": { "hashes": [ @@ -763,9 +763,17 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.8" }, + "youtube-dl": { + "hashes": [ + "sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2", + "sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55" + ], + "index": "pypi", + "version": "==2021.12.17" + }, "zipp": { "hashes": [ "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d", diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 27a0924..e406078 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -2,9 +2,8 @@ import cisticola.base import cisticola.scraper.base from datetime import datetime import json -from typing import Generator, Tuple +from typing import Generator from garc import Garc -import tempfile class GabScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gab, using GARC library""" @@ -22,7 +21,7 @@ class GabScraper(cisticola.scraper.base.Scraper): scraper = client.userposts(username) for post in scraper: - if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date: + if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date: break media_urls = [] diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 2e59e3c..3471f25 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -63,12 +63,13 @@ class GettrScraper(cisticola.scraper.base.Scraper): def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - # Using mkv might be more robust: https://stackoverflow.com/a/42871067 content_type = 'video/mp4' ext = '.' + content_type.split('/')[-1] with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: + ydl_opts = {} + ( ffmpeg .input(url) diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py new file mode 100644 index 0000000..98e7386 --- /dev/null +++ b/cisticola/scraper/rumble.py @@ -0,0 +1,143 @@ +from concurrent.futures import process +import cisticola.base +import cisticola.scraper.base +from datetime import datetime +import json +from typing import Generator, Tuple +import tempfile + +import requests +from bs4 import BeautifulSoup +import youtube_dl +import json + +BASE_URL = 'https://rumble.com' + +class RumbleScraper(cisticola.scraper.base.Scraper): + """An implementation of a Scraper for Rumble, using custom functions""" + __version__ = "RumbleScraper 0.0.1" + + def get_username_from_url(url): + username = url.split('https://rumble.com/c/')[1] + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + + username = RumbleScraper.get_username_from_url(channel.url) + scraper = get_channel_videos(username) + + for post in scraper: + if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: + break + + archived_urls = {} + + url = post['media_url'] + + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[post['media_url']] = archived_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Rumble", + channel=channel.id, + platform_id=post['media_url'].split('/')[-2], + date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None), + date_archived=datetime.now(), + raw_data=json.dumps(post), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: + return True + + def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + content_type = 'video/mp4' + ext = '.' + content_type.split('/')[-1] + + with tempfile.TemporaryDirectory() as temp_dir: + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", + "noplaylist": True, + 'quiet': True, + "verbose": False,} + ydl = youtube_dl.YoutubeDL(ydl_opts) + + try: + meta = ydl.extract_info( + url, + download=True,) + except youtube_dl.utils.DownloadError as e: + raise e + else: + video_id = meta["id"] + video_ext = meta["ext"] + + with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f: + blob = f.read() + + if key is None: + key = url.split('/')[-2] + ext + + return blob, content_type, key + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def get_media_url(url): + + r = requests.get(url) + soup = BeautifulSoup(r.content, features = 'lxml') + + script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text)) + media_url = script[0]['embedUrl'] + + return media_url + +def process_video(video): + + rumble_soup = video.find('span', {'class' : 'video-item--rumbles'}) + if rumble_soup is None: + rumbles = '0' + else: + rumbles = rumble_soup['data-value'] + + info = { + 'title' : video.find('h3').text, + 'thumbnail' : video.find('img')['src'], + 'link' : BASE_URL + video.find('a', href = True)['href'], + 'views' : video.find('span', {'class' : 'video-item--views'})['data-value'], + 'rumbles' : rumbles, + 'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], + 'datetime' : video.find('time')['datetime']} + + info['media_url'] = get_media_url(info['link']) + + return info + +def get_channel_videos(channel): + + page = 1 + channel_url = f'{BASE_URL}/c/{channel}?page=' + + while True: + url = channel_url + str(page) + r = requests.get(url) + + if r.status_code == 404: + break + + soup = BeautifulSoup(r.content, features = 'lxml') + + video_list = soup.find_all('li', {'class' : 'video-listing-entry'}) + + for video in video_list: + yield process_video(video) + + page += 1 + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/test.py b/test.py index dfcb2cd..d3bda39 100644 --- a/test.py +++ b/test.py @@ -5,6 +5,7 @@ import cisticola.scraper.gettr import cisticola.scraper.bitchute import cisticola.scraper.odysee import cisticola.scraper.gab +import cisticola.scraper.rumble from sqlalchemy import create_engine @@ -37,6 +38,11 @@ test_channels = [ id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt', category="test", followers=None, platform="Gab", url="https://gab.com/marc_capt", screenname='marc_capt', country="CA", + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel( + id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', + category="test", followers=None, platform="Rumble", + url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA", influencer=None, public=True, chat=False, notes="")] @@ -60,6 +66,9 @@ controller.register_scraper(odysee) gab = cisticola.scraper.gab.GabScraper() controller.register_scraper(gab) +rumble = cisticola.scraper.rumble.RumbleScraper() +controller.register_scraper(rumble) + engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) From f3d9dc91c64d31c8b7174311f905fa1e891ddd49 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 1 Mar 2022 14:13:04 -0600 Subject: [PATCH 6/9] changed URL parsing to use urllib --- cisticola/scraper/base.py | 4 ++-- cisticola/scraper/gettr.py | 6 +++--- cisticola/scraper/odysee.py | 3 ++- cisticola/scraper/rumble.py | 5 +++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 8b0bb90..c9e3fb7 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -4,6 +4,7 @@ import requests import os import boto3 from io import BytesIO +from urllib.parse import urlparse from loguru import logger class Scraper: @@ -28,8 +29,7 @@ class Scraper: return self.__version__ def url_to_key(self, url: str, content_type: str) -> str: - key = url.split('/')[-1] - key = key.split('?')[0] + key = urlparse(url).path.split('/')[-1] return key def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 3471f25..66ec977 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -6,6 +6,8 @@ from typing import Generator, Tuple from gogettr import PublicClient import ffmpeg import tempfile +from urllib.parse import urlparse + class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -68,8 +70,6 @@ class GettrScraper(cisticola.scraper.base.Scraper): with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: - ydl_opts = {} - ( ffmpeg .input(url) @@ -81,6 +81,6 @@ class GettrScraper(cisticola.scraper.base.Scraper): blob = temp_file.read() if key is None: - key = url.split('/')[-2] + ext + key = urlparse(url).path.split('/')[-2] + ext return blob, content_type, key \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 1d9d4e5..2876a66 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -4,6 +4,7 @@ from datetime import datetime import json from typing import Generator from polyphemus.base import OdyseeChannel +from urllib.parse import urlparse class OdyseeScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" @@ -61,7 +62,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): return True def url_to_key(self, url: str, content_type: str) -> str: - key = url.split('/')[-2] + key = urlparse(url).path.split('/')[-2] ext = content_type.split('/')[-1] return f'{key}.{ext}' \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 98e7386..620dcc0 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -9,7 +9,8 @@ import tempfile import requests from bs4 import BeautifulSoup import youtube_dl -import json +import json +from urllib.parse import urlparse BASE_URL = 'https://rumble.com' @@ -82,7 +83,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper): blob = f.read() if key is None: - key = url.split('/')[-2] + ext + key = urlparse(url).path.split('/')[-2] + ext return blob, content_type, key From 75240bb060b7fac806fde0d832579d385f335277 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 1 Mar 2022 15:58:18 -0600 Subject: [PATCH 7/9] fixed various bugs related to archived URL creation and media downloading. Things seem to work well now --- cisticola/scraper/base.py | 25 ++++++++++++++++++++++++- cisticola/scraper/gettr.py | 25 +++---------------------- cisticola/scraper/odysee.py | 12 ++++++++++-- cisticola/scraper/twitter.py | 15 ++++++++++++++- 4 files changed, 51 insertions(+), 26 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index c9e3fb7..465b1f7 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -6,7 +6,8 @@ import boto3 from io import BytesIO from urllib.parse import urlparse from loguru import logger - +import ffmpeg +import tempfile class Scraper: __version__ = "Scraper 0.0.0" @@ -55,6 +56,28 @@ class Scraper: return blob, content_type, key + def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + content_type = 'video/mp4' + ext = '.' + content_type.split('/')[-1] + + with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: + + ( + ffmpeg + .input(url) + .output(temp_file.name, vcodec='copy') + .global_args('-loglevel', 'error') + .run(overwrite_output=True)) + + temp_file.seek(0) + blob = temp_file.read() + + if key is None: + key = self.url_to_key(url = url, content_type = content_type) + + return blob, content_type, key + def archive_media(self, blob: bytes, content_type: str, key: str) -> str: filename = self.__version__.replace(' ', '_') + '/' + key diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 66ec977..cdcb6cf 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -4,8 +4,6 @@ from datetime import datetime import json from typing import Generator, Tuple from gogettr import PublicClient -import ffmpeg -import tempfile from urllib.parse import urlparse class GettrScraper(cisticola.scraper.base.Scraper): @@ -63,24 +61,7 @@ class GettrScraper(cisticola.scraper.base.Scraper): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: return True - def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - - content_type = 'video/mp4' + def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] - - with tempfile.NamedTemporaryFile(suffix = ext) as temp_file: - - ( - ffmpeg - .input(url) - .output(temp_file.name, vcodec='copy') - .global_args('-loglevel', 'error') - .run(overwrite_output=True)) - - temp_file.seek(0) - blob = temp_file.read() - - if key is None: - key = urlparse(url).path.split('/')[-2] + ext - - return blob, content_type, key \ No newline at end of file + key = urlparse(url).path.split('/')[-2] + ext + return key \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 2876a66..fc0c3da 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -5,6 +5,7 @@ import json from typing import Generator from polyphemus.base import OdyseeChannel from urllib.parse import urlparse +import requests class OdyseeScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" @@ -29,7 +30,14 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): archived_urls = {} url = video.info['streaming_url'] - media_blob, content_type, key = self.url_to_blob(url) + + # Check if file is a video file or an m3u8 file + r = requests.head(url) + if r.headers['Content-Type'] == 'text/html; charset=utf-8': + media_blob, content_type, key = self.m3u8_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url @@ -55,7 +63,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): date=datetime.fromtimestamp(comment.info['created']), date_archived=datetime.now(), raw_data=json.dumps(comment.info), - archived_urls=archived_urls) + archived_urls={}) def can_handle(self, channel): if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index e833ec3..e36aab1 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -4,7 +4,7 @@ from datetime import datetime, timezone from typing import Generator import snscrape.modules from loguru import logger - +from urllib.parse import urlparse, parse_qs class TwitterScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" @@ -58,3 +58,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper): def can_handle(self, channel): if channel.platform == "Twitter" and channel.platform_id: return True + + def url_to_key(self, url: str, content_type: str) -> str: + parsed_url = urlparse(url) + queries = parse_qs(parsed_url.query) + + # TODO might require additional statements for other media formats + if 'jpg' in queries.get('format', []): + ext = '.jpg' + elif parsed_url.path.endswith('.mp4'): + ext = '' + + key = parsed_url.path.split('/')[-1] + ext + return key \ No newline at end of file From c21e43ddfaa6e6cbd8fa1473941fe90112abe9b3 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 4 Mar 2022 10:55:54 -0600 Subject: [PATCH 8/9] refactored import structure --- cisticola/__init__.py | 74 +------ cisticola/base.py | 5 +- cisticola/examples/russian_telegram_ingest.py | 131 +++++++++++++ cisticola/scraper/__init__.py | 8 + cisticola/scraper/base.py | 87 ++++++++- cisticola/scraper/bitchute.py | 12 +- cisticola/scraper/gab.py | 11 +- cisticola/scraper/gettr.py | 13 +- cisticola/scraper/odysee.py | 16 +- cisticola/scraper/rumble.py | 15 +- cisticola/scraper/telegram_snscrape.py | 13 +- cisticola/scraper/twitter.py | 24 +-- cisticola/transformer/__init__.py | 18 +- cisticola/transformer/base.py | 16 ++ cisticola/transformer/twitter.py | 10 +- test.py | 181 +++++++++++------- 16 files changed, 418 insertions(+), 216 deletions(-) create mode 100644 cisticola/examples/russian_telegram_ingest.py create mode 100644 cisticola/transformer/base.py diff --git a/cisticola/__init__.py b/cisticola/__init__.py index 2b873cc..505680b 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -1,71 +1,3 @@ -from typing import List -import cisticola.base -import cisticola.scraper.base -from sqlalchemy.orm import sessionmaker -from loguru import logger - - -class ScraperController: - """Registers scrapers, uses them to generate ScraperResults. Synchronizes - everything with database via ORM.""" - - def __init__(self): - self.scrapers = [] - self.session = None - self.mapper_registry = None - - def register_scraper(self, scraper: cisticola.scraper.base.Scraper): - self.scrapers.append(scraper) - - def scrape_channels(self, channels: List[cisticola.base.Channel]): - if self.session is None: - logger.error("No DB session") - return - - for channel in channels: - handled = False - - for scraper in self.scrapers: - if scraper.can_handle(channel): - session = self.session() - handled = True - added = 0 - - # get most recent post - session = self.session() - rows = session.query(cisticola.base.ScraperResult).where( - cisticola.base.ScraperResult.channel == channel.id).order_by( - cisticola.base.ScraperResult.date.desc()).limit(1).all() - - if len(rows) == 1: - since = rows[0] - else: - since = None - - posts = scraper.get_posts(channel, since=since) - - for post in posts: - session.add(post) - added += 1 - - session.commit() - logger.info( - f"{scraper} found {added} new posts from {channel}") - break - - if not handled: - logger.warning(f"No handler found for Channel {channel}") - - def connect_to_db(self, engine): - # create tables - cisticola.base.mapper_registry.metadata.create_all(bind=engine) - - self.session = sessionmaker() - self.session.configure(bind=engine) - - -class ETLController: - """This class will transform the raw_data tables into a format more conducive to analysis.""" - - def __init__(self): - pass +from . import base +from . import scraper +from . import transformer diff --git a/cisticola/base.py b/cisticola/base.py index 03a1641..97a18df 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,11 +1,12 @@ +from typing import List from dataclasses import dataclass from datetime import datetime + from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey mapper_registry = registry() - @dataclass class ScraperResult: """A minimally processed result from a scraper""" @@ -84,4 +85,4 @@ analysis_table = Table('analysis', mapper_registry.metadata, Column('author_username', String) ) -mapper_registry.map_imperatively(TransformedResult, analysis_table) +mapper_registry.map_imperatively(TransformedResult, analysis_table) \ No newline at end of file diff --git a/cisticola/examples/russian_telegram_ingest.py b/cisticola/examples/russian_telegram_ingest.py new file mode 100644 index 0000000..88affbc --- /dev/null +++ b/cisticola/examples/russian_telegram_ingest.py @@ -0,0 +1,131 @@ +from sqlalchemy import create_engine + +from cisticola.base import Channel +from cisticola.scraper import ( + ScraperController, + TelegramSnscrapeScraper) + +test_channels = [ + Channel( + id=0, + name="QAnon Россия", + platform_id=-1001319637748, + category="Qanon", + followers=94048, + platform="Telegram", + url="https://t.me/qanonrus", + screenname="qanonrus", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=1, + name="The Great Awakening | Q", + platform_id=-1001325597521, + category="Qanon", + followers=5715, + platform="Telegram", + url="https://t.me/greatawakin", + screenname="greatawakin", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=2, + name="Великое Пробуждение", + platform_id=-1001285898079, + category="Qanon", + followers=5861, + platform="Telegram", + url="https://t.me/greatawakeningrus", + screenname="greatawakeningrus", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=3, + name="T🕊Редакция Президент Гордон🕊", + platform_id=-1001101170442, + category="Qanon", + followers=5743, + platform="Telegram", + url="https://t.me/prezidentgordonteam", + screenname="prezidentgordonteam", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=4, + name="ПРОЕКТ АВРОРА", + platform_id=-1001279171101, + category="Qanon", + followers=5930, + platform="Telegram", + url="https://t.me/project_aurora", + screenname="project_aurora", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=5, + name="Сон Разума", + platform_id=-1001202338312, + category="Qanon", + followers=27099, + platform="Telegram", + url="https://t.me/error_288", + screenname="error_288", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=6, + name="Пробуждающий Мир - официальный канал", + platform_id=-1001492521207, + category="Qanon", + followers=19097, + platform="Telegram", + url="https://t.me/promirru", + screenname="promirru", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=7, + name="ЦЕЛЬНОЗОР", + platform_id=-1001642737506, + category="Qanon", + followers=13654, + platform="Telegram", + url="https://t.me/tselnozor", + screenname="tselnozor", + country="RU", + influencer=None, + public=True, + chat=False, + notes=""),] + +controller = ScraperController() + +telegram = TelegramSnscrapeScraper() +controller.register_scraper(telegram) + +engine = create_engine('sqlite:///russian_telegram.db') +controller.connect_to_db(engine) + +controller.scrape_channels(test_channels) + diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index e69de29..a41b781 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -0,0 +1,8 @@ +from .base import Scraper, ScraperController +from .bitchute import BitchuteScraper +from .gab import GabScraper +from .gettr import GettrScraper +from .odysee import OdyseeScraper +from .rumble import RumbleScraper +from .telegram_snscrape import TelegramSnscrapeScraper +from .twitter import TwitterScraper \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 465b1f7..934f365 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -1,13 +1,17 @@ -from typing import Generator, Tuple -import cisticola.base -import requests +from typing import Generator, Tuple, List import os -import boto3 from io import BytesIO from urllib.parse import urlparse +import tempfile + +import requests +import boto3 from loguru import logger import ffmpeg -import tempfile +from sqlalchemy.orm import sessionmaker + +from cisticola.base import Channel, ScraperResult, mapper_registry + class Scraper: __version__ = "Scraper 0.0.0" @@ -89,8 +93,77 @@ class Scraper: return archived_url - def can_handle(self, channel: cisticola.base.Channel) -> bool: + def can_handle(self, channel: Channel) -> bool: pass - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + pass + + +class ScraperController: + """Registers scrapers, uses them to generate ScraperResults. Synchronizes + everything with database via ORM.""" + + def __init__(self): + self.scrapers = [] + self.session = None + self.mapper_registry = None + + def register_scraper(self, scraper: Scraper): + self.scrapers.append(scraper) + + def register_scrapers(self, scraper: List[Scraper]): + self.scrapers.extend(scraper) + + def scrape_channels(self, channels: List[Channel]): + if self.session is None: + logger.error("No DB session") + return + + for channel in channels: + handled = False + + for scraper in self.scrapers: + if scraper.can_handle(channel): + session = self.session() + handled = True + added = 0 + + # get most recent post + session = self.session() + rows = session.query(ScraperResult).where( + ScraperResult.channel == channel.id).order_by( + ScraperResult.date.desc()).limit(1).all() + + if len(rows) == 1: + since = rows[0] + else: + since = None + + posts = scraper.get_posts(channel, since=since) + + for post in posts: + session.add(post) + added += 1 + + session.commit() + logger.info( + f"{scraper} found {added} new posts from {channel}") + break + + if not handled: + logger.warning(f"No handler found for Channel {channel}") + + def connect_to_db(self, engine): + # create tables + mapper_registry.metadata.create_all(bind=engine) + + self.session = sessionmaker() + self.session.configure(bind=engine) + + +class ETLController: + """This class will transform the raw_data tables into a format more conducive to analysis.""" + + def __init__(self): pass diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 9e7f202..736f6c2 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -9,9 +9,9 @@ from typing import Generator import requests from bs4 import BeautifulSoup -import cisticola.base - -class BitchuteScraper(cisticola.scraper.base.Scraper): +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper +class BitchuteScraper(Scraper): """An implementation of a Scraper for Bitchute, using classes from the 4cat library""" __version__ = "BitchuteScraper 0.0.1" @@ -23,7 +23,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: session = requests.Session() session.headers.update(self.headers) @@ -32,8 +32,6 @@ class BitchuteScraper(cisticola.scraper.base.Scraper): "input", {"name": "csrfmiddlewaretoken"})[0].get("value") time.sleep(0.25) - # Don't scrape comment information - #TODO implement framework for processing and storing comments detail = 'comments' username = BitchuteScraper.get_username_from_url(channel.url) @@ -52,7 +50,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Bitchute", channel=channel.id, diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index e406078..b3cd5e4 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -1,11 +1,12 @@ -import cisticola.base -import cisticola.scraper.base from datetime import datetime import json from typing import Generator + from garc import Garc -class GabScraper(cisticola.scraper.base.Scraper): +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper +class GabScraper(Scraper): """An implementation of a Scraper for Gab, using GARC library""" __version__ = "GabScraper 0.0.1" @@ -14,7 +15,7 @@ class GabScraper(cisticola.scraper.base.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: client = Garc(profile = 'main') username = GabScraper.get_username_from_url(channel.url) @@ -37,7 +38,7 @@ class GabScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Gab", channel=channel.id, diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index cdcb6cf..4a1c206 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -1,12 +1,13 @@ -import cisticola.base -import cisticola.scraper.base from datetime import datetime import json from typing import Generator, Tuple -from gogettr import PublicClient from urllib.parse import urlparse -class GettrScraper(cisticola.scraper.base.Scraper): +from gogettr import PublicClient + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper +class GettrScraper(Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -17,7 +18,7 @@ class GettrScraper(cisticola.scraper.base.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: client = PublicClient() username = GettrScraper.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") @@ -47,7 +48,7 @@ class GettrScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post['vid']] = archived_url - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Gettr", channel=channel.id, diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index fc0c3da..bd1d3aa 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -1,13 +1,15 @@ -import cisticola.base -import cisticola.scraper.base from datetime import datetime import json from typing import Generator -from polyphemus.base import OdyseeChannel from urllib.parse import urlparse + +from polyphemus.base import OdyseeChannel import requests -class OdyseeScraper(cisticola.scraper.base.Scraper): +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +class OdyseeScraper(Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" __version__ = "OdyseeScraper 0.0.1" @@ -17,7 +19,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: username = OdyseeScraper.get_username_from_url(channel.url) odysee_channel = OdyseeChannel(channel_name = username) @@ -43,7 +45,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): all_comments = video.get_all_comments() - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Odysee", channel=channel.id, @@ -55,7 +57,7 @@ class OdyseeScraper(cisticola.scraper.base.Scraper): for comment in all_comments: - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Odysee", channel=channel.id, diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 620dcc0..c0e5b92 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -1,20 +1,19 @@ -from concurrent.futures import process -import cisticola.base -import cisticola.scraper.base from datetime import datetime import json from typing import Generator, Tuple import tempfile +from urllib.parse import urlparse import requests from bs4 import BeautifulSoup import youtube_dl -import json -from urllib.parse import urlparse + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper BASE_URL = 'https://rumble.com' -class RumbleScraper(cisticola.scraper.base.Scraper): +class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" @@ -23,7 +22,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper): return username - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: username = RumbleScraper.get_username_from_url(channel.url) scraper = get_channel_videos(username) @@ -40,7 +39,7 @@ class RumbleScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post['media_url']] = archived_url - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Rumble", channel=channel.id, diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 83b15b1..a176538 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -1,18 +1,19 @@ -import cisticola.base -import cisticola.scraper.base from typing import Generator -import snscrape.modules from datetime import datetime, timezone +import snscrape.modules -class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +class TelegramSnscrapeScraper(Scraper): __version__ = "TelegramSnscrapeScraper 0.0.1" def can_handle(self, channel): if channel.platform == "Telegram" and channel.public and not channel.chat: return True - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) @@ -34,7 +35,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[post.video] = archived_url - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Telegram", channel=channel.id, diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index e36aab1..de72de2 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,17 +1,19 @@ -import cisticola.base -import cisticola.scraper.base from datetime import datetime, timezone from typing import Generator -import snscrape.modules -from loguru import logger from urllib.parse import urlparse, parse_qs -class TwitterScraper(cisticola.scraper.base.Scraper): +from snscrape.modules.twitter import TwitterProfileScraper, Video, Gif, Photo +from loguru import logger + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +class TwitterScraper(Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" - def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: - scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id) + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: + scraper = TwitterProfileScraper(channel.platform_id) first = True @@ -28,13 +30,13 @@ class TwitterScraper(cisticola.scraper.base.Scraper): if tweet.media: for media in tweet.media: - if type(media) == snscrape.modules.twitter.Video: + if type(media) == Video: variant = max( [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) url = variant.url - elif type(media) == snscrape.modules.twitter.Gif: + elif type(media) == Gif: url = media.variants[0].url - elif type(media) == snscrape.modules.twitter.Photo: + elif type(media) == Photo: url = media.fullUrl else: logger.warning(f"Could not get media URL of {media}") @@ -45,7 +47,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(media_blob, content_type, key) archived_urls[url] = archived_url - yield cisticola.base.ScraperResult( + yield ScraperResult( scraper=self.__version__, platform="Twitter", channel=channel.id, diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 1512950..e3a4b49 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,16 +1,2 @@ -import cisticola.base - -class Transformer: - """Interface class for transformers""" - - __version__ = "Transformer 0.0.0" - - def __init__(self): - pass - - def can_handle(data: cisticola.base.ScraperResult) -> bool: - pass - - def transform(data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult: - pass - +from . import base +from .twitter import TwitterTransformer \ No newline at end of file diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py new file mode 100644 index 0000000..8005b4a --- /dev/null +++ b/cisticola/transformer/base.py @@ -0,0 +1,16 @@ +from cisticola.base import ScraperResult, TransformedResult + +class Transformer: + """Interface class for transformers""" + + __version__ = "Transformer 0.0.0" + + def __init__(self): + pass + + def can_handle(data: ScraperResult) -> bool: + pass + + def transform(data: ScraperResult) -> TransformedResult: + pass + diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index c79e01f..866a9fb 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -1,17 +1,17 @@ -import cisticola.transformer -import cisticola.base import json +from cisticola.transformer.base import Transformer +from cisticola.base import ScraperResult, TransformedResult -class TwitterTransformer(cisticola.transformer.Transformer): +class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" __version__ = "TwitterTransformer 0.0.1" - def transform(self, data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult: + def transform(self, data: ScraperResult) -> TransformedResult: raw = json.loads(data.raw_data) - transformed = cisticola.base.TransformedResult( + transformed = TransformedResult( raw_id=data.id, scraper=data.scraper, transformer=self.__version__, diff --git a/test.py b/test.py index d3bda39..aec6b22 100644 --- a/test.py +++ b/test.py @@ -1,76 +1,127 @@ -import cisticola -import cisticola.scraper.telegram_snscrape -import cisticola.scraper.twitter -import cisticola.scraper.gettr -import cisticola.scraper.bitchute -import cisticola.scraper.odysee -import cisticola.scraper.gab -import cisticola.scraper.rumble - from sqlalchemy import create_engine +from cisticola.base import Channel +from cisticola.scraper import ( + ScraperController, + BitchuteScraper, + GabScraper, + GettrScraper, + OdyseeScraper, + RumbleScraper, + TelegramSnscrapeScraper, + TwitterScraper) test_channels = [ - cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, - category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", - influencer=None, public=True, chat=False, - notes=""), - cisticola.base.Channel(id=1, name="South West Ohio Proud Boys (test)", platform_id=-1001276612436, - category="test", followers=None, platform="Telegram", - url="https://t.me/SouthwestOhioPB", screenname="SouthwestOhioPB", country="US", - influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel(id=2, name="LizardRepublic (test)", platform_id='lizardrepublic', - category="test", followers=None, platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", - influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel( - id=4, name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', - category="test", followers=None, platform="Bitchute", - url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US", - influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel( - id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon', - category="test", followers=None, platform="Odysee", - url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US", - influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel( - id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt', - category="test", followers=None, platform="Gab", - url="https://gab.com/marc_capt", screenname='marc_capt', country="CA", - influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel( - id=7, name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', - category="test", followers=None, platform="Rumble", - url="https://rumble.com/c/c-916305", screenname='we are uploading', country="CA", - influencer=None, public=True, chat=False, notes="")] + Channel( + id=0, + name="Logan Williams (test)", + platform_id=891729132, + category="test", + followers=None, + platform="Twitter", + url="https://twitter.com/obtusatum", + screenname="obtusatum", + country="US", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=1, + name="South West Ohio Proud Boys (test)", + platform_id=-1001276612436, + category="test", + followers=None, + platform="Telegram", + url="https://t.me/SouthwestOhioPB", + screenname="SouthwestOhioPB", + country="US", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=2, + name="LizardRepublic (test)", + platform_id='lizardrepublic', + category="test", + followers=None, + platform="Gettr", + url="https://www.gettr.com/user/lizardrepublic", + screenname="lizardrepublic", + country="US", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=4, + name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', + category="test", + followers=None, + platform="Bitchute", + url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, + country="US", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=5, + name="Mak1n' Bacon (test)", + platform_id='Mak1nBacon', + category="test", + followers=None, + platform="Odysee", + url="https://odysee.com/@Mak1nBacon", + screenname='Mak1nBacon', + country="US", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=6, + name="Capt. Marc Simon (test)", + platform_id='marc_capt', + category="test", + followers=None, + platform="Gab", + url="https://gab.com/marc_capt", + screenname='marc_capt', + country="CA", + influencer=None, + public=True, + chat=False, + notes=""), + Channel( + id=7, + name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', + category="test", + followers=None, + platform="Rumble", + url="https://rumble.com/c/c-916305", + screenname='we are uploading', + country="CA", + influencer=None, + public=True, + chat=False, + notes="")] +controller = ScraperController() -controller = cisticola.ScraperController() +scrapers = [ + BitchuteScraper(), + GabScraper(), + GettrScraper(), + OdyseeScraper(), + RumbleScraper(), + TelegramSnscrapeScraper(), + TwitterScraper()] -twitter = cisticola.scraper.twitter.TwitterScraper() -controller.register_scraper(twitter) - -telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() -controller.register_scraper(telegram) - -gettr = cisticola.scraper.gettr.GettrScraper() -controller.register_scraper(gettr) - -bitchute = cisticola.scraper.bitchute.BitchuteScraper() -controller.register_scraper(bitchute) - -odysee = cisticola.scraper.odysee.OdyseeScraper() -controller.register_scraper(odysee) - -gab = cisticola.scraper.gab.GabScraper() -controller.register_scraper(gab) - -rumble = cisticola.scraper.rumble.RumbleScraper() -controller.register_scraper(rumble) +controller.register_scrapers(scrapers) engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) -controller.scrape_channels(test_channels) - +controller.scrape_channels(test_channels) \ No newline at end of file From cd5f68e9e5ff5c168cd2e058d587a179fc88975f Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 4 Mar 2022 12:36:09 -0600 Subject: [PATCH 9/9] added basic unit tests --- .gitignore | 16 +- Pipfile | 5 + Pipfile.lock | 307 +++++++++++++----- cisticola/scraper/bitchute.py | 2 - .../russian_telegram_ingest.py | 0 pytest.ini | 13 + tests/__init__.py | 0 tests/conftest.py | 147 +++++++++ tests/scraper/bitchute.py | 8 + tests/scraper/gab.py | 8 + tests/scraper/gettr.py | 8 + tests/scraper/odysee.py | 8 + tests/scraper/rumble.py | 8 + tests/scraper/telegram_snscrape.py | 8 + tests/scraper/twitter.py | 8 + 15 files changed, 457 insertions(+), 89 deletions(-) rename {cisticola/examples => examples}/russian_telegram_ingest.py (100%) create mode 100644 pytest.ini create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/scraper/bitchute.py create mode 100644 tests/scraper/gab.py create mode 100644 tests/scraper/gettr.py create mode 100644 tests/scraper/odysee.py create mode 100644 tests/scraper/rumble.py create mode 100644 tests/scraper/telegram_snscrape.py create mode 100644 tests/scraper/twitter.py diff --git a/.gitignore b/.gitignore index b107dd0..1f00762 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,17 @@ -.DS_Store +# Sphinx documentation +docs/build/ +docs/source/_* + +# Miscellaneous files +**/.DS_Store *.pyc *.ipynb *.db -docs/build/ -docs/source/_* .env + +# Unit test / coverage reports +reports +.coverage +.cache +.pytest_cache/ +cover/ \ No newline at end of file diff --git a/Pipfile b/Pipfile index 7ea75cd..c8eabfc 100644 --- a/Pipfile +++ b/Pipfile @@ -17,7 +17,12 @@ ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" youtube-dl = "*" + [dev-packages] +pytest = "*" +pytest-cov = "*" +pytest-html = "*" +pytest-metadata = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index c66dfed..76b4c3f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ceba738b6c5ca9afd5fa79490cffde53d97ea7ec86034e0f31ecbc54fd418055" + "sha256": "ea2a1f1dff68fa0bd30dab06553e913f467c3b1399388b97f0ed913ab74c6e85" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,19 @@ }, "boto3": { "hashes": [ - "sha256:32080e2d956b222f36b76f8fec532ec237ddb4a935dd1c9bb79c759fbe4a5868", - "sha256:bd7c71274e9257596879f99cff3d0f531b801e567e509b5e3d613bd2033a7279" + "sha256:75709628320cea8ce137975dc33b75213c2e4f6e7cd09e55290de7245e2c79e2", + "sha256:c92ec20a670721b5a1bc013b305a84db2b7f9c716653b3056ce7e2fbd2a180ef" ], "index": "pypi", - "version": "==1.21.9" + "version": "==1.21.12" }, "botocore": { "hashes": [ - "sha256:8d41deb25e585b0d7b6ee8547990d5e95562f1dc5d3127af58459450b25c13c7", - "sha256:c44758c487df7a357c4a103d959962d78e225d1ab6c9eeda4c77f79a410ccd19" + "sha256:0174999a04b0a2e42457106093ace9b36fa94772a442d9bcf60750263d1d073e", + "sha256:0cd7395311a3fef4aad8df8f511b4f7d221c24ae30934bd5c03458b0fc096d0c" ], "markers": "python_version >= '3.6'", - "version": "==1.24.9" + "version": "==1.24.12" }, "bs4": { "hashes": [ @@ -446,7 +446,7 @@ }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", - "ref": "18b89f19ecdd32e7dc8b5564b258a67165e680ca" + "ref": "8506fd43770661cdcf92c5cac2356cba74778834" }, "py": { "hashes": [ @@ -513,82 +513,83 @@ }, "regex": { "hashes": [ - "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87", - "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52", - "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3", - "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288", - "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f", - "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c", - "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184", - "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f", - "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8", - "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02", - "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3", - "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38", - "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d", - "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633", - "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4", - "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5", - "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202", - "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3", - "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118", - "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d", - "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729", - "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed", - "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607", - "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c", - "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a", - "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75", - "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899", - "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0", - "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832", - "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9", - "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a", - "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6", - "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1", - "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68", - "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e", - "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74", - "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7", - "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3", - "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4", - "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4", - "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b", - "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c", - "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101", - "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a", - "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1", - "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7", - "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d", - "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605", - "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d", - "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916", - "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949", - "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6", - "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3", - "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6", - "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9", - "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af", - "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59", - "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f", - "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2", - "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298", - "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4", - "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c", - "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc", - "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a", - "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43", - "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a", - "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb", - "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093", - "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8", - "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52", - "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442", - "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338", - "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f", - "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab" + "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", + "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9", + "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204", + "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f", + "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737", + "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b", + "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3", + "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4", + "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac", + "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f", + "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29", + "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772", + "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1", + "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863", + "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66", + "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed", + "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47", + "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f", + "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f", + "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008", + "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d", + "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571", + "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0", + "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a", + "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3", + "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7", + "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447", + "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493", + "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4", + "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede", + "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640", + "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd", + "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c", + "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee", + "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30", + "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b", + "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec", + "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1", + "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e", + "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8", + "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9", + "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231", + "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7", + "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729", + "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960", + "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056", + "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357", + "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7", + "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3", + "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7", + "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573", + "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0", + "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178", + "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f", + "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834", + "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c", + "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015", + "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0", + "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57", + "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635", + "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07", + "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2", + "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1", + "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b", + "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2", + "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5", + "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b", + "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86", + "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5", + "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93", + "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0", + "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f", + "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", + "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], - "version": "==2022.1.18" + "markers": "python_version >= '3.6'", + "version": "==2022.3.2" }, "requests": { "extras": [ @@ -783,5 +784,143 @@ "version": "==3.7.0" } }, - "develop": {} + "develop": { + "attrs": { + "hashes": [ + "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", + "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.4.0" + }, + "coverage": { + "extras": [ + "toml" + ], + "hashes": [ + "sha256:03e2a7826086b91ef345ff18742ee9fc47a6839ccd517061ef8fa1976e652ce9", + "sha256:07e6db90cd9686c767dcc593dff16c8c09f9814f5e9c51034066cad3373b914d", + "sha256:18d520c6860515a771708937d2f78f63cc47ab3b80cb78e86573b0a760161faf", + "sha256:1ebf730d2381158ecf3dfd4453fbca0613e16eaa547b4170e2450c9707665ce7", + "sha256:21b7745788866028adeb1e0eca3bf1101109e2dc58456cb49d2d9b99a8c516e6", + "sha256:26e2deacd414fc2f97dd9f7676ee3eaecd299ca751412d89f40bc01557a6b1b4", + "sha256:2c6dbb42f3ad25760010c45191e9757e7dce981cbfb90e42feef301d71540059", + "sha256:2fea046bfb455510e05be95e879f0e768d45c10c11509e20e06d8fcaa31d9e39", + "sha256:34626a7eee2a3da12af0507780bb51eb52dca0e1751fd1471d0810539cefb536", + "sha256:37d1141ad6b2466a7b53a22e08fe76994c2d35a5b6b469590424a9953155afac", + "sha256:46191097ebc381fbf89bdce207a6c107ac4ec0890d8d20f3360345ff5976155c", + "sha256:4dd8bafa458b5c7d061540f1ee9f18025a68e2d8471b3e858a9dad47c8d41903", + "sha256:4e21876082ed887baed0146fe222f861b5815455ada3b33b890f4105d806128d", + "sha256:58303469e9a272b4abdb9e302a780072c0633cdcc0165db7eec0f9e32f901e05", + "sha256:5ca5aeb4344b30d0bec47481536b8ba1181d50dbe783b0e4ad03c95dc1296684", + "sha256:68353fe7cdf91f109fc7d474461b46e7f1f14e533e911a2a2cbb8b0fc8613cf1", + "sha256:6f89d05e028d274ce4fa1a86887b071ae1755082ef94a6740238cd7a8178804f", + "sha256:7a15dc0a14008f1da3d1ebd44bdda3e357dbabdf5a0b5034d38fcde0b5c234b7", + "sha256:8bdde1177f2311ee552f47ae6e5aa7750c0e3291ca6b75f71f7ffe1f1dab3dca", + "sha256:8ce257cac556cb03be4a248d92ed36904a59a4a5ff55a994e92214cde15c5bad", + "sha256:8cf5cfcb1521dc3255d845d9dca3ff204b3229401994ef8d1984b32746bb45ca", + "sha256:8fbbdc8d55990eac1b0919ca69eb5a988a802b854488c34b8f37f3e2025fa90d", + "sha256:9548f10d8be799551eb3a9c74bbf2b4934ddb330e08a73320123c07f95cc2d92", + "sha256:96f8a1cb43ca1422f36492bebe63312d396491a9165ed3b9231e778d43a7fca4", + "sha256:9b27d894748475fa858f9597c0ee1d4829f44683f3813633aaf94b19cb5453cf", + "sha256:9baff2a45ae1f17c8078452e9e5962e518eab705e50a0aa8083733ea7d45f3a6", + "sha256:a2a8b8bcc399edb4347a5ca8b9b87e7524c0967b335fbb08a83c8421489ddee1", + "sha256:acf53bc2cf7282ab9b8ba346746afe703474004d9e566ad164c91a7a59f188a4", + "sha256:b0be84e5a6209858a1d3e8d1806c46214e867ce1b0fd32e4ea03f4bd8b2e3359", + "sha256:b31651d018b23ec463e95cf10070d0b2c548aa950a03d0b559eaa11c7e5a6fa3", + "sha256:b78e5afb39941572209f71866aa0b206c12f0109835aa0d601e41552f9b3e620", + "sha256:c76aeef1b95aff3905fb2ae2d96e319caca5b76fa41d3470b19d4e4a3a313512", + "sha256:dd035edafefee4d573140a76fdc785dc38829fe5a455c4bb12bac8c20cfc3d69", + "sha256:dd6fe30bd519694b356cbfcaca9bd5c1737cddd20778c6a581ae20dc8c04def2", + "sha256:e5f4e1edcf57ce94e5475fe09e5afa3e3145081318e5fd1a43a6b4539a97e518", + "sha256:ec6bc7fe73a938933d4178c9b23c4e0568e43e220aef9472c4f6044bfc6dd0f0", + "sha256:f1555ea6d6da108e1999b2463ea1003fe03f29213e459145e70edbaf3e004aaa", + "sha256:f5fa5803f47e095d7ad8443d28b01d48c0359484fec1b9d8606d0e3282084bc4", + "sha256:f7331dbf301b7289013175087636bbaf5b2405e57259dd2c42fdcc9fcc47325e", + "sha256:f9987b0354b06d4df0f4d3e0ec1ae76d7ce7cbca9a2f98c25041eb79eec766f1", + "sha256:fd9e830e9d8d89b20ab1e5af09b32d33e1a08ef4c4e14411e559556fd788e6b2" + ], + "markers": "python_version >= '3.7'", + "version": "==6.3.2" + }, + "iniconfig": { + "hashes": [ + "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", + "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" + ], + "version": "==1.1.1" + }, + "packaging": { + "hashes": [ + "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", + "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" + ], + "markers": "python_version >= '3.6'", + "version": "==21.3" + }, + "pluggy": { + "hashes": [ + "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", + "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" + ], + "markers": "python_version >= '3.6'", + "version": "==1.0.0" + }, + "py": { + "hashes": [ + "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", + "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==1.11.0" + }, + "pyparsing": { + "hashes": [ + "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", + "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.7" + }, + "pytest": { + "hashes": [ + "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", + "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + ], + "markers": "python_version >= '3.6'", + "version": "==7.0.1" + }, + "pytest-cov": { + "hashes": [ + "sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6", + "sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470" + ], + "index": "pypi", + "version": "==3.0.0" + }, + "pytest-html": { + "hashes": [ + "sha256:3ee1cf319c913d19fe53aeb0bc400e7b0bc2dbeb477553733db1dad12eb75ee3", + "sha256:b7f82f123936a3f4d2950bc993c2c1ca09ce262c9ae12f9ac763a2401380b455" + ], + "index": "pypi", + "version": "==3.1.1" + }, + "pytest-metadata": { + "hashes": [ + "sha256:576055b8336dd4a9006dd2a47615f76f2f8c30ab12b1b1c039d99e834583523f", + "sha256:71b506d49d34e539cc3cfdb7ce2c5f072bea5c953320002c95968e0238f8ecf1" + ], + "index": "pypi", + "version": "==1.11.0" + }, + "tomli": { + "hashes": [ + "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", + "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + ], + "markers": "python_version >= '3.7'", + "version": "==2.0.1" + } + } } diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 736f6c2..ce839f1 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -16,8 +16,6 @@ class BitchuteScraper(Scraper): library""" __version__ = "BitchuteScraper 0.0.1" - # TODO snscrape should be able to scrape from user ID alone, but there is - # currently a bug/other issue, so it is extracting the username from URL def get_username_from_url(url): username = url.split('bitchute.com/channel/')[-1].strip('/') diff --git a/cisticola/examples/russian_telegram_ingest.py b/examples/russian_telegram_ingest.py similarity index 100% rename from cisticola/examples/russian_telegram_ingest.py rename to examples/russian_telegram_ingest.py diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..844d239 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,13 @@ +[pytest] +minversion = + 6.0.2 +testpaths = + tests/ +python_files = + *.py +addopts = + -vvv + --cov='cisticola' + --cov-report html:reports/coverage + --html='reports/tests.html' + --self-contained-html \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..547d02f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,147 @@ +import pytest + +from sqlalchemy import create_engine + +from cisticola.scraper import ScraperController + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +BITCHUTE_CHANNEL_KWARGS = { + 'id': 0, + 'name': 'bestonlinejewelrystoresusa@gmail.com (test)', + 'platform_id': 'bestonlinejewelrystoresusagmailcom', + 'category': 'test', + 'followers': None, + 'platform': 'Bitchute', + 'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/', + 'screenname': None, + 'country': 'US', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +GAB_CHANNEL_KWARGS = { + 'id': 1, + 'name': 'Capt. Marc Simon (test)', + 'platform_id': 'marc_capt', + 'category': 'test', + 'followers': None, + 'platform': 'Gab', + 'url': 'https://gab.com/marc_capt', + 'screenname': 'marc_capt', + 'country': 'CA', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +GETTR_CHANNEL_KWARGS = { + 'id': 2, + 'name': 'LizardRepublic (test)', + 'platform_id': 'lizardrepublic', + 'category': 'test', + 'followers': None, + 'platform': 'Gettr', + 'url': 'https://www.gettr.com/user/lizardrepublic', + 'screenname': 'lizardrepublic', + 'country': 'US', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +ODYSEE_CHANNEL_KWARGS = { + 'id': 3, + 'name': "Mak1n' Bacon (test)", + 'platform_id': 'Mak1nBacon', + 'category': 'test', + 'followers': None, + 'platform': 'Odysee', + 'url': 'https://odysee.com/@Mak1nBacon', + 'screenname': 'Mak1nBacon', + 'country': 'US', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +RUMBLE_CHANNEL_KWARGS = { + 'id': 4, + 'name': 'we are uploading videos wow products', + 'platform_id': 'c-916305', + 'category': 'test', + 'followers': None, + 'platform': 'Rumble', + 'url': 'https://rumble.com/c/c-916305', + 'screenname': 'we are uploading', + 'country': 'CA', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +TELEGRAM_SNSCRAPE_CHANNEL_KWARGS = { + 'id': 5, + 'name': 'South West Ohio Proud Boys (test)', + 'platform_id': -1001276612436, + 'category': 'test', + 'followers': None, + 'platform': 'Telegram', + 'url': 'https://t.me/SouthwestOhioPB', + 'screenname': 'SouthwestOhioPB', + 'country': 'US', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +TWITTER_CHANNEL_KWARGS = { + 'id': 5, + 'name': 'Logan Williams (test)', + 'platform_id': 891729132, + 'category': 'test', + 'followers': None, + 'platform': 'Twitter', + 'url': 'https://twitter.com/obtusatum', + 'screenname': 'obtusatum', + 'country': 'US', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +@pytest.fixture(scope='package') +def controller(tmpdir_factory): + + """Initialize ScraperController and SQLite database file to be used for all + tests in the package. + """ + + file = tmpdir_factory.mktemp('test_data').join('test.db') + engine = create_engine(f'sqlite:///{file}') + + scraper_controller = ScraperController() + scraper_controller.connect_to_db(engine) + + return scraper_controller + +@pytest.fixture(scope='package') +def channel_kwargs(): + + """Define keyword arguments to use for defining test channels for each + platform to be scraped. + """ + + return { + 'bitchute' : BITCHUTE_CHANNEL_KWARGS, + 'gab' : GAB_CHANNEL_KWARGS, + 'gettr' : GETTR_CHANNEL_KWARGS, + 'odysee' : ODYSEE_CHANNEL_KWARGS, + 'rumble' : RUMBLE_CHANNEL_KWARGS, + 'telegram_snscrape' : TELEGRAM_SNSCRAPE_CHANNEL_KWARGS, + 'twitter' : TWITTER_CHANNEL_KWARGS} + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py new file mode 100644 index 0000000..83883d8 --- /dev/null +++ b/tests/scraper/bitchute.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import BitchuteScraper + +def test_scrape_bitchute_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['bitchute'])] + controller.register_scraper(BitchuteScraper()) + controller.scrape_channels(channels) diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py new file mode 100644 index 0000000..a360af3 --- /dev/null +++ b/tests/scraper/gab.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import GabScraper + +def test_scrape_gab_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['gab'])] + controller.register_scraper(GabScraper()) + controller.scrape_channels(channels) diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py new file mode 100644 index 0000000..ac08db7 --- /dev/null +++ b/tests/scraper/gettr.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import GettrScraper + +def test_scrape_gettr_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['gettr'])] + controller.register_scraper(GettrScraper()) + controller.scrape_channels(channels) diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py new file mode 100644 index 0000000..c13d08d --- /dev/null +++ b/tests/scraper/odysee.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import OdyseeScraper + +def test_scrape_odysee_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['odysee'])] + controller.register_scraper(OdyseeScraper()) + controller.scrape_channels(channels) diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py new file mode 100644 index 0000000..8c00aa5 --- /dev/null +++ b/tests/scraper/rumble.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import RumbleScraper + +def test_scrape_rumble_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['rumble'])] + controller.register_scraper(RumbleScraper()) + controller.scrape_channels(channels) diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py new file mode 100644 index 0000000..077f1bb --- /dev/null +++ b/tests/scraper/telegram_snscrape.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import TelegramSnscrapeScraper + +def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['telegram_snscrape'])] + controller.register_scraper(TelegramSnscrapeScraper()) + controller.scrape_channels(channels) diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py new file mode 100644 index 0000000..5c22b62 --- /dev/null +++ b/tests/scraper/twitter.py @@ -0,0 +1,8 @@ +from cisticola.base import Channel +from cisticola.scraper import TwitterScraper + +def test_scrape_twitter_channel(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['twitter'])] + controller.register_scraper(TwitterScraper()) + controller.scrape_channels(channels)