From 47dad8fb00489a875f8d5b90d17b75a272166f76 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 25 Feb 2022 20:28:00 -0600 Subject: [PATCH] added odysee scraper, minor refactoring of url_to_blob method (added url_to_key method that can be overridden by child classes while still using the parent url_to_blob method) and changed test file to include only channels with a relatively small number of posts, to make testing faster --- Pipfile | 1 + Pipfile.lock | 74 +++++++++++++++++++++++++++++++++---- cisticola/scraper/base.py | 8 +++- cisticola/scraper/odysee.py | 53 ++++++++++++++++++++++++++ test.py | 29 ++++++++++----- 5 files changed, 145 insertions(+), 20 deletions(-) create mode 100644 cisticola/scraper/odysee.py diff --git a/Pipfile b/Pipfile index 2f4187e..3f094c5 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ sphinx = "*" boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" +polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 0bdf685..75b94d7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6" + "sha256": "263a7825d8113518c7a0690d5f69526cabe2dfa6ea572bb39cbe5d26495e619c" }, "pipfile-spec": 6, "requires": { @@ -41,19 +41,19 @@ }, "boto3": { "hashes": [ - "sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b", - "sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61" + "sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71", + "sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7" ], "index": "pypi", - "version": "==1.21.7" + "version": "==1.21.8" }, "botocore": { "hashes": [ - "sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c", - "sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50" + "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c", + "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759" ], "markers": "python_version >= '3.6'", - "version": "==1.24.7" + "version": "==1.24.8" }, "bs4": { "hashes": [ @@ -354,6 +354,31 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, + "numpy": { + "hashes": [ + "sha256:03ae5850619abb34a879d5f2d4bb4dcd025d6d8fb72f5e461dae84edccfe129f", + "sha256:076aee5a3763d41da6bef9565fdf3cb987606f567cd8b104aded2b38b7b47abf", + "sha256:0b536b6840e84c1c6a410f3a5aa727821e6108f3454d81a5cd5900999ef04f89", + "sha256:15efb7b93806d438e3bc590ca8ef2f953b0ce4f86f337ef4559d31ec6cf9d7dd", + "sha256:168259b1b184aa83a514f307352c25c56af111c269ffc109d9704e81f72e764b", + "sha256:2638389562bda1635b564490d76713695ff497242a83d9b684d27bb4a6cc9d7a", + "sha256:3556c5550de40027d3121ebbb170f61bbe19eb639c7ad0c7b482cd9b560cd23b", + "sha256:4a176959b6e7e00b5a0d6f549a479f869829bfd8150282c590deee6d099bbb6e", + "sha256:515a8b6edbb904594685da6e176ac9fbea8f73a5ebae947281de6613e27f1956", + "sha256:55535c7c2f61e2b2fc817c5cbe1af7cb907c7f011e46ae0a52caa4be1f19afe2", + "sha256:59153979d60f5bfe9e4c00e401e24dfe0469ef8da6d68247439d3278f30a180f", + "sha256:60cb8e5933193a3cc2912ee29ca331e9c15b2da034f76159b7abc520b3d1233a", + "sha256:6767ad399e9327bfdbaa40871be4254d1995f4a3ca3806127f10cec778bd9896", + "sha256:76a4f9bce0278becc2da7da3b8ef854bed41a991f4226911a24a9711baad672c", + "sha256:8cf33634b60c9cef346663a222d9841d3bbbc0a2f00221d6bcfd0d993d5543f6", + "sha256:94dd11d9f13ea1be17bac39c1942f527cbf7065f94953cf62dfe805653da2f8f", + "sha256:aafa46b5a39a27aca566198d3312fb3bde95ce9677085efd02c86f7ef6be4ec7", + "sha256:badca914580eb46385e7f7e4e426fea6de0a37b9e06bec252e481ae7ec287082", + "sha256:d76a26c5118c4d96e264acc9e3242d72e1a2b92e739807b3b69d8d47684b6677" + ], + "markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'", + "version": "==1.22.2" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -362,6 +387,37 @@ "markers": "python_version >= '3.6'", "version": "==21.3" }, + "pandas": { + "hashes": [ + "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34", + "sha256:04dd15d9db538470900c851498e532ef28d4e56bfe72c9523acb32042de43dfb", + "sha256:0b1a13f647e4209ed7dbb5da3497891d0045da9785327530ab696417ef478f84", + "sha256:19f7c632436b1b4f84615c3b127bbd7bc603db95e3d4332ed259dc815c9aaa26", + "sha256:1b384516dbb4e6aae30e3464c2e77c563da5980440fbdfbd0968e3942f8f9d70", + "sha256:1d85d5f6be66dfd6d1d8d13b9535e342a2214260f1852654b19fa4d7b8d1218b", + "sha256:2e5a7a1e0ecaac652326af627a3eca84886da9e667d68286866d4e33f6547caf", + "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf", + "sha256:358b0bc98a5ff067132d23bf7a2242ee95db9ea5b7bbc401cf79205f11502fd3", + "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907", + "sha256:4e1176f45981c8ccc8161bc036916c004ca51037a7ed73f2d2a9857e6dbe654f", + "sha256:508c99debccd15790d526ce6b1624b97a5e1e4ca5b871319fb0ebfd46b8f4dad", + "sha256:6105af6533f8b63a43ea9f08a2ede04e8f43e49daef0209ab0d30352bcf08bee", + "sha256:6d6ad1da00c7cc7d8dd1559a6ba59ba3973be6b15722d49738b2be0977eb8a0c", + "sha256:7ea47ba1d6f359680130bd29af497333be6110de8f4c35b9211eec5a5a9630fa", + "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2", + "sha256:96e9ece5759f9b47ae43794b6359bbc54805d76e573b161ae770c1ea59393106", + "sha256:bbb15ad79050e8b8d39ec40dd96a30cd09b886a2ae8848d0df1abba4d5502a67", + "sha256:c614001129b2a5add5e3677c3a213a9e6fd376204cb8d17c04e84ff7dfc02a73", + "sha256:e6a7bbbb7950063bfc942f8794bc3e31697c020a14f1cd8905fc1d28ec674a01", + "sha256:f02e85e6d832be37d7f16cf6ac8bb26b519ace3e5f3235564a91c7f658ab2a43" + ], + "markers": "python_version >= '3.8'", + "version": "==1.4.1" + }, + "polyphemus": { + "git": "https://github.com/bellingcat/polyphemus.git", + "ref": "72ea0a63de4b40bf8038dfdb26cbbab87ba86da9" + }, "pygments": { "hashes": [ "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", @@ -489,7 +545,9 @@ "version": "==2022.1.18" }, "requests": { - "extras": [], + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 9712a97..7a7865e 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -24,6 +24,11 @@ class Scraper: def __str__(self): return self.__version__ + def url_to_key(self, url: str, content_type: str) -> str: + key = url.split('/')[-1] + key = key.split('?')[0] + return key + def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: n_retries = 0 @@ -42,8 +47,7 @@ class Scraper: content_type = r.headers.get('Content-Type') if key is None: - key = url.split('/')[-1] - key = key.split('?')[0] + key = self.url_to_key(url, content_type) return blob, content_type, key diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py new file mode 100644 index 0000000..dc496b2 --- /dev/null +++ b/cisticola/scraper/odysee.py @@ -0,0 +1,53 @@ +import cisticola.base +import cisticola.scraper.base +from datetime import datetime +import json +from typing import Generator +from polyphemus.base import OdyseeChannel + +class OdyseeScraper(cisticola.scraper.base.Scraper): + """An implementation of a Scraper for Odysee, using polyphemus library""" + __version__ = "OdyseeScraper 0.0.1" + + def get_username_from_url(url): + + username = url.split('odysee.com/')[-1].strip('@').split(':')[0] + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + + username = OdyseeScraper.get_username_from_url(channel.url) + odysee_channel = OdyseeChannel(channel_name = username) + + all_videos = odysee_channel.get_all_videos() + + for video in all_videos: + if since is not None and datetime.fromtimestamp(video['created']) <= since.date: + break + + archived_urls = {} + url = video.info['streaming_url'] + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Odysee", + channel=channel.id, + platform_id=video.info['claim_id'], + date=datetime.fromtimestamp(video.info['created']), + date_archived=datetime.now(), + raw_data=json.dumps(video.info), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: + return True + + def url_to_key(self, url: str, content_type: str) -> str: + key = url.split('/')[-2] + ext = content_type.split('/')[-1] + + return f'{key}.{ext}' \ No newline at end of file diff --git a/test.py b/test.py index 1270f8b..eef8e29 100644 --- a/test.py +++ b/test.py @@ -3,6 +3,7 @@ import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter import cisticola.scraper.gettr import cisticola.scraper.bitchute +import cisticola.scraper.odysee from sqlalchemy import create_engine @@ -13,19 +14,24 @@ test_channels = [ url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, notes=""), - cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, - category="qanon", followers=None, platform="Telegram", - url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR", - influencer="JQNH SPARTAN", public=True, chat=False, notes=""), - cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic', - category="qanon", followers=None, platform="Gettr", + cisticola.base.Channel(id=1, name="South West Ohio Proud Boys (test)", platform_id=-1001276612436, + category="test", followers=None, platform="Telegram", + url="https://t.me/SouthwestOhioPB", screenname="SouthwestOhioPB", country="US", + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel(id=2, name="LizardRepublic (test)", platform_id='lizardrepublic', + category="test", followers=None, platform="Gettr", url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US", influencer=None, public=True, chat=False, notes=""), cisticola.base.Channel( - id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom', - category="spam", followers=None, platform="Bitchute", + id=4, name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', + category="test", followers=None, platform="Bitchute", url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US", - influencer=None, public=True, chat=False, notes=""),] + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel( + id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon', + category="test", followers=None, platform="Odysee", + url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US", + influencer=None, public=True, chat=False, notes="")] controller = cisticola.ScraperController() @@ -40,7 +46,10 @@ gettr = cisticola.scraper.gettr.GettrScraper() controller.register_scraper(gettr) bitchute = cisticola.scraper.bitchute.BitchuteScraper() -controller.register_scraper(gettr) +controller.register_scraper(bitchute) + +odysee = cisticola.scraper.odysee.OdyseeScraper() +controller.register_scraper(odysee) engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine)