From 4668d4df1125d156abcd0aa2c3c7820d368c7270 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 18 Feb 2022 10:13:37 -0600 Subject: [PATCH] implemented Gettr scraper --- Pipfile | 1 + Pipfile.lock | 24 ++++++++++++++++++----- cisticola/scraper/gettr.py | 40 ++++++++++++++++++++++++++++++++++++++ test.py | 6 +++++- 4 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 cisticola/scraper/gettr.py diff --git a/Pipfile b/Pipfile index e5b3ba5..2e6227d 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,7 @@ name = "pypi" sqlalchemy = "*" snscrape = "*" loguru = "*" +gogettr = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 9e1aef9..2743f87 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "8d3e1f9f21d8d86b560a55f660d1eb089287706e81fc8d21ba66f52827861b73" + "sha256": "e335358892de4b581de211099e214f370f8cfd1f86b2cd2b3f0ea6d2d43313bb" }, "pipfile-spec": 6, "requires": { @@ -39,6 +39,14 @@ "markers": "python_version >= '3'", "version": "==2.0.12" }, + "click": { + "hashes": [ + "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3", + "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b" + ], + "markers": "python_version >= '3.6'", + "version": "==8.0.3" + }, "filelock": { "hashes": [ "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85", @@ -47,6 +55,14 @@ "markers": "python_version >= '3.7'", "version": "==3.6.0" }, + "gogettr": { + "hashes": [ + "sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255", + "sha256:eebdd76e5bb67e3848905f9d6841a9a897a44944edf2b6e1a5a1e5e8fb2110c1" + ], + "index": "pypi", + "version": "==0.8.0" + }, "greenlet": { "hashes": [ "sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3", @@ -105,7 +121,7 @@ "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c", "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", "version": "==1.1.2" }, "idna": { @@ -200,9 +216,7 @@ "version": "==1.7.1" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py new file mode 100644 index 0000000..ca777c8 --- /dev/null +++ b/cisticola/scraper/gettr.py @@ -0,0 +1,40 @@ +import cisticola.base +from datetime import datetime +import json +from typing import List +from gogettr import PublicClient + +class GettrScraper(cisticola.scraper.Scraper): + """An implementation of a Scraper for Gettr, using gogettr library""" + __version__ = "GettrScraper 0.0.1" + + def get_username_from_url(url): + username = url.split("gettr.com/user/")[1] + if len(username.split("/")) > 1: + return None + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + posts = [] + client = PublicClient() + username = GettrScraper.get_username_from_url(channel.url) + scraper = client.user_activity(username=username, type="posts") + + for post in scraper: + if since is not None and post['cdate'] <= int(since.date_archived.timestamp()): + break + + posts.append(cisticola.base.ScraperResult(scraper=self.__version__, + platform="Gettr", + channel=username, + platform_id=post['_id'], + date=datetime.fromtimestamp(post['cdate']/1000.), + date_archived=datetime.now(), + raw_data=json.dumps(post))) + + return posts + + def can_handle(self, channel): + if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: + return True diff --git a/test.py b/test.py index a31a081..0d8eb18 100644 --- a/test.py +++ b/test.py @@ -15,7 +15,11 @@ test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", plat cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, category="qanon", followers=None, platform="Telegram", url="https://t.me/jqhnspartan", country="FR", - influencer="JQNH SPARTAN", public=True, chat=False, notes="")] + influencer="JQNH SPARTAN", public=True, chat=False, notes=""), + cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic', + category="qanon", followers=None, platform="Gettr", + url="https://www.gettr.com/user/lizardrepublic", country="US", + influencer=None, public=True, chat=False, notes=""),] controller = cisticola.ScraperController()