From 0b1c175dd9579f67c597bcd3d63db3b7220251ac Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 20:25:14 +0100 Subject: [PATCH] Modify GettrScraper to yield results, archive media (videos incomplete) --- cisticola/base.py | 2 +- cisticola/scraper/gettr.py | 32 ++++++++++++++++++++++++-------- test.py | 9 +++++++-- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index d2913e2..03a1641 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -12,7 +12,7 @@ class ScraperResult: scraper: str platform: str - channel: int + channel: int #TODO there is probably a way of making this a Channel object foreign key platform_id: str date: datetime raw_data: str diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index c656549..5ae7d96 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -1,10 +1,11 @@ import cisticola.base +import cisticola.scraper.base from datetime import datetime import json from typing import List from gogettr import PublicClient -class GettrScraper(cisticola.scraper.Scraper): +class GettrScraper(cisticola.scraper.base.Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" @@ -16,25 +17,40 @@ class GettrScraper(cisticola.scraper.Scraper): return username def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - posts = [] client = PublicClient() username = GettrScraper.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") for post in scraper: - if since is not None and post['cdate'] <= int(since.date_archived.timestamp()): + if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: break - posts.append(cisticola.base.ScraperResult( + archived_urls = {} + + if 'imgs' in post: + for img in post['imgs']: + url = "https://media.gettr.com/" + img + archived_url = self.archive_media(url) + archived_urls[img] = archived_url + + if 'main' in post: + archived_url = self.archive_media("https://media.gettr.com/" + post['main']) + archived_urls[post['main']] = archived_url + + # TODO this is just archiving the playlist file, not the actual video + if 'vid' in post: + archived_url = self.archive_media("https://media.gettr.com/" + post['vid']) + archived_urls[post['vid']] = archived_url + + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Gettr", - channel=username, + channel=channel.id, platform_id=post['_id'], date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(), - raw_data=json.dumps(post))) - - return posts + raw_data=json.dumps(post), + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: diff --git a/test.py b/test.py index 0e4a6e0..9c60fb0 100644 --- a/test.py +++ b/test.py @@ -1,11 +1,13 @@ import cisticola import cisticola.scraper.telegram_snscrape import cisticola.scraper.twitter +import cisticola.scraper.gettr from sqlalchemy import create_engine -test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, +test_channels = [ + cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, category="test", followers=None, platform="Twitter", url="https://twitter.com/obtusatum", screenname="obtusatum", country="US", influencer=None, public=True, chat=False, @@ -32,7 +34,10 @@ controller.register_scraper(twitter) telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() controller.register_scraper(telegram) -engine = create_engine('sqlite:///test.db') +gettr = cisticola.scraper.gettr.GettrScraper() +controller.register_scraper(gettr) + +engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels)