Modify GettrScraper to yield results, archive media (videos incomplete)

2026-06-12 13:28:34 +03:00 · 2022-02-24 20:25:14 +01:00
parent 456d592792
commit 0b1c175dd9
3 changed files with 32 additions and 11 deletions
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -12,7 +12,7 @@ class ScraperResult:

    scraper: str
    platform: str
-    channel: int
+    channel: int #TODO there is probably a way of making this a Channel object foreign key
    platform_id: str
    date: datetime
    raw_data: str
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -1,10 +1,11 @@
 import cisticola.base
+import cisticola.scraper.base
 from datetime import datetime
 import json
 from typing import List
 from gogettr import PublicClient

-class GettrScraper(cisticola.scraper.Scraper):
+class GettrScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Gettr, using gogettr library"""
    __version__ = "GettrScraper 0.0.1"

@@ -16,25 +17,40 @@ class GettrScraper(cisticola.scraper.Scraper):
        return username

    def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
-        posts = []
        client = PublicClient()
        username = GettrScraper.get_username_from_url(channel.url)
        scraper = client.user_activity(username=username, type="posts")

        for post in scraper:
-            if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
+            if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
                break

-            posts.append(cisticola.base.ScraperResult(
+            archived_urls = {}
+
+            if 'imgs' in post:
+                for img in post['imgs']:
+                    url = "https://media.gettr.com/" + img
+                    archived_url = self.archive_media(url)
+                    archived_urls[img] = archived_url
+
+            if 'main' in post:
+                archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
+                archived_urls[post['main']] = archived_url
+
+            # TODO this is just archiving the playlist file, not the actual video
+            if 'vid' in post:
+                archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
+                archived_urls[post['vid']] = archived_url
+
+            yield cisticola.base.ScraperResult(
                scraper=self.__version__,
                platform="Gettr",
-                channel=username,
+                channel=channel.id,
                platform_id=post['_id'],
                date=datetime.fromtimestamp(post['cdate']/1000.),
                date_archived=datetime.now(),
-                raw_data=json.dumps(post)))
-
-        return posts
+                raw_data=json.dumps(post),
+                archived_urls=archived_urls)

    def can_handle(self, channel):
        if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
--- a/test.py
+++ b/test.py
@@ -1,11 +1,13 @@
 import cisticola
 import cisticola.scraper.telegram_snscrape
 import cisticola.scraper.twitter
+import cisticola.scraper.gettr

 from sqlalchemy import create_engine


-test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
+test_channels = [
+    cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
                                   category="test", followers=None, platform="Twitter",
                                   url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
                                   influencer=None, public=True, chat=False,
@@ -32,7 +34,10 @@ controller.register_scraper(twitter)
 telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
 controller.register_scraper(telegram)

-engine = create_engine('sqlite:///test.db')
+gettr = cisticola.scraper.gettr.GettrScraper()
+controller.register_scraper(gettr)
+
+engine = create_engine('sqlite:///test3.db')
 controller.connect_to_db(engine)

 controller.scrape_channels(test_channels)