Modify GettrScraper to yield results, archive media (videos incomplete)

This commit is contained in:
Logan Williams
2022-02-24 20:25:14 +01:00
parent 456d592792
commit 0b1c175dd9
3 changed files with 32 additions and 11 deletions

View File

@@ -12,7 +12,7 @@ class ScraperResult:
scraper: str
platform: str
channel: int
channel: int #TODO there is probably a way of making this a Channel object foreign key
platform_id: str
date: datetime
raw_data: str

View File

@@ -1,10 +1,11 @@
import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import List
from gogettr import PublicClient
class GettrScraper(cisticola.scraper.Scraper):
class GettrScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
@@ -16,25 +17,40 @@ class GettrScraper(cisticola.scraper.Scraper):
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
posts = []
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
break
posts.append(cisticola.base.ScraperResult(
archived_urls = {}
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_url = self.archive_media(url)
archived_urls[img] = archived_url
if 'main' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
archived_urls[post['main']] = archived_url
# TODO this is just archiving the playlist file, not the actual video
if 'vid' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
archived_urls[post['vid']] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Gettr",
channel=username,
channel=channel.id,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
return posts
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:

View File

@@ -1,11 +1,13 @@
import cisticola
import cisticola.scraper.telegram_snscrape
import cisticola.scraper.twitter
import cisticola.scraper.gettr
from sqlalchemy import create_engine
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
test_channels = [
cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
category="test", followers=None, platform="Twitter",
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
influencer=None, public=True, chat=False,
@@ -32,7 +34,10 @@ controller.register_scraper(twitter)
telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
controller.register_scraper(telegram)
engine = create_engine('sqlite:///test.db')
gettr = cisticola.scraper.gettr.GettrScraper()
controller.register_scraper(gettr)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)