Files
cisticola/cisticola/scraper/gettr.py
Tristan Lee 8ecb904249 merged main
2022-04-01 02:05:25 -05:00

85 lines
3.0 KiB
Python

from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
from loguru import logger
from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split("gettr.com/user/")[1]
if len(username.split("/")) > 1:
return None
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = self.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
break
archived_urls = {}
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_urls[url] = None
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
archived_urls[url] = None
if 'ovid' in post:
url = "https://media.gettr.com/" + post['ovid']
archived_urls[url] = None
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Gettr",
channel=channel.id,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(timezone.utc),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = PublicClient()
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))