mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 13:28:34 +03:00
Modify GettrScraper to yield results, archive media (videos incomplete)
This commit is contained in:
@@ -12,7 +12,7 @@ class ScraperResult:
|
||||
|
||||
scraper: str
|
||||
platform: str
|
||||
channel: int
|
||||
channel: int #TODO there is probably a way of making this a Channel object foreign key
|
||||
platform_id: str
|
||||
date: datetime
|
||||
raw_data: str
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import List
|
||||
from gogettr import PublicClient
|
||||
|
||||
class GettrScraper(cisticola.scraper.Scraper):
|
||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
|
||||
@@ -16,25 +17,40 @@ class GettrScraper(cisticola.scraper.Scraper):
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
posts = []
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
|
||||
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
archived_urls = {}
|
||||
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
archived_url = self.archive_media(url)
|
||||
archived_urls[img] = archived_url
|
||||
|
||||
if 'main' in post:
|
||||
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
# TODO this is just archiving the playlist file, not the actual video
|
||||
if 'vid' in post:
|
||||
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
|
||||
archived_urls[post['vid']] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gettr",
|
||||
channel=username,
|
||||
channel=channel.id,
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post)))
|
||||
|
||||
return posts
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
|
||||
9
test.py
9
test.py
@@ -1,11 +1,13 @@
|
||||
import cisticola
|
||||
import cisticola.scraper.telegram_snscrape
|
||||
import cisticola.scraper.twitter
|
||||
import cisticola.scraper.gettr
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
|
||||
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
|
||||
test_channels = [
|
||||
cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
|
||||
category="test", followers=None, platform="Twitter",
|
||||
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
|
||||
influencer=None, public=True, chat=False,
|
||||
@@ -32,7 +34,10 @@ controller.register_scraper(twitter)
|
||||
telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
|
||||
controller.register_scraper(telegram)
|
||||
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
gettr = cisticola.scraper.gettr.GettrScraper()
|
||||
controller.register_scraper(gettr)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
|
||||
Reference in New Issue
Block a user