mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
114 lines
4.1 KiB
Python
114 lines
4.1 KiB
Python
from datetime import datetime, timezone, date
|
|
import json
|
|
from typing import Generator
|
|
import os
|
|
|
|
from gabber.client import Client, GAB_API_BASE_URL
|
|
|
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
|
from cisticola.scraper.base import Scraper
|
|
|
|
class GabScraper(Scraper):
|
|
"""An implementation of a Scraper for Gab, using gabber library"""
|
|
__version__ = "GabScraper 0.0.2"
|
|
|
|
def get_username_from_url(self, url):
|
|
username = url.split('https://gab.com/')[-1]
|
|
|
|
return username
|
|
|
|
def get_group_id_from_url(self, url):
|
|
group_id = int(url.split('/')[-1])
|
|
|
|
return group_id
|
|
|
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
|
client = Client(
|
|
username = os.environ['GAB_USER'],
|
|
password = os.environ['GAB_PASS'],
|
|
threads = 25)
|
|
|
|
if channel.url.split('/')[-2] == 'groups':
|
|
|
|
group_id = self.get_group_id_from_url(url = channel.url)
|
|
scraper = client.pull_group_posts(
|
|
id = group_id,
|
|
depth = float('inf'))
|
|
else:
|
|
|
|
username = self.get_username_from_url(channel.url)
|
|
|
|
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
|
user_id = int(result['id'])
|
|
|
|
scraper = client.pull_statuses(
|
|
id = user_id,
|
|
created_after = date.min,
|
|
replies = False)
|
|
|
|
for post in scraper:
|
|
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
|
break
|
|
|
|
media_urls = []
|
|
archived_urls = {}
|
|
|
|
if archive_media:
|
|
|
|
for attachment in post.get('media_attachments'):
|
|
if attachment.get('type') == 'video':
|
|
media_urls.append(attachment['source_mp4'])
|
|
else:
|
|
media_urls.append(attachment['url'])
|
|
|
|
if post.get('reblog') is not None:
|
|
for attachment in post['reblog'].get('media_attachments'):
|
|
if attachment.get('type') == 'video':
|
|
media_urls.append(attachment['source_mp4'])
|
|
else:
|
|
media_urls.append(attachment['url'])
|
|
|
|
for url in media_urls:
|
|
media_blob, content_type, key = self.url_to_blob(url)
|
|
archived_url = self.archive_blob(media_blob, content_type, key)
|
|
archived_urls[url] = archived_url
|
|
|
|
yield ScraperResult(
|
|
scraper=self.__version__,
|
|
platform="Gab",
|
|
channel=channel.id,
|
|
platform_id=post['id'],
|
|
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
|
|
date_archived=datetime.now(timezone.utc),
|
|
raw_posts=json.dumps(post),
|
|
archived_urls=archived_urls,
|
|
media_archived=archive_media)
|
|
|
|
def can_handle(self, channel: Channel) -> bool:
|
|
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
|
return True
|
|
|
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
|
|
|
client = Client(
|
|
username = os.environ['GAB_USER'],
|
|
password = os.environ['GAB_PASS'],
|
|
threads = 25)
|
|
|
|
if channel.url.split('/')[-2] == 'groups':
|
|
|
|
group_id = self.get_group_id_from_url(url = channel.url)
|
|
profile = client.pull_group(id = group_id)
|
|
|
|
else:
|
|
|
|
username = self.get_username_from_url(channel.url)
|
|
|
|
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
|
|
|
return RawChannelInfo(scraper=self.__version__,
|
|
platform=channel.platform,
|
|
channel=channel.id,
|
|
raw_data=json.dumps(profile),
|
|
date_archived=datetime.now(timezone.utc))
|