mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
added capability to scrape Gab group posts
This commit is contained in:
@@ -17,21 +17,34 @@ class GabScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_group_id_from_url(self, url):
|
||||
group_id = int(url.split('/')[-1])
|
||||
|
||||
return group_id
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
password = os.environ['GAB_PASS'],
|
||||
threads = 25)
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
if channel.url.split('/')[-2] == 'groups':
|
||||
|
||||
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
user_id = int(result['id'])
|
||||
group_id = self.get_group_id_from_url(url = channel.url)
|
||||
scraper = client.pull_group_posts(
|
||||
id = group_id,
|
||||
depth = float('inf'))
|
||||
else:
|
||||
|
||||
scraper = client.pull_statuses(
|
||||
id = user_id,
|
||||
created_after = date.min,
|
||||
replies = False)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
user_id = int(result['id'])
|
||||
|
||||
scraper = client.pull_statuses(
|
||||
id = user_id,
|
||||
created_after = date.min,
|
||||
replies = False)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
@@ -82,8 +95,15 @@ class GabScraper(Scraper):
|
||||
password = os.environ['GAB_PASS'],
|
||||
threads = 25)
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
if channel.url.split('/')[-2] == 'groups':
|
||||
|
||||
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
group_id = self.get_group_id_from_url(url = channel.url)
|
||||
profile = client.pull_group(id = group_id)
|
||||
|
||||
else:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
|
||||
return profile
|
||||
@@ -33,6 +33,19 @@ GAB_CHANNEL_KWARGS = {
|
||||
'chat': False,
|
||||
'notes': ''}
|
||||
|
||||
GAB_GROUP_KWARGS = {
|
||||
'name': 'iran group (test)',
|
||||
'platform_id': 10001,
|
||||
'category': 'test',
|
||||
'platform': 'Gab',
|
||||
'url': 'https://gab.com/groups/10001',
|
||||
'screenname': 'iran group',
|
||||
'country': 'IR',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': True,
|
||||
'notes': ''}
|
||||
|
||||
GETTR_CHANNEL_KWARGS = {
|
||||
'name': 'LizardRepublic (test)',
|
||||
'platform_id': 'lizardrepublic',
|
||||
@@ -178,6 +191,7 @@ def channel_kwargs():
|
||||
return {
|
||||
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
|
||||
'gab' : GAB_CHANNEL_KWARGS,
|
||||
'gab_group' : GAB_GROUP_KWARGS,
|
||||
'gettr' : GETTR_CHANNEL_KWARGS,
|
||||
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
|
||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
||||
|
||||
@@ -23,4 +23,26 @@ def test_scrape_gab_profile(channel_kwargs):
|
||||
|
||||
scraper = GabScraper()
|
||||
channel = Channel(**channel_kwargs['gab'])
|
||||
scraper.get_profile(channel=channel)
|
||||
|
||||
def test_scrape_gab_group_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab_group'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_gab_group(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab_group'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_gab_group_profile(channel_kwargs):
|
||||
|
||||
scraper = GabScraper()
|
||||
channel = Channel(**channel_kwargs['gab_group'])
|
||||
scraper.get_profile(channel=channel)
|
||||
Reference in New Issue
Block a user