added capability to scrape Gab group posts

This commit is contained in:
Tristan Lee
2022-03-30 09:11:07 -05:00
parent 1f99e52436
commit b7871b060d
3 changed files with 65 additions and 9 deletions

View File

@@ -17,21 +17,34 @@ class GabScraper(Scraper):
return username
def get_group_id_from_url(self, url):
group_id = int(url.split('/')[-1])
return group_id
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Client(
username = os.environ['GAB_USER'],
password = os.environ['GAB_PASS'],
threads = 25)
username = self.get_username_from_url(channel.url)
if channel.url.split('/')[-2] == 'groups':
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
user_id = int(result['id'])
group_id = self.get_group_id_from_url(url = channel.url)
scraper = client.pull_group_posts(
id = group_id,
depth = float('inf'))
else:
scraper = client.pull_statuses(
id = user_id,
created_after = date.min,
replies = False)
username = self.get_username_from_url(channel.url)
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
user_id = int(result['id'])
scraper = client.pull_statuses(
id = user_id,
created_after = date.min,
replies = False)
for post in scraper:
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
@@ -82,8 +95,15 @@ class GabScraper(Scraper):
password = os.environ['GAB_PASS'],
threads = 25)
username = self.get_username_from_url(channel.url)
if channel.url.split('/')[-2] == 'groups':
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
group_id = self.get_group_id_from_url(url = channel.url)
profile = client.pull_group(id = group_id)
else:
username = self.get_username_from_url(channel.url)
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
return profile

View File

@@ -33,6 +33,19 @@ GAB_CHANNEL_KWARGS = {
'chat': False,
'notes': ''}
GAB_GROUP_KWARGS = {
'name': 'iran group (test)',
'platform_id': 10001,
'category': 'test',
'platform': 'Gab',
'url': 'https://gab.com/groups/10001',
'screenname': 'iran group',
'country': 'IR',
'influencer': None,
'public': True,
'chat': True,
'notes': ''}
GETTR_CHANNEL_KWARGS = {
'name': 'LizardRepublic (test)',
'platform_id': 'lizardrepublic',
@@ -178,6 +191,7 @@ def channel_kwargs():
return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gab_group' : GAB_GROUP_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,

View File

@@ -23,4 +23,26 @@ def test_scrape_gab_profile(channel_kwargs):
scraper = GabScraper()
channel = Channel(**channel_kwargs['gab'])
scraper.get_profile(channel=channel)
def test_scrape_gab_group_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab_group'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
def test_scrape_gab_group(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gab_group'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_gab_group_profile(channel_kwargs):
scraper = GabScraper()
channel = Channel(**channel_kwargs['gab_group'])
scraper.get_profile(channel=channel)