mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
added transformer for Gettr
This commit is contained in:
@@ -237,12 +237,12 @@ class Post:
|
||||
|
||||
# replace is here in order to prevent catastrophic backtracking
|
||||
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
|
||||
self.outlinks = urls
|
||||
self.outlinks += urls
|
||||
|
||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||
|
||||
hashtags = re.findall(HASHTAG_REGEX, self.content)
|
||||
self.hashtags = hashtags
|
||||
self.hashtags += hashtags
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
|
||||
|
||||
@@ -14,7 +14,7 @@ BASE_URL = 'https://rumble.com'
|
||||
|
||||
class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
__version__ = "RumbleScraper 0.0.2"
|
||||
|
||||
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
||||
cookiefilename = 'cookiefile.txt'
|
||||
|
||||
@@ -3,3 +3,4 @@ from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .rumble import RumbleTransformer
|
||||
from .gettr import GettrTransformer
|
||||
|
||||
78
cisticola/transformer/gettr.py
Normal file
78
cisticola/transformer/gettr.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class GettrTransformer(Transformer):
|
||||
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "GettrTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "GettrScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['_id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['nickname'],
|
||||
description=raw['dsc'],
|
||||
description_url=raw['website'],
|
||||
description_location=raw['location'],
|
||||
followers=raw['flg'],
|
||||
following=raw['flw'],
|
||||
verified=True if raw.get('infl') else False,
|
||||
date_created=datetime.fromtimestamp(raw['cdate']*0.001),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
if raw["activity"]["action"] == "shares_pst":
|
||||
forwarded_from = raw["activity"]["uid"]
|
||||
else:
|
||||
forwarded_from = None
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw["_id"],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url="https://www.gettr.com/post/" + raw["_id"],
|
||||
content=raw.get("txt", ""),
|
||||
author_id=raw["receiver_id"],
|
||||
author_username=raw["uid"],
|
||||
hashtags=raw.get("htgs", []),
|
||||
outlinks = list(filter(None, [raw.get("prevsrc")])),
|
||||
forwarded_from = forwarded_from)
|
||||
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
34
tests/transformer/gettr.py
Normal file
34
tests/transformer/gettr.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GettrScraper
|
||||
from cisticola.transformer import GettrTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(GettrTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 23
|
||||
# assert len(media) == 0
|
||||
|
||||
assert 'Nigerian gender studies' in posts[-1].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user