diff --git a/cisticola/base.py b/cisticola/base.py index 50bf0c7..50101b7 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -237,12 +237,12 @@ class Post: # replace is here in order to prevent catastrophic backtracking urls = re.findall(URL_REGEX, self.content.replace("::::::::", "")) - self.outlinks = urls + self.outlinks += urls HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)" hashtags = re.findall(HASHTAG_REGEX, self.content) - self.hashtags = hashtags + self.hashtags += hashtags # regex patterns for finding crypto addresses BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b' diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index ad87247..fe7adb3 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,7 +14,7 @@ BASE_URL = 'https://rumble.com' class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" - __version__ = "RumbleScraper 0.0.1" + __version__ = "RumbleScraper 0.0.2" cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t') cookiefilename = 'cookiefile.txt' diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 77f9f0b..37df764 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -3,3 +3,4 @@ from .twitter import TwitterTransformer from .bitchute import BitchuteTransformer from .telegram_telethon import TelegramTelethonTransformer from .rumble import RumbleTransformer +from .gettr import GettrTransformer diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py new file mode 100644 index 0000000..aff1264 --- /dev/null +++ b/cisticola/transformer/gettr.py @@ -0,0 +1,78 @@ +import json +from loguru import logger +from typing import Generator, Union, Callable +import dateutil.parser +from datetime import datetime, timezone + +from cisticola.transformer.base import Transformer +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel + +class GettrTransformer(Transformer): + """A Gettr specific ScraperResult, with a method ETL/transforming""" + + __version__ = "GettrTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "GettrScraper": + return True + + return False + + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['_id'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['username'], + name=raw['nickname'], + description=raw['dsc'], + description_url=raw['website'], + description_location=raw['location'], + followers=raw['flg'], + following=raw['flw'], + verified=True if raw.get('infl') else False, + date_created=datetime.fromtimestamp(raw['cdate']*0.001), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + if raw["activity"]["action"] == "shares_pst": + forwarded_from = raw["activity"]["uid"] + else: + forwarded_from = None + + transformed = Post( + raw_id=data.id, + platform_id=raw["_id"], + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), + url="https://www.gettr.com/post/" + raw["_id"], + content=raw.get("txt", ""), + author_id=raw["receiver_id"], + author_username=raw["uid"], + hashtags=raw.get("htgs", []), + outlinks = list(filter(None, [raw.get("prevsrc")])), + forwarded_from = forwarded_from) + + insert(transformed) + + # media = self.process_media(raw, transformed.id, data) + # for m in media: + # insert(m) \ No newline at end of file diff --git a/tests/transformer/gettr.py b/tests/transformer/gettr.py new file mode 100644 index 0000000..ef37b67 --- /dev/null +++ b/tests/transformer/gettr.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import GettrScraper +from cisticola.transformer import GettrTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['gettr'])] + controller.register_scraper(scraper = GettrScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(GettrTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 23 + # assert len(media) == 0 + + assert 'Nigerian gender studies' in posts[-1].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file