added transformer for Gettr

This commit is contained in:
Tristan Lee
2022-05-20 02:22:34 -05:00
parent 591f1986e8
commit f4072183be
5 changed files with 116 additions and 3 deletions

View File

@@ -237,12 +237,12 @@ class Post:
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks = urls
self.outlinks += urls
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags = hashtags
self.hashtags += hashtags
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'

View File

@@ -14,7 +14,7 @@ BASE_URL = 'https://rumble.com'
class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
__version__ = "RumbleScraper 0.0.2"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'

View File

@@ -3,3 +3,4 @@ from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer
from .gettr import GettrTransformer

View File

@@ -0,0 +1,78 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class GettrTransformer(Transformer):
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
__version__ = "GettrTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "GettrScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['_id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['nickname'],
description=raw['dsc'],
description_url=raw['website'],
description_location=raw['location'],
followers=raw['flg'],
following=raw['flw'],
verified=True if raw.get('infl') else False,
date_created=datetime.fromtimestamp(raw['cdate']*0.001),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst":
forwarded_from = raw["activity"]["uid"]
else:
forwarded_from = None
transformed = Post(
raw_id=data.id,
platform_id=raw["_id"],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url="https://www.gettr.com/post/" + raw["_id"],
content=raw.get("txt", ""),
author_id=raw["receiver_id"],
author_username=raw["uid"],
hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from)
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
from cisticola.transformer import GettrTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(GettrTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 23
# assert len(media) == 0
assert 'Nigerian gender studies' in posts[-1].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"