Merge pull request #62 from bellingcat/other-transformer-fixes

Fixed broken channel_info transformers, added Telegram post transformer fields
This commit is contained in:
Logan Williams
2022-06-24 11:00:50 +02:00
committed by GitHub
18 changed files with 438 additions and 86 deletions

15
app.py
View File

@@ -14,7 +14,14 @@ from cisticola.scraper import (
BitchuteScraper,
RumbleScraper,
)
from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
from cisticola.transformer import (
ETLController,
TelegramTelethonTransformer,
GettrTransformer,
RumbleTransformer,
BitchuteTransformer,
VkontakteTransformer)
from sync_with_gsheet import sync_channels
def get_db_session():
@@ -49,7 +56,11 @@ def get_transformer_controller():
controller = ETLController()
controller.connect_to_db(engine)
transformers = [TelegramTelethonTransformer()]
transformers = [VkontakteTransformer(),
TelegramTelethonTransformer(),
GettrTransformer(),
BitchuteTransformer(),
RumbleTransformer()]
controller.register_transformers(transformers)

View File

@@ -229,20 +229,40 @@ class Post:
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None
#: The ID of the Post that this Post is a reply to or reblog of
#: The ID of the Post that this Post is a reply to
reply_to: int = None
#: Other users mentioned in the post
mentions: list = field(default_factory=list)
#: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.)
likes: int = None
#: Number of times the post was forwarded/retweeted/shared
forwards: int = None
#: Number of times the post was viewed
views: int = None
#: Video title, if post is a video
video_title: str = None
#: Video duration in seconds, if post is a video
video_duration: int = None
def hydrate(self):
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks += urls
self.outlinks = list(set(outlink for outlink in self.outlinks))
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags += hashtags
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
@@ -398,7 +418,6 @@ channel_info_table = Table('channel_info', mapper_registry.metadata,
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('scraper', String),
Column('platform', String),
Column('transformer', String),
Column('platform', String),
Column('screenname', String),
@@ -434,7 +453,7 @@ post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('platform_id', Integer, index=True),
Column('platform_id', String, index=True),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
@@ -452,6 +471,12 @@ post_table = Table('posts', mapper_registry.metadata,
Column('cryptocurrency_addresses', JSON),
Column('hashtags', JSON),
Column('outlinks', JSON),
Column('mentions', JSON),
Column('likes', Integer),
Column('forwards', Integer),
Column('views', Integer),
Column('video_title', String),
Column('video_duration', Integer),
Column('detected_language', String),
Column('normalized_content', String)
)

View File

@@ -105,7 +105,7 @@ class BitchuteScraper(Scraper):
profile = {
'description' : description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : owner_name,
@@ -483,16 +483,4 @@ def decode_cfemail(cfemail):
for i in range(2, len(cfemail)-1, 2):
email += chr(int(cfemail[i:i+2], 16)^k)
return email
#---------------------------------------------------------------------------#
def parse_created(created):
period_list = ['year', 'month', 'week', 'day']
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return datetime.now() - relativedelta(**kwargs)
return email

View File

@@ -166,13 +166,7 @@ class TelegramTelethonScraper(Scraper):
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = TelegramTelethonScraper.get_channel_identifier(channel)
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
full_channel = client(GetFullChannelRequest(channel = username))
full_channel = self.client(GetFullChannelRequest(channel = username))
profile = full_channel.to_dict()
return RawChannelInfo(scraper=self.__version__,

View File

@@ -4,3 +4,4 @@ from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer
from .gettr import GettrTransformer
from .vkontakte import VkontakteTransformer

View File

@@ -96,7 +96,7 @@ class ETLController:
# This is using some adhoc unique constraints that might be worth formalizing at some point
if type(obj) == Channel:
instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first()
instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first()
elif type(obj) == Post:
instance = None
@@ -133,6 +133,8 @@ class ETLController:
obj.hydrate()
session.add(obj)
session.flush()
logger.trace(f"Inserted new object {obj}")
return obj
@@ -169,8 +171,8 @@ class ETLController:
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True):

View File

@@ -2,7 +2,7 @@ import json
from loguru import logger
from typing import Generator, Union, Callable
from datetime import datetime, timezone
import dateutil.parser
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
@@ -12,7 +12,7 @@ from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Me
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
__version__ = "BitchuteTransformer 0.0.1"
__version__ = "BitchuteTransformer 0.0.2"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
@@ -49,7 +49,7 @@ class BitchuteTransformer(Transformer):
followers=raw['subscribers'],
following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute
date_created=dateutil.parser.parse(raw['created']),
date_created=parse_created(raw['created'], data.date_archived),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
@@ -59,8 +59,32 @@ class BitchuteTransformer(Transformer):
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text
if raw['category'] == 'comment':
if raw['parent_id'] is None:
reply_to_id = raw['thread_id']
else:
reply_to_id = raw['parent_id']
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
if post is None:
if raw['parent_id'] is not None:
# this block is for comments whose parent_ids correspond to deleted comments
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
if post is None:
reply_to = -1
else:
reply_to = post.id
else:
reply_to = -1
else:
reply_to = post.id
content = raw['body'].strip()
else:
reply_to = -1
soup = BeautifulSoup(raw['body'], features = 'html.parser')
soup.find('div', {'class': 'teaser'}).decompose()
soup.find('span', {'class': 'more'}).decompose()
soup.find('span', {'class': 'less hidden'}).decompose()
content = soup.text.strip()
transformed = Post(
raw_id=data.id,
@@ -72,9 +96,41 @@ class BitchuteTransformer(Transformer):
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
url=raw['url'] if raw['url'] else None,
content=content,
author_id=raw['author_id'],
author_username=raw['author'])
author_username=raw['author'],
reply_to=reply_to,
hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
likes = raw['likes'],
views = int(raw['views']) if raw.get('views') else None,
video_title = raw['subject'],
video_duration = _parse_duration_str(raw['length']))
transformed = insert(transformed)
transformed = insert(transformed)
session.flush()
def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
object relative to the specified ``date_archived``.
"""
try:
# handle case where `created` string has already been parsed into a datetime
return datetime.fromisoformat(created)
except ValueError:
period_list = ['year', 'month', 'week', 'day']
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return date_archived - relativedelta(**kwargs)
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])

View File

@@ -3,6 +3,9 @@ from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func
from gogettr import PublicClient
from gogettr.api import GettrApiError
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -31,28 +34,68 @@ class GettrTransformer(Transformer):
transformer=self.__version__,
screenname=raw['username'],
name=raw['nickname'],
description=raw['dsc'],
description_url=raw['website'],
description_location=raw['location'],
followers=raw['flg'],
following=raw['flw'],
description=raw.get('dsc'),
description_url=raw.get('website'),
description_location=raw.get('location'),
followers=int(raw['flg']),
following=int(raw['flw']),
verified=True if raw.get('infl') else False,
date_created=datetime.fromtimestamp(raw['cdate']*0.001),
date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
if channel is None:
try:
client = PublicClient()
profile = client.user_info(username.lower())
screenname = profile.get('_id')
channel = Channel(
name=profile.get('nickname'),
platform_id=screenname,
platform='Gettr',
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category=category,
source=self.__version__,
)
except GettrApiError:
channel = Channel(
name = None,
platform_id = None,
platform = 'Gettr',
url = None,
screenname=username,
category=category,
source=self.__version__,
notes='GettrApiError'
)
channel = insert(channel)
return channel.id
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst":
forwarded_from = raw["activity"]["uid"]
forwarded_from = self._get_channel_id(
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
else:
forwarded_from = None
mentions = []
for mentioned_user in raw.get("utgs", []):
mentioned_id = self._get_channel_id(
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
mentions.append(mentioned_id)
transformed = Post(
raw_id=data.id,
platform_id=raw["_id"],
@@ -69,7 +112,12 @@ class GettrTransformer(Transformer):
author_username=raw["uid"],
hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from)
forwarded_from = forwarded_from,
mentions = mentions,
likes = raw.get('lkbpst'),
forwards = raw.get("shbpst"),
views = raw.get('vfpst')
)
insert(transformed)

View File

@@ -3,6 +3,7 @@ from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func, JSON, String, cast, text
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
@@ -22,19 +23,30 @@ class RumbleTransformer(Transformer):
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if 'id' not in raw:
# The first version of the Rumble ChannelInfo scraper didn't return
# the platform_id, so this is a workaround.
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
if channel is None:
platform_id = None
else:
platform_id = json.loads(channel.raw_data)['id']
else:
platform_id = raw['id']
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform_id=platform_id,
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['id'],
screenname=platform_id,
name=raw['name'],
description='', # does not exist for Rumble
description_url='', # does not exist for Rumble
description_location='', # does not exist for Rumble
followers=raw['subscribers'],
followers=_process_number(raw['subscribers']),
following=-1, # does not exist for Rumble
verified=raw['verified'],
date_created=None, # does not exist for Rumble
@@ -61,10 +73,35 @@ class RumbleTransformer(Transformer):
url=raw['link'],
content=raw['content'],
author_id=raw['author_id'],
author_username=raw['author_name'])
author_username=raw['author_name'],
views = _process_number(raw.get('views')),
likes = _process_number(raw.get('rumbles')),
video_title = raw['title'],
video_duration=_parse_duration_str(raw['duration']))
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)
# insert(m)
def _process_number(s):
if s is None:
return None
else:
s = s.replace(' ', '')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6)
elif s.endswith('K'):
return int(float(s[:-1]) * 1000)
return int(s)
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])

View File

@@ -8,15 +8,19 @@ import requests
import time
from telethon.sync import TelegramClient
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
from telethon.tl import types
from telethon.helpers import add_surrogate
import os
from datetime import datetime, timezone
from sqlalchemy import func
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class TelegramTelethonTransformer(Transformer):
__version__ = 'TelegramTelethonTransformer 0.0.2'
__version__ = 'TelegramTelethonTransformer 0.0.3'
bad_channels = {}
@@ -34,8 +38,10 @@ class TelegramTelethonTransformer(Transformer):
try:
with TelegramClient("transform.session", api_id, api_hash) as client:
data = client.get_entity(channel_id)
return (data.username, data.title, "")
if isinstance(data, types.User):
return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "")
else:
return (data.username, data.title, "")
except ChannelPrivateError:
logger.info("ChannelPrivateError")
return ("", "", "ChannelPrivateError")
@@ -125,7 +131,7 @@ class TelegramTelethonTransformer(Transformer):
fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
if channel is None:
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
@@ -154,13 +160,50 @@ class TelegramTelethonTransformer(Transformer):
reply_to = None
if raw['reply_to']:
reply_to_id = raw['reply_to']['reply_to_msg_id']
reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
if post is None:
reply_to = -1
else:
reply_to = post.id
mentions = []
for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']:
offset = mention_entity['offset']
length = mention_entity['length']
screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip()
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first()
if channel is None:
channel = Channel(
name = None,
platform_id = None,
platform = 'Telegram',
url="https://t.me/s/" + screenname,
screenname=screenname,
category='mentioned',
source=self.__version__,
)
channel = insert(channel)
logger.info(f"Added {channel}")
mentions.append(channel.id)
channel = session.query(Channel).filter_by(id=int(data.channel)).first()
if channel is not None and channel.url:
url = channel.url.strip('/') + f"/{raw['id']}"
author_username = channel.screenname
else:
url = ""
author_username = ""
transformed = Post(
raw_id = data.id,
platform_id = raw['id'],
@@ -171,24 +214,47 @@ class TelegramTelethonTransformer(Transformer):
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url="",
content=raw['message'],
author_id=raw['post_author'],
author_username="",
url=url,
content=add_markdown_links(raw),
author_id=raw.get('peer_id', {}).get('channel_id'),
author_username=author_username,
forwarded_from=fwd_from,
reply_to=reply_to
reply_to=reply_to,
mentions = mentions,
forwards = raw.get('forwards'),
views = raw.get('views')
)
transformed = insert(transformed)
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
ext = archived_url.split('.')[-1]
# for k in data.archived_urls:
# if data.archived_urls[k]:
# archived_url = data.archived_urls[k]
# ext = archived_url.split('.')[-1]
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
else:
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# else:
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
def add_markdown_links(raw_post):
global_offset = 0
transformed_content = raw_post['message']
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
for link in links:
offset = global_offset + link['offset']
length = link['length']
url = link['url']
before_link = transformed_content[:offset]
link_text = f"[{transformed_content[offset:offset+length].strip()}]"
trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()])
link_href = f"({url})"
after_link = transformed_content[offset+length:]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
global_offset += (4 + len(url))
return transformed_content

View File

@@ -0,0 +1,73 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class VkontakteTransformer(Transformer):
"""A Vkontakte specific ScraperResult, with a method ETL/transforming"""
__version__ = "VkontakteTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "VkontakteScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['username'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['name'],
description=raw.get('description'),
description_url=raw.get('websites'),
description_location=None,
followers=int(raw['followers']) if raw['followers'] else None,
following=-1,
verified=raw['verified'],
date_archived=data.date_archived,
date_created=None,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=data.platform_id,
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=raw['content'] if raw['content'] else '',
author_id = None,
author_username=None,
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
)
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -18,7 +18,8 @@ BITCHUTE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
GAB_CHANNEL_KWARGS = {
'name': 'Capt. Marc Simon (test)',
@@ -31,7 +32,8 @@ GAB_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
GAB_GROUP_KWARGS = {
'name': 'iran group (test)',
@@ -44,7 +46,8 @@ GAB_GROUP_KWARGS = {
'influencer': None,
'public': True,
'chat': True,
'notes': ''}
'notes': '',
'source': 'researcher'}
GETTR_CHANNEL_KWARGS = {
'name': 'LizardRepublic (test)',
@@ -57,7 +60,8 @@ GETTR_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
INSTAGRAM_CHANNEL_KWARGS = {
'name': 'borland.88 (test)',
@@ -70,7 +74,8 @@ INSTAGRAM_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
ODYSEE_CHANNEL_KWARGS = {
'name': "Mak1n' Bacon (test)",
@@ -83,7 +88,8 @@ ODYSEE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
RUMBLE_CHANNEL_KWARGS = {
'name': 'we are uploading videos wow products',
@@ -96,7 +102,8 @@ RUMBLE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'South West Ohio Proud Boys (test)',
@@ -109,8 +116,9 @@ TELEGRAM_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
TWITTER_CHANNEL_KWARGS = {
'name': 'L Weber (test)',
'platform_id': 1424979017749442595,
@@ -122,7 +130,8 @@ TWITTER_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
VKONTAKTE_CHANNEL_KWARGS = {
'name': 'Wwg1wgA (test)',
@@ -135,7 +144,8 @@ VKONTAKTE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
YOUTUBE_CHANNEL_KWARGS = {
'name': 'AnEs87 (test)',
@@ -148,7 +158,8 @@ YOUTUBE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -15,7 +15,8 @@ def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs)
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(BitchuteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()

View File

@@ -15,6 +15,7 @@ def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(GettrTransformer())
etl_controller.transform_all_untransformed()

View File

@@ -15,7 +15,8 @@ def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(RumbleTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()

View File

@@ -15,6 +15,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(TelegramTelethonTransformer())
etl_controller.transform_all_untransformed()
@@ -28,7 +29,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe
media = session.query(Media).all()
assert len(posts) == 19
assert len(media) == 13
# assert len(media) == 13
assert posts[16].content == "Taking pre-orders now"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"

View File

@@ -15,6 +15,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
@@ -28,7 +29,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
media = session.query(Media).all()
assert len(posts) == 12
assert len(media) == 4
assert len(media) == 8
assert posts[2].content == "BARN"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -0,0 +1,35 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
from cisticola.transformer import VkontakteTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(VkontakteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 23
# assert len(media) == 0
assert 'Nigerian gender studies' in posts[-1].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"