diff --git a/app.py b/app.py index 0ede270..9d848f8 100644 --- a/app.py +++ b/app.py @@ -14,7 +14,14 @@ from cisticola.scraper import ( BitchuteScraper, RumbleScraper, ) -from cisticola.transformer import (ETLController, TelegramTelethonTransformer) +from cisticola.transformer import ( + ETLController, + TelegramTelethonTransformer, + GettrTransformer, + RumbleTransformer, + BitchuteTransformer, + VkontakteTransformer) + from sync_with_gsheet import sync_channels def get_db_session(): @@ -49,7 +56,11 @@ def get_transformer_controller(): controller = ETLController() controller.connect_to_db(engine) - transformers = [TelegramTelethonTransformer()] + transformers = [VkontakteTransformer(), + TelegramTelethonTransformer(), + GettrTransformer(), + BitchuteTransformer(), + RumbleTransformer()] controller.register_transformers(transformers) diff --git a/cisticola/base.py b/cisticola/base.py index 3c6651a..28ff9f8 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -229,20 +229,40 @@ class Post: #: The ID of the Channel that the post was forwarded or quoted from forwarded_from: int = None - #: The ID of the Post that this Post is a reply to or reblog of + #: The ID of the Post that this Post is a reply to reply_to: int = None + #: Other users mentioned in the post + mentions: list = field(default_factory=list) + + #: Number of positive post reactions (e.g. likes, favorites, rumbles, upvotes, etc.) + likes: int = None + + #: Number of times the post was forwarded/retweeted/shared + forwards: int = None + + #: Number of times the post was viewed + views: int = None + + #: Video title, if post is a video + video_title: str = None + + #: Video duration in seconds, if post is a video + video_duration: int = None + def hydrate(self): URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(? RawChannelInfo: username = TelegramTelethonScraper.get_channel_identifier(channel) - - api_id = os.environ['TELEGRAM_API_ID'] - api_hash = os.environ['TELEGRAM_API_HASH'] - phone = os.environ['TELEGRAM_PHONE'] - - with TelegramClient(phone, api_id, api_hash) as client: - full_channel = client(GetFullChannelRequest(channel = username)) + full_channel = self.client(GetFullChannelRequest(channel = username)) profile = full_channel.to_dict() return RawChannelInfo(scraper=self.__version__, diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 37df764..48f96bd 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -4,3 +4,4 @@ from .bitchute import BitchuteTransformer from .telegram_telethon import TelegramTelethonTransformer from .rumble import RumbleTransformer from .gettr import GettrTransformer +from .vkontakte import VkontakteTransformer diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 053de19..32ed37c 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -96,7 +96,7 @@ class ETLController: # This is using some adhoc unique constraints that might be worth formalizing at some point if type(obj) == Channel: - instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first() + instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first() elif type(obj) == Post: instance = None @@ -133,6 +133,8 @@ class ETLController: obj.hydrate() session.add(obj) + session.flush() + logger.trace(f"Inserted new object {obj}") return obj @@ -169,8 +171,8 @@ class ETLController: session.commit() break - if handled == False: - logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})") + if handled == False: + logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})") @logger.catch(reraise=True) def transform_all_untransformed(self, hydrate: bool = True): diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index 19fac56..c8dff36 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -2,7 +2,7 @@ import json from loguru import logger from typing import Generator, Union, Callable from datetime import datetime, timezone -import dateutil.parser +from dateutil.relativedelta import relativedelta from bs4 import BeautifulSoup @@ -12,7 +12,7 @@ from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Me class BitchuteTransformer(Transformer): """A Bitchute specific ScraperResult, with a method ETL/transforming""" - __version__ = "BitchuteTransformer 0.0.1" + __version__ = "BitchuteTransformer 0.0.2" def can_handle(self, data: ScraperResult) -> bool: scraper = data.scraper.split(' ') @@ -49,7 +49,7 @@ class BitchuteTransformer(Transformer): followers=raw['subscribers'], following=-1, # does not exist for Bitchute verified=False, # does not exist for Bitchute - date_created=dateutil.parser.parse(raw['created']), + date_created=parse_created(raw['created'], data.date_archived), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc) ) @@ -59,8 +59,32 @@ class BitchuteTransformer(Transformer): def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) - soup = BeautifulSoup(raw['body'], features = 'html.parser') - content = soup.find_all('p')[-1].text + if raw['category'] == 'comment': + if raw['parent_id'] is None: + reply_to_id = raw['thread_id'] + else: + reply_to_id = raw['parent_id'] + post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() + if post is None: + if raw['parent_id'] is not None: + # this block is for comments whose parent_ids correspond to deleted comments + post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first() + if post is None: + reply_to = -1 + else: + reply_to = post.id + else: + reply_to = -1 + else: + reply_to = post.id + content = raw['body'].strip() + else: + reply_to = -1 + soup = BeautifulSoup(raw['body'], features = 'html.parser') + soup.find('div', {'class': 'teaser'}).decompose() + soup.find('span', {'class': 'more'}).decompose() + soup.find('span', {'class': 'less hidden'}).decompose() + content = soup.text.strip() transformed = Post( raw_id=data.id, @@ -72,9 +96,41 @@ class BitchuteTransformer(Transformer): date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), - url=raw['url'], + url=raw['url'] if raw['url'] else None, content=content, author_id=raw['author_id'], - author_username=raw['author']) + author_username=raw['author'], + reply_to=reply_to, + hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])), + likes = raw['likes'], + views = int(raw['views']) if raw.get('views') else None, + video_title = raw['subject'], + video_duration = _parse_duration_str(raw['length'])) - transformed = insert(transformed) \ No newline at end of file + transformed = insert(transformed) + session.flush() + +def parse_created(created: str, date_archived: datetime) -> datetime: + """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime + object relative to the specified ``date_archived``. + """ + try: + # handle case where `created` string has already been parsed into a datetime + return datetime.fromisoformat(created) + except ValueError: + period_list = ['year', 'month', 'week', 'day'] + + periods = [period.strip() for period in created.split('ago')[0].strip().split(',')] + _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()} + kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} + + return date_archived - relativedelta(**kwargs) + +def _parse_duration_str(duration_str: str) -> int: + """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824). + """ + if not duration_str: + return None + else: + duration_list = duration_str.split(':') + return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]) \ No newline at end of file diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index aff1264..0e0a4d0 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -3,6 +3,9 @@ from loguru import logger from typing import Generator, Union, Callable import dateutil.parser from datetime import datetime, timezone +from sqlalchemy import func +from gogettr import PublicClient +from gogettr.api import GettrApiError from cisticola.transformer.base import Transformer from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel @@ -31,28 +34,68 @@ class GettrTransformer(Transformer): transformer=self.__version__, screenname=raw['username'], name=raw['nickname'], - description=raw['dsc'], - description_url=raw['website'], - description_location=raw['location'], - followers=raw['flg'], - following=raw['flw'], + description=raw.get('dsc'), + description_url=raw.get('website'), + description_location=raw.get('location'), + followers=int(raw['flg']), + following=int(raw['flw']), verified=True if raw.get('infl') else False, - date_created=datetime.fromtimestamp(raw['cdate']*0.001), + date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc) ) transformed = insert(transformed) + def _get_channel_id(self, username: str, category: str, insert: Callable, session): + + channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first() + + if channel is None: + try: + client = PublicClient() + profile = client.user_info(username.lower()) + screenname = profile.get('_id') + channel = Channel( + name=profile.get('nickname'), + platform_id=screenname, + platform='Gettr', + url="https://gettr.com/user/" + screenname, + screenname=screenname, + category=category, + source=self.__version__, + ) + except GettrApiError: + channel = Channel( + name = None, + platform_id = None, + platform = 'Gettr', + url = None, + screenname=username, + category=category, + source=self.__version__, + notes='GettrApiError' + ) + + channel = insert(channel) + + return channel.id def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) if raw["activity"]["action"] == "shares_pst": - forwarded_from = raw["activity"]["uid"] + forwarded_from = self._get_channel_id( + username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session) else: forwarded_from = None + mentions = [] + for mentioned_user in raw.get("utgs", []): + mentioned_id = self._get_channel_id( + username = mentioned_user, category = 'mentioned', insert = insert, session = session) + mentions.append(mentioned_id) + transformed = Post( raw_id=data.id, platform_id=raw["_id"], @@ -69,7 +112,12 @@ class GettrTransformer(Transformer): author_username=raw["uid"], hashtags=raw.get("htgs", []), outlinks = list(filter(None, [raw.get("prevsrc")])), - forwarded_from = forwarded_from) + forwarded_from = forwarded_from, + mentions = mentions, + likes = raw.get('lkbpst'), + forwards = raw.get("shbpst"), + views = raw.get('vfpst') + ) insert(transformed) diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py index 91ef244..79289f5 100644 --- a/cisticola/transformer/rumble.py +++ b/cisticola/transformer/rumble.py @@ -3,6 +3,7 @@ from loguru import logger from typing import Generator, Union, Callable import dateutil.parser from datetime import datetime, timezone +from sqlalchemy import func, JSON, String, cast, text from cisticola.transformer.base import Transformer from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel @@ -22,19 +23,30 @@ class RumbleTransformer(Transformer): def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) + if 'id' not in raw: + # The first version of the Rumble ChannelInfo scraper didn't return + # the platform_id, so this is a workaround. + channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first() + if channel is None: + platform_id = None + else: + platform_id = json.loads(channel.raw_data)['id'] + else: + platform_id = raw['id'] + transformed = ChannelInfo( raw_channel_info_id=data.id, channel=data.channel, - platform_id=raw['id'], + platform_id=platform_id, platform=data.platform, scraper=data.scraper, transformer=self.__version__, - screenname=raw['id'], + screenname=platform_id, name=raw['name'], description='', # does not exist for Rumble description_url='', # does not exist for Rumble description_location='', # does not exist for Rumble - followers=raw['subscribers'], + followers=_process_number(raw['subscribers']), following=-1, # does not exist for Rumble verified=raw['verified'], date_created=None, # does not exist for Rumble @@ -61,10 +73,35 @@ class RumbleTransformer(Transformer): url=raw['link'], content=raw['content'], author_id=raw['author_id'], - author_username=raw['author_name']) + author_username=raw['author_name'], + views = _process_number(raw.get('views')), + likes = _process_number(raw.get('rumbles')), + video_title = raw['title'], + video_duration=_parse_duration_str(raw['duration'])) insert(transformed) # media = self.process_media(raw, transformed.id, data) # for m in media: - # insert(m) \ No newline at end of file + # insert(m) + +def _process_number(s): + + if s is None: + return None + else: + s = s.replace(' ', '') + if s.endswith('M'): + return int(float(s[:-1]) * 1e6) + elif s.endswith('K'): + return int(float(s[:-1]) * 1000) + return int(s) + +def _parse_duration_str(duration_str: str) -> int: + """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824). + """ + if not duration_str: + return None + else: + duration_list = duration_str.split(':') + return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]) \ No newline at end of file diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index d0a8cee..b5326e2 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -8,15 +8,19 @@ import requests import time from telethon.sync import TelegramClient from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError +from telethon.tl import types +from telethon.helpers import add_surrogate + import os from datetime import datetime, timezone +from sqlalchemy import func from cisticola.transformer.base import Transformer from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel class TelegramTelethonTransformer(Transformer): - __version__ = 'TelegramTelethonTransformer 0.0.2' + __version__ = 'TelegramTelethonTransformer 0.0.3' bad_channels = {} @@ -34,8 +38,10 @@ class TelegramTelethonTransformer(Transformer): try: with TelegramClient("transform.session", api_id, api_hash) as client: data = client.get_entity(channel_id) - - return (data.username, data.title, "") + if isinstance(data, types.User): + return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "") + else: + return (data.username, data.title, "") except ChannelPrivateError: logger.info("ChannelPrivateError") return ("", "", "ChannelPrivateError") @@ -125,7 +131,7 @@ class TelegramTelethonTransformer(Transformer): fwd_from = None if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: - channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first() + channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() if channel is None: (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id']) @@ -154,13 +160,50 @@ class TelegramTelethonTransformer(Transformer): reply_to = None if raw['reply_to']: - reply_to_id = raw['reply_to']['reply_to_msg_id'] + reply_to_id = str(raw['reply_to']['reply_to_msg_id']) post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() if post is None: reply_to = -1 else: reply_to = post.id + mentions = [] + + for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: + + offset = mention_entity['offset'] + length = mention_entity['length'] + + screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() + + channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() + + if channel is None: + + channel = Channel( + name = None, + platform_id = None, + platform = 'Telegram', + url="https://t.me/s/" + screenname, + screenname=screenname, + category='mentioned', + source=self.__version__, + ) + + channel = insert(channel) + logger.info(f"Added {channel}") + + mentions.append(channel.id) + + channel = session.query(Channel).filter_by(id=int(data.channel)).first() + + if channel is not None and channel.url: + url = channel.url.strip('/') + f"/{raw['id']}" + author_username = channel.screenname + else: + url = "" + author_username = "" + transformed = Post( raw_id = data.id, platform_id = raw['id'], @@ -171,24 +214,47 @@ class TelegramTelethonTransformer(Transformer): date=dateutil.parser.parse(raw['date']), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), - url="", - content=raw['message'], - author_id=raw['post_author'], - author_username="", + url=url, + content=add_markdown_links(raw), + author_id=raw.get('peer_id', {}).get('channel_id'), + author_username=author_username, forwarded_from=fwd_from, - reply_to=reply_to + reply_to=reply_to, + mentions = mentions, + forwards = raw.get('forwards'), + views = raw.get('views') ) transformed = insert(transformed) - for k in data.archived_urls: - if data.archived_urls[k]: - archived_url = data.archived_urls[k] - ext = archived_url.split('.')[-1] + # for k in data.archived_urls: + # if data.archived_urls[k]: + # archived_url = data.archived_urls[k] + # ext = archived_url.split('.')[-1] - if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': - insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) - else: - insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) + # if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': + # insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) + # else: + # insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) - \ No newline at end of file +def add_markdown_links(raw_post): + + global_offset = 0 + transformed_content = raw_post['message'] + links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl'] + + for link in links: + offset = global_offset + link['offset'] + length = link['length'] + url = link['url'] + + before_link = transformed_content[:offset] + link_text = f"[{transformed_content[offset:offset+length].strip()}]" + trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()]) + link_href = f"({url})" + after_link = transformed_content[offset+length:] + + transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link + global_offset += (4 + len(url)) + + return transformed_content \ No newline at end of file diff --git a/cisticola/transformer/vkontakte.py b/cisticola/transformer/vkontakte.py new file mode 100644 index 0000000..2351972 --- /dev/null +++ b/cisticola/transformer/vkontakte.py @@ -0,0 +1,73 @@ +import json +from loguru import logger +from typing import Generator, Union, Callable +import dateutil.parser +from datetime import datetime, timezone +from sqlalchemy import func + +from cisticola.transformer.base import Transformer +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel + +class VkontakteTransformer(Transformer): + """A Vkontakte specific ScraperResult, with a method ETL/transforming""" + + __version__ = "VkontakteTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "VkontakteScraper": + return True + + return False + + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['username'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['username'], + name=raw['name'], + description=raw.get('description'), + description_url=raw.get('websites'), + description_location=None, + followers=int(raw['followers']) if raw['followers'] else None, + following=-1, + verified=raw['verified'], + date_archived=data.date_archived, + date_created=None, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = Post( + raw_id=data.id, + platform_id=data.platform_id, + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=data.date, + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), + url=raw['url'], + content=raw['content'] if raw['content'] else '', + author_id = None, + author_username=None, + outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [], + ) + + insert(transformed) + + # media = self.process_media(raw, transformed.id, data) + # for m in media: + # insert(m) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 684c15d..27e6180 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,8 @@ BITCHUTE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} GAB_CHANNEL_KWARGS = { 'name': 'Capt. Marc Simon (test)', @@ -31,7 +32,8 @@ GAB_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} GAB_GROUP_KWARGS = { 'name': 'iran group (test)', @@ -44,7 +46,8 @@ GAB_GROUP_KWARGS = { 'influencer': None, 'public': True, 'chat': True, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} GETTR_CHANNEL_KWARGS = { 'name': 'LizardRepublic (test)', @@ -57,7 +60,8 @@ GETTR_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} INSTAGRAM_CHANNEL_KWARGS = { 'name': 'borland.88 (test)', @@ -70,7 +74,8 @@ INSTAGRAM_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} ODYSEE_CHANNEL_KWARGS = { 'name': "Mak1n' Bacon (test)", @@ -83,7 +88,8 @@ ODYSEE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} RUMBLE_CHANNEL_KWARGS = { 'name': 'we are uploading videos wow products', @@ -96,7 +102,8 @@ RUMBLE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} TELEGRAM_CHANNEL_KWARGS = { 'name': 'South West Ohio Proud Boys (test)', @@ -109,8 +116,9 @@ TELEGRAM_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} - + 'notes': '', + 'source': 'researcher'} + TWITTER_CHANNEL_KWARGS = { 'name': 'L Weber (test)', 'platform_id': 1424979017749442595, @@ -122,7 +130,8 @@ TWITTER_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} VKONTAKTE_CHANNEL_KWARGS = { 'name': 'Wwg1wgA (test)', @@ -135,7 +144,8 @@ VKONTAKTE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} YOUTUBE_CHANNEL_KWARGS = { 'name': 'AnEs87 (test)', @@ -148,7 +158,8 @@ YOUTUBE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/transformer/bitchute.py b/tests/transformer/bitchute.py index 161d3e5..126db3a 100644 --- a/tests/transformer/bitchute.py +++ b/tests/transformer/bitchute.py @@ -15,7 +15,8 @@ def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs) channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = True) - + controller.scrape_all_channel_info() + etl_controller.register_transformer(BitchuteTransformer()) etl_controller.transform_all_untransformed() etl_controller.transform_all_untransformed_info() diff --git a/tests/transformer/gettr.py b/tests/transformer/gettr.py index ef37b67..9472f50 100644 --- a/tests/transformer/gettr.py +++ b/tests/transformer/gettr.py @@ -15,6 +15,7 @@ def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() etl_controller.register_transformer(GettrTransformer()) etl_controller.transform_all_untransformed() diff --git a/tests/transformer/rumble.py b/tests/transformer/rumble.py index 95450ed..3b2b8a5 100644 --- a/tests/transformer/rumble.py +++ b/tests/transformer/rumble.py @@ -15,7 +15,8 @@ def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = True) - + controller.scrape_all_channel_info() + etl_controller.register_transformer(RumbleTransformer()) etl_controller.transform_all_untransformed() etl_controller.transform_all_untransformed_info() diff --git a/tests/transformer/telegram_telethon.py b/tests/transformer/telegram_telethon.py index a5389b6..14fe04c 100644 --- a/tests/transformer/telegram_telethon.py +++ b/tests/transformer/telegram_telethon.py @@ -15,6 +15,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() etl_controller.register_transformer(TelegramTelethonTransformer()) etl_controller.transform_all_untransformed() @@ -28,7 +29,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe media = session.query(Media).all() assert len(posts) == 19 - assert len(media) == 13 + # assert len(media) == 13 assert posts[16].content == "Taking pre-orders now" - assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280" \ No newline at end of file + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280" \ No newline at end of file diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index 8799aad..3e4b368 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -15,6 +15,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() etl_controller.register_transformer(TwitterTransformer()) etl_controller.transform_all_untransformed() @@ -28,7 +29,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): media = session.query(Media).all() assert len(posts) == 12 - assert len(media) == 4 + assert len(media) == 8 assert posts[2].content == "BARN" assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/vkontakte.py b/tests/transformer/vkontakte.py new file mode 100644 index 0000000..ea26b62 --- /dev/null +++ b/tests/transformer/vkontakte.py @@ -0,0 +1,35 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import VkontakteScraper +from cisticola.transformer import VkontakteTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['vkontakte'])] + controller.register_scraper(scraper = VkontakteScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() + + etl_controller.register_transformer(VkontakteTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 23 + # assert len(media) == 0 + + assert 'Nigerian gender studies' in posts[-1].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file