From a2a7882f1cbc2d34764a0f52cb9a60316cfc85d6 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 13 Jun 2022 13:42:33 -0500 Subject: [PATCH] fixed Gettr and Bitchute info transformers, added missing or incorrect TelegramTransformer fields, added Telegram mentions to the transformer. --- cisticola/base.py | 5 +- cisticola/scraper/bitchute.py | 16 +--- cisticola/scraper/telegram_telethon.py | 8 +- cisticola/transformer/base.py | 4 +- cisticola/transformer/bitchute.py | 24 +++++- cisticola/transformer/gettr.py | 12 +-- cisticola/transformer/telegram_telethon.py | 98 ++++++++++++++++++---- tests/conftest.py | 35 +++++--- tests/transformer/bitchute.py | 3 +- tests/transformer/gettr.py | 1 + tests/transformer/rumble.py | 3 +- tests/transformer/telegram_telethon.py | 5 +- tests/transformer/twitter.py | 3 +- 13 files changed, 150 insertions(+), 67 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index 3c6651a..c72edd1 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -232,6 +232,9 @@ class Post: #: The ID of the Post that this Post is a reply to or reblog of reply_to: int = None + #: Other users mentioned in the post + mentions: list = field(default_factory=list) + def hydrate(self): URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(? RawChannelInfo: username = TelegramTelethonScraper.get_channel_identifier(channel) - - api_id = os.environ['TELEGRAM_API_ID'] - api_hash = os.environ['TELEGRAM_API_HASH'] - phone = os.environ['TELEGRAM_PHONE'] - - with TelegramClient(phone, api_id, api_hash) as client: - full_channel = client(GetFullChannelRequest(channel = username)) + full_channel = self.client(GetFullChannelRequest(channel = username)) profile = full_channel.to_dict() return RawChannelInfo(scraper=self.__version__, diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 053de19..c2e52a7 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -96,7 +96,7 @@ class ETLController: # This is using some adhoc unique constraints that might be worth formalizing at some point if type(obj) == Channel: - instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first() + instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first() elif type(obj) == Post: instance = None @@ -133,6 +133,8 @@ class ETLController: obj.hydrate() session.add(obj) + session.flush() + logger.trace(f"Inserted new object {obj}") return obj diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index 19fac56..96c8b53 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -2,7 +2,7 @@ import json from loguru import logger from typing import Generator, Union, Callable from datetime import datetime, timezone -import dateutil.parser +from dateutil.relativedelta import relativedelta from bs4 import BeautifulSoup @@ -12,7 +12,7 @@ from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Me class BitchuteTransformer(Transformer): """A Bitchute specific ScraperResult, with a method ETL/transforming""" - __version__ = "BitchuteTransformer 0.0.1" + __version__ = "BitchuteTransformer 0.0.2" def can_handle(self, data: ScraperResult) -> bool: scraper = data.scraper.split(' ') @@ -49,7 +49,7 @@ class BitchuteTransformer(Transformer): followers=raw['subscribers'], following=-1, # does not exist for Bitchute verified=False, # does not exist for Bitchute - date_created=dateutil.parser.parse(raw['created']), + date_created=parse_created(raw['created'], data.date_archived), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc) ) @@ -77,4 +77,20 @@ class BitchuteTransformer(Transformer): author_id=raw['author_id'], author_username=raw['author']) - transformed = insert(transformed) \ No newline at end of file + transformed = insert(transformed) + +def parse_created(created: str, date_archived: datetime) -> datetime: + """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime + object relative to the specified ``date_archived``. + """ + try: + # handle case where `created` string has already been parsed into a datetime + return datetime.fromisoformat(created) + except ValueError: + period_list = ['year', 'month', 'week', 'day'] + + periods = [period.strip() for period in created.split('ago')[0].strip().split(',')] + _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()} + kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} + + return date_archived - relativedelta(**kwargs) \ No newline at end of file diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py index aff1264..603c05c 100644 --- a/cisticola/transformer/gettr.py +++ b/cisticola/transformer/gettr.py @@ -31,13 +31,13 @@ class GettrTransformer(Transformer): transformer=self.__version__, screenname=raw['username'], name=raw['nickname'], - description=raw['dsc'], - description_url=raw['website'], - description_location=raw['location'], - followers=raw['flg'], - following=raw['flw'], + description=raw.get('dsc'), + description_url=raw.get('website'), + description_location=raw.get('location'), + followers=int(raw['flg']), + following=int(raw['flw']), verified=True if raw.get('infl') else False, - date_created=datetime.fromtimestamp(raw['cdate']*0.001), + date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc) ) diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index d0a8cee..0840cd7 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -8,15 +8,19 @@ import requests import time from telethon.sync import TelegramClient from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError +from telethon.tl import types +from telethon.helpers import add_surrogate + import os from datetime import datetime, timezone +from sqlalchemy import func from cisticola.transformer.base import Transformer from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel class TelegramTelethonTransformer(Transformer): - __version__ = 'TelegramTelethonTransformer 0.0.2' + __version__ = 'TelegramTelethonTransformer 0.0.3' bad_channels = {} @@ -34,8 +38,10 @@ class TelegramTelethonTransformer(Transformer): try: with TelegramClient("transform.session", api_id, api_hash) as client: data = client.get_entity(channel_id) - - return (data.username, data.title, "") + if isinstance(data, types.User): + return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "") + else: + return (data.username, data.title, "") except ChannelPrivateError: logger.info("ChannelPrivateError") return ("", "", "ChannelPrivateError") @@ -161,6 +167,43 @@ class TelegramTelethonTransformer(Transformer): else: reply_to = post.id + mentions = [] + + for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: + + offset = mention_entity['offset'] + length = mention_entity['length'] + + screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() + + channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() + + if channel is None: + + channel = Channel( + name = None, + platform_id = None, + platform = 'Telegram', + url="https://t.me/s/" + screenname, + screenname=screenname, + category='mentioned', + source=self.__version__, + ) + + channel = insert(channel) + logger.info(f"Added {channel}") + + mentions.append(channel.id) + + channel = session.query(Channel).filter_by(id=int(data.channel)).first() + + if channel is not None: + url = channel.url.strip('/') + f"/{raw['id']}" + author_username = channel.screenname + else: + url = "" + author_username = "" + transformed = Post( raw_id = data.id, platform_id = raw['id'], @@ -171,24 +214,45 @@ class TelegramTelethonTransformer(Transformer): date=dateutil.parser.parse(raw['date']), date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), - url="", - content=raw['message'], - author_id=raw['post_author'], - author_username="", + url=url, + content=add_markdown_links(raw), + author_id=raw.get('peer_id', {}).get('channel_id'), + author_username=author_username, forwarded_from=fwd_from, - reply_to=reply_to + reply_to=reply_to, + mentions = mentions ) transformed = insert(transformed) - for k in data.archived_urls: - if data.archived_urls[k]: - archived_url = data.archived_urls[k] - ext = archived_url.split('.')[-1] + # for k in data.archived_urls: + # if data.archived_urls[k]: + # archived_url = data.archived_urls[k] + # ext = archived_url.split('.')[-1] - if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': - insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) - else: - insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) + # if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': + # insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) + # else: + # insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) - \ No newline at end of file +def add_markdown_links(raw_post): + + global_offset = 0 + transformed_content = raw_post['message'] + links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl'] + + for link in links: + offset = global_offset + link['offset'] + length = link['length'] + url = link['url'] + + before_link = transformed_content[:offset] + link_text = f"[{transformed_content[offset:offset+length].strip()}]" + trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()]) + link_href = f"({url})" + after_link = transformed_content[offset+length:] + + transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link + global_offset += (4 + len(url)) + + return transformed_content \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 684c15d..27e6180 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,8 @@ BITCHUTE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} GAB_CHANNEL_KWARGS = { 'name': 'Capt. Marc Simon (test)', @@ -31,7 +32,8 @@ GAB_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} GAB_GROUP_KWARGS = { 'name': 'iran group (test)', @@ -44,7 +46,8 @@ GAB_GROUP_KWARGS = { 'influencer': None, 'public': True, 'chat': True, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} GETTR_CHANNEL_KWARGS = { 'name': 'LizardRepublic (test)', @@ -57,7 +60,8 @@ GETTR_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} INSTAGRAM_CHANNEL_KWARGS = { 'name': 'borland.88 (test)', @@ -70,7 +74,8 @@ INSTAGRAM_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} ODYSEE_CHANNEL_KWARGS = { 'name': "Mak1n' Bacon (test)", @@ -83,7 +88,8 @@ ODYSEE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} RUMBLE_CHANNEL_KWARGS = { 'name': 'we are uploading videos wow products', @@ -96,7 +102,8 @@ RUMBLE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} TELEGRAM_CHANNEL_KWARGS = { 'name': 'South West Ohio Proud Boys (test)', @@ -109,8 +116,9 @@ TELEGRAM_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} - + 'notes': '', + 'source': 'researcher'} + TWITTER_CHANNEL_KWARGS = { 'name': 'L Weber (test)', 'platform_id': 1424979017749442595, @@ -122,7 +130,8 @@ TWITTER_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} VKONTAKTE_CHANNEL_KWARGS = { 'name': 'Wwg1wgA (test)', @@ -135,7 +144,8 @@ VKONTAKTE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} YOUTUBE_CHANNEL_KWARGS = { 'name': 'AnEs87 (test)', @@ -148,7 +158,8 @@ YOUTUBE_CHANNEL_KWARGS = { 'influencer': None, 'public': True, 'chat': False, - 'notes': ''} + 'notes': '', + 'source': 'researcher'} #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/transformer/bitchute.py b/tests/transformer/bitchute.py index 161d3e5..126db3a 100644 --- a/tests/transformer/bitchute.py +++ b/tests/transformer/bitchute.py @@ -15,7 +15,8 @@ def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs) channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = True) - + controller.scrape_all_channel_info() + etl_controller.register_transformer(BitchuteTransformer()) etl_controller.transform_all_untransformed() etl_controller.transform_all_untransformed_info() diff --git a/tests/transformer/gettr.py b/tests/transformer/gettr.py index ef37b67..9472f50 100644 --- a/tests/transformer/gettr.py +++ b/tests/transformer/gettr.py @@ -15,6 +15,7 @@ def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() etl_controller.register_transformer(GettrTransformer()) etl_controller.transform_all_untransformed() diff --git a/tests/transformer/rumble.py b/tests/transformer/rumble.py index 95450ed..3b2b8a5 100644 --- a/tests/transformer/rumble.py +++ b/tests/transformer/rumble.py @@ -15,7 +15,8 @@ def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = True) - + controller.scrape_all_channel_info() + etl_controller.register_transformer(RumbleTransformer()) etl_controller.transform_all_untransformed() etl_controller.transform_all_untransformed_info() diff --git a/tests/transformer/telegram_telethon.py b/tests/transformer/telegram_telethon.py index a5389b6..14fe04c 100644 --- a/tests/transformer/telegram_telethon.py +++ b/tests/transformer/telegram_telethon.py @@ -15,6 +15,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() etl_controller.register_transformer(TelegramTelethonTransformer()) etl_controller.transform_all_untransformed() @@ -28,7 +29,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe media = session.query(Media).all() assert len(posts) == 19 - assert len(media) == 13 + # assert len(media) == 13 assert posts[16].content == "Taking pre-orders now" - assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280" \ No newline at end of file + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280" \ No newline at end of file diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index 8799aad..3e4b368 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -15,6 +15,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = True) + controller.scrape_all_channel_info() etl_controller.register_transformer(TwitterTransformer()) etl_controller.transform_all_untransformed() @@ -28,7 +29,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): media = session.query(Media).all() assert len(posts) == 12 - assert len(media) == 4 + assert len(media) == 8 assert posts[2].content == "BARN" assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file