fixed Gettr and Bitchute info transformers, added missing or incorrect TelegramTransformer fields, added Telegram mentions to the transformer.

This commit is contained in:
Tristan Lee
2022-06-13 13:42:33 -05:00
parent 6e962de244
commit a2a7882f1c
13 changed files with 150 additions and 67 deletions

View File

@@ -232,6 +232,9 @@ class Post:
#: The ID of the Post that this Post is a reply to or reblog of
reply_to: int = None
#: Other users mentioned in the post
mentions: list = field(default_factory=list)
def hydrate(self):
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
@@ -398,7 +401,6 @@ channel_info_table = Table('channel_info', mapper_registry.metadata,
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('scraper', String),
Column('platform', String),
Column('transformer', String),
Column('platform', String),
Column('screenname', String),
@@ -452,6 +454,7 @@ post_table = Table('posts', mapper_registry.metadata,
Column('cryptocurrency_addresses', JSON),
Column('hashtags', JSON),
Column('outlinks', JSON),
Column('mentions', JSON),
Column('detected_language', String),
Column('normalized_content', String)
)

View File

@@ -105,7 +105,7 @@ class BitchuteScraper(Scraper):
profile = {
'description' : description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : owner_name,
@@ -483,16 +483,4 @@ def decode_cfemail(cfemail):
for i in range(2, len(cfemail)-1, 2):
email += chr(int(cfemail[i:i+2], 16)^k)
return email
#---------------------------------------------------------------------------#
def parse_created(created):
period_list = ['year', 'month', 'week', 'day']
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return datetime.now() - relativedelta(**kwargs)
return email

View File

@@ -166,13 +166,7 @@ class TelegramTelethonScraper(Scraper):
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = TelegramTelethonScraper.get_channel_identifier(channel)
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
full_channel = client(GetFullChannelRequest(channel = username))
full_channel = self.client(GetFullChannelRequest(channel = username))
profile = full_channel.to_dict()
return RawChannelInfo(scraper=self.__version__,

View File

@@ -96,7 +96,7 @@ class ETLController:
# This is using some adhoc unique constraints that might be worth formalizing at some point
if type(obj) == Channel:
instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first()
instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first()
elif type(obj) == Post:
instance = None
@@ -133,6 +133,8 @@ class ETLController:
obj.hydrate()
session.add(obj)
session.flush()
logger.trace(f"Inserted new object {obj}")
return obj

View File

@@ -2,7 +2,7 @@ import json
from loguru import logger
from typing import Generator, Union, Callable
from datetime import datetime, timezone
import dateutil.parser
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
@@ -12,7 +12,7 @@ from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Me
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
__version__ = "BitchuteTransformer 0.0.1"
__version__ = "BitchuteTransformer 0.0.2"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
@@ -49,7 +49,7 @@ class BitchuteTransformer(Transformer):
followers=raw['subscribers'],
following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute
date_created=dateutil.parser.parse(raw['created']),
date_created=parse_created(raw['created'], data.date_archived),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
@@ -77,4 +77,20 @@ class BitchuteTransformer(Transformer):
author_id=raw['author_id'],
author_username=raw['author'])
transformed = insert(transformed)
transformed = insert(transformed)
def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
object relative to the specified ``date_archived``.
"""
try:
# handle case where `created` string has already been parsed into a datetime
return datetime.fromisoformat(created)
except ValueError:
period_list = ['year', 'month', 'week', 'day']
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return date_archived - relativedelta(**kwargs)

View File

@@ -31,13 +31,13 @@ class GettrTransformer(Transformer):
transformer=self.__version__,
screenname=raw['username'],
name=raw['nickname'],
description=raw['dsc'],
description_url=raw['website'],
description_location=raw['location'],
followers=raw['flg'],
following=raw['flw'],
description=raw.get('dsc'),
description_url=raw.get('website'),
description_location=raw.get('location'),
followers=int(raw['flg']),
following=int(raw['flw']),
verified=True if raw.get('infl') else False,
date_created=datetime.fromtimestamp(raw['cdate']*0.001),
date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)

View File

@@ -8,15 +8,19 @@ import requests
import time
from telethon.sync import TelegramClient
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
from telethon.tl import types
from telethon.helpers import add_surrogate
import os
from datetime import datetime, timezone
from sqlalchemy import func
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class TelegramTelethonTransformer(Transformer):
__version__ = 'TelegramTelethonTransformer 0.0.2'
__version__ = 'TelegramTelethonTransformer 0.0.3'
bad_channels = {}
@@ -34,8 +38,10 @@ class TelegramTelethonTransformer(Transformer):
try:
with TelegramClient("transform.session", api_id, api_hash) as client:
data = client.get_entity(channel_id)
return (data.username, data.title, "")
if isinstance(data, types.User):
return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "")
else:
return (data.username, data.title, "")
except ChannelPrivateError:
logger.info("ChannelPrivateError")
return ("", "", "ChannelPrivateError")
@@ -161,6 +167,43 @@ class TelegramTelethonTransformer(Transformer):
else:
reply_to = post.id
mentions = []
for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']:
offset = mention_entity['offset']
length = mention_entity['length']
screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip()
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first()
if channel is None:
channel = Channel(
name = None,
platform_id = None,
platform = 'Telegram',
url="https://t.me/s/" + screenname,
screenname=screenname,
category='mentioned',
source=self.__version__,
)
channel = insert(channel)
logger.info(f"Added {channel}")
mentions.append(channel.id)
channel = session.query(Channel).filter_by(id=int(data.channel)).first()
if channel is not None:
url = channel.url.strip('/') + f"/{raw['id']}"
author_username = channel.screenname
else:
url = ""
author_username = ""
transformed = Post(
raw_id = data.id,
platform_id = raw['id'],
@@ -171,24 +214,45 @@ class TelegramTelethonTransformer(Transformer):
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url="",
content=raw['message'],
author_id=raw['post_author'],
author_username="",
url=url,
content=add_markdown_links(raw),
author_id=raw.get('peer_id', {}).get('channel_id'),
author_username=author_username,
forwarded_from=fwd_from,
reply_to=reply_to
reply_to=reply_to,
mentions = mentions
)
transformed = insert(transformed)
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
ext = archived_url.split('.')[-1]
# for k in data.archived_urls:
# if data.archived_urls[k]:
# archived_url = data.archived_urls[k]
# ext = archived_url.split('.')[-1]
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
else:
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# else:
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
def add_markdown_links(raw_post):
global_offset = 0
transformed_content = raw_post['message']
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
for link in links:
offset = global_offset + link['offset']
length = link['length']
url = link['url']
before_link = transformed_content[:offset]
link_text = f"[{transformed_content[offset:offset+length].strip()}]"
trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()])
link_href = f"({url})"
after_link = transformed_content[offset+length:]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
global_offset += (4 + len(url))
return transformed_content

View File

@@ -18,7 +18,8 @@ BITCHUTE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
GAB_CHANNEL_KWARGS = {
'name': 'Capt. Marc Simon (test)',
@@ -31,7 +32,8 @@ GAB_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
GAB_GROUP_KWARGS = {
'name': 'iran group (test)',
@@ -44,7 +46,8 @@ GAB_GROUP_KWARGS = {
'influencer': None,
'public': True,
'chat': True,
'notes': ''}
'notes': '',
'source': 'researcher'}
GETTR_CHANNEL_KWARGS = {
'name': 'LizardRepublic (test)',
@@ -57,7 +60,8 @@ GETTR_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
INSTAGRAM_CHANNEL_KWARGS = {
'name': 'borland.88 (test)',
@@ -70,7 +74,8 @@ INSTAGRAM_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
ODYSEE_CHANNEL_KWARGS = {
'name': "Mak1n' Bacon (test)",
@@ -83,7 +88,8 @@ ODYSEE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
RUMBLE_CHANNEL_KWARGS = {
'name': 'we are uploading videos wow products',
@@ -96,7 +102,8 @@ RUMBLE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'South West Ohio Proud Boys (test)',
@@ -109,8 +116,9 @@ TELEGRAM_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
TWITTER_CHANNEL_KWARGS = {
'name': 'L Weber (test)',
'platform_id': 1424979017749442595,
@@ -122,7 +130,8 @@ TWITTER_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
VKONTAKTE_CHANNEL_KWARGS = {
'name': 'Wwg1wgA (test)',
@@ -135,7 +144,8 @@ VKONTAKTE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
YOUTUBE_CHANNEL_KWARGS = {
'name': 'AnEs87 (test)',
@@ -148,7 +158,8 @@ YOUTUBE_CHANNEL_KWARGS = {
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
'notes': '',
'source': 'researcher'}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -15,7 +15,8 @@ def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs)
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(BitchuteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()

View File

@@ -15,6 +15,7 @@ def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(GettrTransformer())
etl_controller.transform_all_untransformed()

View File

@@ -15,7 +15,8 @@ def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(RumbleTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()

View File

@@ -15,6 +15,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(TelegramTelethonTransformer())
etl_controller.transform_all_untransformed()
@@ -28,7 +29,7 @@ def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channe
media = session.query(Media).all()
assert len(posts) == 19
assert len(media) == 13
# assert len(media) == 13
assert posts[16].content == "Taking pre-orders now"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"

View File

@@ -15,6 +15,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
@@ -28,7 +29,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
media = session.query(Media).all()
assert len(posts) == 12
assert len(media) == 4
assert len(media) == 8
assert posts[2].content == "BARN"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"