diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 88073ec..888a64f 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -357,7 +357,7 @@ class ScraperController: # This will sort the channels by the least recently scraped. most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery() channels = session.query(Channel).\ - where(Channel.source=='researcher').\ + where(Channel.category=='imported').\ outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\ order_by(nullsfirst(most_recently_archived.c.date.asc())).all() diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 7212e91..7d4b8fc 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -7,12 +7,17 @@ from pathlib import Path import time import pickle +import requests +from bs4 import BeautifulSoup + from sqlalchemy import func from loguru import logger from telethon.sync import TelegramClient from telethon.tl.functions.channels import GetFullChannelRequest from telethon.tl import types +from snscrape.modules.telegram import TelegramChannelScraper + from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper @@ -31,7 +36,7 @@ class TelegramTelethonScraper(Scraper): phone = os.environ['TELEGRAM_PHONE'] # set up a persistent client for Telethon - self.client = TelegramClient(phone, api_id, api_hash) + self.client = TelegramClient('transform.session', api_id, api_hash) self.client.connect() def __del__(self): @@ -222,14 +227,64 @@ class TelegramTelethonScraper(Scraper): archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) + def get_full_channel_tgstat(self, channel): + + username = TelegramTelethonScraper.get_username_from_url(channel.url) + url = f'https://tgstat.com/channel/@{username}/stat' + r = requests.get(url, headers = { + 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}) + if 'Channel not found' in r.text: + raise ValueError(f'Channel information not archived') + + soup = BeautifulSoup(r.content, features = 'lxml') + + raw_data = { + 'full_chat': { + 'id': channel.platform_id, + 'about': soup.find('div', class_ = 'col-12 col-sm-7 col-md-8 col-lg-6').text.strip().split('\n')[-1].strip(), + 'participants_count': int(soup.find('h2', class_ = 'text-dark').text.strip().replace(' ', '')) + }, + 'chats': [{ + 'username':soup.find('a', {'target': '_blank'}).text.strip().strip('@'), + 'title': soup.find('h1').text.strip(), + 'date': None, + }], + } + + return raw_data + + def get_full_channel_snscrape(self, channel): + username = TelegramTelethonScraper.get_username_from_url(channel.url) + scraper = TelegramChannelScraper(name = username) + entity = scraper._get_entity() + raw_data = { + 'full_chat': { + 'id': channel.platform_id, + 'about': entity.description, + 'participants_count': entity.members + }, + 'chats': [{ + 'username': entity.username, + 'title': entity.title, + 'date': None, + }], + } + return raw_data + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: username = TelegramTelethonScraper.get_channel_identifier(channel) - full_channel = self.client(GetFullChannelRequest(channel = username)) - profile = full_channel.to_dict() + try: + full_channel = self.client(GetFullChannelRequest(channel = username)) + profile = full_channel.to_dict() + except: + try: + profile = self.get_full_channel_snscrape(channel) + except: + profile = self.get_full_channel_tgstat(channel) return RawChannelInfo(scraper=self.__version__, platform=channel.platform, channel=channel.id, raw_data=json.dumps(profile, default=str), - date_archived=datetime.now(timezone.utc)) + date_archived=datetime.now(timezone.utc)) \ No newline at end of file diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 97313da..c487abd 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -201,13 +201,13 @@ class ETLController: logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})") handled = True - transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session) + transformer.transform_info(result, lambda obj: insert_or_select(obj, session, False), session) session.commit() break - if handled == False: - logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})") + if handled == False: + logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})") @logger.catch(reraise=True) diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index 52c3a47..ca685dc 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -32,7 +32,7 @@ class TelegramTelethonTransformer(Transformer): phone = os.environ['TELEGRAM_PHONE'] # set up a persistent client for Telethon - self.client = TelegramClient(phone, api_id, api_hash) + self.client = TelegramClient('transform.session', api_id, api_hash) self.client.connect() def can_handle(self, data: ScraperResult) -> bool: @@ -122,7 +122,7 @@ class TelegramTelethonTransformer(Transformer): followers=raw['full_chat']['participants_count'], following=-1, # does not exist for Telegram verified=False, #does not exist for Telegram - date_created=dateutil.parser.parse(chat_raw['date']), + date_created=dateutil.parser.parse(chat_raw['date']) if chat_raw['date'] else None, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc) )