added additional options when Telethon GetFullChannelRequest fails

This commit is contained in:
Tristan Lee
2022-07-01 03:19:31 -05:00
parent fb2a6e77cc
commit dcf7e77446
4 changed files with 65 additions and 10 deletions

View File

@@ -357,7 +357,7 @@ class ScraperController:
# This will sort the channels by the least recently scraped.
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
channels = session.query(Channel).\
where(Channel.source=='researcher').\
where(Channel.category=='imported').\
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()

View File

@@ -7,12 +7,17 @@ from pathlib import Path
import time
import pickle
import requests
from bs4 import BeautifulSoup
from sqlalchemy import func
from loguru import logger
from telethon.sync import TelegramClient
from telethon.tl.functions.channels import GetFullChannelRequest
from telethon.tl import types
from snscrape.modules.telegram import TelegramChannelScraper
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
@@ -31,7 +36,7 @@ class TelegramTelethonScraper(Scraper):
phone = os.environ['TELEGRAM_PHONE']
# set up a persistent client for Telethon
self.client = TelegramClient(phone, api_id, api_hash)
self.client = TelegramClient('transform.session', api_id, api_hash)
self.client.connect()
def __del__(self):
@@ -222,14 +227,64 @@ class TelegramTelethonScraper(Scraper):
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
def get_full_channel_tgstat(self, channel):
username = TelegramTelethonScraper.get_username_from_url(channel.url)
url = f'https://tgstat.com/channel/@{username}/stat'
r = requests.get(url, headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'})
if 'Channel not found' in r.text:
raise ValueError(f'Channel information not archived')
soup = BeautifulSoup(r.content, features = 'lxml')
raw_data = {
'full_chat': {
'id': channel.platform_id,
'about': soup.find('div', class_ = 'col-12 col-sm-7 col-md-8 col-lg-6').text.strip().split('\n')[-1].strip(),
'participants_count': int(soup.find('h2', class_ = 'text-dark').text.strip().replace(' ', ''))
},
'chats': [{
'username':soup.find('a', {'target': '_blank'}).text.strip().strip('@'),
'title': soup.find('h1').text.strip(),
'date': None,
}],
}
return raw_data
def get_full_channel_snscrape(self, channel):
username = TelegramTelethonScraper.get_username_from_url(channel.url)
scraper = TelegramChannelScraper(name = username)
entity = scraper._get_entity()
raw_data = {
'full_chat': {
'id': channel.platform_id,
'about': entity.description,
'participants_count': entity.members
},
'chats': [{
'username': entity.username,
'title': entity.title,
'date': None,
}],
}
return raw_data
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = TelegramTelethonScraper.get_channel_identifier(channel)
full_channel = self.client(GetFullChannelRequest(channel = username))
profile = full_channel.to_dict()
try:
full_channel = self.client(GetFullChannelRequest(channel = username))
profile = full_channel.to_dict()
except:
try:
profile = self.get_full_channel_snscrape(channel)
except:
profile = self.get_full_channel_tgstat(channel)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile, default=str),
date_archived=datetime.now(timezone.utc))
date_archived=datetime.now(timezone.utc))

View File

@@ -201,13 +201,13 @@ class ETLController:
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
handled = True
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session)
transformer.transform_info(result, lambda obj: insert_or_select(obj, session, False), session)
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
@logger.catch(reraise=True)

View File

@@ -32,7 +32,7 @@ class TelegramTelethonTransformer(Transformer):
phone = os.environ['TELEGRAM_PHONE']
# set up a persistent client for Telethon
self.client = TelegramClient(phone, api_id, api_hash)
self.client = TelegramClient('transform.session', api_id, api_hash)
self.client.connect()
def can_handle(self, data: ScraperResult) -> bool:
@@ -122,7 +122,7 @@ class TelegramTelethonTransformer(Transformer):
followers=raw['full_chat']['participants_count'],
following=-1, # does not exist for Telegram
verified=False, #does not exist for Telegram
date_created=dateutil.parser.parse(chat_raw['date']),
date_created=dateutil.parser.parse(chat_raw['date']) if chat_raw['date'] else None,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)