mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 12:58:33 +03:00
added additional options when Telethon GetFullChannelRequest fails
This commit is contained in:
@@ -357,7 +357,7 @@ class ScraperController:
|
||||
# This will sort the channels by the least recently scraped.
|
||||
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
|
||||
channels = session.query(Channel).\
|
||||
where(Channel.source=='researcher').\
|
||||
where(Channel.category=='imported').\
|
||||
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
|
||||
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()
|
||||
|
||||
|
||||
@@ -7,12 +7,17 @@ from pathlib import Path
|
||||
import time
|
||||
import pickle
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from sqlalchemy import func
|
||||
from loguru import logger
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.tl.functions.channels import GetFullChannelRequest
|
||||
from telethon.tl import types
|
||||
|
||||
from snscrape.modules.telegram import TelegramChannelScraper
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
@@ -31,7 +36,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
phone = os.environ['TELEGRAM_PHONE']
|
||||
|
||||
# set up a persistent client for Telethon
|
||||
self.client = TelegramClient(phone, api_id, api_hash)
|
||||
self.client = TelegramClient('transform.session', api_id, api_hash)
|
||||
self.client.connect()
|
||||
|
||||
def __del__(self):
|
||||
@@ -222,14 +227,64 @@ class TelegramTelethonScraper(Scraper):
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def get_full_channel_tgstat(self, channel):
|
||||
|
||||
username = TelegramTelethonScraper.get_username_from_url(channel.url)
|
||||
url = f'https://tgstat.com/channel/@{username}/stat'
|
||||
r = requests.get(url, headers = {
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'})
|
||||
if 'Channel not found' in r.text:
|
||||
raise ValueError(f'Channel information not archived')
|
||||
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
|
||||
raw_data = {
|
||||
'full_chat': {
|
||||
'id': channel.platform_id,
|
||||
'about': soup.find('div', class_ = 'col-12 col-sm-7 col-md-8 col-lg-6').text.strip().split('\n')[-1].strip(),
|
||||
'participants_count': int(soup.find('h2', class_ = 'text-dark').text.strip().replace(' ', ''))
|
||||
},
|
||||
'chats': [{
|
||||
'username':soup.find('a', {'target': '_blank'}).text.strip().strip('@'),
|
||||
'title': soup.find('h1').text.strip(),
|
||||
'date': None,
|
||||
}],
|
||||
}
|
||||
|
||||
return raw_data
|
||||
|
||||
def get_full_channel_snscrape(self, channel):
|
||||
username = TelegramTelethonScraper.get_username_from_url(channel.url)
|
||||
scraper = TelegramChannelScraper(name = username)
|
||||
entity = scraper._get_entity()
|
||||
raw_data = {
|
||||
'full_chat': {
|
||||
'id': channel.platform_id,
|
||||
'about': entity.description,
|
||||
'participants_count': entity.members
|
||||
},
|
||||
'chats': [{
|
||||
'username': entity.username,
|
||||
'title': entity.title,
|
||||
'date': None,
|
||||
}],
|
||||
}
|
||||
return raw_data
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
||||
full_channel = self.client(GetFullChannelRequest(channel = username))
|
||||
profile = full_channel.to_dict()
|
||||
try:
|
||||
full_channel = self.client(GetFullChannelRequest(channel = username))
|
||||
profile = full_channel.to_dict()
|
||||
except:
|
||||
try:
|
||||
profile = self.get_full_channel_snscrape(channel)
|
||||
except:
|
||||
profile = self.get_full_channel_tgstat(channel)
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -201,13 +201,13 @@ class ETLController:
|
||||
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
|
||||
handled = True
|
||||
|
||||
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session)
|
||||
transformer.transform_info(result, lambda obj: insert_or_select(obj, session, False), session)
|
||||
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
|
||||
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
|
||||
@@ -32,7 +32,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
phone = os.environ['TELEGRAM_PHONE']
|
||||
|
||||
# set up a persistent client for Telethon
|
||||
self.client = TelegramClient(phone, api_id, api_hash)
|
||||
self.client = TelegramClient('transform.session', api_id, api_hash)
|
||||
self.client.connect()
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
@@ -122,7 +122,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
followers=raw['full_chat']['participants_count'],
|
||||
following=-1, # does not exist for Telegram
|
||||
verified=False, #does not exist for Telegram
|
||||
date_created=dateutil.parser.parse(chat_raw['date']),
|
||||
date_created=dateutil.parser.parse(chat_raw['date']) if chat_raw['date'] else None,
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user