From 92d4839b5ecf1dd6b6ab0924778ed73a6c71fab4 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 9 Jun 2022 10:01:27 +0200 Subject: [PATCH] Revise Telethon scraper to use the same client connection --- cisticola/scraper/telegram_telethon.py | 97 ++++++++++++-------------- 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 6dd9d23..1616a9a 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -19,6 +19,22 @@ MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] class TelegramTelethonScraper(Scraper): """An implementation of a Scraper for Telegram, using Telethon library""" __version__ = "TelegramTelethonScraper 0.0.2" + client = None + + def __init__(self): + super().__init__() + + api_id = os.environ['TELEGRAM_API_ID'] + api_hash = os.environ['TELEGRAM_API_HASH'] + phone = os.environ['TELEGRAM_PHONE'] + + # set up a persistent client for Telethon + self.client = TelegramClient(phone, api_id, api_hash) + self.client.connect() + + def __del__(self): + logger.info("Disconnecting Telethon client") + self.client.disconnect() def get_username_from_url(url): username = url.split('https://t.me/')[1] @@ -39,18 +55,10 @@ class TelegramTelethonScraper(Scraper): return identifier @logger.catch - def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult: + def archive_files(self, result: ScraperResult) -> ScraperResult: if len(result.archived_urls.keys()) == 0: return result - if client is None: - api_id = os.environ['TELEGRAM_API_ID'] - api_hash = os.environ['TELEGRAM_API_HASH'] - phone = os.environ['TELEGRAM_PHONE'] - - with TelegramClient(phone, api_id, api_hash) as client: - return self.archive_files(result, client) - if len(list(result.archived_urls.keys())) != 1: logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}") else: @@ -59,11 +67,11 @@ class TelegramTelethonScraper(Scraper): if result.archived_urls[key] is None: raw = json.loads(result.raw_data) - message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) + message = self.client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) blob = None if len(message) > 0: - blob, output_file_with_ext = self.archive_post_media(message[0], client) + blob, output_file_with_ext = self.archive_post_media(message[0]) else: logger.warning("No message retrieved") @@ -83,19 +91,11 @@ class TelegramTelethonScraper(Scraper): return result - def archive_post_media(self, post : types.Message, client : TelegramClient = None): + def archive_post_media(self, post : types.Message): if post.media is None: logger.debug("No media for post") return None, None - if client is None: - api_id = os.environ['TELEGRAM_API_ID'] - api_hash = os.environ['TELEGRAM_API_HASH'] - phone = os.environ['TELEGRAM_PHONE'] - - with TelegramClient(phone, api_id, api_hash) as client: - return self.archive_post_media(post, client=client) - if type(post.media) == types.MessageMediaDocument: if post.media.document.size/(1024*1024) > 50: logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") @@ -110,7 +110,7 @@ class TelegramTelethonScraper(Scraper): with tempfile.TemporaryDirectory() as temp_dir: output_file = Path(temp_dir, key) - client.download_media(post.media, output_file) + self.client.download_media(post.media, output_file) if len(os.listdir(temp_dir)) == 0: logger.warning(f"No file present. Could not archive {post.media}") @@ -131,42 +131,37 @@ class TelegramTelethonScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = TelegramTelethonScraper.get_channel_identifier(channel) - api_id = os.environ['TELEGRAM_API_ID'] - api_hash = os.environ['TELEGRAM_API_HASH'] - phone = os.environ['TELEGRAM_PHONE'] + for post in self.client.iter_messages(username): + post_url = f'{channel.url}/{post.id}' - with TelegramClient(phone, api_id, api_hash) as client: - for post in client.iter_messages(username): - post_url = f'{channel.url}/{post.id}' + logger.trace(f"Archiving post {post_url} from {post.date}") - logger.trace(f"Archiving post {post_url} from {post.date}") + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): + logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}') + break - if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): - logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}') - break + archived_urls = {} - archived_urls = {} + if post.media is not None: + archived_urls[post_url] = None - if post.media is not None: - archived_urls[post_url] = None + # if archive_media: + # blob, output_file_with_ext = self.archive_post_media(post, client) + # if blob is not None: + # # TODO specify Content-Type + # archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext) + # archived_urls[post_url] = archived_url - # if archive_media: - # blob, output_file_with_ext = self.archive_post_media(post, client) - # if blob is not None: - # # TODO specify Content-Type - # archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext) - # archived_urls[post_url] = archived_url - - yield ScraperResult( - scraper=self.__version__, - platform="Telegram", - channel=channel.id, - platform_id=post_url, - date=post.date.replace(tzinfo=timezone.utc), - date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post.to_dict(), default=str), - archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + yield ScraperResult( + scraper=self.__version__, + platform="Telegram", + channel=channel.id, + platform_id=post_url, + date=post.date.replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(post.to_dict(), default=str), + archived_urls=archived_urls, + media_archived=datetime.now(timezone.utc) if archive_media else None) @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: