From 92d4839b5ecf1dd6b6ab0924778ed73a6c71fab4 Mon Sep 17 00:00:00 2001
From: Logan Williams <logan.williams@alum.mit.edu>
Date: Thu, 9 Jun 2022 10:01:27 +0200
Subject: [PATCH] Revise Telethon scraper to use the same client connection

---
 cisticola/scraper/telegram_telethon.py | 97 ++++++++++++--------------
 1 file changed, 46 insertions(+), 51 deletions(-)

diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py
index 6dd9d23..1616a9a 100644
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -19,6 +19,22 @@ MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
 class TelegramTelethonScraper(Scraper):
     """An implementation of a Scraper for Telegram, using Telethon library"""
     __version__ = "TelegramTelethonScraper 0.0.2"
+    client = None
+
+    def __init__(self):
+        super().__init__()
+
+        api_id = os.environ['TELEGRAM_API_ID']
+        api_hash = os.environ['TELEGRAM_API_HASH']
+        phone = os.environ['TELEGRAM_PHONE']
+
+        # set up a persistent client for Telethon
+        self.client =  TelegramClient(phone, api_id, api_hash)
+        self.client.connect()
+
+    def __del__(self):
+        logger.info("Disconnecting Telethon client")
+        self.client.disconnect()
 
     def get_username_from_url(url):
         username = url.split('https://t.me/')[1]
@@ -39,18 +55,10 @@ class TelegramTelethonScraper(Scraper):
         return identifier
 
     @logger.catch
-    def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult:
+    def archive_files(self, result: ScraperResult) -> ScraperResult:
         if len(result.archived_urls.keys()) == 0:
             return result
 
-        if client is None:
-            api_id = os.environ['TELEGRAM_API_ID']
-            api_hash = os.environ['TELEGRAM_API_HASH']
-            phone = os.environ['TELEGRAM_PHONE']
-
-            with TelegramClient(phone, api_id, api_hash) as client:
-                return self.archive_files(result, client)
-
         if len(list(result.archived_urls.keys())) != 1:
             logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}")
         else:
@@ -59,11 +67,11 @@ class TelegramTelethonScraper(Scraper):
             if result.archived_urls[key] is None:
                 raw = json.loads(result.raw_data)
                     
-                message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
+                message = self.client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
 
                 blob = None
                 if len(message) > 0:
-                    blob, output_file_with_ext = self.archive_post_media(message[0], client)
+                    blob, output_file_with_ext = self.archive_post_media(message[0])
                 else:
                     logger.warning("No message retrieved")
 
@@ -83,19 +91,11 @@ class TelegramTelethonScraper(Scraper):
             
         return result
 
-    def archive_post_media(self, post : types.Message, client : TelegramClient = None):
+    def archive_post_media(self, post : types.Message):
         if post.media is None:
             logger.debug("No media for post")
             return None, None
         
-        if client is None:
-            api_id = os.environ['TELEGRAM_API_ID']
-            api_hash = os.environ['TELEGRAM_API_HASH']
-            phone = os.environ['TELEGRAM_PHONE']
-
-            with TelegramClient(phone, api_id, api_hash) as client:
-                return self.archive_post_media(post, client=client)
-
         if type(post.media) == types.MessageMediaDocument:
             if post.media.document.size/(1024*1024) > 50:
                 logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
@@ -110,7 +110,7 @@ class TelegramTelethonScraper(Scraper):
         with tempfile.TemporaryDirectory() as temp_dir:
             output_file = Path(temp_dir, key)
 
-            client.download_media(post.media, output_file)
+            self.client.download_media(post.media, output_file)
 
             if len(os.listdir(temp_dir)) == 0:
                 logger.warning(f"No file present. Could not archive {post.media}")
@@ -131,42 +131,37 @@ class TelegramTelethonScraper(Scraper):
     def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
         username = TelegramTelethonScraper.get_channel_identifier(channel)
 
-        api_id = os.environ['TELEGRAM_API_ID']
-        api_hash = os.environ['TELEGRAM_API_HASH']
-        phone = os.environ['TELEGRAM_PHONE']
+        for post in self.client.iter_messages(username):
+            post_url = f'{channel.url}/{post.id}'
 
-        with TelegramClient(phone, api_id, api_hash) as client:
-            for post in client.iter_messages(username):
-                post_url = f'{channel.url}/{post.id}'
+            logger.trace(f"Archiving post {post_url} from {post.date}")
 
-                logger.trace(f"Archiving post {post_url} from {post.date}")
+            if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
+                logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
+                break
 
-                if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
-                    logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
-                    break
+            archived_urls = {}
 
-                archived_urls = {}
+            if post.media is not None:                    
+                archived_urls[post_url] = None
 
-                if post.media is not None:                    
-                    archived_urls[post_url] = None
+                # if archive_media:
+                #     blob, output_file_with_ext = self.archive_post_media(post, client)
+                #     if blob is not None:
+                #         # TODO specify Content-Type
+                #         archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
+                #         archived_urls[post_url] = archived_url
 
-                    # if archive_media:
-                    #     blob, output_file_with_ext = self.archive_post_media(post, client)
-                    #     if blob is not None:
-                    #         # TODO specify Content-Type
-                    #         archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
-                    #         archived_urls[post_url] = archived_url
-
-                yield ScraperResult(
-                    scraper=self.__version__,
-                    platform="Telegram",
-                    channel=channel.id,
-                    platform_id=post_url,
-                    date=post.date.replace(tzinfo=timezone.utc),
-                    date_archived=datetime.now(timezone.utc),
-                    raw_data=json.dumps(post.to_dict(), default=str),
-                    archived_urls=archived_urls,
-                    media_archived=datetime.now(timezone.utc) if archive_media else None)
+            yield ScraperResult(
+                scraper=self.__version__,
+                platform="Telegram",
+                channel=channel.id,
+                platform_id=post_url,
+                date=post.date.replace(tzinfo=timezone.utc),
+                date_archived=datetime.now(timezone.utc),
+                raw_data=json.dumps(post.to_dict(), default=str),
+                archived_urls=archived_urls,
+                media_archived=datetime.now(timezone.utc) if archive_media else None)
 
     @logger.catch
     def get_profile(self, channel: Channel) -> RawChannelInfo: