merged main

2026-06-29 21:48:39 +03:00 · 2022-03-28 20:22:34 -05:00
parent 16870d7daa 5d6473e946
commit ea40ea2640
7 changed files with 205 additions and 121 deletions
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -234,6 +234,17 @@ class Scraper:

        return archived_url

+    def archive_files(self, result: ScraperResult) -> ScraperResult:
+        for url in result.archived_urls:
+            if result.archived_urls[url] is None:
+                media_blob, content_type, key = self.url_to_blob(url)
+                archived_url = self.archive_blob(media_blob, content_type, key)
+                result.archived_urls[url] = archived_url
+
+        result.media_archived = True
+        return result
+
+
    def can_handle(self, channel: Channel) -> bool:
        """Whether or not the scraper can scrape the specified channel.

@@ -353,6 +364,38 @@ class ScraperController:
            if not handled:
                logger.warning(f"No handler found for Channel {channel}")

+    @logger.catch(reraise = True)
+    def archive_unarchived_media(self):
+        if self.session is None:
+            logger.error("No DB session")
+            return
+
+        session = self.session()
+
+        posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
+
+        logger.info(f"Found {len(posts)} posts without media. Archiving now")
+
+        for post in posts:
+            handled = False
+
+            for scraper in self.scrapers:
+                if scraper.__version__ == post.scraper:
+                    handled = True
+                    logger.info(f"{scraper} is archiving media for {post}")
+                    post = scraper.archive_files(post)
+
+                    if post:
+                        session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': True})
+                        session.commit()
+
+                    break
+            
+            if not handled:
+                logger.warning(f"No handler found for post scraped with {post.scraper}")
+
+        session.commit()
+
    def connect_to_db(self, engine):
        """Connect the specified SQLAlchemy engine to the controller.
        """
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -30,19 +30,17 @@ class TelegramSnscrapeScraper(Scraper):

            archived_urls = {}

+            for image_url in post.images:
+                archived_urls[image_url] = None
+
+            if post.video:
+                archived_urls[post.video] = None
+
            if archive_media:
-
-                for image_url in post.images:
-                    logger.debug(f'Archiving image: {image_url}')
-                    media_blob, content_type, key = self.url_to_blob(image_url)
+                for url in archived_urls:
+                    media_blob, content_type, key = self.url_to_blob(url)
                    archived_url = self.archive_blob(media_blob, content_type, key)
-                    archived_urls[image_url] = archived_url
-
-                if post.video:
-                    logger.debug(f'Archiving video: {post.video}')
-                    media_blob, content_type, key = self.url_to_blob(post.video)
-                    archived_url = self.archive_blob(media_blob, content_type, key)
-                    archived_urls[post.video] = archived_url
+                    archived_urls[url] = archived_url

            yield ScraperResult(
                scraper=self.__version__,
@@ -52,7 +50,8 @@ class TelegramSnscrapeScraper(Scraper):
                date=post.date,
                date_archived=datetime.now(timezone.utc),
                raw_data=post.json(),
-                archived_urls=archived_urls
+                archived_urls=archived_urls,
+                media_archived=archive_media
            )

    def get_profile(self, channel: Channel) -> dict:
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -4,10 +4,12 @@ import os
 import json
 import tempfile
 from pathlib import Path
+import time

 from loguru import logger
 from telethon.sync import TelegramClient 
 from telethon.tl.functions.channels import GetFullChannelRequest
+from telethon.tl import types

 from cisticola.base import Channel, ScraperResult
 from cisticola.scraper.base import Scraper
@@ -24,12 +26,80 @@ class TelegramTelethonScraper(Scraper):
            username = username.split('s/')[1]
        return username

+    def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult:
+        if len(result.archived_urls.keys()) == 0:
+            return result
+
+        if client is None:
+            api_id = os.environ['TELEGRAM_API_ID']
+            api_hash = os.environ['TELEGRAM_API_HASH']
+            phone = os.environ['TELEGRAM_PHONE']
+
+            with TelegramClient(phone, api_id, api_hash) as client:
+                return self.archive_files(result, client)
+
+        if len(list(result.archived_urls.keys())) != 1:
+            logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}")
+        else:
+            key = list(result.archived_urls.keys())[0]
+
+            if result.archived_urls[key] is None:
+                raw = json.loads(result.raw_data)
+                    
+                message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
+
+                blob = None
+                if len(message) > 0:
+                    blob, output_file_with_ext = self.archive_post_media(message[0], client)
+                else:
+                    logger.warning("No message retrieved")
+
+                if blob is not None:
+                    # TODO specify Content-Type
+                    archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
+                    result.archived_urls[key] = archived_url
+                    return result
+                else:
+                    logger.warning("Downloaded blob was None")
+            
+        result.media_archived = True
+        return result
+
+    def archive_post_media(self, post : types.Message, client : TelegramClient = None):
+        logger.debug(f"Archiving post {post}")
+        
+        if post.media is None:
+            return None, None
+        
+        logger.debug(f"Archiving media {post.media}")
+
+        if client is None:
+            api_id = os.environ['TELEGRAM_API_ID']
+            api_hash = os.environ['TELEGRAM_API_HASH']
+            phone = os.environ['TELEGRAM_PHONE']
+
+            with TelegramClient(phone, api_id, api_hash) as client:
+                return self.archive_post_media(post, client=client)
+
+        key = f'{post.peer_id.channel_id}_{post.id}'
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_file = Path(temp_dir, key)
+
+            client.download_media(post.media, output_file)
+
+            output_file_with_ext = os.listdir(temp_dir)[0]
+            filename = Path(temp_dir, output_file_with_ext)
+            
+            with open(filename, 'rb') as f:
+                blob = f.read()
+                return (blob, output_file_with_ext)
+
    def can_handle(self, channel):
        if channel.platform == "Telegram" and channel.public and not channel.chat:
            return True

    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
-
        username = self.get_username_from_url(channel.url)

        api_id = os.environ['TELEGRAM_API_ID']
@@ -37,34 +107,27 @@ class TelegramTelethonScraper(Scraper):
        phone = os.environ['TELEGRAM_PHONE']

        with TelegramClient(phone, api_id, api_hash) as client:
-
            for post in client.iter_messages(username):
+                post_url = f'{channel.url}/{post.id}'
+
+                logger.info(f"Archiving post {post_url} from {post.date}")

                if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
                    logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
                    break

-                post_url = f'{channel.url}/{post.id}'
-                key = f'{username}_{post.id}'
-
                archived_urls = {}
+                logger.info(f"Archiving post {post_url}")

-                if archive_media:
+                if post.media is not None:                    
+                    archived_urls[post_url] = None

-                    if post.media is not None:
-                        with tempfile.TemporaryDirectory() as temp_dir:
-                            output_file = Path(temp_dir, key)
-                            client.download_media(post.media, output_file)
-
-                            output_file_with_ext = os.listdir(temp_dir)[0]
-                            filename = Path(temp_dir, output_file_with_ext)
-                            
-                            with open(filename, 'rb') as f:
-                                blob = f.read()
-                        
-                        # TODO specify Content-Type
-                        archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
-                        archived_urls[post_url] = archived_url
+                    if archive_media:
+                        blob, output_file_with_ext = self.archive_post_media(post, client)
+                        if blob is not None:
+                            # TODO specify Content-Type
+                            archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
+                            archived_urls[post_url] = archived_url

                yield ScraperResult(
                    scraper=self.__version__,
@@ -74,7 +137,8 @@ class TelegramTelethonScraper(Scraper):
                    date=post.date.replace(tzinfo=timezone.utc),
                    date_archived=datetime.now(timezone.utc),
                    raw_data=json.dumps(post.to_dict(), default=str),
-                    archived_urls=archived_urls)
+                    archived_urls=archived_urls,
+                    media_archived=archive_media))

    def get_profile(self, channel: Channel) -> dict:

@@ -88,4 +152,4 @@ class TelegramTelethonScraper(Scraper):
            full_channel = client(GetFullChannelRequest(channel = username))
        profile = full_channel.__dict__

-        return profile
+        return profile
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -28,31 +28,33 @@ class TwitterScraper(Scraper):

            archived_urls = {}

-            if archive_media:
-                media_list = []
-                if tweet.media:
-                    media_list += tweet.media
+            media_list = []
+            if tweet.media:
+                media_list += tweet.media

-                if tweet.retweetedTweet and tweet.retweetedTweet.media:
-                    media_list += tweet.retweetedTweet.media
+            if tweet.retweetedTweet and tweet.retweetedTweet.media:
+                media_list += tweet.retweetedTweet.media

-                if tweet.quotedTweet and tweet.quotedTweet.media:
-                    media_list += tweet.quotedTweet.media
+            if tweet.quotedTweet and tweet.quotedTweet.media:
+                media_list += tweet.quotedTweet.media

-                for media in media_list:
-                    if type(media) == Video:
-                        variant = max(
-                            [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
-                        url = variant.url
-                    elif type(media) == Gif:
-                        url = media.variants[0].url
-                    elif type(media) == Photo:
-                        url = media.fullUrl
-                    else:
-                        logger.warning(f"Could not get media URL of {media}")
-                        url = None
+            for media in media_list:
+                if type(media) == Video:
+                    variant = max(
+                        [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
+                    url = variant.url
+                elif type(media) == Gif:
+                    url = media.variants[0].url
+                elif type(media) == Photo:
+                    url = media.fullUrl
+                else:
+                    logger.warning(f"Could not get media URL of {media}")
+                    url = None

-                    if url is not None and url not in archived_urls:
+                if url is not None and url not in archived_urls:
+                    archived_urls[url] = None
+
+                    if archive_media:
                        media_blob, content_type, key = self.url_to_blob(url)
                        archived_url = self.archive_blob(media_blob, content_type, key)
                        archived_urls[url] = archived_url