From a80dbddbbc2e8ced29041795c6002b70bdd0f683 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 28 Mar 2022 11:42:15 +0200 Subject: [PATCH] Add snscrape delayed media archiving support; add explicit bool --- cisticola/base.py | 6 +++++- cisticola/scraper/base.py | 9 ++++++--- cisticola/scraper/telegram_snscrape.py | 23 +++++++++++------------ cisticola/scraper/telegram_telethon.py | 4 +++- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index decaab8..ff7f136 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -41,6 +41,9 @@ class ScraperResult: #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. archived_urls: dict + + #: Has the media in this post been archived? + media_archived: bool @dataclass class Channel: @@ -228,7 +231,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('date', DateTime), Column('raw_data', String), Column('date_archived', DateTime), - Column('archived_urls', JSON)) + Column('archived_urls', JSON), + Column('media_archived', Boolean)) channel_table = Table('channels', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 9fb2029..c887ee1 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -241,6 +241,7 @@ class Scraper: archived_url = self.archive_blob(media_blob, content_type, key) result.archived_urls[url] = archived_url + result.media_archived = True return result @@ -371,7 +372,7 @@ class ScraperController: session = self.session() - posts = session.query(ScraperResult).filter(ScraperResult.archived_urls.like("%null%")).all() + posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all() logger.info(f"Found {len(posts)} posts without media. Archiving now") @@ -384,8 +385,10 @@ class ScraperController: logger.info(f"{scraper} is archiving media for {post}") post = scraper.archive_files(post) - session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls}) - session.commit() + if post: + session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': True}) + session.commit() + break if not handled: diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index ec5b292..a0b758b 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -30,19 +30,17 @@ class TelegramSnscrapeScraper(Scraper): archived_urls = {} + for image_url in post.images: + archived_urls[image_url] = None + + if post.video: + archived_urls[post.video] = None + if archive_media: - - for image_url in post.images: - logger.debug(f'Archiving image: {image_url}') - media_blob, content_type, key = self.url_to_blob(image_url) + for url in archived_urls: + media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[image_url] = archived_url - - if post.video: - logger.debug(f'Archiving video: {post.video}') - media_blob, content_type, key = self.url_to_blob(post.video) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post.video] = archived_url + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -52,5 +50,6 @@ class TelegramSnscrapeScraper(Scraper): date=post.date, date_archived=datetime.now(timezone.utc), raw_data=post.json(), - archived_urls=archived_urls + archived_urls=archived_urls, + media_archived=archive_media ) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 34532dc..e9740c9 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -61,6 +61,7 @@ class TelegramTelethonScraper(Scraper): else: logger.warning("Downloaded blob was None") + result.media_archived = True return result def archive_post_media(self, post : types.Message, client : TelegramClient = None): @@ -135,4 +136,5 @@ class TelegramTelethonScraper(Scraper): date=post.date.replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post.to_dict(), default=str), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media)