From 38e01040787adac1266488fe9b70c0a86ebec0b8 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 14 Apr 2022 10:43:27 +0000 Subject: [PATCH] Separate logging; limit Telegram archive file size --- app.py | 5 ++++- cisticola/scraper/base.py | 3 ++- cisticola/scraper/telegram_telethon.py | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index 3a7bc29..2367575 100644 --- a/app.py +++ b/app.py @@ -152,7 +152,6 @@ def init_db(): if __name__ == "__main__": logger.remove() logger.add(sys.stdout, level="DEBUG", catch=True) - logger.add("logs/cisticola.log", level="TRACE", rotation="100 MB") parser = argparse.ArgumentParser(description="Cisticola command line tools") parser.add_argument( @@ -172,12 +171,16 @@ if __name__ == "__main__": if args.command == "init-db": init_db() elif args.command == "sync-channels": + logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB") sync_channels(args) elif args.command == "scrape-channels": + logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB") scrape_channels(args) elif args.command == "archive-media": + logger.add("logs/archive-media.log", level="TRACE", rotation="100 MB") archive_media(args) elif args.command == "channel-info": + logger.add("logs/channel-info.log", level="TRACE", rotation="100 MB") scrape_channel_info(args) else: logger.error(f"Unrecognized command {args.command}") diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 2e19c1f..973fc55 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -449,7 +449,8 @@ class ScraperController: handled = False for scraper in self.scrapers: - if scraper.__version__ == post.scraper: + # compare major versions + if scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: handled = True logger.debug(f"{scraper} is archiving media for ID {post.id}") post = scraper.archive_files(post) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 03ac0d6..3091fda 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -18,7 +18,7 @@ MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] class TelegramTelethonScraper(Scraper): """An implementation of a Scraper for Telegram, using Telethon library""" - __version__ = "TelegramTelethonScraper 0.0.1" + __version__ = "TelegramTelethonScraper 0.0.2" def get_username_from_url(self, url): username = url.split('https://t.me/')[1] @@ -62,6 +62,7 @@ class TelegramTelethonScraper(Scraper): result.media_archived = datetime.now(timezone.utc) else: logger.warning("Downloaded blob was None") + result.archived_urls = {} result.media_archived = datetime.now(timezone.utc) return result @@ -80,6 +81,10 @@ class TelegramTelethonScraper(Scraper): return self.archive_post_media(post, client=client) if type(post.media) == types.MessageMediaDocument: + if post.media.document.size/(1024*1024) > 50: + logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") + return None, None + logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") else: logger.debug(f"Archiving {type(post.media)}")