From 1ac8d6c603321b0c75e238604afff84557482e10 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 13 Apr 2022 10:38:08 +0000 Subject: [PATCH 1/2] Close sessions; sort channel info by least recently archived --- cisticola/scraper/base.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 65ede8b..2e19c1f 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -12,8 +12,9 @@ from sqlalchemy.orm import sessionmaker import yt_dlp from sqlalchemy.sql.expression import func from pathlib import Path +from sqlalchemy import nullsfirst -from cisticola.base import Channel, ScraperResult, mapper_registry +from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry from cisticola.utils import make_request class Scraper: @@ -340,6 +341,8 @@ class ScraperController: channels = session.query(Channel).where(Channel.source=='researcher').all() + session.close() + return self.scrape_channels(channels, archive_media=archive_media) def scrape_all_channel_info(self): @@ -349,8 +352,15 @@ class ScraperController: session = self.session() - channels = session.query(Channel).where(Channel.source=='researcher').all() + # Because of rate limiting, we may not be able to succesfully scrape info for all of these channels. + # This will sort the channels by the least recently scraped. + most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery() + channels = session.query(Channel).\ + where(Channel.source=='researcher').\ + outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\ + order_by(nullsfirst(most_recently_archived.c.date.asc())).all() + session.close() return self.scrape_channel_info(channels) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): @@ -419,6 +429,8 @@ class ScraperController: if not handled: logger.warning(f"No handler found for Channel {channel}") + session.close() + @logger.catch(reraise = True) def archive_unarchived_media(self): if self.session is None: @@ -452,6 +464,7 @@ class ScraperController: logger.warning(f"No handler found for post scraped with {post.scraper}") session.commit() + session.close() @logger.catch(reraise = True) def scrape_channel_info(self, channels: List[Channel]): @@ -470,6 +483,7 @@ class ScraperController: logger.error("No DB session") return + session = self.session() for channel in channels: handled = False @@ -479,10 +493,13 @@ class ScraperController: handled = True # get most recent post - session = self.session() try: info = scraper.get_profile(channel) + if info is None: + logger.warning(f"No info returned for {channel}") + break + session.add(info) session.commit() @@ -495,6 +512,8 @@ class ScraperController: if not handled: logger.warning(f"No handler found for Channel {channel}") + session.close() + def connect_to_db(self, engine): """Connect the specified SQLAlchemy engine to the controller. """ From 38e01040787adac1266488fe9b70c0a86ebec0b8 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 14 Apr 2022 10:43:27 +0000 Subject: [PATCH 2/2] Separate logging; limit Telegram archive file size --- app.py | 5 ++++- cisticola/scraper/base.py | 3 ++- cisticola/scraper/telegram_telethon.py | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index 3a7bc29..2367575 100644 --- a/app.py +++ b/app.py @@ -152,7 +152,6 @@ def init_db(): if __name__ == "__main__": logger.remove() logger.add(sys.stdout, level="DEBUG", catch=True) - logger.add("logs/cisticola.log", level="TRACE", rotation="100 MB") parser = argparse.ArgumentParser(description="Cisticola command line tools") parser.add_argument( @@ -172,12 +171,16 @@ if __name__ == "__main__": if args.command == "init-db": init_db() elif args.command == "sync-channels": + logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB") sync_channels(args) elif args.command == "scrape-channels": + logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB") scrape_channels(args) elif args.command == "archive-media": + logger.add("logs/archive-media.log", level="TRACE", rotation="100 MB") archive_media(args) elif args.command == "channel-info": + logger.add("logs/channel-info.log", level="TRACE", rotation="100 MB") scrape_channel_info(args) else: logger.error(f"Unrecognized command {args.command}") diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 2e19c1f..973fc55 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -449,7 +449,8 @@ class ScraperController: handled = False for scraper in self.scrapers: - if scraper.__version__ == post.scraper: + # compare major versions + if scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: handled = True logger.debug(f"{scraper} is archiving media for ID {post.id}") post = scraper.archive_files(post) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 03ac0d6..3091fda 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -18,7 +18,7 @@ MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] class TelegramTelethonScraper(Scraper): """An implementation of a Scraper for Telegram, using Telethon library""" - __version__ = "TelegramTelethonScraper 0.0.1" + __version__ = "TelegramTelethonScraper 0.0.2" def get_username_from_url(self, url): username = url.split('https://t.me/')[1] @@ -62,6 +62,7 @@ class TelegramTelethonScraper(Scraper): result.media_archived = datetime.now(timezone.utc) else: logger.warning("Downloaded blob was None") + result.archived_urls = {} result.media_archived = datetime.now(timezone.utc) return result @@ -80,6 +81,10 @@ class TelegramTelethonScraper(Scraper): return self.archive_post_media(post, client=client) if type(post.media) == types.MessageMediaDocument: + if post.media.document.size/(1024*1024) > 50: + logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") + return None, None + logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") else: logger.debug(f"Archiving {type(post.media)}")