diff --git a/cisticola/__init__.py b/cisticola/__init__.py index e87aa19..5bac90b 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -4,6 +4,7 @@ import cisticola.scraper.base from sqlalchemy.orm import sessionmaker from loguru import logger +MAX_POSTS = 10 class ScraperController: """Registers scrapers, uses them to generate ScraperResults. Synchronizes @@ -27,6 +28,10 @@ class ScraperController: for scraper in self.scrapers: if scraper.can_handle(channel): + session = self.session() + handled = True + added = 0 + # get most recent post session = self.session() rows = session.query(cisticola.base.ScraperResult).order_by( @@ -38,20 +43,20 @@ class ScraperController: since = None posts = scraper.get_posts(channel, since=since) - handled = True + for post in posts: + session.add(post) + added += 1 + if added >= MAX_POSTS: + break + + session.commit() logger.info( - f"{scraper} found {len(posts)} new posts from {channel}") + f"{scraper} found {added} new posts from {channel}") break if not handled: logger.warning(f"No handler found for Channel {channel}") - else: - session = self.session() - session.bulk_save_objects(posts) - session.commit() - - logger.info(f"Added {len(posts)} entries to database") def connect_to_db(self, engine): # create tables diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 8bd8f15..c76910f 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -13,16 +13,12 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): return True def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None): - posts = [] scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) g = scr.get_items() for post in g: - if (len(posts)) >= 10: - break - if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break @@ -36,7 +32,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): video_archive_url = self.archive_media(post.video) archived_urls[post.video] = video_archive_url - posts.append(cisticola.base.ScraperResult( + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Telegram", channel=channel.id, @@ -45,6 +41,4 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): date_archived=datetime.now(timezone.utc), raw_data=post.json(), archived_urls=archived_urls - )) - - return posts + ) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index bb85f48..41287ab 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -20,22 +20,19 @@ class TwitterScraper(cisticola.scraper.base.Scraper): return username def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: - posts = [] scraper = snscrape.modules.twitter.TwitterProfileScraper( TwitterScraper.get_username_from_url(channel.url)) first = True for tweet in scraper.get_items(): - if len(posts) >= 10: - break - if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): # with TwitterProfileScraper, the first tweet could be an old pinned tweet if first: first = False continue else: + print('too far') break archived_urls = {} @@ -58,7 +55,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper): archived_url = self.archive_media(url) archived_urls[url] = archived_url - posts.append(cisticola.base.ScraperResult( + yield cisticola.base.ScraperResult( scraper=self.__version__, platform="Twitter", channel=channel.id, @@ -66,9 +63,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper): date=tweet.date, date_archived=datetime.now(), raw_data=tweet.json(), - archived_urls=archived_urls)) - - return posts + archived_urls=archived_urls) def can_handle(self, channel): if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: