yield data rather than returning a list

2026-06-08 03:18:34 +03:00 · 2022-02-24 18:58:08 +01:00
parent d163e6b3d9
commit d159c09aa4
3 changed files with 18 additions and 24 deletions
--- a/cisticola/init.py
+++ b/cisticola/init.py
@@ -4,6 +4,7 @@ import cisticola.scraper.base
 from sqlalchemy.orm import sessionmaker
 from loguru import logger

+MAX_POSTS = 10

 class ScraperController:
    """Registers scrapers, uses them to generate ScraperResults. Synchronizes
@@ -27,6 +28,10 @@ class ScraperController:

            for scraper in self.scrapers:
                if scraper.can_handle(channel):
+                    session = self.session()
+                    handled = True
+                    added = 0
+
                    # get most recent post
                    session = self.session()
                    rows = session.query(cisticola.base.ScraperResult).order_by(
@@ -38,20 +43,20 @@ class ScraperController:
                        since = None

                    posts = scraper.get_posts(channel, since=since)
-                    handled = True

+                    for post in posts:
+                        session.add(post)
+                        added += 1
+                        if added >= MAX_POSTS:
+                            break
+
+                    session.commit()
                    logger.info(
-                        f"{scraper} found {len(posts)} new posts from {channel}")
+                        f"{scraper} found {added} new posts from {channel}")
                    break

            if not handled:
                logger.warning(f"No handler found for Channel {channel}")
-            else:
-                session = self.session()
-                session.bulk_save_objects(posts)
-                session.commit()
-
-                logger.info(f"Added {len(posts)} entries to database")

    def connect_to_db(self, engine):
        # create tables
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -13,16 +13,12 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
            return True

    def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None):
-        posts = []
        scr = snscrape.modules.telegram.TelegramChannelScraper(
            channel.screenname)

        g = scr.get_items()

        for post in g:
-            if (len(posts)) >= 10:
-                break
-            
            if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
                break

@@ -36,7 +32,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
                video_archive_url = self.archive_media(post.video)
                archived_urls[post.video] = video_archive_url

-            posts.append(cisticola.base.ScraperResult(
+            yield cisticola.base.ScraperResult(
                scraper=self.__version__,
                platform="Telegram",
                channel=channel.id,
@@ -45,6 +41,4 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
                date_archived=datetime.now(timezone.utc),
                raw_data=post.json(),
                archived_urls=archived_urls
-            ))
-
-        return posts
+            )
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -20,22 +20,19 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
        return username

    def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
-        posts = []
        scraper = snscrape.modules.twitter.TwitterProfileScraper(
            TwitterScraper.get_username_from_url(channel.url))

        first = True

        for tweet in scraper.get_items():
-            if len(posts) >= 10:
-                break
-
            if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
                # with TwitterProfileScraper, the first tweet could be an old pinned tweet
                if first:
                    first = False
                    continue
                else:
+                    print('too far')
                    break

            archived_urls = {}
@@ -58,7 +55,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
                        archived_url = self.archive_media(url)
                        archived_urls[url] = archived_url

-            posts.append(cisticola.base.ScraperResult(
+            yield cisticola.base.ScraperResult(
                scraper=self.__version__,
                platform="Twitter",
                channel=channel.id,
@@ -66,9 +63,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
                date=tweet.date,
                date_archived=datetime.now(),
                raw_data=tweet.json(),
-                archived_urls=archived_urls))
-
-        return posts
+                archived_urls=archived_urls)

    def can_handle(self, channel):
        if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: