got Telegram scraper tests all working

2026-06-08 03:18:34 +03:00 · 2023-08-01 10:46:50 -05:00
parent 249f411a1d
commit bd67806ed2
3 changed files with 44 additions and 31 deletions
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -454,6 +454,39 @@ class ScraperController:

        session.close()

+    def archive_unarchived_media_batch(self, session = None, chronological=False):
+        if session is None:
+            session = self.session()
+        if chronological:
+            posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
+        else:
+            # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
+            # simultaneously with low risk of collision (at least while the number of unarchived items is very large)
+            posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
+
+        logger.info(f"Found {len(posts)} posts without media. Archiving now")
+
+        for post in posts:
+            handled = False
+
+            for scraper in self.scrapers:
+                # compare major versions
+                if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
+                    handled = True
+                    logger.debug(f"{scraper} is archiving media for ID {post.id}")
+                    post = scraper.archive_files(post)
+
+                    if post:
+                        session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
+                        session.commit()
+
+                    break
+            
+            if not handled:
+                logger.warning(f"No handler found for post scraped with {post.scraper}")
+
+        session.commit()
+                
    @logger.catch(reraise = True)
    def archive_unarchived_media(self, chronological=False):
        if self.session is None:
@@ -463,35 +496,10 @@ class ScraperController:
        session = self.session()

        while True:
-            if chronological:
-                posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
-            else:
-                # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
-                # simultaneously with low risk of collision (at least while the number of unarchived items is very large)
-                posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
-
-            logger.info(f"Found {len(posts)} posts without media. Archiving now")
-
-            for post in posts:
-                handled = False
-
-                for scraper in self.scrapers:
-                    # compare major versions
-                    if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
-                        handled = True
-                        logger.debug(f"{scraper} is archiving media for ID {post.id}")
-                        post = scraper.archive_files(post)
-
-                        if post:
-                            session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
-                            session.commit()
-
-                        break
-                
-                if not handled:
-                    logger.warning(f"No handler found for post scraped with {post.scraper}")
-
-            session.commit()
+            # # DEBUG
+            # assert 0
+            self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
+            
            
        session.close()

--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -87,6 +87,8 @@ To run the test suite without archiving media (which can take a long time), run

    pipenv run pytest -m "not media"

+To see the logging output from a test run, add the ``--capture=no`` flag to the command. 
+
 Examples
 --------

--- a/tests/scraper/telegram_telethon.py
+++ b/tests/scraper/telegram_telethon.py
@@ -15,7 +15,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
@pytest.mark.unarchived
 def test_scrape_telegram_telethon_unarchived_media(controller):

-    controller.archive_unarchived_media()
+    controller.archive_unarchived_media_batch()

@pytest.mark.media
 def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
@@ -28,7 +28,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
    controller.scrape_channels(channels = channels, archive_media = True)

@pytest.mark.profile
-def test_scrape_telegram_telethon_profile(channel_kwargs):
+def test_scrape_telegram_telethon_profile(controller, channel_kwargs):
+
+    controller.reset_db()
+    controller.remove_all_scrapers()

    scraper = TelegramTelethonScraper()
    channel = Channel(**channel_kwargs['telegram'])