diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a246023..b220550 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -454,6 +454,39 @@ class ScraperController: session.close() + def archive_unarchived_media_batch(self, session = None, chronological=False): + if session is None: + session = self.session() + if chronological: + posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all() + else: + # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work + # simultaneously with low risk of collision (at least while the number of unarchived items is very large) + posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all() + + logger.info(f"Found {len(posts)} posts without media. Archiving now") + + for post in posts: + handled = False + + for scraper in self.scrapers: + # compare major versions + if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: + handled = True + logger.debug(f"{scraper} is archiving media for ID {post.id}") + post = scraper.archive_files(post) + + if post: + session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived}) + session.commit() + + break + + if not handled: + logger.warning(f"No handler found for post scraped with {post.scraper}") + + session.commit() + @logger.catch(reraise = True) def archive_unarchived_media(self, chronological=False): if self.session is None: @@ -463,35 +496,10 @@ class ScraperController: session = self.session() while True: - if chronological: - posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all() - else: - # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work - # simultaneously with low risk of collision (at least while the number of unarchived items is very large) - posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all() - - logger.info(f"Found {len(posts)} posts without media. Archiving now") - - for post in posts: - handled = False - - for scraper in self.scrapers: - # compare major versions - if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: - handled = True - logger.debug(f"{scraper} is archiving media for ID {post.id}") - post = scraper.archive_files(post) - - if post: - session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived}) - session.commit() - - break - - if not handled: - logger.warning(f"No handler found for post scraped with {post.scraper}") - - session.commit() + # # DEBUG + # assert 0 + self.archive_unarchived_media_batch(self, session=session, chronological=chronological) + session.close() diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 4dd87ce..c86d956 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -87,6 +87,8 @@ To run the test suite without archiving media (which can take a long time), run pipenv run pytest -m "not media" +To see the logging output from a test run, add the ``--capture=no`` flag to the command. + Examples -------- diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index 8dbe9ff..f1f9be2 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -15,7 +15,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): @pytest.mark.unarchived def test_scrape_telegram_telethon_unarchived_media(controller): - controller.archive_unarchived_media() + controller.archive_unarchived_media_batch() @pytest.mark.media def test_scrape_telegram_telethon_channel(controller, channel_kwargs): @@ -28,7 +28,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs): controller.scrape_channels(channels = channels, archive_media = True) @pytest.mark.profile -def test_scrape_telegram_telethon_profile(channel_kwargs): +def test_scrape_telegram_telethon_profile(controller, channel_kwargs): + + controller.reset_db() + controller.remove_all_scrapers() scraper = TelegramTelethonScraper() channel = Channel(**channel_kwargs['telegram'])