got Telegram scraper tests all working

This commit is contained in:
Tristan Lee
2023-08-01 10:46:50 -05:00
parent 249f411a1d
commit bd67806ed2
3 changed files with 44 additions and 31 deletions

View File

@@ -454,6 +454,39 @@ class ScraperController:
session.close()
def archive_unarchived_media_batch(self, session = None, chronological=False):
if session is None:
session = self.session()
if chronological:
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
else:
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now")
for post in posts:
handled = False
for scraper in self.scrapers:
# compare major versions
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
handled = True
logger.debug(f"{scraper} is archiving media for ID {post.id}")
post = scraper.archive_files(post)
if post:
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
session.commit()
break
if not handled:
logger.warning(f"No handler found for post scraped with {post.scraper}")
session.commit()
@logger.catch(reraise = True)
def archive_unarchived_media(self, chronological=False):
if self.session is None:
@@ -463,35 +496,10 @@ class ScraperController:
session = self.session()
while True:
if chronological:
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
else:
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now")
for post in posts:
handled = False
for scraper in self.scrapers:
# compare major versions
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
handled = True
logger.debug(f"{scraper} is archiving media for ID {post.id}")
post = scraper.archive_files(post)
if post:
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
session.commit()
break
if not handled:
logger.warning(f"No handler found for post scraped with {post.scraper}")
session.commit()
# # DEBUG
# assert 0
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
session.close()

View File

@@ -87,6 +87,8 @@ To run the test suite without archiving media (which can take a long time), run
pipenv run pytest -m "not media"
To see the logging output from a test run, add the ``--capture=no`` flag to the command.
Examples
--------

View File

@@ -15,7 +15,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
@pytest.mark.unarchived
def test_scrape_telegram_telethon_unarchived_media(controller):
controller.archive_unarchived_media()
controller.archive_unarchived_media_batch()
@pytest.mark.media
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
@@ -28,7 +28,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_telegram_telethon_profile(channel_kwargs):
def test_scrape_telegram_telethon_profile(controller, channel_kwargs):
controller.reset_db()
controller.remove_all_scrapers()
scraper = TelegramTelethonScraper()
channel = Channel(**channel_kwargs['telegram'])