mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
got Telegram scraper tests all working
This commit is contained in:
@@ -454,6 +454,39 @@ class ScraperController:
|
||||
|
||||
session.close()
|
||||
|
||||
def archive_unarchived_media_batch(self, session = None, chronological=False):
|
||||
if session is None:
|
||||
session = self.session()
|
||||
if chronological:
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
|
||||
else:
|
||||
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
||||
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
|
||||
|
||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||
|
||||
for post in posts:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
# compare major versions
|
||||
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
|
||||
handled = True
|
||||
logger.debug(f"{scraper} is archiving media for ID {post.id}")
|
||||
post = scraper.archive_files(post)
|
||||
|
||||
if post:
|
||||
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
|
||||
session.commit()
|
||||
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for post scraped with {post.scraper}")
|
||||
|
||||
session.commit()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def archive_unarchived_media(self, chronological=False):
|
||||
if self.session is None:
|
||||
@@ -463,35 +496,10 @@ class ScraperController:
|
||||
session = self.session()
|
||||
|
||||
while True:
|
||||
if chronological:
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
|
||||
else:
|
||||
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
||||
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
|
||||
|
||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||
|
||||
for post in posts:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
# compare major versions
|
||||
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
|
||||
handled = True
|
||||
logger.debug(f"{scraper} is archiving media for ID {post.id}")
|
||||
post = scraper.archive_files(post)
|
||||
|
||||
if post:
|
||||
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
|
||||
session.commit()
|
||||
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for post scraped with {post.scraper}")
|
||||
|
||||
session.commit()
|
||||
# # DEBUG
|
||||
# assert 0
|
||||
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
|
||||
|
||||
|
||||
session.close()
|
||||
|
||||
|
||||
@@ -87,6 +87,8 @@ To run the test suite without archiving media (which can take a long time), run
|
||||
|
||||
pipenv run pytest -m "not media"
|
||||
|
||||
To see the logging output from a test run, add the ``--capture=no`` flag to the command.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_telethon_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
controller.archive_unarchived_media_batch()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
@@ -28,7 +28,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_telegram_telethon_profile(channel_kwargs):
|
||||
def test_scrape_telegram_telethon_profile(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
scraper = TelegramTelethonScraper()
|
||||
channel = Channel(**channel_kwargs['telegram'])
|
||||
|
||||
Reference in New Issue
Block a user