mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 13:58:33 +03:00
Implement media archiving after the initial scrape for Twitter and Telethon
This commit is contained in:
@@ -234,6 +234,16 @@ class Scraper:
|
||||
|
||||
return archived_url
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
"""Whether or not the scraper can scrape the specified channel.
|
||||
|
||||
@@ -353,6 +363,36 @@ class ScraperController:
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def archive_unarchived_media(self):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
posts = session.query(ScraperResult).filter(ScraperResult.archived_urls.like("%null%")).all()
|
||||
|
||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||
|
||||
for post in posts:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.__version__ == post.scraper:
|
||||
handled = True
|
||||
logger.info(f"{scraper} is archiving media for {post}")
|
||||
post = scraper.archive_files(post)
|
||||
|
||||
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls})
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for post scraped with {post.scraper}")
|
||||
|
||||
session.commit()
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user