mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
Add snscrape delayed media archiving support; add explicit bool
This commit is contained in:
@@ -41,6 +41,9 @@ class ScraperResult:
|
|||||||
|
|
||||||
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
||||||
archived_urls: dict
|
archived_urls: dict
|
||||||
|
|
||||||
|
#: Has the media in this post been archived?
|
||||||
|
media_archived: bool
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Channel:
|
class Channel:
|
||||||
@@ -228,7 +231,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
|||||||
Column('date', DateTime),
|
Column('date', DateTime),
|
||||||
Column('raw_data', String),
|
Column('raw_data', String),
|
||||||
Column('date_archived', DateTime),
|
Column('date_archived', DateTime),
|
||||||
Column('archived_urls', JSON))
|
Column('archived_urls', JSON),
|
||||||
|
Column('media_archived', Boolean))
|
||||||
|
|
||||||
channel_table = Table('channels', mapper_registry.metadata,
|
channel_table = Table('channels', mapper_registry.metadata,
|
||||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||||
|
|||||||
@@ -241,6 +241,7 @@ class Scraper:
|
|||||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||||
result.archived_urls[url] = archived_url
|
result.archived_urls[url] = archived_url
|
||||||
|
|
||||||
|
result.media_archived = True
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@@ -371,7 +372,7 @@ class ScraperController:
|
|||||||
|
|
||||||
session = self.session()
|
session = self.session()
|
||||||
|
|
||||||
posts = session.query(ScraperResult).filter(ScraperResult.archived_urls.like("%null%")).all()
|
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
|
||||||
|
|
||||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||||
|
|
||||||
@@ -384,8 +385,10 @@ class ScraperController:
|
|||||||
logger.info(f"{scraper} is archiving media for {post}")
|
logger.info(f"{scraper} is archiving media for {post}")
|
||||||
post = scraper.archive_files(post)
|
post = scraper.archive_files(post)
|
||||||
|
|
||||||
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls})
|
if post:
|
||||||
session.commit()
|
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': True})
|
||||||
|
session.commit()
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if not handled:
|
if not handled:
|
||||||
|
|||||||
@@ -30,19 +30,17 @@ class TelegramSnscrapeScraper(Scraper):
|
|||||||
|
|
||||||
archived_urls = {}
|
archived_urls = {}
|
||||||
|
|
||||||
|
for image_url in post.images:
|
||||||
|
archived_urls[image_url] = None
|
||||||
|
|
||||||
|
if post.video:
|
||||||
|
archived_urls[post.video] = None
|
||||||
|
|
||||||
if archive_media:
|
if archive_media:
|
||||||
|
for url in archived_urls:
|
||||||
for image_url in post.images:
|
media_blob, content_type, key = self.url_to_blob(url)
|
||||||
logger.debug(f'Archiving image: {image_url}')
|
|
||||||
media_blob, content_type, key = self.url_to_blob(image_url)
|
|
||||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||||
archived_urls[image_url] = archived_url
|
archived_urls[url] = archived_url
|
||||||
|
|
||||||
if post.video:
|
|
||||||
logger.debug(f'Archiving video: {post.video}')
|
|
||||||
media_blob, content_type, key = self.url_to_blob(post.video)
|
|
||||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
|
||||||
archived_urls[post.video] = archived_url
|
|
||||||
|
|
||||||
yield ScraperResult(
|
yield ScraperResult(
|
||||||
scraper=self.__version__,
|
scraper=self.__version__,
|
||||||
@@ -52,5 +50,6 @@ class TelegramSnscrapeScraper(Scraper):
|
|||||||
date=post.date,
|
date=post.date,
|
||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=post.json(),
|
raw_data=post.json(),
|
||||||
archived_urls=archived_urls
|
archived_urls=archived_urls,
|
||||||
|
media_archived=archive_media
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
else:
|
else:
|
||||||
logger.warning("Downloaded blob was None")
|
logger.warning("Downloaded blob was None")
|
||||||
|
|
||||||
|
result.media_archived = True
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
||||||
@@ -135,4 +136,5 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
date=post.date.replace(tzinfo=timezone.utc),
|
date=post.date.replace(tzinfo=timezone.utc),
|
||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=json.dumps(post.to_dict(), default=str),
|
raw_data=json.dumps(post.to_dict(), default=str),
|
||||||
archived_urls=archived_urls)
|
archived_urls=archived_urls,
|
||||||
|
media_archived=archive_media)
|
||||||
|
|||||||
Reference in New Issue
Block a user