Add snscrape delayed media archiving support; add explicit bool

This commit is contained in:
Logan Williams
2022-03-28 11:42:15 +02:00
parent 63fdae9f1b
commit a80dbddbbc
4 changed files with 25 additions and 17 deletions

View File

@@ -41,6 +41,9 @@ class ScraperResult:
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict archived_urls: dict
#: Has the media in this post been archived?
media_archived: bool
@dataclass @dataclass
class Channel: class Channel:
@@ -228,7 +231,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('date', DateTime), Column('date', DateTime),
Column('raw_data', String), Column('raw_data', String),
Column('date_archived', DateTime), Column('date_archived', DateTime),
Column('archived_urls', JSON)) Column('archived_urls', JSON),
Column('media_archived', Boolean))
channel_table = Table('channels', mapper_registry.metadata, channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True), Column('id', Integer, primary_key=True, autoincrement=True),

View File

@@ -241,6 +241,7 @@ class Scraper:
archived_url = self.archive_blob(media_blob, content_type, key) archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url result.archived_urls[url] = archived_url
result.media_archived = True
return result return result
@@ -371,7 +372,7 @@ class ScraperController:
session = self.session() session = self.session()
posts = session.query(ScraperResult).filter(ScraperResult.archived_urls.like("%null%")).all() posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now") logger.info(f"Found {len(posts)} posts without media. Archiving now")
@@ -384,8 +385,10 @@ class ScraperController:
logger.info(f"{scraper} is archiving media for {post}") logger.info(f"{scraper} is archiving media for {post}")
post = scraper.archive_files(post) post = scraper.archive_files(post)
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls}) if post:
session.commit() session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': True})
session.commit()
break break
if not handled: if not handled:

View File

@@ -30,19 +30,17 @@ class TelegramSnscrapeScraper(Scraper):
archived_urls = {} archived_urls = {}
for image_url in post.images:
archived_urls[image_url] = None
if post.video:
archived_urls[post.video] = None
if archive_media: if archive_media:
for url in archived_urls:
for image_url in post.images: media_blob, content_type, key = self.url_to_blob(url)
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_blob(media_blob, content_type, key) archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[image_url] = archived_url archived_urls[url] = archived_url
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post.video] = archived_url
yield ScraperResult( yield ScraperResult(
scraper=self.__version__, scraper=self.__version__,
@@ -52,5 +50,6 @@ class TelegramSnscrapeScraper(Scraper):
date=post.date, date=post.date,
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=post.json(), raw_data=post.json(),
archived_urls=archived_urls archived_urls=archived_urls,
media_archived=archive_media
) )

View File

@@ -61,6 +61,7 @@ class TelegramTelethonScraper(Scraper):
else: else:
logger.warning("Downloaded blob was None") logger.warning("Downloaded blob was None")
result.media_archived = True
return result return result
def archive_post_media(self, post : types.Message, client : TelegramClient = None): def archive_post_media(self, post : types.Message, client : TelegramClient = None):
@@ -135,4 +136,5 @@ class TelegramTelethonScraper(Scraper):
date=post.date.replace(tzinfo=timezone.utc), date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str), raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls) archived_urls=archived_urls,
media_archived=archive_media)