Add snscrape delayed media archiving support; add explicit bool

This commit is contained in:
Logan Williams
2022-03-28 11:42:15 +02:00
parent 63fdae9f1b
commit a80dbddbbc
4 changed files with 25 additions and 17 deletions

View File

@@ -41,6 +41,9 @@ class ScraperResult:
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
#: Has the media in this post been archived?
media_archived: bool
@dataclass
class Channel:
@@ -228,7 +231,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
Column('archived_urls', JSON),
Column('media_archived', Boolean))
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),

View File

@@ -241,6 +241,7 @@ class Scraper:
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
@@ -371,7 +372,7 @@ class ScraperController:
session = self.session()
posts = session.query(ScraperResult).filter(ScraperResult.archived_urls.like("%null%")).all()
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now")
@@ -384,8 +385,10 @@ class ScraperController:
logger.info(f"{scraper} is archiving media for {post}")
post = scraper.archive_files(post)
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls})
session.commit()
if post:
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': True})
session.commit()
break
if not handled:

View File

@@ -30,19 +30,17 @@ class TelegramSnscrapeScraper(Scraper):
archived_urls = {}
for image_url in post.images:
archived_urls[image_url] = None
if post.video:
archived_urls[post.video] = None
if archive_media:
for image_url in post.images:
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
for url in archived_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post.video] = archived_url
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -52,5 +50,6 @@ class TelegramSnscrapeScraper(Scraper):
date=post.date,
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls
archived_urls=archived_urls,
media_archived=archive_media
)

View File

@@ -61,6 +61,7 @@ class TelegramTelethonScraper(Scraper):
else:
logger.warning("Downloaded blob was None")
result.media_archived = True
return result
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
@@ -135,4 +136,5 @@ class TelegramTelethonScraper(Scraper):
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)