Archive media in dictionary

This commit is contained in:
Logan Williams
2022-02-24 17:35:24 +01:00
parent a87cfd570a
commit 214287b7a8
4 changed files with 15 additions and 16 deletions

View File

@@ -1,7 +1,7 @@
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
mapper_registry = registry()
@@ -17,6 +17,7 @@ class ScraperResult:
date: datetime
raw_data: str
date_archived: datetime
archived_urls: dict
raw_data_table = Table('raw_data', mapper_registry.metadata,
@@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime))
Column('date_archived', DateTime),
Column('archived_urls', JSON))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)

View File

@@ -24,7 +24,7 @@ class Scraper:
def __str__(self):
return self.__version__
def archive_media(self, url: str) -> str:
def archive_media(self, url: str, key: str = None) -> str:
n_retries = 0
r = requests.get(url)
@@ -39,8 +39,9 @@ class Scraper:
blob = r.content
key = url.split('/')[-1]
key = key.split('?')[0]
if key is None:
key = url.split('/')[-1]
key = key.split('?')[0]
filename = self.__version__.replace(' ', '_') + '/' + key

View File

@@ -1,4 +1,3 @@
import cisticola.base
import cisticola.scraper.base
from typing import List
@@ -24,15 +23,15 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
raw_data = post.json()
archived_urls = {}
for image_url in post.images:
archive_url = self.archive_media(image_url)
raw_data = raw_data.replace(image_url, archive_url)
archived_urls[image_url] = archive_url
if post.video:
video_archive_url = self.archive_media(post.video)
raw_data = raw_data.replace(post.video, video_archive_url)
archived_urls[post.video] = video_archive_url
posts.append(cisticola.base.ScraperResult(
scraper=self.__version__,
@@ -40,8 +39,9 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
channel=channel.id,
platform_id=post.url,
date=post.date,
date_archived=datetime.now(),
raw_data=raw_data
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls
))
return posts

View File

@@ -1,7 +1,3 @@
# TODO/TODECIDE:
# should 'username' be a part of the Channel definition somehow?
# still need to do some planning for handling media
import cisticola
import cisticola.scraper.telegram_snscrape
@@ -35,7 +31,7 @@ controller = cisticola.ScraperController()
scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
controller.register_scraper(scraper)
engine = create_engine('sqlite:///test3.db')
engine = create_engine('sqlite:///test4.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)