diff --git a/cisticola/base.py b/cisticola/base.py index da811df..d2913e2 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from datetime import datetime from sqlalchemy.orm import registry -from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey +from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey mapper_registry = registry() @@ -17,6 +17,7 @@ class ScraperResult: date: datetime raw_data: str date_archived: datetime + archived_urls: dict raw_data_table = Table('raw_data', mapper_registry.metadata, @@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('platform_id', String), Column('date', DateTime), Column('raw_data', String), - Column('date_archived', DateTime)) + Column('date_archived', DateTime), + Column('archived_urls', JSON)) mapper_registry.map_imperatively(ScraperResult, raw_data_table) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 477ce7a..3a399ec 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -24,7 +24,7 @@ class Scraper: def __str__(self): return self.__version__ - def archive_media(self, url: str) -> str: + def archive_media(self, url: str, key: str = None) -> str: n_retries = 0 r = requests.get(url) @@ -39,8 +39,9 @@ class Scraper: blob = r.content - key = url.split('/')[-1] - key = key.split('?')[0] + if key is None: + key = url.split('/')[-1] + key = key.split('?')[0] filename = self.__version__.replace(' ', '_') + '/' + key diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 5752d7d..6ea7fd7 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -1,4 +1,3 @@ - import cisticola.base import cisticola.scraper.base from typing import List @@ -24,15 +23,15 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break - raw_data = post.json() + archived_urls = {} for image_url in post.images: archive_url = self.archive_media(image_url) - raw_data = raw_data.replace(image_url, archive_url) + archived_urls[image_url] = archive_url if post.video: video_archive_url = self.archive_media(post.video) - raw_data = raw_data.replace(post.video, video_archive_url) + archived_urls[post.video] = video_archive_url posts.append(cisticola.base.ScraperResult( scraper=self.__version__, @@ -40,8 +39,9 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): channel=channel.id, platform_id=post.url, date=post.date, - date_archived=datetime.now(), - raw_data=raw_data + date_archived=datetime.now(timezone.utc), + raw_data=post.json(), + archived_urls=archived_urls )) return posts diff --git a/test.py b/test.py index 1885487..c3e4d72 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,3 @@ -# TODO/TODECIDE: -# should 'username' be a part of the Channel definition somehow? -# still need to do some planning for handling media - import cisticola import cisticola.scraper.telegram_snscrape @@ -35,7 +31,7 @@ controller = cisticola.ScraperController() scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper() controller.register_scraper(scraper) -engine = create_engine('sqlite:///test3.db') +engine = create_engine('sqlite:///test4.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels)