From a82ec15f0ec28a74e6c09043d916e314152cfa30 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Sun, 3 Apr 2022 12:02:27 +0200 Subject: [PATCH] Change archived_media to be timestamp for all scrapers --- app.py | 11 +++++++++-- cisticola/base.py | 4 ++-- cisticola/scraper/base.py | 5 ++++- cisticola/scraper/bitchute.py | 4 ++-- cisticola/scraper/gab.py | 4 ++-- cisticola/scraper/gettr.py | 4 ++-- cisticola/scraper/instagram.py | 8 ++++---- cisticola/scraper/odysee.py | 10 +++++----- cisticola/scraper/rumble.py | 6 +++--- cisticola/scraper/telegram_snscrape.py | 4 ++-- cisticola/scraper/telegram_telethon.py | 4 ++-- cisticola/scraper/twitter.py | 4 ++-- cisticola/scraper/vkontakte.py | 2 +- cisticola/scraper/youtube.py | 6 +++--- cisticola/transformer/bitchute.py | 4 ++-- cisticola/transformer/twitter.py | 2 +- 16 files changed, 46 insertions(+), 36 deletions(-) diff --git a/app.py b/app.py index 9b87873..d5ff286 100644 --- a/app.py +++ b/app.py @@ -11,7 +11,10 @@ from cisticola.base import Channel, mapper_registry from cisticola.scraper import ( ScraperController, VkontakteScraper, - TelegramTelethonScraper) + TelegramTelethonScraper, + GettrScraper, + OdyseeScraper, + RumbleScraper) def sync_channels(args): logger.info("Synchronizing channels") @@ -78,7 +81,11 @@ def get_scraper_controller(): scrapers = [ TelegramTelethonScraper(), - VkontakteScraper()] + VkontakteScraper(), + GettrScraper(), + OdyseeScraper(), + RumbleScraper() + ] controller.register_scrapers(scrapers) diff --git a/cisticola/base.py b/cisticola/base.py index b53ed35..bcc4618 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -34,7 +34,7 @@ class ScraperResult: date: datetime #: JSON dump of dict that contains all data scraped for the post. - raw_posts: str + raw_data: str #: Datetime (relative to UTC) that the scraped post was archived at. date_archived: datetime @@ -249,7 +249,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata, Column('channel', Integer, ForeignKey('channels.id')), Column('platform_id', String), Column('date', DateTime), - Column('raw_posts', String), + Column('raw_data', String), Column('date_archived', DateTime), Column('archived_urls', JSON), Column('media_archived', DateTime)) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 8fa212b..51c5f8e 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -257,7 +257,7 @@ class Scraper: archived_url = self.archive_blob(media_blob, content_type, key) result.archived_urls[url] = archived_url - result.media_archived = True + result.media_archived = datetime.now(timezone.utc) return result def can_handle(self, channel: Channel) -> bool: @@ -402,6 +402,9 @@ class ScraperController: session.commit() added += 1 + if added >= 200: + break + session.commit() logger.info( f"{scraper} found {added} new posts from {channel}") diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 5afa4e1..fda24ff 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -62,9 +62,9 @@ class BitchuteScraper(Scraper): platform_id=post['id'], date=datetime.fromtimestamp(post['timestamp']), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(post), + raw_data=json.dumps(post), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def can_handle(self, channel): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index ab1cdf3..2f25655 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -81,9 +81,9 @@ class GabScraper(Scraper): platform_id=post['id'], date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(post), + raw_data=json.dumps(post), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def can_handle(self, channel: Channel) -> bool: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index f785771..6f28a75 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -59,9 +59,9 @@ class GettrScraper(Scraper): platform_id=post['_id'], date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(post), + raw_data=json.dumps(post), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def can_handle(self, channel): if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index 4dbc205..1f25cbb 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -66,9 +66,9 @@ class InstagramScraper(Scraper): platform_id=post.mediaid, date=post.date_utc, date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(post._asdict(), default=str), + raw_data=json.dumps(post._asdict(), default=str), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) for comment in post.get_comments(): @@ -83,9 +83,9 @@ class InstagramScraper(Scraper): platform_id=post.mediaid, date=comment.created_at_utc, date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(comment_dict, default=str), + raw_data=json.dumps(comment_dict, default=str), archived_urls={}, - media_archived=True) + media_archived=datetime.now(timezone.utc)) def can_handle(self, channel): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 0f7a3fe..25788fe 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -62,9 +62,9 @@ class OdyseeScraper(Scraper): platform_id=video.info['claim_id'], date=datetime.fromtimestamp(video.info['created']), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(video.info), + raw_data=json.dumps(video.info), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) for comment in all_comments: @@ -75,9 +75,9 @@ class OdyseeScraper(Scraper): platform_id=comment.info['claim_id'], date=datetime.fromtimestamp(comment.info['created']), date_archived=datetime.now(), - raw_posts=json.dumps(comment.info), + raw_data=json.dumps(comment.info), archived_urls={}, - media_archived=True) + media_archived=datetime.now(timezone.utc)) def archive_files(self, result: ScraperResult) -> ScraperResult: for url in result.archived_urls: @@ -91,7 +91,7 @@ class OdyseeScraper(Scraper): archived_url = self.archive_blob(media_blob, content_type, key) result.archived_urls[url] = archived_url - result.media_archived = True + result.media_archived = datetime.now(timezone.utc) return result def can_handle(self, channel): diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 37f1b53..c75c947 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -41,9 +41,9 @@ class RumbleScraper(Scraper): platform_id=post['media_url'].split('/')[-2], date=post['datetime'].replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(post, default = str), + raw_data=json.dumps(post, default = str), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] @@ -57,7 +57,7 @@ class RumbleScraper(Scraper): archived_url = self.archive_blob(media_blob, content_type, key) result.archived_urls[url] = archived_url - result.media_archived = True + result.media_archived = datetime.now(timezone.utc) return result def can_handle(self, channel): diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 9b91203..6593917 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -50,9 +50,9 @@ class TelegramSnscrapeScraper(Scraper): platform_id=post.url, date=post.date, date_archived=datetime.now(timezone.utc), - raw_posts=post.json(), + raw_data=post.json(), archived_urls=archived_urls, - media_archived=archive_media + media_archived=datetime.now(timezone.utc) if archive_media else None ) def get_profile(self, channel: Channel) -> RawChannelInfo: diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 6c7eb16..a2d0b7d 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper): key = list(result.archived_urls.keys())[0] if result.archived_urls[key] is None: - raw = json.loads(result.raw_posts) + raw = json.loads(result.raw_data) message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) @@ -144,7 +144,7 @@ class TelegramTelethonScraper(Scraper): platform_id=post_url, date=post.date.replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(post.to_dict(), default=str), + raw_data=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index a361252..b59aaf6 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -72,9 +72,9 @@ class TwitterScraper(Scraper): platform_id=tweet.id, date=tweet.date, date_archived=datetime.now(timezone.utc), - raw_posts=tweet.json(), + raw_data=tweet.json(), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def can_handle(self, channel): if channel.platform == "Twitter" and (channel.platform_id or channel.screenname): diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 7b735da..cdce5b2 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -69,7 +69,7 @@ class VkontakteScraper(Scraper): platform_id=post.url.split('/')[-1], date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_posts=post.json(), + raw_data=post.json(), archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 6b14d98..445b8f4 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -75,9 +75,9 @@ class YoutubeScraper(Scraper): platform_id=video_id, date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_posts=json.dumps(video, default = str), + raw_data=json.dumps(video, default = str), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def can_handle(self, channel): if channel.platform == "Youtube" and channel.url: @@ -115,7 +115,7 @@ class YoutubeScraper(Scraper): archived_url = self.archive_blob(media_blob, content_type, key) result.archived_urls[url] = archived_url - result.media_archived = True + result.media_archived = datetime.now(timezone.utc) return result def get_profile(self, channel: Channel) -> RawChannelInfo: diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index 61b327d..d0c5fe0 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer): return False def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: - raw = json.loads(data.raw_posts) + raw = json.loads(data.raw_data) orig = raw['video_url'] new = data.archived_urls[orig] @@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer): yield m def transform(self, data: ScraperResult) -> Post: - raw = json.loads(data.raw_posts) + raw = json.loads(data.raw_data) soup = BeautifulSoup(raw['body'], features = 'html.parser') content = soup.find_all('p')[-1].text diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 8fa2e68..85ada05 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -47,7 +47,7 @@ class TwitterTransformer(Transformer): def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: - raw = json.loads(data.raw_posts) + raw = json.loads(data.raw_data) transformed = Post( raw_id=data.id,