From d1f9dd0e012745a8bf3e6b675b11376136b98d1e Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 12 Apr 2022 12:57:04 +0000 Subject: [PATCH 1/2] Limit max # of archived files per session --- .gitignore | 2 ++ cisticola/scraper/base.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 65dd3d7..65fab11 100644 --- a/.gitignore +++ b/.gitignore @@ -9,9 +9,11 @@ docs/source/_* *.db .env *.session +*.session-journal service_account.json .vscode/ *.log +*.lock # Unit test / coverage reports reports diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 0144557..fcb9277 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -421,7 +421,7 @@ class ScraperController: # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work # simultaneously with low risk of collision (at least while the number of unarchived items is very large) - posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(10000).all() + posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(4000).all() logger.info(f"Found {len(posts)} posts without media. Archiving now") From a0dbe7d92b8f1eb05a005cc88d6c5a6163fe4c06 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Wed, 13 Apr 2022 10:10:29 +0200 Subject: [PATCH 2/2] Catch errors in channel info --- cisticola/scraper/bitchute.py | 1 + cisticola/scraper/gab.py | 1 + cisticola/scraper/gettr.py | 1 + cisticola/scraper/instagram.py | 1 + cisticola/scraper/odysee.py | 1 + cisticola/scraper/rumble.py | 1 + cisticola/scraper/telegram_snscrape.py | 1 + cisticola/scraper/telegram_telethon.py | 1 + cisticola/scraper/twitter.py | 1 + cisticola/scraper/vkontakte.py | 1 + cisticola/scraper/youtube.py | 1 + 11 files changed, 11 insertions(+) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 1271dfb..c0cedc9 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -70,6 +70,7 @@ class BitchuteScraper(Scraper): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: base_url = channel.url diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 4a0fb51..5602489 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -89,6 +89,7 @@ class GabScraper(Scraper): if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: return True + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: client = Client( diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 6f28a75..1fa0599 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -72,6 +72,7 @@ class GettrScraper(Scraper): key = urlparse(url).path.split('/')[-2] + ext return key + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: client = PublicClient() username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index 3eab56c..3eca83e 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -91,6 +91,7 @@ class InstagramScraper(Scraper): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: return True + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 0823e8e..5bc2b6e 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -105,6 +105,7 @@ class OdyseeScraper(Scraper): return f'{key}.{ext}' + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 4fba420..0c7177f 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -69,6 +69,7 @@ class RumbleScraper(Scraper): if channel.platform == "Rumble" and channel.url is not None: return True + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: profile = get_channel_profile(url = channel.url) diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 4dd3f44..c31f458 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -55,6 +55,7 @@ class TelegramSnscrapeScraper(Scraper): media_archived=datetime.now(timezone.utc) if archive_media else None ) + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: scr = snscrape.modules.telegram.TelegramChannelScraper( diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 426ab4e..03ac0d6 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -149,6 +149,7 @@ class TelegramTelethonScraper(Scraper): archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: username = channel.screenname if username is None: diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index ebbdb95..3ccef23 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -97,6 +97,7 @@ class TwitterScraper(Scraper): key = parsed_url.path.split('/')[-1] + ext return key + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: scraper = TwitterUserScraper(channel.screenname) diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index cf427ba..3ef1648 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -103,6 +103,7 @@ class VkontakteScraper(Scraper): return key + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 40a122d..f1d8455 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -138,6 +138,7 @@ class YoutubeScraper(Scraper): result.media_archived = datetime.now(timezone.utc) return result + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: ydl_opts = {