diff --git a/Pipfile b/Pipfile index c913ecb..34fa842 100644 --- a/Pipfile +++ b/Pipfile @@ -25,6 +25,7 @@ gabber = {git = "https://github.com/stanfordio/gabber.git"} psycopg2-binary = "*" tqdm = "*" ratelimit = "*" +pytz = "*" [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 2d95b71..4629c08 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b9fc02f3ecaa2199480c4fcba30f02780860dfbc2e10c026889c78f639709fb4" + "sha256": "89ac092ac8c8321f199f199da0c0867803a44b080538a43e1a57ae7713683616" }, "pipfile-spec": 6, "requires": { @@ -773,7 +773,7 @@ "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" ], - "markers": "python_version < '3.9'", + "index": "pypi", "version": "==2022.1" }, "pytz-deprecation-shim": { @@ -872,7 +872,9 @@ "version": "==2022.3.2" }, "requests": { - "extras": [], + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1387,11 +1389,13 @@ "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" ], - "markers": "python_version < '3.9'", + "index": "pypi", "version": "==2022.1" }, "requests": { - "extras": [], + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1475,7 +1479,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version < '3.11.0'", "version": "==2.0.1" }, "typing-extensions": { diff --git a/app.py b/app.py index a55faab..570bf71 100644 --- a/app.py +++ b/app.py @@ -5,6 +5,8 @@ from sqlalchemy import create_engine, func from sqlalchemy.orm import sessionmaker import os import time +import sys +import telethon.errors.rpcerrorlist from cisticola.base import Channel, RawChannelInfo, mapper_registry from cisticola.scraper import ( @@ -112,6 +114,8 @@ def init_db(): mapper_registry.metadata.create_all(bind=engine) if __name__ == '__main__': + logger.remove() + logger.add(sys.stdout, level="DEBUG", catch=True) logger.add("./test.log", level="TRACE") parser = argparse.ArgumentParser(description = 'Cisticola command line tools') @@ -121,6 +125,7 @@ if __name__ == '__main__': args = parser.parse_args() + if args.command == 'init-db': init_db() elif args.command == 'sync-channels': diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index ccbfa39..e08bf8e 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -275,6 +275,7 @@ class Scraper: raise NotImplementedError + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: """Scrape all posts from the specified Channel. @@ -341,7 +342,6 @@ class ScraperController: return self.scrape_channel_info(channels) - @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): """Scrape all posts for all specified channels. @@ -388,6 +388,9 @@ class ScraperController: session.add(post) added += 1 + if added > 100: + break + session.commit() logger.info( f"{scraper} found {added} new posts from {channel}") diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index f318b1e..5afa4e1 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -8,6 +8,7 @@ from typing import Generator import requests from bs4 import BeautifulSoup +from loguru import logger from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper @@ -22,6 +23,7 @@ class BitchuteScraper(Scraper): return username + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: session = requests.Session() diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 36baf67..ab1cdf3 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -2,6 +2,7 @@ from datetime import datetime, timezone, date import json from typing import Generator import os +from loguru import logger from gabber.client import Client, GAB_API_BASE_URL @@ -22,6 +23,7 @@ class GabScraper(Scraper): return group_id + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = Client( username = os.environ['GAB_USER'], diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 43bc095..89ed35f 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -2,6 +2,7 @@ from datetime import datetime, timezone import json from typing import Generator from urllib.parse import urlparse +from loguru import logger from gogettr import PublicClient @@ -19,6 +20,7 @@ class GettrScraper(Scraper): return username + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = PublicClient() username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index 435d69d..4dbc205 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -25,6 +25,7 @@ class InstagramScraper(Scraper): username = url.split(BASE_URL)[1].strip('/') return username + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 020a2ba..0f7a3fe 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -25,6 +25,7 @@ class OdyseeScraper(Scraper): return username + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 2a4d968..737be05 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -2,6 +2,7 @@ from datetime import datetime, timezone import json from typing import Generator from urllib.parse import urlparse +from loguru import logger from bs4 import BeautifulSoup @@ -14,6 +15,7 @@ class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: scraper = get_channel_videos(channel.url) diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index e272db5..9b91203 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -15,6 +15,7 @@ class TelegramSnscrapeScraper(Scraper): if channel.platform == "Telegram" and channel.public and not channel.chat: return True + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index fb9a58f..c27f8c8 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -105,6 +105,7 @@ class TelegramTelethonScraper(Scraper): if channel.platform == "Telegram" and channel.public: return True + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = channel.screenname if username is None: diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 1d00d53..a361252 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -12,6 +12,7 @@ class TwitterScraper(Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: if channel.platform_id: identifier = int(channel.platform_id) diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 5e3d5d3..f36ac12 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -20,6 +20,7 @@ class VkontakteScraper(Scraper): return username + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index f937d24..6b14d98 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -15,6 +15,7 @@ class YoutubeScraper(Scraper): """An implementation of a Scraper for Youtube, using youtube-dl""" __version__ = "YoutubeScraper 0.0.1" + @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: content_type = 'video/mp4'