From 63633617d2daf348809317e549389fe135b9c24e Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Sat, 2 Apr 2022 18:34:14 +0000 Subject: [PATCH] Configure with Telethon and VK only --- app.py | 17 +++++------------ cisticola/base.py | 6 +++--- cisticola/scraper/base.py | 7 +++---- cisticola/scraper/telegram_telethon.py | 4 ++-- cisticola/scraper/vkontakte.py | 8 ++++---- 5 files changed, 17 insertions(+), 25 deletions(-) diff --git a/app.py b/app.py index 570bf71..9b87873 100644 --- a/app.py +++ b/app.py @@ -6,19 +6,12 @@ from sqlalchemy.orm import sessionmaker import os import time import sys -import telethon.errors.rpcerrorlist -from cisticola.base import Channel, RawChannelInfo, mapper_registry +from cisticola.base import Channel, mapper_registry from cisticola.scraper import ( ScraperController, - BitchuteScraper, - GabScraper, - GettrScraper, - OdyseeScraper, - RumbleScraper, - TelegramSnscrapeScraper, - TelegramTelethonScraper, - TwitterScraper) + VkontakteScraper, + TelegramTelethonScraper) def sync_channels(args): logger.info("Synchronizing channels") @@ -52,7 +45,7 @@ def sync_channels(args): if c['platform_id'] != '': platform_id = c['platform_id'] - channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first() + channel = session.query(Channel).filter_by(platform_id=str(platform_id), platform=c['platform'], url=c['url']).first() if not channel: channel = Channel(**c, source='researcher') @@ -85,7 +78,7 @@ def get_scraper_controller(): scrapers = [ TelegramTelethonScraper(), - TwitterScraper()] + VkontakteScraper()] controller.register_scrapers(scrapers) diff --git a/cisticola/base.py b/cisticola/base.py index 37c897b..b53ed35 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -42,8 +42,8 @@ class ScraperResult: #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. archived_urls: dict - #: Has the media in this post been archived? - media_archived: bool + #: What date was the media archived? (None if not archived) + media_archived: datetime @dataclass class Channel: @@ -252,7 +252,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata, Column('raw_posts', String), Column('date_archived', DateTime), Column('archived_urls', JSON), - Column('media_archived', Boolean)) + Column('media_archived', DateTime)) raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata, Column('id', Integer, primary_key=True), diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 0762c16..f246605 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -9,6 +9,7 @@ from loguru import logger import ffmpeg from sqlalchemy.orm import sessionmaker import yt_dlp +from sqlalchemy.sql.expression import func from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.utils import make_request @@ -397,11 +398,9 @@ class ScraperController: for post in posts: session.add(post) + session.commit() added += 1 - if added > 100: - break - session.commit() logger.info( f"{scraper} found {added} new posts from {channel}") @@ -418,7 +417,7 @@ class ScraperController: session = self.session() - posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all() + posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).order_by(func.random()).all() logger.info(f"Found {len(posts)} posts without media. Archiving now") diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index c27f8c8..6c7eb16 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -62,7 +62,7 @@ class TelegramTelethonScraper(Scraper): else: logger.warning("Downloaded blob was None") - result.media_archived = True + result.media_archived = datetime.now(timezone.utc) return result def archive_post_media(self, post : types.Message, client : TelegramClient = None): @@ -146,7 +146,7 @@ class TelegramTelethonScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_posts=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def get_profile(self, channel: Channel) -> RawChannelInfo: username = channel.screenname diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index f36ac12..7b735da 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -64,14 +64,14 @@ class VkontakteScraper(Scraper): yield ScraperResult( scraper=self.__version__, - platform="Vkontatke", + platform="VK", channel=channel.id, platform_id=post.url.split('/')[-1], date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_posts=post.json(), archived_urls=archived_urls, - media_archived=archive_media) + media_archived=datetime.now(timezone.utc) if archive_media else None) def archive_files(self, result: ScraperResult) -> ScraperResult: for url in result.archived_urls: @@ -84,12 +84,12 @@ class VkontakteScraper(Scraper): archived_url = self.archive_blob(media_blob, content_type, key) result.archived_urls[url] = archived_url - result.media_archived = True + result.media_archived = datetime.now(timezone.utc) return result def can_handle(self, channel): - if channel.platform == "Vkontakte" and channel.platform_id: + if channel.platform == "VK": return True def url_to_key(self, url: str, content_type: str) -> str: