Configure with Telethon and VK only

This commit is contained in:
Logan Williams
2022-04-02 18:34:14 +00:00
parent 0099558c68
commit 63633617d2
5 changed files with 17 additions and 25 deletions

17
app.py
View File

@@ -6,19 +6,12 @@ from sqlalchemy.orm import sessionmaker
import os
import time
import sys
import telethon.errors.rpcerrorlist
from cisticola.base import Channel, RawChannelInfo, mapper_registry
from cisticola.base import Channel, mapper_registry
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
GabScraper,
GettrScraper,
OdyseeScraper,
RumbleScraper,
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
VkontakteScraper,
TelegramTelethonScraper)
def sync_channels(args):
logger.info("Synchronizing channels")
@@ -52,7 +45,7 @@ def sync_channels(args):
if c['platform_id'] != '':
platform_id = c['platform_id']
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
channel = session.query(Channel).filter_by(platform_id=str(platform_id), platform=c['platform'], url=c['url']).first()
if not channel:
channel = Channel(**c, source='researcher')
@@ -85,7 +78,7 @@ def get_scraper_controller():
scrapers = [
TelegramTelethonScraper(),
TwitterScraper()]
VkontakteScraper()]
controller.register_scrapers(scrapers)

View File

@@ -42,8 +42,8 @@ class ScraperResult:
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
#: Has the media in this post been archived?
media_archived: bool
#: What date was the media archived? (None if not archived)
media_archived: datetime
@dataclass
class Channel:
@@ -252,7 +252,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('raw_posts', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON),
Column('media_archived', Boolean))
Column('media_archived', DateTime))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),

View File

@@ -9,6 +9,7 @@ from loguru import logger
import ffmpeg
from sqlalchemy.orm import sessionmaker
import yt_dlp
from sqlalchemy.sql.expression import func
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
@@ -397,11 +398,9 @@ class ScraperController:
for post in posts:
session.add(post)
session.commit()
added += 1
if added > 100:
break
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")
@@ -418,7 +417,7 @@ class ScraperController:
session = self.session()
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).order_by(func.random()).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now")

View File

@@ -62,7 +62,7 @@ class TelegramTelethonScraper(Scraper):
else:
logger.warning("Downloaded blob was None")
result.media_archived = True
result.media_archived = datetime.now(timezone.utc)
return result
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
@@ -146,7 +146,7 @@ class TelegramTelethonScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_posts=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
media_archived=datetime.now(timezone.utc) if archive_media else None)
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = channel.screenname

View File

@@ -64,14 +64,14 @@ class VkontakteScraper(Scraper):
yield ScraperResult(
scraper=self.__version__,
platform="Vkontatke",
platform="VK",
channel=channel.id,
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media)
media_archived=datetime.now(timezone.utc) if archive_media else None)
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
@@ -84,12 +84,12 @@ class VkontakteScraper(Scraper):
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
result.media_archived = datetime.now(timezone.utc)
return result
def can_handle(self, channel):
if channel.platform == "Vkontakte" and channel.platform_id:
if channel.platform == "VK":
return True
def url_to_key(self, url: str, content_type: str) -> str: