mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Configure with Telethon and VK only
This commit is contained in:
17
app.py
17
app.py
@@ -6,19 +6,12 @@ from sqlalchemy.orm import sessionmaker
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import telethon.errors.rpcerrorlist
|
||||
|
||||
from cisticola.base import Channel, RawChannelInfo, mapper_registry
|
||||
from cisticola.base import Channel, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
GabScraper,
|
||||
GettrScraper,
|
||||
OdyseeScraper,
|
||||
RumbleScraper,
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
VkontakteScraper,
|
||||
TelegramTelethonScraper)
|
||||
|
||||
def sync_channels(args):
|
||||
logger.info("Synchronizing channels")
|
||||
@@ -52,7 +45,7 @@ def sync_channels(args):
|
||||
if c['platform_id'] != '':
|
||||
platform_id = c['platform_id']
|
||||
|
||||
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
|
||||
channel = session.query(Channel).filter_by(platform_id=str(platform_id), platform=c['platform'], url=c['url']).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
@@ -85,7 +78,7 @@ def get_scraper_controller():
|
||||
|
||||
scrapers = [
|
||||
TelegramTelethonScraper(),
|
||||
TwitterScraper()]
|
||||
VkontakteScraper()]
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
|
||||
@@ -42,8 +42,8 @@ class ScraperResult:
|
||||
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
||||
archived_urls: dict
|
||||
|
||||
#: Has the media in this post been archived?
|
||||
media_archived: bool
|
||||
#: What date was the media archived? (None if not archived)
|
||||
media_archived: datetime
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
@@ -252,7 +252,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
||||
Column('raw_posts', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON),
|
||||
Column('media_archived', Boolean))
|
||||
Column('media_archived', DateTime))
|
||||
|
||||
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
|
||||
@@ -9,6 +9,7 @@ from loguru import logger
|
||||
import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import yt_dlp
|
||||
from sqlalchemy.sql.expression import func
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
@@ -397,11 +398,9 @@ class ScraperController:
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
session.commit()
|
||||
added += 1
|
||||
|
||||
if added > 100:
|
||||
break
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
@@ -418,7 +417,7 @@ class ScraperController:
|
||||
|
||||
session = self.session()
|
||||
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
|
||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).order_by(func.random()).all()
|
||||
|
||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
else:
|
||||
logger.warning("Downloaded blob was None")
|
||||
|
||||
result.media_archived = True
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
||||
@@ -146,7 +146,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
username = channel.screenname
|
||||
|
||||
@@ -64,14 +64,14 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Vkontatke",
|
||||
platform="VK",
|
||||
channel=channel.id,
|
||||
platform_id=post.url.split('/')[-1],
|
||||
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
@@ -84,12 +84,12 @@ class VkontakteScraper(Scraper):
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Vkontakte" and channel.platform_id:
|
||||
if channel.platform == "VK":
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user