mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 05:18:33 +03:00
71 lines
2.6 KiB
Python
71 lines
2.6 KiB
Python
from typing import Generator
|
|
from datetime import datetime, timezone
|
|
import json
|
|
import snscrape.modules
|
|
from loguru import logger
|
|
|
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
|
from cisticola.scraper.base import Scraper
|
|
|
|
class TelegramSnscrapeScraper(Scraper):
|
|
"""An implementation of a Scraper for Telegram, using snscrape library"""
|
|
__version__ = "TelegramSnscrapeScraper 0.0.0"
|
|
|
|
def can_handle(self, channel):
|
|
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
|
return True
|
|
|
|
@logger.catch
|
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
|
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
|
channel.screenname)
|
|
|
|
g = scr.get_items()
|
|
|
|
for post in g:
|
|
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
|
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
|
break
|
|
|
|
logger.info(f'Processing post {post}')
|
|
|
|
archived_urls = {}
|
|
|
|
for image_url in post.images:
|
|
archived_urls[image_url] = None
|
|
|
|
for video_url in post.videos:
|
|
archived_urls[video_url] = None
|
|
|
|
if archive_media:
|
|
for url in archived_urls:
|
|
media_blob, content_type, key = self.url_to_blob(url)
|
|
archived_url = self.archive_blob(media_blob, content_type, key)
|
|
archived_urls[url] = archived_url
|
|
|
|
yield ScraperResult(
|
|
scraper=self.__version__,
|
|
platform="Telegram",
|
|
channel=channel.id,
|
|
platform_id=post.url,
|
|
date=post.date,
|
|
date_archived=datetime.now(timezone.utc),
|
|
raw_data=post.json(),
|
|
archived_urls=archived_urls,
|
|
media_archived=datetime.now(timezone.utc) if archive_media else None
|
|
)
|
|
|
|
@logger.catch
|
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
|
|
|
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
|
channel.screenname)
|
|
|
|
profile = scr._get_entity().__dict__
|
|
|
|
return RawChannelInfo(scraper=self.__version__,
|
|
platform=channel.platform,
|
|
channel=channel.id,
|
|
raw_data=json.dumps(profile),
|
|
date_archived=datetime.now(timezone.utc))
|