mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Add Telegram channel scraper
This commit is contained in:
58
cisticola/scraper/base.py
Normal file
58
cisticola/scraper/base.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from typing import List
|
||||
import cisticola.base
|
||||
import requests
|
||||
import os
|
||||
import boto3
|
||||
from io import BytesIO
|
||||
from loguru import logger
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
self.s3_client = boto3.client('s3',
|
||||
region_name=os.getenv(
|
||||
'DO_SPACES_REGION'),
|
||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||
os.getenv('DO_SPACES_REGION')),
|
||||
aws_access_key_id=os.getenv(
|
||||
'DO_SPACES_KEY'),
|
||||
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
||||
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def archive_media(self, url: str) -> str:
|
||||
n_retries = 0
|
||||
r = requests.get(url)
|
||||
|
||||
while r.status_code != 200 and n_retries < 5:
|
||||
logger.warning(f"{n_retries}/5: Request for {url} failed")
|
||||
n_retries += 1
|
||||
r = requests.get(url)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f"Could not fetch URL {url}")
|
||||
return url
|
||||
|
||||
blob = r.content
|
||||
|
||||
key = url.split('/')[-1]
|
||||
key = key.split('?')[0]
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'})
|
||||
|
||||
archived_url = os.getenv('DO_URL') + '/' + filename
|
||||
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: cisticola.base.Channel) -> bool:
|
||||
pass
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
pass
|
||||
47
cisticola/scraper/telegram_snscrape.py
Normal file
47
cisticola/scraper/telegram_snscrape.py
Normal file
@@ -0,0 +1,47 @@
|
||||
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from typing import List
|
||||
import snscrape.modules
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None):
|
||||
posts = []
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
g = scr.get_items()
|
||||
|
||||
for post in g:
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
raw_data = post.json()
|
||||
|
||||
for image_url in post.images:
|
||||
archive_url = self.archive_media(image_url)
|
||||
raw_data = raw_data.replace(image_url, archive_url)
|
||||
|
||||
if post.video:
|
||||
video_archive_url = self.archive_media(post.video)
|
||||
raw_data = raw_data.replace(post.video, video_archive_url)
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Telegram",
|
||||
channel=channel.id,
|
||||
platform_id=post.url,
|
||||
date=post.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=raw_data
|
||||
))
|
||||
|
||||
return posts
|
||||
Reference in New Issue
Block a user