From a87cfd570a01590115b4cf6fe0e1a74a040b2f18 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 24 Feb 2022 16:37:13 +0100 Subject: [PATCH] Add Telegram channel scraper --- cisticola/scraper/base.py | 58 ++++++++++++++++++++++++++ cisticola/scraper/telegram_snscrape.py | 47 +++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 cisticola/scraper/base.py create mode 100644 cisticola/scraper/telegram_snscrape.py diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py new file mode 100644 index 0000000..477ce7a --- /dev/null +++ b/cisticola/scraper/base.py @@ -0,0 +1,58 @@ +from typing import List +import cisticola.base +import requests +import os +import boto3 +from io import BytesIO +from loguru import logger + +class Scraper: + __version__ = "Scraper 0.0.0" + + def __init__(self): + self.s3_client = boto3.client('s3', + region_name=os.getenv( + 'DO_SPACES_REGION'), + endpoint_url='https://{}.digitaloceanspaces.com'.format( + os.getenv('DO_SPACES_REGION')), + aws_access_key_id=os.getenv( + 'DO_SPACES_KEY'), + aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + + pass + + def __str__(self): + return self.__version__ + + def archive_media(self, url: str) -> str: + n_retries = 0 + r = requests.get(url) + + while r.status_code != 200 and n_retries < 5: + logger.warning(f"{n_retries}/5: Request for {url} failed") + n_retries += 1 + r = requests.get(url) + + if r.status_code != 200: + logger.error(f"Could not fetch URL {url}") + return url + + blob = r.content + + key = url.split('/')[-1] + key = key.split('?')[0] + + filename = self.__version__.replace(' ', '_') + '/' + key + + self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv( + 'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'}) + + archived_url = os.getenv('DO_URL') + '/' + filename + + return archived_url + + def can_handle(self, channel: cisticola.base.Channel) -> bool: + pass + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + pass diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py new file mode 100644 index 0000000..5752d7d --- /dev/null +++ b/cisticola/scraper/telegram_snscrape.py @@ -0,0 +1,47 @@ + +import cisticola.base +import cisticola.scraper.base +from typing import List +import snscrape.modules +from datetime import datetime, timezone + + +class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper): + __version__ = "TelegramSnscrapeScraper 0.0.1" + + def can_handle(self, channel): + if channel.platform == "Telegram" and channel.public and not channel.chat: + return True + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None): + posts = [] + scr = snscrape.modules.telegram.TelegramChannelScraper( + channel.screenname) + + g = scr.get_items() + + for post in g: + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): + break + + raw_data = post.json() + + for image_url in post.images: + archive_url = self.archive_media(image_url) + raw_data = raw_data.replace(image_url, archive_url) + + if post.video: + video_archive_url = self.archive_media(post.video) + raw_data = raw_data.replace(post.video, video_archive_url) + + posts.append(cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Telegram", + channel=channel.id, + platform_id=post.url, + date=post.date, + date_archived=datetime.now(), + raw_data=raw_data + )) + + return posts