Add Telegram channel scraper

This commit is contained in:
Logan Williams
2022-02-24 16:37:13 +01:00
parent 6092e4caa5
commit a87cfd570a
2 changed files with 105 additions and 0 deletions

58
cisticola/scraper/base.py Normal file
View File

@@ -0,0 +1,58 @@
from typing import List
import cisticola.base
import requests
import os
import boto3
from io import BytesIO
from loguru import logger
class Scraper:
__version__ = "Scraper 0.0.0"
def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.getenv(
'DO_SPACES_REGION'),
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.getenv('DO_SPACES_REGION')),
aws_access_key_id=os.getenv(
'DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
pass
def __str__(self):
return self.__version__
def archive_media(self, url: str) -> str:
n_retries = 0
r = requests.get(url)
while r.status_code != 200 and n_retries < 5:
logger.warning(f"{n_retries}/5: Request for {url} failed")
n_retries += 1
r = requests.get(url)
if r.status_code != 200:
logger.error(f"Could not fetch URL {url}")
return url
blob = r.content
key = url.split('/')[-1]
key = key.split('?')[0]
filename = self.__version__.replace(' ', '_') + '/' + key
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': 'image/jpeg'})
archived_url = os.getenv('DO_URL') + '/' + filename
return archived_url
def can_handle(self, channel: cisticola.base.Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
pass