added wrapper for requests that retries after encountering exception

This commit is contained in:
Tristan Lee
2022-03-07 13:28:33 -06:00
parent 253a9bea49
commit 506fb54a53
7 changed files with 143 additions and 69 deletions

View File

@@ -1,3 +1,4 @@
from .utils import make_request
from .base import Scraper, ScraperController
from .bitchute import BitchuteScraper
from .gab import GabScraper

View File

@@ -11,19 +11,18 @@ import ffmpeg
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.scraper import make_request
class Scraper:
__version__ = "Scraper 0.0.0"
def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.getenv(
'DO_SPACES_REGION'),
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.getenv('DO_SPACES_REGION')),
aws_access_key_id=os.getenv(
'DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
os.environ['DO_SPACES_REGION']),
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
@@ -39,18 +38,7 @@ class Scraper:
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
n_retries = 0
r = requests.get(url, headers = self.headers)
while r.status_code != 200 and n_retries < 5:
logger.warning(f"{n_retries}/5: Request for {url} failed")
n_retries += 1
r = requests.get(url, headers = self.headers)
if r.status_code != 200:
logger.error(f"Could not fetch URL {url}")
return url
r = make_request(url, headers = self.headers)
blob = r.content
content_type = r.headers.get('Content-Type')
@@ -86,18 +74,18 @@ class Scraper:
filename = self.__version__.replace(' ', '_') + '/' + key
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.environ[
'DO_BUCKET'], Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
archived_url = os.getenv('DO_URL') + '/' + filename
archived_url = os.environ['DO_URL'] + '/' + filename
return archived_url
def can_handle(self, channel: Channel) -> bool:
pass
raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
pass
raise NotImplementedError
class ScraperController:

View File

@@ -435,7 +435,7 @@ def get_about(user):
about = {
'description' : description_soup.text,
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': re.sub('\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : soup.find('p', {'class' : 'owner'}).text,

View File

@@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
import youtube_dl
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com'
@@ -90,7 +90,7 @@ class RumbleScraper(Scraper):
def get_media_url(url):
r = requests.get(url)
r = make_request(url = url)
soup = BeautifulSoup(r.content, features = 'lxml')
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
@@ -126,7 +126,7 @@ def get_channel_videos(channel):
while True:
url = channel_url + str(page)
r = requests.get(url)
r = make_request(url = url, break_codes = [404])
if r.status_code == 404:
break

View File

@@ -2,6 +2,7 @@ from typing import Generator
from datetime import datetime, timezone
import snscrape.modules
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
@@ -21,16 +22,21 @@ class TelegramSnscrapeScraper(Scraper):
for post in g:
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
break
logger.info(f'Processing post {post}')
archived_urls = {}
for image_url in post.images:
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post.video] = archived_url

View File

@@ -0,0 +1,72 @@
import requests
from loguru import logger
def make_request(url, headers = None, max_retries = 5, break_codes = None):
"""Retry request `max_retries` times, while catching arbitrary exceptions.
Parameters
----------
url : str
URL of content that is being requested
headers : dict or None
Dictionary of key-value pairs for request headers
max_retries : int
Maximum number of times to retry the request
break_codes : list or None
List of acceptable status codes that indicate that the request should
not be retried further. Useful if, for example, a `404` is expected at
some point to terminate a loop, and we don't want to retry to get the
404-ed page multiple times.
Returns
-------
requests.Response or None
Reponse from the request, or None if all retries failed.
"""
if break_codes is None:
break_codes = []
r = None
for n_retries in range(max_retries):
try:
r = request_until_200(
url = url,
headers = headers,
max_retries = max_retries,
break_codes = break_codes)
logger.debug(f"Request for url: {url} succeeded on attempt: {n_retries}/{max_retries}")
except Exception as e:
logger.warning(f"Request for url: {url} raised exception: [{e}] on attempt: {n_retries}/{max_retries}")
continue
else:
break
else:
logger.error(f"Request for url: {url} failed after {max_retries} attempts")
return r
def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
"""Retry request `max_retries` times, or until the request is successful.
"""
if break_codes is None:
break_codes = [200]
else:
break_codes = break_codes + [200]
n_retries = 0
r = requests.get(url, headers = headers)
while r.status_code not in break_codes and n_retries < 5:
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
n_retries += 1
r = requests.get(url, headers = headers)
if r.status_code not in break_codes:
raise ValueError(f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts")
return r

View File

@@ -1,53 +1,60 @@
import sys
from sqlalchemy import create_engine
from loguru import logger
from cisticola.base import Channel
from cisticola.scraper import (
ScraperController,
TelegramSnscrapeScraper)
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add("../russian_telegram_ingest.log", level = "INFO")
test_channels = [
Channel(
id=0,
name="QAnon Россия",
platform_id=-1001319637748,
category="Qanon",
followers=94048,
platform="Telegram",
url="https://t.me/qanonrus",
screenname="qanonrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="The Great Awakening | Q",
platform_id=-1001325597521,
category="Qanon",
followers=5715,
platform="Telegram",
url="https://t.me/greatawakin",
screenname="greatawakin",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="Великое Пробуждение",
platform_id=-1001285898079,
category="Qanon",
followers=5861,
platform="Telegram",
url="https://t.me/greatawakeningrus",
screenname="greatawakeningrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
# Channel(
# id=0,
# name="QAnon Россия",
# platform_id=-1001319637748,
# category="Qanon",
# followers=94048,
# platform="Telegram",
# url="https://t.me/qanonrus",
# screenname="qanonrus",
# country="RU",
# influencer=None,
# public=True,
# chat=False,
# notes=""),
# Channel(
# id=1,
# name="The Great Awakening | Q",
# platform_id=-1001325597521,
# category="Qanon",
# followers=5715,
# platform="Telegram",
# url="https://t.me/greatawakin",
# screenname="greatawakin",
# country="RU",
# influencer=None,
# public=True,
# chat=False,
# notes=""),
# Channel(
# id=2,
# name="Великое Пробуждение",
# platform_id=-1001285898079,
# category="Qanon",
# followers=5861,
# platform="Telegram",
# url="https://t.me/greatawakeningrus",
# screenname="greatawakeningrus",
# country="RU",
# influencer=None,
# public=True,
# chat=False,
# notes=""),
Channel(
id=3,
name="T🕊Редакция Президент Гордон🕊",