added wrapper for requests that retries after encountering exception

2026-06-12 05:18:33 +03:00 · 2022-03-07 13:28:33 -06:00
parent 253a9bea49
commit 506fb54a53
7 changed files with 143 additions and 69 deletions
--- a/cisticola/scraper/init.py
+++ b/cisticola/scraper/init.py
@@ -1,3 +1,4 @@
+from .utils import make_request
 from .base import Scraper, ScraperController
 from .bitchute import BitchuteScraper
 from .gab import GabScraper 
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -11,19 +11,18 @@ import ffmpeg
 from sqlalchemy.orm import sessionmaker

 from cisticola.base import Channel, ScraperResult, mapper_registry
+from cisticola.scraper import make_request

 class Scraper:
    __version__ = "Scraper 0.0.0"

    def __init__(self):
        self.s3_client = boto3.client('s3',
-                                      region_name=os.getenv(
-                                          'DO_SPACES_REGION'),
+                                      region_name=os.environ['DO_SPACES_REGION'],
                                      endpoint_url='https://{}.digitaloceanspaces.com'.format(
-                                          os.getenv('DO_SPACES_REGION')),
-                                      aws_access_key_id=os.getenv(
-                                          'DO_SPACES_KEY'),
-                                      aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
+                                          os.environ['DO_SPACES_REGION']),
+                                      aws_access_key_id=os.environ['DO_SPACES_KEY'],
+                                      aws_secret_access_key=os.environ['DO_SPACES_SECRET'])

        self.headers = {
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
@@ -39,18 +38,7 @@ class Scraper:

    def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:

-        n_retries = 0
-
-        r = requests.get(url, headers = self.headers)
-
-        while r.status_code != 200 and n_retries < 5:
-            logger.warning(f"{n_retries}/5: Request for {url} failed")
-            n_retries += 1
-            r = requests.get(url, headers = self.headers)
-
-        if r.status_code != 200:
-            logger.error(f"Could not fetch URL {url}")
-            return url
+        r = make_request(url, headers = self.headers)

        blob = r.content
        content_type = r.headers.get('Content-Type')
@@ -86,18 +74,18 @@ class Scraper:

        filename = self.__version__.replace(' ', '_') + '/' + key

-        self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
-            'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
+        self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.environ[
+            'DO_BUCKET'], Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})

-        archived_url = os.getenv('DO_URL') + '/' + filename
+        archived_url = os.environ['DO_URL'] + '/' + filename

        return archived_url

    def can_handle(self, channel: Channel) -> bool:
-        pass
+        raise NotImplementedError

    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-        pass
+        raise NotImplementedError


 class ScraperController:
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -435,7 +435,7 @@ def get_about(user):
    about = {
        'description' : description_soup.text,
        'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
-        'created': re.sub('\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
+        'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
        'videos' : int(info_list[1].text.split('videos')[0].strip()),
        'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
        'owner_name' : soup.find('p', {'class' : 'owner'}).text,
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
 import youtube_dl

 from cisticola.base import Channel, ScraperResult
-from cisticola.scraper.base import Scraper
+from cisticola.scraper import Scraper, make_request

 BASE_URL = 'https://rumble.com'

@@ -90,7 +90,7 @@ class RumbleScraper(Scraper):

 def get_media_url(url):
    
-    r = requests.get(url)
+    r = make_request(url = url)
    soup = BeautifulSoup(r.content, features = 'lxml')
    
    script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
@@ -126,7 +126,7 @@ def get_channel_videos(channel):

    while True:
        url = channel_url + str(page)
-        r = requests.get(url)
+        r = make_request(url = url, break_codes = [404])

        if r.status_code == 404:
            break
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -2,6 +2,7 @@ from typing import Generator
 from datetime import datetime, timezone

 import snscrape.modules
+from loguru import logger

 from cisticola.base import Channel, ScraperResult
 from cisticola.scraper.base import Scraper
@@ -21,16 +22,21 @@ class TelegramSnscrapeScraper(Scraper):

        for post in g:
            if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
+                logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
                break

+            logger.info(f'Processing post {post}')
+
            archived_urls = {}

            for image_url in post.images:
+                logger.debug(f'Archiving image: {image_url}')
                media_blob, content_type, key = self.url_to_blob(image_url)
                archived_url = self.archive_media(media_blob, content_type, key)
                archived_urls[image_url] = archived_url

            if post.video:
+                logger.debug(f'Archiving video: {post.video}')
                media_blob, content_type, key = self.url_to_blob(post.video)
                archived_url = self.archive_media(media_blob, content_type, key)
                archived_urls[post.video] = archived_url
--- a/cisticola/scraper/utils.py
+++ b/cisticola/scraper/utils.py
@@ -0,0 +1,72 @@
+import requests
+from loguru import logger
+
+def make_request(url, headers = None, max_retries = 5, break_codes = None):
+
+    """Retry request `max_retries` times, while catching arbitrary exceptions.
+
+    Parameters
+    ----------
+    url : str
+        URL of content that is being requested
+    headers : dict or None
+        Dictionary of key-value pairs for request headers
+    max_retries : int
+        Maximum number of times to retry the request 
+    break_codes : list or None
+        List of acceptable status codes that indicate that the request should 
+        not be retried further. Useful if, for example, a `404` is expected at 
+        some point to terminate a loop, and we don't want to retry to get the 
+        404-ed page multiple times.
+
+    Returns
+    -------
+    requests.Response or None
+        Reponse from the request, or None if all retries failed.
+    """
+
+    if break_codes is None:
+        break_codes = []
+
+    r = None
+
+    for n_retries in range(max_retries):
+        try:
+            r = request_until_200(
+                url = url, 
+                headers = headers, 
+                max_retries = max_retries,
+                break_codes = break_codes)
+            logger.debug(f"Request for url: {url} succeeded on attempt: {n_retries}/{max_retries}")
+        except Exception as e:
+            logger.warning(f"Request for url: {url} raised exception: [{e}] on attempt: {n_retries}/{max_retries}")
+            continue 
+        else:
+            break 
+    else:
+        logger.error(f"Request for url: {url} failed after {max_retries} attempts")
+
+    return r
+
+def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
+
+    """Retry request `max_retries` times, or until the request is successful.
+    """
+
+    if break_codes is None:
+        break_codes = [200]
+    else:
+        break_codes = break_codes + [200]
+
+    n_retries = 0
+    r = requests.get(url, headers = headers)
+
+    while r.status_code not in break_codes and n_retries < 5:
+        logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
+        n_retries += 1
+        r = requests.get(url, headers = headers)
+
+    if r.status_code not in break_codes:
+        raise ValueError(f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts")
+
+    return r
--- a/examples/russian_telegram_ingest.py
+++ b/examples/russian_telegram_ingest.py
@@ -1,53 +1,60 @@
+import sys
+
 from sqlalchemy import create_engine
+from loguru import logger

 from cisticola.base import Channel
 from cisticola.scraper import (
    ScraperController,
    TelegramSnscrapeScraper)

+logger.remove()
+logger.add(sys.stderr, level="INFO")
+logger.add("../russian_telegram_ingest.log", level = "INFO")
+
 test_channels = [
-    Channel(
-        id=0, 
-        name="QAnon Россия", 
-        platform_id=-1001319637748,
-        category="Qanon", 
-        followers=94048, 
-        platform="Telegram",
-        url="https://t.me/qanonrus", 
-        screenname="qanonrus", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False,
-        notes=""),
-    Channel(
-        id=1, 
-        name="The Great Awakening | Q", 
-        platform_id=-1001325597521,
-        category="Qanon", 
-        followers=5715,
-        platform="Telegram",
-        url="https://t.me/greatawakin", 
-        screenname="greatawakin", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
-    Channel(
-        id=2, 
-        name="Великое Пробуждение", 
-        platform_id=-1001285898079,
-        category="Qanon", 
-        followers=5861, 
-        platform="Telegram",
-        url="https://t.me/greatawakeningrus", 
-        screenname="greatawakeningrus", 
-        country="RU",
-        influencer=None, 
-        public=True, 
-        chat=False, 
-        notes=""),
+    # Channel(
+    #     id=0, 
+    #     name="QAnon Россия", 
+    #     platform_id=-1001319637748,
+    #     category="Qanon", 
+    #     followers=94048, 
+    #     platform="Telegram",
+    #     url="https://t.me/qanonrus", 
+    #     screenname="qanonrus", 
+    #     country="RU",
+    #     influencer=None, 
+    #     public=True, 
+    #     chat=False,
+    #     notes=""),
+    # Channel(
+    #     id=1, 
+    #     name="The Great Awakening | Q", 
+    #     platform_id=-1001325597521,
+    #     category="Qanon", 
+    #     followers=5715,
+    #     platform="Telegram",
+    #     url="https://t.me/greatawakin", 
+    #     screenname="greatawakin", 
+    #     country="RU",
+    #     influencer=None, 
+    #     public=True, 
+    #     chat=False, 
+    #     notes=""),
+    # Channel(
+    #     id=2, 
+    #     name="Великое Пробуждение", 
+    #     platform_id=-1001285898079,
+    #     category="Qanon", 
+    #     followers=5861, 
+    #     platform="Telegram",
+    #     url="https://t.me/greatawakeningrus", 
+    #     screenname="greatawakeningrus", 
+    #     country="RU",
+    #     influencer=None, 
+    #     public=True, 
+    #     chat=False, 
+    #     notes=""),
    Channel(
        id=3, 
        name="T🕊Редакция Президент Гордон🕊",