mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 05:18:33 +03:00
added wrapper for requests that retries after encountering exception
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from .utils import make_request
|
||||
from .base import Scraper, ScraperController
|
||||
from .bitchute import BitchuteScraper
|
||||
from .gab import GabScraper
|
||||
|
||||
@@ -11,19 +11,18 @@ import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import make_request
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
self.s3_client = boto3.client('s3',
|
||||
region_name=os.getenv(
|
||||
'DO_SPACES_REGION'),
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||
os.getenv('DO_SPACES_REGION')),
|
||||
aws_access_key_id=os.getenv(
|
||||
'DO_SPACES_KEY'),
|
||||
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
||||
os.environ['DO_SPACES_REGION']),
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
self.headers = {
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
||||
@@ -39,18 +38,7 @@ class Scraper:
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
n_retries = 0
|
||||
|
||||
r = requests.get(url, headers = self.headers)
|
||||
|
||||
while r.status_code != 200 and n_retries < 5:
|
||||
logger.warning(f"{n_retries}/5: Request for {url} failed")
|
||||
n_retries += 1
|
||||
r = requests.get(url, headers = self.headers)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f"Could not fetch URL {url}")
|
||||
return url
|
||||
r = make_request(url, headers = self.headers)
|
||||
|
||||
blob = r.content
|
||||
content_type = r.headers.get('Content-Type')
|
||||
@@ -86,18 +74,18 @@ class Scraper:
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
|
||||
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.environ[
|
||||
'DO_BUCKET'], Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
|
||||
|
||||
archived_url = os.getenv('DO_URL') + '/' + filename
|
||||
archived_url = os.environ['DO_URL'] + '/' + filename
|
||||
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ScraperController:
|
||||
|
||||
@@ -435,7 +435,7 @@ def get_about(user):
|
||||
about = {
|
||||
'description' : description_soup.text,
|
||||
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
|
||||
'created': re.sub('\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
||||
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
||||
'videos' : int(info_list[1].text.split('videos')[0].strip()),
|
||||
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
|
||||
'owner_name' : soup.find('p', {'class' : 'owner'}).text,
|
||||
|
||||
@@ -9,7 +9,7 @@ from bs4 import BeautifulSoup
|
||||
import youtube_dl
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
from cisticola.scraper import Scraper, make_request
|
||||
|
||||
BASE_URL = 'https://rumble.com'
|
||||
|
||||
@@ -90,7 +90,7 @@ class RumbleScraper(Scraper):
|
||||
|
||||
def get_media_url(url):
|
||||
|
||||
r = requests.get(url)
|
||||
r = make_request(url = url)
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
|
||||
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
|
||||
@@ -126,7 +126,7 @@ def get_channel_videos(channel):
|
||||
|
||||
while True:
|
||||
url = channel_url + str(page)
|
||||
r = requests.get(url)
|
||||
r = make_request(url = url, break_codes = [404])
|
||||
|
||||
if r.status_code == 404:
|
||||
break
|
||||
|
||||
@@ -2,6 +2,7 @@ from typing import Generator
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import snscrape.modules
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
@@ -21,16 +22,21 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
|
||||
for post in g:
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
||||
break
|
||||
|
||||
logger.info(f'Processing post {post}')
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
for image_url in post.images:
|
||||
logger.debug(f'Archiving image: {image_url}')
|
||||
media_blob, content_type, key = self.url_to_blob(image_url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[image_url] = archived_url
|
||||
|
||||
if post.video:
|
||||
logger.debug(f'Archiving video: {post.video}')
|
||||
media_blob, content_type, key = self.url_to_blob(post.video)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post.video] = archived_url
|
||||
|
||||
72
cisticola/scraper/utils.py
Normal file
72
cisticola/scraper/utils.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
def make_request(url, headers = None, max_retries = 5, break_codes = None):
|
||||
|
||||
"""Retry request `max_retries` times, while catching arbitrary exceptions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
URL of content that is being requested
|
||||
headers : dict or None
|
||||
Dictionary of key-value pairs for request headers
|
||||
max_retries : int
|
||||
Maximum number of times to retry the request
|
||||
break_codes : list or None
|
||||
List of acceptable status codes that indicate that the request should
|
||||
not be retried further. Useful if, for example, a `404` is expected at
|
||||
some point to terminate a loop, and we don't want to retry to get the
|
||||
404-ed page multiple times.
|
||||
|
||||
Returns
|
||||
-------
|
||||
requests.Response or None
|
||||
Reponse from the request, or None if all retries failed.
|
||||
"""
|
||||
|
||||
if break_codes is None:
|
||||
break_codes = []
|
||||
|
||||
r = None
|
||||
|
||||
for n_retries in range(max_retries):
|
||||
try:
|
||||
r = request_until_200(
|
||||
url = url,
|
||||
headers = headers,
|
||||
max_retries = max_retries,
|
||||
break_codes = break_codes)
|
||||
logger.debug(f"Request for url: {url} succeeded on attempt: {n_retries}/{max_retries}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Request for url: {url} raised exception: [{e}] on attempt: {n_retries}/{max_retries}")
|
||||
continue
|
||||
else:
|
||||
break
|
||||
else:
|
||||
logger.error(f"Request for url: {url} failed after {max_retries} attempts")
|
||||
|
||||
return r
|
||||
|
||||
def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
|
||||
|
||||
"""Retry request `max_retries` times, or until the request is successful.
|
||||
"""
|
||||
|
||||
if break_codes is None:
|
||||
break_codes = [200]
|
||||
else:
|
||||
break_codes = break_codes + [200]
|
||||
|
||||
n_retries = 0
|
||||
r = requests.get(url, headers = headers)
|
||||
|
||||
while r.status_code not in break_codes and n_retries < 5:
|
||||
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
|
||||
n_retries += 1
|
||||
r = requests.get(url, headers = headers)
|
||||
|
||||
if r.status_code not in break_codes:
|
||||
raise ValueError(f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts")
|
||||
|
||||
return r
|
||||
@@ -1,53 +1,60 @@
|
||||
import sys
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
TelegramSnscrapeScraper)
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
logger.add("../russian_telegram_ingest.log", level = "INFO")
|
||||
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="QAnon Россия",
|
||||
platform_id=-1001319637748,
|
||||
category="Qanon",
|
||||
followers=94048,
|
||||
platform="Telegram",
|
||||
url="https://t.me/qanonrus",
|
||||
screenname="qanonrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="The Great Awakening | Q",
|
||||
platform_id=-1001325597521,
|
||||
category="Qanon",
|
||||
followers=5715,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakin",
|
||||
screenname="greatawakin",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="Великое Пробуждение",
|
||||
platform_id=-1001285898079,
|
||||
category="Qanon",
|
||||
followers=5861,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakeningrus",
|
||||
screenname="greatawakeningrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
# Channel(
|
||||
# id=0,
|
||||
# name="QAnon Россия",
|
||||
# platform_id=-1001319637748,
|
||||
# category="Qanon",
|
||||
# followers=94048,
|
||||
# platform="Telegram",
|
||||
# url="https://t.me/qanonrus",
|
||||
# screenname="qanonrus",
|
||||
# country="RU",
|
||||
# influencer=None,
|
||||
# public=True,
|
||||
# chat=False,
|
||||
# notes=""),
|
||||
# Channel(
|
||||
# id=1,
|
||||
# name="The Great Awakening | Q",
|
||||
# platform_id=-1001325597521,
|
||||
# category="Qanon",
|
||||
# followers=5715,
|
||||
# platform="Telegram",
|
||||
# url="https://t.me/greatawakin",
|
||||
# screenname="greatawakin",
|
||||
# country="RU",
|
||||
# influencer=None,
|
||||
# public=True,
|
||||
# chat=False,
|
||||
# notes=""),
|
||||
# Channel(
|
||||
# id=2,
|
||||
# name="Великое Пробуждение",
|
||||
# platform_id=-1001285898079,
|
||||
# category="Qanon",
|
||||
# followers=5861,
|
||||
# platform="Telegram",
|
||||
# url="https://t.me/greatawakeningrus",
|
||||
# screenname="greatawakeningrus",
|
||||
# country="RU",
|
||||
# influencer=None,
|
||||
# public=True,
|
||||
# chat=False,
|
||||
# notes=""),
|
||||
Channel(
|
||||
id=3,
|
||||
name="T🕊Редакция Президент Гордон🕊",
|
||||
Reference in New Issue
Block a user