From 067e6d89542fede8a7b4f89d86f37732652fcda0 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 13:39:52 +0200 Subject: [PATCH] retry mechanism --- archivers/base_archiver.py | 41 +++++++++++++++++++++++++++++++++-- archivers/wayback_archiver.py | 3 ++- auto_archive.py | 16 ++++++++++---- 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index d3145e5..4258cca 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,7 +1,8 @@ -import os, datetime, shutil, hashlib, time, requests +import os, datetime, shutil, hashlib, time, requests, re from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse +from random import randrange import ffmpeg from loguru import logger @@ -27,6 +28,7 @@ class ArchiveResult: class Archiver(ABC): name = "default" + retry_regex = r"retrying at (\d+)$" def __init__(self, storage: Storage, driver): self.storage = storage @@ -95,7 +97,7 @@ class Archiver(ABC): key = self.get_key(path.replace("/", "_")) if '.' not in path: key += '.jpg' - + filename = os.path.join(Storage.TMP_FOLDER, key) d = requests.get(media_url, headers=headers) @@ -226,3 +228,38 @@ class Archiver(ABC): thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) return (key_thumb, thumb_index_cdn_url) + + def signal_retry_in(self, min_seconds=1800, max_seconds=7200): + """ + sets state to retry in random between (min_seconds, max_seconds) + """ + now = datetime.datetime.now().timestamp() + retry_at = int(now + randrange(min_seconds, max_seconds)) + logger.debug(f"signaling {retry_at=}") + return ArchiveResult(status=f'retrying at {retry_at}') + + def is_retry(status): + return re.search(Archiver.retry_regex, status) is not None + + def should_retry_from_status(status): + """ + checks status against message in signal_retry_in + returns true if enough time has elapsed, false otherwise + """ + match = re.search(Archiver.retry_regex, status) + if match: + retry_at = int(match.group(1)) + now = datetime.datetime.now().timestamp() + should_retry = now >= retry_at + logger.debug(f"{should_retry=} as {now=} >= {retry_at=}") + return should_retry + return False + + def remove_retry(status): + """ + transforms the status from retry into something else + """ + new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0) + logger.debug(f"removing retry message at {status=}, got {new_status=}") + return new_status + diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 75bf50b..6f04725 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -63,7 +63,8 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - # TODO: if "please try again" in str(status_json).lower() then this can be retried in the future + if "please try again" in str(status_json).lower(): + return self.signal_retry_in() return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" diff --git a/auto_archive.py b/auto_archive.py index 75b105d..a8382b7 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,13 +1,15 @@ -import os, datetime, shutil, traceback +import os, datetime, shutil, traceback, random from loguru import logger from slugify import slugify -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config from storages import Storage +random.seed() + def update_sheet(gw, row, result: ArchiveResult): cell_updates = [] @@ -72,7 +74,10 @@ def process_sheet(c: Config): original_status = gw.get_cell(row, 'status') status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '') - if url == '' or status not in ['', None]: continue + is_retry = False + if url == '' or status not in ['', None]: + is_retry = Archiver.should_retry_from_status(status) + if not is_retry: continue # All checks done - archival process starts here gw.set_cell(row, 'status', 'Archive in progress') @@ -85,9 +90,9 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ TelethonArchiver(storage, c.webdriver, c.telegram_config), - TelegramArchiver(storage, c.webdriver), TiktokArchiver(storage, c.webdriver), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] @@ -113,6 +118,9 @@ def process_sheet(c: Config): if success: logger.success(f'{archiver.name} succeeded on {row=}, {url=}') break + # only 1 retry possible for now + if is_retry and Archiver.is_retry(result.status): + result.status = Archiver.remove_retry(result.status) logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}') if result: