From 067e6d89542fede8a7b4f89d86f37732652fcda0 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Wed, 8 Jun 2022 13:39:52 +0200
Subject: [PATCH] retry mechanism

---
 archivers/base_archiver.py    | 41 +++++++++++++++++++++++++++++++++--
 archivers/wayback_archiver.py |  3 ++-
 auto_archive.py               | 16 ++++++++++----
 3 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index d3145e5..4258cca 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -1,7 +1,8 @@
-import os, datetime, shutil, hashlib, time, requests
+import os, datetime, shutil, hashlib, time, requests, re
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from urllib.parse import urlparse
+from random import randrange
 
 import ffmpeg
 from loguru import logger
@@ -27,6 +28,7 @@ class ArchiveResult:
 
 class Archiver(ABC):
     name = "default"
+    retry_regex = r"retrying at (\d+)$"
 
     def __init__(self, storage: Storage, driver):
         self.storage = storage
@@ -95,7 +97,7 @@ class Archiver(ABC):
             key = self.get_key(path.replace("/", "_"))
             if '.' not in path:
                 key += '.jpg'
-                
+
             filename = os.path.join(Storage.TMP_FOLDER, key)
 
             d = requests.get(media_url, headers=headers)
@@ -226,3 +228,38 @@ class Archiver(ABC):
         thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
 
         return (key_thumb, thumb_index_cdn_url)
+
+    def signal_retry_in(self, min_seconds=1800, max_seconds=7200):
+        """
+        sets state to retry in random between (min_seconds, max_seconds)
+        """
+        now = datetime.datetime.now().timestamp()
+        retry_at = int(now + randrange(min_seconds, max_seconds))
+        logger.debug(f"signaling {retry_at=}")
+        return ArchiveResult(status=f'retrying at {retry_at}')
+
+    def is_retry(status):
+        return re.search(Archiver.retry_regex, status) is not None
+
+    def should_retry_from_status(status):
+        """
+        checks status against message in signal_retry_in
+        returns true if enough time has elapsed, false otherwise
+        """
+        match = re.search(Archiver.retry_regex, status)
+        if match:
+            retry_at = int(match.group(1))
+            now = datetime.datetime.now().timestamp()
+            should_retry = now >= retry_at
+            logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
+            return should_retry
+        return False
+
+    def remove_retry(status):
+        """
+        transforms the status from retry into something else
+        """
+        new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
+        logger.debug(f"removing retry message at {status=}, got {new_status=}")
+        return new_status
+
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index 75bf50b..6f04725 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -63,7 +63,8 @@ class WaybackArchiver(Archiver):
 
         status_json = status_r.json()
         if status_json['status'] != 'success':
-            # TODO: if "please try again" in str(status_json).lower() then this can be retried in the future
+            if "please try again" in str(status_json).lower():
+                return self.signal_retry_in()
             return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
 
         archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
diff --git a/auto_archive.py b/auto_archive.py
index 75b105d..a8382b7 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -1,13 +1,15 @@
-import os, datetime, shutil, traceback
+import os, datetime, shutil, traceback, random
 
 from loguru import logger
 from slugify import slugify
 
-from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
+from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver
 from utils import GWorksheet, mkdir_if_not_exists, expand_url
 from configs import Config
 from storages import Storage
 
+random.seed()
+
 
 def update_sheet(gw, row, result: ArchiveResult):
     cell_updates = []
@@ -72,7 +74,10 @@ def process_sheet(c: Config):
             original_status = gw.get_cell(row, 'status')
             status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
 
-            if url == '' or status not in ['', None]: continue
+            is_retry = False
+            if url == '' or status not in ['', None]:
+                is_retry = Archiver.should_retry_from_status(status)
+                if not is_retry: continue
 
             # All checks done - archival process starts here
             gw.set_cell(row, 'status', 'Archive in progress')
@@ -85,9 +90,9 @@ def process_sheet(c: Config):
             # order matters, first to succeed excludes remaining
             active_archivers = [
                 TelethonArchiver(storage, c.webdriver, c.telegram_config),
-                TelegramArchiver(storage, c.webdriver),
                 TiktokArchiver(storage, c.webdriver),
                 YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
+                TelegramArchiver(storage, c.webdriver),
                 TwitterArchiver(storage, c.webdriver),
                 WaybackArchiver(storage, c.webdriver, c.wayback_config)
             ]
@@ -113,6 +118,9 @@ def process_sheet(c: Config):
                     if success:
                         logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
                         break
+                    # only 1 retry possible for now
+                    if is_retry and Archiver.is_retry(result.status):
+                        result.status = Archiver.remove_retry(result.status)
                     logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
 
             if result: