From bd5146ac3e9f8295ebcc01659dc052788e64eb91 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 8 Jun 2022 18:17:25 +0200 Subject: [PATCH] bug fixes --- archivers/base_archiver.py | 4 ++-- archivers/telethon_archiver.py | 2 +- archivers/wayback_archiver.py | 18 +++++++++++------- configs/config.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 4258cca..b3d872b 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -229,14 +229,14 @@ class Archiver(ABC): return (key_thumb, thumb_index_cdn_url) - def signal_retry_in(self, min_seconds=1800, max_seconds=7200): + def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs): """ sets state to retry in random between (min_seconds, max_seconds) """ now = datetime.datetime.now().timestamp() retry_at = int(now + randrange(min_seconds, max_seconds)) logger.debug(f"signaling {retry_at=}") - return ArchiveResult(status=f'retrying at {retry_at}') + return ArchiveResult(status=f'retrying at {retry_at}', **kwargs) def is_retry(status): return re.search(Archiver.retry_regex, status) is not None diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 95bf288..18996d8 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -80,7 +80,7 @@ class TelethonArchiver(Archiver): message = post.message for mp in media_posts: if len(mp.message) > len(message): message = mp.message - filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', mp.id) + filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) key = filename.split(Storage.TMP_FOLDER)[1] self.storage.upload(filename, key) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index 6f04725..700194d 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -27,6 +27,7 @@ class WaybackArchiver(Archiver): if req.status_code == 200: return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived') + screenshot = self.get_screenshot(url) logger.debug(f"POSTing {url=} to web.archive.org") ia_headers = { "Accept": "application/json", @@ -36,11 +37,13 @@ class WaybackArchiver(Archiver): if r.status_code != 200: logger.warning(f"Internet archive failed with status of {r.status_code}") - return ArchiveResult(status="Internet archive failed") + return ArchiveResult(status="Internet archive failed", screenshot=screenshot) if 'job_id' not in r.json() and 'message' in r.json(): + if "please try again" in str(r.json()).lower(): + return self.signal_retry_in(screenshot=screenshot) logger.warning(f"Internet archive failed json \n {r.json()}") - return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") + return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}", screenshot=screenshot) job_id = r.json()['job_id'] logger.debug(f"GETting status for {job_id=} on {url=}") @@ -59,18 +62,19 @@ class WaybackArchiver(Archiver): retries += 1 if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed") + return ArchiveResult(status="Internet archive failed", screenshot=screenshot) status_json = status_r.json() if status_json['status'] != 'success': + logger.info(f'please try again" in str(status_json).lower(): {("please try again" in str(status_json).lower())}') if "please try again" in str(status_json).lower(): - return self.signal_retry_in() - return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) + return self.signal_retry_in(screenshot=screenshot) + return ArchiveResult(status='Internet Archive failed: ' + str(status_json), screenshot=screenshot) archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}" return self.if_archived_return_with_screenshot(archive_url) - def if_archived_return_with_screenshot(self, url, archive_url, req=None, status='success'): + def if_archived_return_with_screenshot(self, url, archive_url, screenshot=None, req=None, status='success'): try: if req is None: req = requests.get(archive_url) @@ -80,6 +84,6 @@ class WaybackArchiver(Archiver): title = 'Could not get title' except: title = "Could not get title" - screenshot = self.get_screenshot(url) + screenshot = screenshot or self.get_screenshot(url) self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot) return self.seen_urls[url] diff --git a/configs/config.py b/configs/config.py index 0370020..70b2046 100644 --- a/configs/config.py +++ b/configs/config.py @@ -220,9 +220,9 @@ class Config: "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, "s3_config": hasattr(self, "s3_config"), + "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None), "gd_config": hasattr(self, "gd_config"), "local_config": hasattr(self, "local_config"), - "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, "gsheets_client": self.gsheets_client != None,