mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
bug fixes
This commit is contained in:
@@ -229,14 +229,14 @@ class Archiver(ABC):
|
||||
|
||||
return (key_thumb, thumb_index_cdn_url)
|
||||
|
||||
def signal_retry_in(self, min_seconds=1800, max_seconds=7200):
|
||||
def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
|
||||
"""
|
||||
sets state to retry in random between (min_seconds, max_seconds)
|
||||
"""
|
||||
now = datetime.datetime.now().timestamp()
|
||||
retry_at = int(now + randrange(min_seconds, max_seconds))
|
||||
logger.debug(f"signaling {retry_at=}")
|
||||
return ArchiveResult(status=f'retrying at {retry_at}')
|
||||
return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
|
||||
|
||||
def is_retry(status):
|
||||
return re.search(Archiver.retry_regex, status) is not None
|
||||
|
||||
@@ -80,7 +80,7 @@ class TelethonArchiver(Archiver):
|
||||
message = post.message
|
||||
for mp in media_posts:
|
||||
if len(mp.message) > len(message): message = mp.message
|
||||
filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', mp.id)
|
||||
filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id))
|
||||
filename = self.client.download_media(mp.media, filename_dest)
|
||||
key = filename.split(Storage.TMP_FOLDER)[1]
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
@@ -27,6 +27,7 @@ class WaybackArchiver(Archiver):
|
||||
if req.status_code == 200:
|
||||
return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived')
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
logger.debug(f"POSTing {url=} to web.archive.org")
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
@@ -36,11 +37,13 @@ class WaybackArchiver(Archiver):
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
|
||||
|
||||
if 'job_id' not in r.json() and 'message' in r.json():
|
||||
if "please try again" in str(r.json()).lower():
|
||||
return self.signal_retry_in(screenshot=screenshot)
|
||||
logger.warning(f"Internet archive failed json \n {r.json()}")
|
||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}", screenshot=screenshot)
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
logger.debug(f"GETting status for {job_id=} on {url=}")
|
||||
@@ -59,18 +62,19 @@ class WaybackArchiver(Archiver):
|
||||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
|
||||
|
||||
status_json = status_r.json()
|
||||
if status_json['status'] != 'success':
|
||||
logger.info(f'please try again" in str(status_json).lower(): {("please try again" in str(status_json).lower())}')
|
||||
if "please try again" in str(status_json).lower():
|
||||
return self.signal_retry_in()
|
||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||
return self.signal_retry_in(screenshot=screenshot)
|
||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json), screenshot=screenshot)
|
||||
|
||||
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||
return self.if_archived_return_with_screenshot(archive_url)
|
||||
|
||||
def if_archived_return_with_screenshot(self, url, archive_url, req=None, status='success'):
|
||||
def if_archived_return_with_screenshot(self, url, archive_url, screenshot=None, req=None, status='success'):
|
||||
try:
|
||||
if req is None:
|
||||
req = requests.get(archive_url)
|
||||
@@ -80,6 +84,6 @@ class WaybackArchiver(Archiver):
|
||||
title = 'Could not get title'
|
||||
except:
|
||||
title = "Could not get title"
|
||||
screenshot = self.get_screenshot(url)
|
||||
screenshot = screenshot or self.get_screenshot(url)
|
||||
self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
return self.seen_urls[url]
|
||||
|
||||
@@ -220,9 +220,9 @@ class Config:
|
||||
"selenium_config": asdict(self.selenium_config),
|
||||
"selenium_webdriver": self.webdriver != None,
|
||||
"s3_config": hasattr(self, "s3_config"),
|
||||
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
|
||||
"gd_config": hasattr(self, "gd_config"),
|
||||
"local_config": hasattr(self, "local_config"),
|
||||
"s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
|
||||
"wayback_config": self.wayback_config != None,
|
||||
"telegram_config": self.telegram_config != None,
|
||||
"gsheets_client": self.gsheets_client != None,
|
||||
|
||||
Reference in New Issue
Block a user