mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
refactor
This commit is contained in:
@@ -35,6 +35,9 @@ class Archiver(ABC):
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url, check_if_exists=False): pass
|
||||
|
||||
@@ -134,6 +137,7 @@ class Archiver(ABC):
|
||||
return hash.hexdigest()
|
||||
|
||||
def get_screenshot(self, url):
|
||||
logger.debug(f"getting screenshot for {url=}")
|
||||
key = self.get_key(urlparse(url).path.replace(
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = Storage.TMP_FOLDER + key
|
||||
|
||||
@@ -18,8 +18,8 @@ class TiktokArchiver(Archiver):
|
||||
try:
|
||||
info = tiktok_downloader.info_post(url)
|
||||
key = self.get_key(f'{info.id}.mp4')
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
filename = Storage.TMP_FOLDER + key
|
||||
logger.info(f'found video {key=}')
|
||||
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
status = 'already archived'
|
||||
@@ -28,13 +28,15 @@ class TiktokArchiver(Archiver):
|
||||
|
||||
if len(media) <= 0:
|
||||
if status == 'already archived':
|
||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
|
||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=self.storage.get_cdn_url(key))
|
||||
else:
|
||||
return ArchiveResult(status='Could not download media')
|
||||
|
||||
logger.info(f'downloading video {key=}')
|
||||
media[0].download(filename)
|
||||
|
||||
if status != 'already archived':
|
||||
logger.info(f'uploading video {key=}')
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
try:
|
||||
@@ -50,6 +52,7 @@ class TiktokArchiver(Archiver):
|
||||
try: os.remove(filename)
|
||||
except FileNotFoundError:
|
||||
logger.info(f'tmp file not found thus not deleted {filename}')
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
|
||||
thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat(),
|
||||
|
||||
@@ -8,26 +8,31 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
from configs import WaybackConfig
|
||||
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
name = "wayback"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: WaybackConfig):
|
||||
super(WaybackArchiver, self).__init__(storage, driver)
|
||||
self.config = config
|
||||
# TODO: this logic should live at the auto-archiver level
|
||||
self.seen_urls = {}
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if check_if_exists and url in self.seen_urls:
|
||||
return self.seen_urls[url]
|
||||
if check_if_exists:
|
||||
if url in self.seen_urls: return self.seen_urls[url]
|
||||
|
||||
logger.debug(f"checking if {url=} already on archive.org")
|
||||
archive_url = f"https://web.archive.org/web/{url}"
|
||||
req = requests.get(archive_url)
|
||||
if req.status_code == 200:
|
||||
return self.if_archived_return_with_screenshot(url, archive_url, req=req, status='already archived')
|
||||
|
||||
logger.debug(f"POSTing {url=} to web.archive.org")
|
||||
ia_headers = {
|
||||
"Accept": "application/json",
|
||||
"Authorization": f"LOW {self.config.key}:{self.config.secret}"
|
||||
}
|
||||
|
||||
r = requests.post(
|
||||
'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"Internet archive failed with status of {r.status_code}")
|
||||
@@ -38,47 +43,41 @@ class WaybackArchiver(Archiver):
|
||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
|
||||
logger.debug(f"GETting status for {job_id=} on {url=}")
|
||||
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
retries = 0
|
||||
|
||||
# TODO: make the job queue parallel -> consider propagation of results back to sheet though
|
||||
# wait 90-120 seconds for the archive job to finish
|
||||
while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
|
||||
time.sleep(3)
|
||||
|
||||
try:
|
||||
status_r = requests.get(
|
||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||
logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
|
||||
status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
retries += 1
|
||||
|
||||
if status_r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
status_json = status_r.json()
|
||||
|
||||
if status_json['status'] != 'success':
|
||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||
|
||||
archive_url = 'https://web.archive.org/web/' + \
|
||||
status_json['timestamp'] + '/' + status_json['original_url']
|
||||
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||
return self.if_archived_return_with_screenshot(archive_url)
|
||||
|
||||
def if_archived_return_with_screenshot(self, url, archive_url, req=None, status='success'):
|
||||
try:
|
||||
r = requests.get(archive_url)
|
||||
|
||||
parsed = BeautifulSoup(r.content, 'html.parser')
|
||||
|
||||
if req is None:
|
||||
req = requests.get(archive_url)
|
||||
parsed = BeautifulSoup(req.content, 'html.parser')
|
||||
title = parsed.find_all('title')[0].text
|
||||
|
||||
if title == 'Wayback Machine':
|
||||
title = 'Could not get title'
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
result = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
self.seen_urls[url] = result
|
||||
return result
|
||||
self.seen_urls[url] = ArchiveResult(status=status, cdn_url=archive_url, title=title, screenshot=screenshot)
|
||||
return self.seen_urls[url]
|
||||
|
||||
Reference in New Issue
Block a user