diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index d523e3f..687e762 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -6,7 +6,6 @@ from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse import hashlib -from loguru import logger import time import requests @@ -43,18 +42,41 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc - def generate_media_page(self, urls, url, object): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } + def get_html_key(self, url): + return self.get_key(urlparse(url).path.replace("/", "_") + ".html") + def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): page = f'''{url}

Archived media from {self.name}

{url}

{self.name} object data:

{object}" + page += f"" + + page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") + page_filename = 'tmp/' + page_key + page_cdn = self.storage.get_cdn_url(page_key) + + with open(page_filename, "w") as f: + f.write(page) + + page_hash = self.get_hash(page_filename) + + self.storage.upload(page_filename, page_key, extra_args={ + 'ACL': 'public-read', 'ContentType': 'text/html'}) + return (page_cdn, page_hash, thumbnail) + + def generate_media_page(self, urls, url, object): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + + thumbnail = None + uploaded_media = [] for media_url in urls: path = urlparse(media_url).path key = self.get_key(path.replace("/", "_")) @@ -73,26 +95,9 @@ class Archiver(ABC): if thumbnail is None: thumbnail = cdn_url + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - page += f'''
  • {media_url}: {hash}
  • ''' - - page += f"

    {self.name} object data:

    {object}" - page += f"" - - page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html") - page_filename = 'tmp/' + page_key - page_cdn = self.storage.get_cdn_url(page_key) - - with open(page_filename, "w") as f: - f.write(page) - - page_hash = self.get_hash(page_filename) - - self.storage.upload(page_filename, page_key, extra_args={ - 'ACL': 'public-read', 'ContentType': 'text/html'}) - - return (page_cdn, page_hash, thumbnail) - + return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail) def get_key(self, filename): """ returns a key in the format "[archiverName]_[filename]" includes extension