From c2ae382a4e60782d073ea53616f4a6dd0a7361eb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 16 Mar 2022 19:50:44 +0100 Subject: [PATCH] isloates html page generation logic so it can be reused --- archivers/base_archiver.py | 55 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index d523e3f..687e762 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -6,7 +6,6 @@ from dataclasses import dataclass from abc import ABC, abstractmethod from urllib.parse import urlparse import hashlib -from loguru import logger import time import requests @@ -43,18 +42,41 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc - def generate_media_page(self, urls, url, object): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } + def get_html_key(self, url): + return self.get_key(urlparse(url).path.replace("/", "_") + ".html") + def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): page = f'''
{object}"
+ page += f""
+
+ page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
+ page_filename = 'tmp/' + page_key
+ page_cdn = self.storage.get_cdn_url(page_key)
+
+ with open(page_filename, "w") as f:
+ f.write(page)
+
+ page_hash = self.get_hash(page_filename)
+
+ self.storage.upload(page_filename, page_key, extra_args={
+ 'ACL': 'public-read', 'ContentType': 'text/html'})
+ return (page_cdn, page_hash, thumbnail)
+
+ def generate_media_page(self, urls, url, object):
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+ }
+
+ thumbnail = None
+ uploaded_media = []
for media_url in urls:
path = urlparse(media_url).path
key = self.get_key(path.replace("/", "_"))
@@ -73,26 +95,9 @@ class Archiver(ABC):
if thumbnail is None:
thumbnail = cdn_url
+ uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
- page += f'''{object}"
- page += f"