From c4efa6e597287a5d1142ec30ad76346c9b59d0da Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 21 Jun 2022 15:39:13 +0200 Subject: [PATCH] dding thumbnails --- Pipfile.lock | 11 ++++------- archivers/base_archiver.py | 7 ++++--- archivers/vk_archiver.py | 22 +++++++++++++--------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 9cd48b3..77fc8f0 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1038,10 +1038,7 @@ "version": "==4.1.1" }, "urllib3": { - "extras": [ - "secure", - "socks" - ], + "extras": [], "hashes": [ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" @@ -1058,11 +1055,11 @@ }, "vk-url-scraper": { "hashes": [ - "sha256:1747e926dfa5f802b4960347db0d5f7425f69838d1444d2bbee6b5b168524e43", - "sha256:7539df9de4f6c70db303efc52557582eae7fc3c85b34dc7137e75d4928598078" + "sha256:181c8a4b69e395a68bdf00e3dc1717e5218960c9fda6e90eea9633ff26fc9257", + "sha256:9cfc6bc3d7259f18508c3822955efac21ff9bad5bd886010b10f098ea10ad551" ], "index": "pypi", - "version": "==0.2.4" + "version": "==0.3.2" }, "websockets": { "hashes": [ diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 7e64078..3813c65 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -197,8 +197,9 @@ class Archiver(ABC): return self.storage.get_cdn_url(key) def get_thumbnails(self, filename, key, duration=None): - thumbnails_folder = filename.split('.')[0] + '/' - key_folder = key.split('.')[0] + '/' + thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep + key_folder = key.split('.')[0] + os.path.sep + logger.info(f"{filename=} {thumbnails_folder=} {key_folder=} ") mkdir_if_not_exists(thumbnails_folder) @@ -222,7 +223,7 @@ class Archiver(ABC): for fname in thumbnails: if fname[-3:] == 'jpg': thumbnail_filename = thumbnails_folder + fname - key = key_folder + fname + key = os.path.join(key_folder, fname) self.storage.upload(thumbnail_filename, key) cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index 8d4b195..c448367 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -1,4 +1,4 @@ -import re, json, mimetypes +import re, json, mimetypes, os from loguru import logger from vk_url_scraper import VkScraper, DateTimeEncoder @@ -28,10 +28,10 @@ class VkArchiver(Archiver): return False key = self.get_html_key(url) - if check_if_exists and self.storage.exists(key): - screenshot = self.get_screenshot(url) - cdn_url = self.storage.get_cdn_url(key) - return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) + # if check_if_exists and self.storage.exists(key): + # screenshot = self.get_screenshot(url) + # cdn_url = self.storage.get_cdn_url(key) + # return ArchiveResult(status="already archived", cdn_url=cdn_url, screenshot=screenshot) results = self.vks.scrape(url) # some urls can contain multiple wall/photo/... parts and all will be fetched if len(results) == 0: @@ -49,7 +49,7 @@ class VkArchiver(Archiver): urls_found.extend(attachments) # we don't call generate_media_page which downloads urls because it cannot download vk video urls - thumbnail = None + thumbnail, thumbnail_index = None, None uploaded_media = [] filenames = self.vks.download_media(results, Storage.TMP_FOLDER) for filename in filenames: @@ -58,12 +58,16 @@ class VkArchiver(Archiver): hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) try: - if mimetypes.guess_type(filename)[0].split("/")[0] == "image" and thumbnail is None: + _type = mimetypes.guess_type(filename)[0].split("/")[0] + if _type == "image" and thumbnail is None: thumbnail = cdn_url - except: pass + if _type == "video" and (thumbnail is None or thumbnail_index is None): + thumbnail, thumbnail_index = self.get_thumbnails(filename, key) + except Exception as e: + logger.warning(f"failed to get thumb for {filename=} with {e=}") uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) page_cdn, page_hash, thumbnail = self.generate_media_page_html(url, uploaded_media, textual_output, thumbnail=thumbnail) # # if multiple wall/photos/videos are present the screenshot will only grab the 1st screenshot = self.get_screenshot(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=datetime, title=title) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, thumbnail_index=thumbnail_index, timestamp=datetime, title=title)