From 9bcca427a0b59b4fe7cd7addea2b6309305285cd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 2 Feb 2023 12:41:06 +0000 Subject: [PATCH] wacz in gsheets --- src/auto_archiver/databases/gsheet_db.py | 15 +++++---------- src/auto_archiver/enrichers/wacz_enricher.py | 3 +++ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index 8721725..8c711e8 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -1,5 +1,6 @@ from typing import Union, Tuple import datetime +from urllib.parse import quote # from metadata import Metadata from loguru import logger @@ -45,7 +46,7 @@ class GsheetsDb(Database): def done(self, item: Metadata) -> None: """archival result ready - should be saved to DB""" - logger.success(f"DONE {item}") + logger.success(f"DONE {item.get_url()}") gw, row = self._retrieve_gsheet(item) # self._safe_status_update(item, 'done') @@ -68,16 +69,10 @@ class GsheetsDb(Database): batch_if_valid('timestamp', item.get_timestamp()) if (screenshot := item.get_media_by_id("screenshot")): batch_if_valid('screenshot', "\n".join(screenshot.urls)) - # batch_if_valid('status', item.status) - # TODO: AFTER ENRICHMENTS - # batch_if_valid('hash', media.hash) - # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') - # batch_if_valid('thumbnail_index', result.thumbnail_index) - # batch_if_valid('duration', result.duration, str(result.duration)) - # if result.wacz is not None: - # batch_if_valid('wacz', result.wacz) - # batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}') + if (browsertrix := item.get_media_by_id("browsertrix")): + batch_if_valid('wacz', "\n".join(browsertrix.urls)) + batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) gw.batch_set_cell(cell_updates) diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/enrichers/wacz_enricher.py index 49bc6a8..9b69d91 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/enrichers/wacz_enricher.py @@ -20,6 +20,9 @@ class WaczEnricher(Enricher): return { "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, + # #TODO: make WACZ conditional because it is not useful for some URLs and takes a long time + # "enrich_if_success": {"default": True, + # "help": "if False will not enrich when a previous archiver has worked successfully."}, } def enrich(self, to_enrich: Metadata) -> bool: