wacz in gsheets

2026-06-13 05:38:29 +03:00 · 2023-02-02 12:41:06 +00:00
parent 77a8c290f7
commit 9bcca427a0
2 changed files with 8 additions and 10 deletions
--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -1,5 +1,6 @@
 from typing import Union, Tuple
 import datetime
 from urllib.parse import quote
 # from metadata import Metadata
 from loguru import logger
@@ -45,7 +46,7 @@ class GsheetsDb(Database):
    def done(self, item: Metadata) -> None:
        """archival result ready - should be saved to DB"""
-        logger.success(f"DONE {item}")
+        logger.success(f"DONE {item.get_url()}")
        gw, row = self._retrieve_gsheet(item)
        # self._safe_status_update(item, 'done')
@@ -68,16 +69,10 @@ class GsheetsDb(Database):
        batch_if_valid('timestamp', item.get_timestamp())
        if (screenshot := item.get_media_by_id("screenshot")):
            batch_if_valid('screenshot', "\n".join(screenshot.urls))
        # batch_if_valid('status', item.status)
-        # TODO: AFTER ENRICHMENTS
+        if (browsertrix := item.get_media_by_id("browsertrix")):
-        # batch_if_valid('hash', media.hash)
+            batch_if_valid('wacz', "\n".join(browsertrix.urls))
-        # batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
+            batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
        # batch_if_valid('thumbnail_index', result.thumbnail_index)
        # batch_if_valid('duration', result.duration, str(result.duration))
        # if result.wacz is not None:
        #     batch_if_valid('wacz', result.wacz)
        #     batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
        gw.batch_set_cell(cell_updates)
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -20,6 +20,9 @@ class WaczEnricher(Enricher):
        return {
            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
            "timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
            # #TODO: make WACZ conditional because it is not useful for some URLs and takes a long time
            # "enrich_if_success": {"default": True, 
            # "help": "if False will not enrich when a previous archiver has worked successfully."},
        }
    def enrich(self, to_enrich: Metadata) -> bool: