mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
wacz in gsheets
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
from typing import Union, Tuple
|
from typing import Union, Tuple
|
||||||
import datetime
|
import datetime
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
# from metadata import Metadata
|
# from metadata import Metadata
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -45,7 +46,7 @@ class GsheetsDb(Database):
|
|||||||
|
|
||||||
def done(self, item: Metadata) -> None:
|
def done(self, item: Metadata) -> None:
|
||||||
"""archival result ready - should be saved to DB"""
|
"""archival result ready - should be saved to DB"""
|
||||||
logger.success(f"DONE {item}")
|
logger.success(f"DONE {item.get_url()}")
|
||||||
gw, row = self._retrieve_gsheet(item)
|
gw, row = self._retrieve_gsheet(item)
|
||||||
# self._safe_status_update(item, 'done')
|
# self._safe_status_update(item, 'done')
|
||||||
|
|
||||||
@@ -68,16 +69,10 @@ class GsheetsDb(Database):
|
|||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
if (screenshot := item.get_media_by_id("screenshot")):
|
if (screenshot := item.get_media_by_id("screenshot")):
|
||||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||||
# batch_if_valid('status', item.status)
|
|
||||||
|
|
||||||
# TODO: AFTER ENRICHMENTS
|
if (browsertrix := item.get_media_by_id("browsertrix")):
|
||||||
# batch_if_valid('hash', media.hash)
|
batch_if_valid('wacz', "\n".join(browsertrix.urls))
|
||||||
# batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
|
||||||
# batch_if_valid('thumbnail_index', result.thumbnail_index)
|
|
||||||
# batch_if_valid('duration', result.duration, str(result.duration))
|
|
||||||
# if result.wacz is not None:
|
|
||||||
# batch_if_valid('wacz', result.wacz)
|
|
||||||
# batch_if_valid('replaywebpage', f'https://replayweb.page/?source={quote(result.wacz)}#view=pages&url={quote(url)}')
|
|
||||||
|
|
||||||
gw.batch_set_cell(cell_updates)
|
gw.batch_set_cell(cell_updates)
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,9 @@ class WaczEnricher(Enricher):
|
|||||||
return {
|
return {
|
||||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||||
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
||||||
|
# #TODO: make WACZ conditional because it is not useful for some URLs and takes a long time
|
||||||
|
# "enrich_if_success": {"default": True,
|
||||||
|
# "help": "if False will not enrich when a previous archiver has worked successfully."},
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> bool:
|
def enrich(self, to_enrich: Metadata) -> bool:
|
||||||
|
|||||||
Reference in New Issue
Block a user