From b3adc5603a44ad29af12c91b78cbec53cfa330c3 Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Tue, 17 Jun 2025 09:51:19 +0100 Subject: [PATCH] metadata.json hardcode in storage. add new metadata_json_enricher. log level change in orchestrator --- src/auto_archiver/core/orchestrator.py | 2 +- src/auto_archiver/core/storage.py | 7 +++- .../metadata_json_enricher/__init__.py | 1 + .../metadata_json_enricher/__manifest__.py | 37 +++++++++++++++++++ .../metadata_json_enricher.py | 21 +++++++++++ 5 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 src/auto_archiver/modules/metadata_json_enricher/__init__.py create mode 100644 src/auto_archiver/modules/metadata_json_enricher/__manifest__.py create mode 100644 src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 00065e2..b45afc0 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -541,7 +541,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ yield self.feed_item(item) url_count += 1 - logger.success(f"Processed {url_count} URL(s)") + logger.info(f"Processed {url_count} URL(s)") self.cleanup() def feed_item(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 3205f5a..feb3d5f 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -100,7 +100,12 @@ class Storage(BaseModule): # Handle filename_generator logic filename_generator = self.filename_generator - if filename_generator == "random": + # DM 9th Jun 25 - special case for metadata.json file in metadata_json_enricher + # where we want the filename to remain metadata.json + # TODO - should this be a config option to keep the original filename? Is it useful anywhere else? + if filename.endswith('metadata'): + filename = 'metadata' + elif filename_generator == "random": filename = random_str(24) elif filename_generator == "static": # load the hash_enricher module diff --git a/src/auto_archiver/modules/metadata_json_enricher/__init__.py b/src/auto_archiver/modules/metadata_json_enricher/__init__.py new file mode 100644 index 0000000..4eed90b --- /dev/null +++ b/src/auto_archiver/modules/metadata_json_enricher/__init__.py @@ -0,0 +1 @@ +from .metadata_json_enricher import MetadataJsonEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/metadata_json_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_json_enricher/__manifest__.py new file mode 100644 index 0000000..b737b16 --- /dev/null +++ b/src/auto_archiver/modules/metadata_json_enricher/__manifest__.py @@ -0,0 +1,37 @@ +{ + "name": "Metadata JSON Enricher", + "type": ["enricher"], + "requires_setup": True, + "dependencies": { + "python": ["loguru"], + }, + "configs": { + # "width": {"default": 1280, "type": "int", "help": "width of the screenshots"}, + # "height": {"default": 1024, "type": "int", "help": "height of the screenshots"}, + # "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"}, + # "sleep_before_screenshot": { + # "default": 4, + # "type": "int", + # "help": "seconds to wait for the pages to load before taking screenshot", + # }, + # "http_proxy": { + # "default": "", + # "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port", + # }, + # "save_to_pdf": { + # "default": False, + # "type": "bool", + # "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter", + # }, + # "print_options": { + # "default": {}, + # "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information", + # "type": "json_loader", + # }, + }, + "description": """ + + Writes all the metadata to a json file so can be parsed by other tools. + + """, +} diff --git a/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py b/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py new file mode 100644 index 0000000..312f922 --- /dev/null +++ b/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py @@ -0,0 +1,21 @@ +import json +from loguru import logger +import os + +from auto_archiver.core import Enricher +from auto_archiver.core import Media, Metadata + +class MetadataJsonEnricher(Enricher): + def __init__(self): + super().__init__() + + def enrich(self, to_enrich: Metadata) -> None: + url = to_enrich.get_url() + + logger.debug(f"Metadata JSON Enricher for {url=}") + + item_path = os.path.join(self.tmp_dir, f"metadata.json") + with open(item_path, mode="w", encoding="utf-8") as outf: + json.dump(to_enrich.to_dict(), outf, indent=4, default=str) + + to_enrich.add_media(Media(filename=item_path), id="metadata_json") \ No newline at end of file