From 2f1a07abbf3a5ab3cba763f8c05dc51bd8d5bd66 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 17 Jun 2025 16:06:04 +0100 Subject: [PATCH] renaming and code improvements to json_e richer --- .../modules/json_enricher/__init__.py | 1 + .../modules/json_enricher/__manifest__.py | 16 ++++++++ .../json_enricher.py} | 14 +++---- .../metadata_json_enricher/__init__.py | 1 - .../metadata_json_enricher/__manifest__.py | 37 ------------------- 5 files changed, 23 insertions(+), 46 deletions(-) create mode 100644 src/auto_archiver/modules/json_enricher/__init__.py create mode 100644 src/auto_archiver/modules/json_enricher/__manifest__.py rename src/auto_archiver/modules/{metadata_json_enricher/metadata_json_enricher.py => json_enricher/json_enricher.py} (63%) delete mode 100644 src/auto_archiver/modules/metadata_json_enricher/__init__.py delete mode 100644 src/auto_archiver/modules/metadata_json_enricher/__manifest__.py diff --git a/src/auto_archiver/modules/json_enricher/__init__.py b/src/auto_archiver/modules/json_enricher/__init__.py new file mode 100644 index 0000000..5e0a04f --- /dev/null +++ b/src/auto_archiver/modules/json_enricher/__init__.py @@ -0,0 +1 @@ +from .json_enricher import JsonEnricher diff --git a/src/auto_archiver/modules/json_enricher/__manifest__.py b/src/auto_archiver/modules/json_enricher/__manifest__.py new file mode 100644 index 0000000..b80f493 --- /dev/null +++ b/src/auto_archiver/modules/json_enricher/__manifest__.py @@ -0,0 +1,16 @@ +{ + "name": "JSON Enricher", + "type": ["enricher"], + "requires_setup": True, + "dependencies": { + "python": ["loguru"], + }, + "configs": {}, + "description": """ + + Writes all archiving process metadata to a JSON file so it can be parsed by other tools. As this is an Enricher, it will not contain the final stored URLs. + + WARNING: The resulting JSON may reveal sensitive information about the computer and settings in which the archiving process was run. + + """, +} diff --git a/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py b/src/auto_archiver/modules/json_enricher/json_enricher.py similarity index 63% rename from src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py rename to src/auto_archiver/modules/json_enricher/json_enricher.py index 312f922..b0900b6 100644 --- a/src/auto_archiver/modules/metadata_json_enricher/metadata_json_enricher.py +++ b/src/auto_archiver/modules/json_enricher/json_enricher.py @@ -5,17 +5,15 @@ import os from auto_archiver.core import Enricher from auto_archiver.core import Media, Metadata -class MetadataJsonEnricher(Enricher): - def __init__(self): - super().__init__() +class JsonEnricher(Enricher): def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() - logger.debug(f"Metadata JSON Enricher for {url=}") + logger.debug(f"JSON Enricher for {url=}") - item_path = os.path.join(self.tmp_dir, f"metadata.json") + item_path = os.path.join(self.tmp_dir, "metadata.json") with open(item_path, mode="w", encoding="utf-8") as outf: - json.dump(to_enrich.to_dict(), outf, indent=4, default=str) - - to_enrich.add_media(Media(filename=item_path), id="metadata_json") \ No newline at end of file + json.dump(to_enrich.to_dict(), outf, indent=4, default=str, ensure_ascii=False) + + to_enrich.add_media(Media(filename=item_path), id="metadata_json") diff --git a/src/auto_archiver/modules/metadata_json_enricher/__init__.py b/src/auto_archiver/modules/metadata_json_enricher/__init__.py deleted file mode 100644 index 4eed90b..0000000 --- a/src/auto_archiver/modules/metadata_json_enricher/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .metadata_json_enricher import MetadataJsonEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/metadata_json_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_json_enricher/__manifest__.py deleted file mode 100644 index b737b16..0000000 --- a/src/auto_archiver/modules/metadata_json_enricher/__manifest__.py +++ /dev/null @@ -1,37 +0,0 @@ -{ - "name": "Metadata JSON Enricher", - "type": ["enricher"], - "requires_setup": True, - "dependencies": { - "python": ["loguru"], - }, - "configs": { - # "width": {"default": 1280, "type": "int", "help": "width of the screenshots"}, - # "height": {"default": 1024, "type": "int", "help": "height of the screenshots"}, - # "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"}, - # "sleep_before_screenshot": { - # "default": 4, - # "type": "int", - # "help": "seconds to wait for the pages to load before taking screenshot", - # }, - # "http_proxy": { - # "default": "", - # "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port", - # }, - # "save_to_pdf": { - # "default": False, - # "type": "bool", - # "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter", - # }, - # "print_options": { - # "default": {}, - # "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information", - # "type": "json_loader", - # }, - }, - "description": """ - - Writes all the metadata to a json file so can be parsed by other tools. - - """, -}