renaming and code improvements to json_e richer

This commit is contained in:
msramalho
2025-06-17 16:06:04 +01:00
parent 664ee8d037
commit 2f1a07abbf
5 changed files with 23 additions and 46 deletions

View File

@@ -0,0 +1 @@
from .json_enricher import JsonEnricher

View File

@@ -0,0 +1,16 @@
{
"name": "JSON Enricher",
"type": ["enricher"],
"requires_setup": True,
"dependencies": {
"python": ["loguru"],
},
"configs": {},
"description": """
Writes all archiving process metadata to a JSON file so it can be parsed by other tools. As this is an Enricher, it will not contain the final stored URLs.
WARNING: The resulting JSON may reveal sensitive information about the computer and settings in which the archiving process was run.
""",
}

View File

@@ -5,17 +5,15 @@ import os
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata
class MetadataJsonEnricher(Enricher):
def __init__(self):
super().__init__()
class JsonEnricher(Enricher):
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"Metadata JSON Enricher for {url=}")
logger.debug(f"JSON Enricher for {url=}")
item_path = os.path.join(self.tmp_dir, f"metadata.json")
item_path = os.path.join(self.tmp_dir, "metadata.json")
with open(item_path, mode="w", encoding="utf-8") as outf:
json.dump(to_enrich.to_dict(), outf, indent=4, default=str)
to_enrich.add_media(Media(filename=item_path), id="metadata_json")
json.dump(to_enrich.to_dict(), outf, indent=4, default=str, ensure_ascii=False)
to_enrich.add_media(Media(filename=item_path), id="metadata_json")

View File

@@ -1 +0,0 @@
from .metadata_json_enricher import MetadataJsonEnricher

View File

@@ -1,37 +0,0 @@
{
"name": "Metadata JSON Enricher",
"type": ["enricher"],
"requires_setup": True,
"dependencies": {
"python": ["loguru"],
},
"configs": {
# "width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
# "height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
# "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
# "sleep_before_screenshot": {
# "default": 4,
# "type": "int",
# "help": "seconds to wait for the pages to load before taking screenshot",
# },
# "http_proxy": {
# "default": "",
# "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
# },
# "save_to_pdf": {
# "default": False,
# "type": "bool",
# "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
# },
# "print_options": {
# "default": {},
# "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
# "type": "json_loader",
# },
},
"description": """
Writes all the metadata to a json file so can be parsed by other tools.
""",
}