mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
renaming and code improvements to json_e richer
This commit is contained in:
1
src/auto_archiver/modules/json_enricher/__init__.py
Normal file
1
src/auto_archiver/modules/json_enricher/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .json_enricher import JsonEnricher
|
||||
16
src/auto_archiver/modules/json_enricher/__manifest__.py
Normal file
16
src/auto_archiver/modules/json_enricher/__manifest__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "JSON Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {},
|
||||
"description": """
|
||||
|
||||
Writes all archiving process metadata to a JSON file so it can be parsed by other tools. As this is an Enricher, it will not contain the final stored URLs.
|
||||
|
||||
WARNING: The resulting JSON may reveal sensitive information about the computer and settings in which the archiving process was run.
|
||||
|
||||
""",
|
||||
}
|
||||
@@ -5,17 +5,15 @@ import os
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
class MetadataJsonEnricher(Enricher):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
class JsonEnricher(Enricher):
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
logger.debug(f"Metadata JSON Enricher for {url=}")
|
||||
logger.debug(f"JSON Enricher for {url=}")
|
||||
|
||||
item_path = os.path.join(self.tmp_dir, f"metadata.json")
|
||||
item_path = os.path.join(self.tmp_dir, "metadata.json")
|
||||
with open(item_path, mode="w", encoding="utf-8") as outf:
|
||||
json.dump(to_enrich.to_dict(), outf, indent=4, default=str)
|
||||
|
||||
to_enrich.add_media(Media(filename=item_path), id="metadata_json")
|
||||
json.dump(to_enrich.to_dict(), outf, indent=4, default=str, ensure_ascii=False)
|
||||
|
||||
to_enrich.add_media(Media(filename=item_path), id="metadata_json")
|
||||
@@ -1 +0,0 @@
|
||||
from .metadata_json_enricher import MetadataJsonEnricher
|
||||
@@ -1,37 +0,0 @@
|
||||
{
|
||||
"name": "Metadata JSON Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# "width": {"default": 1280, "type": "int", "help": "width of the screenshots"},
|
||||
# "height": {"default": 1024, "type": "int", "help": "height of the screenshots"},
|
||||
# "timeout": {"default": 60, "type": "int", "help": "timeout for taking the screenshot"},
|
||||
# "sleep_before_screenshot": {
|
||||
# "default": 4,
|
||||
# "type": "int",
|
||||
# "help": "seconds to wait for the pages to load before taking screenshot",
|
||||
# },
|
||||
# "http_proxy": {
|
||||
# "default": "",
|
||||
# "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port",
|
||||
# },
|
||||
# "save_to_pdf": {
|
||||
# "default": False,
|
||||
# "type": "bool",
|
||||
# "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter",
|
||||
# },
|
||||
# "print_options": {
|
||||
# "default": {},
|
||||
# "help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
# "type": "json_loader",
|
||||
# },
|
||||
},
|
||||
"description": """
|
||||
|
||||
Writes all the metadata to a json file so can be parsed by other tools.
|
||||
|
||||
""",
|
||||
}
|
||||
Reference in New Issue
Block a user