More manifests, base modules and rename from archiver to extractor.

This commit is contained in:
erinhmclark
2025-01-23 16:40:48 +00:00
parent 9db26cdfc2
commit 1274a1b231
93 changed files with 378 additions and 238 deletions

View File

@@ -0,0 +1,13 @@
m = {
"name": "HTML Formatter",
"type": ["formatter"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "jinja2"],
"bin": [""]
},
"configs": {
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
},
"description": """ """,
}

View File

@@ -0,0 +1,99 @@
from __future__ import annotations
from dataclasses import dataclass
import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
from loguru import logger
import json
import base64
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.base_modules import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
@dataclass
class HtmlFormatter(Formatter):
name = "html_formatter"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
# JinjaHelper class static methods are added as filters
self.environment.filters.update({
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
})
self.template = self.environment.get_template("html_template.html")
# @staticmethod
# def configs() -> dict:
# return {
# "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
# }
def format(self, item: Metadata) -> Media:
url = item.get_url()
if item.is_empty():
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
return
content = self.template.render(
url=url,
title=item.get_title(),
media=item.media,
metadata=item.metadata,
version=__version__
)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
final_media = Media(filename=html_path, _mimetype="text/html")
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")
return final_media
# JINJA helper filters
class JinjaHelpers:
@staticmethod
def is_list(v) -> bool:
return isinstance(v, list)
@staticmethod
def is_video(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "video" in (m or "")
@staticmethod
def is_image(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "image" in (m or "")
@staticmethod
def is_audio(s: str) -> bool:
m = mimetypes.guess_type(s)[0]
return "audio" in (m or "")
@staticmethod
def is_media(v) -> bool:
return isinstance(v, Media)
@staticmethod
def get_extension(filename: str) -> str:
return os.path.splitext(filename)[1]
@staticmethod
def quote(s: str) -> str:
return quote(s)
@staticmethod
def json_dump_b64(d: dict) -> str:
j = json.dumps(d, indent=4, default=str)
return base64.b64encode(j.encode()).decode()

View File

@@ -0,0 +1,332 @@
{# templates/results.html #}
{% import 'macros.html' as macros %}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300italic,700,700italic">
<title>{{ url }}</title>
<style>
html {
font-family: 'Roboto', sans-serif;
}
table {
table-layout: fixed;
width: 90%;
}
table td {
word-wrap: break-word;
overflow-wrap: break-word;
padding: 5px;
}
table,
th,
td {
margin: auto;
border: 1px solid;
border-collapse: collapse;
vertical-align: top;
}
table.metadata td:first-child {
text-align: center;
}
table.content td:nth-child(2),
.center {
text-align: center;
}
.copy:hover {
background: aliceblue;
cursor: copy;
}
#notification {
position: fixed;
right: 20px;
top: 20px;
background: aquamarine;
box-shadow: 6px 8px 5px 0px #000000;
padding: 10px;
font-size: large;
display: none;
}
img,
video {
filter: gray;
-webkit-filter: grayscale(1);
filter: grayscale(1);
}
/* Disable grayscale on hover */
/* img:hover,
video:hover {
-webkit-filter: grayscale(0);
filter: none;
} */
.collapsible {
background-color: #777;
color: white;
cursor: pointer;
padding: 5px;
margin: 10px;
width: 100%;
border: none;
text-align: left;
outline: none;
font-size: 15px;
}
.active,
.collapsible:hover {
background-color: #555;
}
.collapsible-content {
padding: 0 18px;
display: none;
overflow: hidden;
background-color: #f1f1f1;
}
.pem-certificate, .text-preview {
text-align: left;
font-size: small;
}
.text-preview{
padding-left: 10px;
padding-right: 10px;
white-space: pre-wrap;
}
</style>
</head>
<body>
<div id="notification"></div>
<h2>Archived media for <span class="copy">{{ url }}</span> - <a href="{{ url }}">open</a></h2>
{% if title | string | length > 0 %}
<p><b>title:</b> '<span class="copy">{{ title }}</span>'</p>
{% endif %}
<h2 class="center">content {{ media | length }} item(s)</h2>
<form class="center">
<label>
<input type="checkbox" id="safe-media-view" checked>
Safe Media View
</label>
</form>
<table class="content">
<tr>
<th>about</th>
<th>files and preview</th>
</tr>
<tbody>
{% for m in media %}
<tr>
<td>
{{ macros.display_recursive(m, true) }}
</td>
<td>
{{ macros.display_media(m, true, url) }}
</td>
</tr>
{% endfor %}
</tbody>
</table>
<h2 class="center">metadata</h2>
<table class="metadata">
<tr>
<th>key</th>
<th>value</th>
</tr>
{% for key in metadata %}
<tr>
<td>{{ key }}</td>
<td>
{% if metadata[key] is mapping %}
<div class="center copy" copy-value64='{{metadata[key] | json_dump_b64}}'>Copy as JSON</div>
{% endif %}
{{ macros.copy_urlize(metadata[key]) }}
</td>
</tr>
{% endfor %}
</table>
<p class="center">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>
v{{ version }}</p>
</body>
<script src="https://cdnjs.cloudflare.com/ajax/libs/forge/0.10.0/forge.min.js"></script>
<script defer>
// partial decode of SSL certificates
function decodeCertificate(sslCert) {
var cert = forge.pki.certificateFromPem(sslCert);
return `SSL CERTIFICATE PREVIEW:<br/><ul>
<li><b>Subject:</b> <span class="copy">${cert.subject.attributes.map(attr => `${attr.shortName}: ${attr.value}`).join(", ")}</span></li>
<li><b>Issuer:</b> <span class="copy">${cert.issuer.attributes.map(attr => `${attr.shortName}: ${attr.value}`).join(", ")}</span></li>
<li><b>Valid From:</b> <span class="copy">${cert.validity.notBefore}</span></li>
<li><b>Valid To:</b> <span class="copy">${cert.validity.notAfter}</span></li>
<li><b>Serial Number:</b> <span class="copy">${cert.serialNumber}</span></li>
</ul>`;
}
async function run() {
let setupFunctions = [
previewCertificates,
previewText,
enableCopyLogic,
enableCollapsibleLogic,
setupSafeView
];
setupFunctions.forEach(async f => {
try {
await f();
} catch (e) {
console.error(`Error in ${f.name}: ${e}`);
}
});
}
async function previewCertificates() {
await Promise.all(
Array.from(document.querySelectorAll(".pem-certificate")).map(async el => {
let certificate = await (await fetch(el.getAttribute("pem"))).text();
el.innerHTML = decodeCertificate(certificate);
let cyberChefUrl =
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
// create a new anchor with this url and append after the code
let a = document.createElement("a");
a.href = cyberChefUrl;
a.textContent = "Full certificate details";
el.parentElement.appendChild(a);
})
);
console.log("certificate preview done");
}
async function previewText() {
await Promise.all(
Array.from(document.querySelectorAll(".text-preview")).map(async el => {
let textContent = await (await fetch(el.getAttribute("url"))).text();
el.textContent = textContent;
})
);
console.log("text preview done");
}
// notification logic
const notification = document.getElementById("notification");
function showNotification(message, miliseconds) {
notification.style.display = "block";
notification.innerText = message;
setTimeout(() => {
notification.style.display = "none";
notification.innerText = "";
}, miliseconds || 1000)
}
// copy logic
async function enableCopyLogic() {
await Promise.all(
Array.from(document.querySelectorAll(".copy")).map(el => {
el.onclick = () => {
document.execCommand("copy");
}
el.addEventListener("copy", (e) => {
e.preventDefault();
if (e.clipboardData) {
if (el.hasAttribute("copy-value")) {
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
} else if (el.hasAttribute("copy-value64")) {
// TODO: figure out how to decode unicode chars into utf-8
e.clipboardData.setData("text/plain", new String(atob(el.getAttribute(
"copy-value64"))));
} else {
e.clipboardData.setData("text/plain", el.textContent);
}
console.log(e.clipboardData.getData("text"))
showNotification("copied!")
}
})
})
)
console.log("copy logic enabled");
}
// collapsibles
async function enableCollapsibleLogic() {
let coll = document.getElementsByClassName("collapsible");
for (let i = 0; i < coll.length; i++) {
await new Promise(resolve => {
coll[i].addEventListener("click", function () {
this.classList.toggle("active");
// let content = this.nextElementSibling;
let content = this.parentElement.querySelector(".collapsible-content");
if (content.style.display === "block") {
content.style.display = "none";
} else {
content.style.display = "block";
}
});
resolve();
})
}
console.log("collapsible logic enabled");
}
async function setupSafeView() {
// logic for enabled/disabled greyscale
// Get references to the checkboxes and images/videos
const safeImageViewCheckbox = document.getElementById('safe-media-view');
const visualPreviews = document.querySelectorAll('img, video,embed');
// Function to toggle grayscale effect
function toggleGrayscale() {
visualPreviews.forEach(element => {
if (safeImageViewCheckbox.checked) {
// Enable grayscale effect
element.style.filter = 'grayscale(1)';
element.style.webkitFilter = 'grayscale(1)';
} else {
// Disable grayscale effect
element.style.filter = 'none';
element.style.webkitFilter = 'none';
}
});
}
// Add event listener to the checkbox to trigger the toggleGrayscale function
safeImageViewCheckbox.addEventListener('change', toggleGrayscale);
// Handle the hover effect using JavaScript
visualPreviews.forEach(element => {
element.addEventListener('mouseenter', () => {
// Disable grayscale effect on hover
element.style.filter = 'none';
element.style.webkitFilter = 'none';
});
element.addEventListener('mouseleave', () => {
// Re-enable grayscale effect if checkbox is checked
if (safeImageViewCheckbox.checked) {
element.style.filter = 'grayscale(1)';
element.style.webkitFilter = 'grayscale(1)';
}
});
});
toggleGrayscale();
console.log("grayscale logic enabled");
}
run();
</script>
</html>

View File

@@ -0,0 +1,151 @@
{% macro display_media(m, links, main_url) -%}
{% for url in m.urls %}
{% if url | length == 0 %}
No URL available for {{ m.key }}.
{% elif 'http://' in url or 'https://' in url or url.startswith('/') %}
{% if 'image' in m.mimetype %}
<div>
<a href="{{ url }}">
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
<div>
Reverse Image Search:&nbsp;
<a href="https://www.google.com/searchbyimage?sbisrc=4chanx&image_url={{ url | quote }}&safe=off">Google</a>,&nbsp;
<a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,&nbsp;
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,&nbsp;
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,&nbsp;
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>
</div>
<div>
Image Forensics:&nbsp;
<a href="https://fotoforensics.com/?url={{ url | quote }}">FotoForensics</a>,&nbsp;
<a href="https://mever.iti.gr/forensics/?image={{ url }}">Media Verification Assistant</a>
</div>
<p></p>
</div>
{% elif 'video' in m.mimetype %}
<div>
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
</video>
</div>
{% elif 'application/pdf' in m.mimetype %}
<div>
<embed src="{{ url }}" width="100%" height="400px"/>
</div>
{% elif 'audio' in m.mimetype %}
<div>
<audio controls>
<source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element.
</audio>
</div>
{% elif m.filename | get_extension == ".wacz" %}
<a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a>
{% elif m.filename | get_extension == ".pem" %}
<code class="pem-certificate" pem="{{url}}"></code>
{% elif 'text' in m.mimetype %}
<div>PREVIEW:<br/><code><pre class="text-preview" url="{{url}}"></pre></code></div>
{% else %}
No preview available for <code>{{ m.key }}</code>.
{% endif %}
{% else %}
{{ m.url | urlize }}
{% endif %}
{% if links %}
<a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a> or
{{ copy_urlize(url, "copy") }}
<br>
{% endif %}
{% endfor %}
{%- endmacro -%}
{% macro copy_urlize(val, href_text) -%}
{% if val | is_list %}
{% for item in val %}
{{ copy_urlize(item) }}
{% endfor %}
{% elif val is mapping %}
<ul>
{% for key in val %}
<li>
<b>{{ key }}:</b> {{ copy_urlize(val[key]) }}
</li>
{% endfor %}
</ul>
{% else %}
{% if href_text | length == 0 %}
<span class="copy">{{ val | string | urlize }}</span>
{% else %}
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
{% endif %}
{% endif %}
{%- endmacro -%}
{% macro display_recursive(prop, skip_display) -%}
{% if prop is mapping %}
<div class="center copy" copy-value64='{{prop | json_dump_b64}}'>Copy as JSON</div>
<ul>
{% for subprop in prop %}
<li>
<b>{{ subprop }}:</b>
{{ display_recursive(prop[subprop]) }}
</li>
{% endfor %}
</ul>
{% elif prop | is_list %}
{% for item in prop %}
<li>
{{ display_recursive(item) }}
</li>
{% endfor %}
{% elif prop | is_media %}
{% if not skip_display %}
{{ display_media(prop, true) }}
{% endif %}
<ul>
<li><b>key:</b> <span class="copy">{{ prop.key }}</span></li>
<li><b>type:</b> <span class="copy">{{ prop.mimetype }}</span></li>
{% for subprop in prop.properties %}
{% if prop.properties[subprop] | is_list %}
<p></p>
<div>
<b class="collapsible" title="expand">{{ subprop }} ({{ prop.properties[subprop] | length }}):</b>
<p></p>
<div class="collapsible-content">
{% for subsubprop in prop.properties[subprop] %}
{{ display_recursive(subsubprop) }}
{% endfor %}
</div>
</div>
<p></p>
{% elif prop.properties[subprop] | string | length > 1 %}
<li><b>{{ subprop }}:</b> {{ copy_urlize(prop.properties[subprop]) }}</li>
{% endif %}
{% endfor %}
</ul>
{% else %}
{{ copy_urlize(prop) }}
{% endif %}
{%- endmacro -%}